move metadata updates into their own thread pool to prevent traffic thread pool exhaustion from failed metadata updates; add better logic around checking in-flight and pending metadata announcements to peers; add checking to drop an update entirely if the peer disappears;

pull/121/merge
Bryan Biedenkapp 1 week ago
parent 739c8bbf85
commit 608a7a5b48

@ -42,7 +42,7 @@ MetadataNetwork::MetadataNetwork(HostFNE* host, TrafficNetwork* trafficNetwork,
m_status(NET_STAT_INVALID),
m_peerReplicaActPkt(),
m_peerTreeListPkt(),
m_threadPool(workerCnt, "diag")
m_threadPool(workerCnt, "meta")
{
assert(trafficNetwork != nullptr);
assert(host != nullptr);

@ -141,6 +141,9 @@ TrafficNetwork::TrafficNetwork(HostFNE* host, const std::string& address, uint16
m_jitterMaxSize(4U),
m_jitterMaxWait(40000U),
m_threadPool(workerCnt, "fne"),
m_metadataUpdateThreadPool(workerCnt / 2U, "mupdt"),
m_metadataUpdateMutex(),
m_metadataUpdateState(),
m_disablePacketData(false),
m_dumpPacketData(false),
m_verbosePacketData(false),
@ -724,6 +727,9 @@ bool TrafficNetwork::open()
// start thread pool
m_threadPool.start();
// start metadata thread pool
m_metadataUpdateThreadPool.start();
// start FluxQL thread pool
if (m_enableInfluxDB) {
influxdb::detail::TSCaller::start();
@ -780,6 +786,16 @@ void TrafficNetwork::close()
m_threadPool.stop();
m_threadPool.wait();
// stop metadata thread pool
m_metadataUpdateThreadPool.stop();
m_metadataUpdateThreadPool.wait();
// scope is intentional
{
std::lock_guard<std::mutex> lock(m_metadataUpdateMutex);
m_metadataUpdateState.clear();
}
// stop FluxQL thread pool
if (m_enableInfluxDB) {
influxdb::detail::TSCaller::stop();
@ -2408,17 +2424,63 @@ void TrafficNetwork::processInCallCtrl(network::NET_ICC::ENUM command, network::
void TrafficNetwork::peerMetadataUpdate(uint32_t peerId)
{
if (peerId == 0U) {
return;
}
bool enqueueTask = false;
// scope is intentional
{
std::lock_guard<std::mutex> lock(m_metadataUpdateMutex);
MetadataUpdateState& state = m_metadataUpdateState[peerId];
if (state.inFlight) {
// coalesce duplicate requests while one update is running
LogWarning(LOG_MASTER, "PEER %u metadata update already in flight, coalescing duplicate request", peerId);
state.pending = true;
return;
}
if (state.pending) {
// a request is already queued for this peer
LogWarning(LOG_MASTER, "PEER %u metadata update already pending, coalescing duplicate request", peerId);
return;
}
state.pending = true;
enqueueTask = true;
}
if (!enqueueTask) {
return;
}
MetadataUpdateRequest* req = new MetadataUpdateRequest();
req->obj = this;
req->peerId = peerId;
// enqueue the task
if (!m_threadPool.enqueue(new_pooltask(taskMetadataUpdate, req))) {
if (!m_metadataUpdateThreadPool.enqueue(new_pooltask(taskMetadataUpdate, req))) {
LogError(LOG_NET, "Failed to task enqueue metadata update, peerId = %u", peerId);
if (req != nullptr)
// scope is intentional
{
std::lock_guard<std::mutex> lock(m_metadataUpdateMutex);
auto it = m_metadataUpdateState.find(peerId);
if (it != m_metadataUpdateState.end()) {
it->second.pending = false;
if (!it->second.inFlight) {
m_metadataUpdateState.erase(it);
}
}
}
if (req != nullptr) {
delete req;
}
}
}
/* Helper to send the network metadata to the specified peer in a separate thread. */
@ -2435,6 +2497,44 @@ void TrafficNetwork::taskMetadataUpdate(MetadataUpdateRequest* req)
if (req == nullptr)
return;
while (true) {
// scope is intentional
{
std::lock_guard<std::mutex> lock(network->m_metadataUpdateMutex);
// check if there is a pending metadata update for this peer
MetadataUpdateState& state = network->m_metadataUpdateState[req->peerId];
if (!state.pending) {
// no pending metadata update for this peer, exit the loop
state.inFlight = false;
network->m_metadataUpdateState.erase(req->peerId);
break;
}
// check if the peer connection is still valid and connected
FNEPeerConnection* connection = network->m_peers[req->peerId];
if (connection != nullptr) {
if (!connection->connected()) {
// peer connection is not connected, skip the metadata update
LogWarning(LOG_MASTER, "PEER %u (%s) not connected, skipping metadata update", req->peerId, connection->identWithQualifier().c_str());
state.pending = false;
state.inFlight = false;
network->m_metadataUpdateState.erase(req->peerId);
break;
}
} else {
// peer connection is not found, skip the metadata update
LogWarning(LOG_MASTER, "PEER %u not found, skipping metadata update", req->peerId);
state.pending = false;
state.inFlight = false;
network->m_metadataUpdateState.erase(req->peerId);
break;
}
state.pending = false;
state.inFlight = true;
}
std::string peerIdentity = network->resolvePeerIdentity(req->peerId);
FNEPeerConnection* connection = network->m_peers[req->peerId];
@ -2468,6 +2568,7 @@ void TrafficNetwork::taskMetadataUpdate(MetadataUpdateRequest* req)
connection->unlock();
}
}
}
delete req;
}

@ -404,6 +404,24 @@ namespace network
uint32_t m_jitterMaxWait;
ThreadPool m_threadPool;
ThreadPool m_metadataUpdateThreadPool;
/**
* @brief Represents the state of a metadata update for a given peer ID.
* @ingroup fne_network
*/
struct MetadataUpdateState {
/**
* @brief Flag indicating whether a metadata update is currently in flight for this peer ID.
*/
bool inFlight = false;
/**
* @brief Flag indicating whether a metadata update is pending for this peer ID.
*/
bool pending = false;
};
std::mutex m_metadataUpdateMutex;
std::unordered_map<uint32_t, MetadataUpdateState> m_metadataUpdateState;
bool m_disablePacketData;
bool m_dumpPacketData;

Loading…
Cancel
Save

Powered by TurnKey Linux.