diff --git a/configs/fne-config.example.yml b/configs/fne-config.example.yml index 58072975..2fb1c119 100644 --- a/configs/fne-config.example.yml +++ b/configs/fne-config.example.yml @@ -281,6 +281,15 @@ peers: # Textual location for this host. location: Anywhere, USA + # Flag indicating whether or not NAKs sent from the master to this peer will cause this peer to fall over (restart). + # (NOTE: Restarting is not handled by the FNE itself, this is simply a flag to indicate that if the master sends + # too many NAKs to this peer, this FNE instance will stop, and should be restarted by an external watchdog process.) + nakFallOver: false + # Amount of NAKs from the master to this peer that will cause this peer to fall over (restart). + # (This is only applicable if nakFallOver is true. This is a protection mechanism to prevent a bad network connection + # from causing instability in the FNE master.) + nakFallOverCount: 10 + # Flag indicating whether or not packet dumping is enabled. packetDump: false # Flag indicating whether or not verbose debug logging is enabled. diff --git a/src/fne/HostFNE.cpp b/src/fne/HostFNE.cpp index 5d0237c3..27cc2f8d 100644 --- a/src/fne/HostFNE.cpp +++ b/src/fne/HostFNE.cpp @@ -857,7 +857,10 @@ bool HostFNE::createPeerNetworks() float longitude = peerConf["longitude"].as(0.0F); std::string location = peerConf["location"].as(); - ::LogInfoEx(LOG_HOST, "Peer ID %u Master Address %s Master Port %u Enabled %u Encrypted %u", id, masterAddress.c_str(), masterPort, enabled, encrypted); + bool nakFallOver = peerConf["nakFallOver"].as(false); + uint32_t nakFallOverCount = peerConf["nakFallOverCount"].as(10U); + + ::LogInfoEx(LOG_HOST, "Peer ID %u Master Address %s Master Port %u Enabled %u Encrypted %u NAK Fall Over %u", id, masterAddress.c_str(), masterPort, enabled, encrypted, nakFallOver); std::string identOverride = peerConf["identity"].as(); if (identOverride != "") { @@ -879,6 +882,7 @@ bool HostFNE::createPeerNetworks() network->setMasterPeerId(masterPeerId); network->setPeerLookups(m_peerListLookup); network->setPeerReplicationSaveACL(m_peerReplicaSavesACL); + network->setNakFallOver(nakFallOver, nakFallOverCount); if (encrypted) { network->setPresharedKey(presharedKey); } diff --git a/src/fne/network/PeerNetwork.cpp b/src/fne/network/PeerNetwork.cpp index 206b5176..b2d517cb 100644 --- a/src/fne/network/PeerNetwork.cpp +++ b/src/fne/network/PeerNetwork.cpp @@ -13,6 +13,7 @@ #include "common/Log.h" #include "common/Utils.h" #include "fne/network/PeerNetwork.h" +#include "FNEMain.h" using namespace network; using namespace compress; @@ -55,7 +56,10 @@ PeerNetwork::PeerNetwork(const std::string& address, uint16_t port, uint16_t loc m_ridPkt(true, "Peer Replication, RID List"), m_pidPkt(true, "Peer Replication, PID List"), m_threadPool(WORKER_CNT, "peer"), - m_prevSpanningTreeChildren(0U) + m_prevSpanningTreeChildren(0U), + m_nakFallOver(false), + m_nakFallOverCount(0U), + m_nakFallOverCountThreshold(10U) { assert(!address.empty()); assert(port > 0U); @@ -97,6 +101,8 @@ bool PeerNetwork::open() if (!m_enabled) return false; + m_nakFallOverCount = 0U; + return Network::open(); } @@ -321,6 +327,9 @@ void PeerNetwork::userPacketHandler(uint32_t peerId, FrameQueue::OpcodePair opco if (m_peerReplicaCallback != nullptr) m_peerReplicaCallback(this); + // reset NAK count on reception of a replica TG + m_nakFallOverCount = 0U; + // cleanup temporary file ::remove(filename.c_str()); m_tgidPkt.clear(); @@ -380,6 +389,9 @@ void PeerNetwork::userPacketHandler(uint32_t peerId, FrameQueue::OpcodePair opco if (m_peerReplicaCallback != nullptr) m_peerReplicaCallback(this); + // reset NAK count on reception of a replica TG + m_nakFallOverCount = 0U; + // cleanup temporary file ::remove(filename.c_str()); m_ridPkt.clear(); @@ -439,6 +451,9 @@ void PeerNetwork::userPacketHandler(uint32_t peerId, FrameQueue::OpcodePair opco if (m_peerReplicaCallback != nullptr) m_peerReplicaCallback(this); + // reset NAK count on reception of a replica TG + m_nakFallOverCount = 0U; + // cleanup temporary file ::remove(filename.c_str()); m_pidPkt.clear(); @@ -502,6 +517,18 @@ bool PeerNetwork::userNakHandler(uint32_t peerId, uint16_t reason, const frame:: break; } + // if NAK fall over is enabled, track the count of NAKs received from the master and fall over (restart) if too + // many are received, this is to help mitigate cases where the master is repeatedly sending NAKs to this peer + if (m_nakFallOver) { + m_nakFallOverCount++; + LogWarning(LOG_PEER, "PEER %u received NAK from master, reason = %u, nakFallOverCount = %u", peerId, reason, m_nakFallOverCount); + + if (m_nakFallOverCount >= m_nakFallOverCountThreshold) { + LogError(LOG_PEER, "PEER %u received too many NAKs from master, nakFallOverCount = %u exceeds threshold of %u, falling over (restart)", peerId, m_nakFallOverCount, m_nakFallOverCountThreshold); + g_killed = true; + } + } + return false; // return false to perform default handling of the NAK } diff --git a/src/fne/network/PeerNetwork.h b/src/fne/network/PeerNetwork.h index fbc948f9..48fbf40b 100644 --- a/src/fne/network/PeerNetwork.h +++ b/src/fne/network/PeerNetwork.h @@ -248,6 +248,14 @@ namespace network */ void setPeerReplicationSaveACL(bool enabled) { m_peerReplicaSavesACL = enabled; } + /** + * @brief Set whether or not this peer has NAK fall-over enabled, and the amount of NAKs that will cause + * this peer to fall over (restart). + * @param enabled Flag to enable NAK fall-over. + * @param count Number of NAKs that will trigger fall-over. + */ + void setNakFallOver(bool enabled, uint32_t count) { m_nakFallOver = enabled; m_nakFallOverCountThreshold = count; } + /** * @brief Gets the remote peer ID. * @returns uint32_t Remote Peer ID. @@ -333,6 +341,10 @@ namespace network uint32_t m_prevSpanningTreeChildren; + bool m_nakFallOver; + uint32_t m_nakFallOverCount; + uint32_t m_nakFallOverCountThreshold; + /** * @brief Entry point to process a given network packet. * @param req Instance of the PeerPacketRequest structure.