add experimental NAK fallover logic, this is intended for use in situations where excess NAKs from an upstream master can be used to trigger a shutdown of the FNE, intentionally, this requires external scripts or systemd/init system to then restart the process after it automatically shuts down, the intention here is to very heavy handedly force a full state reset of the FNE in a case where it is unable to reconnect properly through other means;

pull/121/head
Bryan Biedenkapp 2 months ago
parent 20f5a45cfe
commit ed028fdb94

@ -281,6 +281,15 @@ peers:
# Textual location for this host. # Textual location for this host.
location: Anywhere, USA location: Anywhere, USA
# Flag indicating whether or not NAKs sent from the master to this peer will cause this peer to fall over (restart).
# (NOTE: Restarting is not handled by the FNE itself, this is simply a flag to indicate that if the master sends
# too many NAKs to this peer, this FNE instance will stop, and should be restarted by an external watchdog process.)
nakFallOver: false
# Amount of NAKs from the master to this peer that will cause this peer to fall over (restart).
# (This is only applicable if nakFallOver is true. This is a protection mechanism to prevent a bad network connection
# from causing instability in the FNE master.)
nakFallOverCount: 10
# Flag indicating whether or not packet dumping is enabled. # Flag indicating whether or not packet dumping is enabled.
packetDump: false packetDump: false
# Flag indicating whether or not verbose debug logging is enabled. # Flag indicating whether or not verbose debug logging is enabled.

@ -857,7 +857,10 @@ bool HostFNE::createPeerNetworks()
float longitude = peerConf["longitude"].as<float>(0.0F); float longitude = peerConf["longitude"].as<float>(0.0F);
std::string location = peerConf["location"].as<std::string>(); std::string location = peerConf["location"].as<std::string>();
::LogInfoEx(LOG_HOST, "Peer ID %u Master Address %s Master Port %u Enabled %u Encrypted %u", id, masterAddress.c_str(), masterPort, enabled, encrypted); bool nakFallOver = peerConf["nakFallOver"].as<bool>(false);
uint32_t nakFallOverCount = peerConf["nakFallOverCount"].as<uint32_t>(10U);
::LogInfoEx(LOG_HOST, "Peer ID %u Master Address %s Master Port %u Enabled %u Encrypted %u NAK Fall Over %u", id, masterAddress.c_str(), masterPort, enabled, encrypted, nakFallOver);
std::string identOverride = peerConf["identity"].as<std::string>(); std::string identOverride = peerConf["identity"].as<std::string>();
if (identOverride != "") { if (identOverride != "") {
@ -879,6 +882,7 @@ bool HostFNE::createPeerNetworks()
network->setMasterPeerId(masterPeerId); network->setMasterPeerId(masterPeerId);
network->setPeerLookups(m_peerListLookup); network->setPeerLookups(m_peerListLookup);
network->setPeerReplicationSaveACL(m_peerReplicaSavesACL); network->setPeerReplicationSaveACL(m_peerReplicaSavesACL);
network->setNakFallOver(nakFallOver, nakFallOverCount);
if (encrypted) { if (encrypted) {
network->setPresharedKey(presharedKey); network->setPresharedKey(presharedKey);
} }

@ -13,6 +13,7 @@
#include "common/Log.h" #include "common/Log.h"
#include "common/Utils.h" #include "common/Utils.h"
#include "fne/network/PeerNetwork.h" #include "fne/network/PeerNetwork.h"
#include "FNEMain.h"
using namespace network; using namespace network;
using namespace compress; using namespace compress;
@ -55,7 +56,10 @@ PeerNetwork::PeerNetwork(const std::string& address, uint16_t port, uint16_t loc
m_ridPkt(true, "Peer Replication, RID List"), m_ridPkt(true, "Peer Replication, RID List"),
m_pidPkt(true, "Peer Replication, PID List"), m_pidPkt(true, "Peer Replication, PID List"),
m_threadPool(WORKER_CNT, "peer"), m_threadPool(WORKER_CNT, "peer"),
m_prevSpanningTreeChildren(0U) m_prevSpanningTreeChildren(0U),
m_nakFallOver(false),
m_nakFallOverCount(0U),
m_nakFallOverCountThreshold(10U)
{ {
assert(!address.empty()); assert(!address.empty());
assert(port > 0U); assert(port > 0U);
@ -97,6 +101,8 @@ bool PeerNetwork::open()
if (!m_enabled) if (!m_enabled)
return false; return false;
m_nakFallOverCount = 0U;
return Network::open(); return Network::open();
} }
@ -321,6 +327,9 @@ void PeerNetwork::userPacketHandler(uint32_t peerId, FrameQueue::OpcodePair opco
if (m_peerReplicaCallback != nullptr) if (m_peerReplicaCallback != nullptr)
m_peerReplicaCallback(this); m_peerReplicaCallback(this);
// reset NAK count on reception of a replica TG
m_nakFallOverCount = 0U;
// cleanup temporary file // cleanup temporary file
::remove(filename.c_str()); ::remove(filename.c_str());
m_tgidPkt.clear(); m_tgidPkt.clear();
@ -380,6 +389,9 @@ void PeerNetwork::userPacketHandler(uint32_t peerId, FrameQueue::OpcodePair opco
if (m_peerReplicaCallback != nullptr) if (m_peerReplicaCallback != nullptr)
m_peerReplicaCallback(this); m_peerReplicaCallback(this);
// reset NAK count on reception of a replica TG
m_nakFallOverCount = 0U;
// cleanup temporary file // cleanup temporary file
::remove(filename.c_str()); ::remove(filename.c_str());
m_ridPkt.clear(); m_ridPkt.clear();
@ -439,6 +451,9 @@ void PeerNetwork::userPacketHandler(uint32_t peerId, FrameQueue::OpcodePair opco
if (m_peerReplicaCallback != nullptr) if (m_peerReplicaCallback != nullptr)
m_peerReplicaCallback(this); m_peerReplicaCallback(this);
// reset NAK count on reception of a replica TG
m_nakFallOverCount = 0U;
// cleanup temporary file // cleanup temporary file
::remove(filename.c_str()); ::remove(filename.c_str());
m_pidPkt.clear(); m_pidPkt.clear();
@ -502,6 +517,18 @@ bool PeerNetwork::userNakHandler(uint32_t peerId, uint16_t reason, const frame::
break; break;
} }
// if NAK fall over is enabled, track the count of NAKs received from the master and fall over (restart) if too
// many are received, this is to help mitigate cases where the master is repeatedly sending NAKs to this peer
if (m_nakFallOver) {
m_nakFallOverCount++;
LogWarning(LOG_PEER, "PEER %u received NAK from master, reason = %u, nakFallOverCount = %u", peerId, reason, m_nakFallOverCount);
if (m_nakFallOverCount >= m_nakFallOverCountThreshold) {
LogError(LOG_PEER, "PEER %u received too many NAKs from master, nakFallOverCount = %u exceeds threshold of %u, falling over (restart)", peerId, m_nakFallOverCount, m_nakFallOverCountThreshold);
g_killed = true;
}
}
return false; // return false to perform default handling of the NAK return false; // return false to perform default handling of the NAK
} }

@ -248,6 +248,14 @@ namespace network
*/ */
void setPeerReplicationSaveACL(bool enabled) { m_peerReplicaSavesACL = enabled; } void setPeerReplicationSaveACL(bool enabled) { m_peerReplicaSavesACL = enabled; }
/**
* @brief Set whether or not this peer has NAK fall-over enabled, and the amount of NAKs that will cause
* this peer to fall over (restart).
* @param enabled Flag to enable NAK fall-over.
* @param count Number of NAKs that will trigger fall-over.
*/
void setNakFallOver(bool enabled, uint32_t count) { m_nakFallOver = enabled; m_nakFallOverCountThreshold = count; }
/** /**
* @brief Gets the remote peer ID. * @brief Gets the remote peer ID.
* @returns uint32_t Remote Peer ID. * @returns uint32_t Remote Peer ID.
@ -333,6 +341,10 @@ namespace network
uint32_t m_prevSpanningTreeChildren; uint32_t m_prevSpanningTreeChildren;
bool m_nakFallOver;
uint32_t m_nakFallOverCount;
uint32_t m_nakFallOverCountThreshold;
/** /**
* @brief Entry point to process a given network packet. * @brief Entry point to process a given network packet.
* @param req Instance of the PeerPacketRequest structure. * @param req Instance of the PeerPacketRequest structure.

Loading…
Cancel
Save

Powered by TurnKey Linux.