Просмотр исходного кода

More tweaks to algorithm for determining when to fail over to TCP, and stop supernodes from resynchronizing unless explicitly ordered.

Adam Ierymenko 11 лет назад
Родитель
Сommit
700a450806
6 измененных файлов с 70 добавлено и 43 удалено
  1. 7 2
      node/Constants.hpp
  2. 16 11
      node/Node.cpp
  3. 2 0
      node/PacketDecoder.cpp
  4. 14 23
      node/Peer.cpp
  5. 28 2
      node/Peer.hpp
  6. 3 5
      node/Topology.hpp

+ 7 - 2
node/Constants.hpp

@@ -305,9 +305,14 @@ error_no_byte_order_defined;
 #define ZT_TCP_TUNNEL_ACTIVITY_TIMEOUT ZT_PEER_PATH_ACTIVITY_TIMEOUT
 
 /**
- * Try TCP tunnels if no response to UDP PINGs in this many milliseconds
+ * Try TCP tunnels if nothing received for this long
  */
-#define ZT_PING_UNANSWERED_AFTER 5000
+#define ZT_TCP_TUNNEL_FAILOVER_TIMEOUT 5000
+
+/**
+ * Try to ping supernodes this often until we get something from somewhere
+ */
+#define ZT_STARTUP_AGGRO 5000
 
 /**
  * Stop relaying via peers that have not responded to direct sends in this long

+ 16 - 11
node/Node.cpp

@@ -540,7 +540,7 @@ Node::ReasonForTermination Node::run()
 		long lastDelayDelta = 0;
 
 		uint64_t networkConfigurationFingerprint = 0;
-		_r->timeOfLastResynchronize = 0;
+		_r->timeOfLastResynchronize = Utils::now();
 
 		while (impl->reasonForTermination == NODE_RUNNING) {
 			if (Utils::fileExists(shutdownIfUnreadablePath.c_str(),false)) {
@@ -551,13 +551,7 @@ Node::ReasonForTermination Node::run()
 			}
 
 			uint64_t now = Utils::now();
-
-			// Did the user send SIGHUP or otherwise order network resync? (mostly for debugging)
-			bool resynchronize = impl->resynchronize;
-			impl->resynchronize = false;
-			if (resynchronize) {
-				LOG("manual resynchronize ordered, resyncing with network");
-			}
+			bool resynchronize = false;
 
 			// If it looks like the computer slept and woke, resynchronize.
 			if (lastDelayDelta >= ZT_SLEEP_WAKE_DETECTION_THRESHOLD) {
@@ -577,18 +571,29 @@ Node::ReasonForTermination Node::run()
 				}
 			}
 
+			// Supernodes do not resynchronize unless explicitly ordered via SIGHUP.
+			if ((resynchronize)&&(_r->topology->amSupernode()))
+				resynchronize = false;
+
+			// Check for SIGHUP / force resync.
+			if (impl->resynchronize) {
+				impl->resynchronize = false;
+				resynchronize = true;
+				LOG("resynchronize forced by user, syncing with network");
+			}
+
 			if (resynchronize)
 				_r->timeOfLastResynchronize = now;
 
 			/* Ping supernodes separately, and do so more aggressively if we haven't
 			 * heard anything from anyone since our last resynchronize / startup. */
 			if ( ((now - lastSupernodePing) >= ZT_PEER_DIRECT_PING_DELAY) ||
-			     ((_r->timeOfLastResynchronize > _r->timeOfLastPacketReceived) && ((now - lastSupernodePing) >= ZT_PING_UNANSWERED_AFTER)) ) {
+			     ((_r->timeOfLastResynchronize > _r->timeOfLastPacketReceived) && ((now - lastSupernodePing) >= ZT_STARTUP_AGGRO)) ) {
 				lastSupernodePing = now;
 				std::vector< SharedPtr<Peer> > sns(_r->topology->supernodePeers());
 				TRACE("pinging %d supernodes",(int)sns.size());
 				for(std::vector< SharedPtr<Peer> >::const_iterator p(sns.begin());p!=sns.end();++p)
-					(*p)->sendPing(_r,now,resynchronize);
+					(*p)->sendPing(_r,now);
 			}
 
 			if (resynchronize) {
@@ -625,7 +630,7 @@ Node::ReasonForTermination Node::run()
 					if ((now - lastPingCheck) >= ZT_PING_CHECK_DELAY) {
 						lastPingCheck = now;
 						try {
-							_r->topology->eachPeer(Topology::PingPeersThatNeedPing(_r,now,resynchronize));
+							_r->topology->eachPeer(Topology::PingPeersThatNeedPing(_r,now));
 							_r->topology->eachPeer(Topology::OpenPeersThatNeedFirewallOpener(_r,now));
 						} catch (std::exception &exc) {
 							LOG("unexpected exception running ping check cycle: %s",exc.what());

+ 2 - 0
node/PacketDecoder.cpp

@@ -490,6 +490,7 @@ bool PacketDecoder::_doMULTICAST_FRAME(const RuntimeEnvironment *_r,const Shared
 		const unsigned int signatureLen = at<uint16_t>(ZT_PROTO_VERB_MULTICAST_FRAME_IDX_FRAME + frameLen);
 		const unsigned char *const signature = field(ZT_PROTO_VERB_MULTICAST_FRAME_IDX_FRAME + frameLen + 2,signatureLen);
 
+		/*
 		TRACE("MULTICAST_FRAME @%.16llx #%.16llx from %s<%s> via %s(%s) to %s [ %s, %d bytes, depth %d ]",
 			(unsigned long long)nwid,
 			(unsigned long long)guid,
@@ -499,6 +500,7 @@ bool PacketDecoder::_doMULTICAST_FRAME(const RuntimeEnvironment *_r,const Shared
 			Switch::etherTypeName(etherType),
 			(int)frameLen,
 			(int)depth);
+		*/
 
 		SharedPtr<Network> network(_r->nc->network(nwid));
 

+ 14 - 23
node/Peer.cpp

@@ -117,8 +117,13 @@ void Peer::receive(
 		_lastMulticastFrame = now;
 }
 
+
 bool Peer::send(const RuntimeEnvironment *_r,const void *data,unsigned int len,uint64_t now)
 {
+	// Note: we'll still use TCP here if that's all we have, but if this
+	// is false we will prefer UDP.
+	bool useTcp = isTcpFailoverTime(_r,now);
+
 	Mutex::Lock _l(_lock);
 
 	std::vector<Path>::iterator p(_paths.begin());
@@ -127,11 +132,13 @@ bool Peer::send(const RuntimeEnvironment *_r,const void *data,unsigned int len,u
 
 	uint64_t bestPathLastReceived = p->lastReceived();
 	std::vector<Path>::iterator bestPath = p;
+	bool bestPathIsTcp = p->tcp();
 	while (++p != _paths.end()) {
 		uint64_t lr = p->lastReceived();
-		if (lr > bestPathLastReceived) {
+		if ( (lr > bestPathLastReceived) || ((bestPathIsTcp)&&(!useTcp)) ) {
 			bestPathLastReceived = lr;
 			bestPath = p;
+			bestPathIsTcp = p->tcp();
 		}
 	}
 
@@ -156,35 +163,19 @@ bool Peer::sendFirewallOpener(const RuntimeEnvironment *_r,uint64_t now)
 	return sent;
 }
 
-bool Peer::sendPing(const RuntimeEnvironment *_r,uint64_t now,bool firstSinceReset)
+bool Peer::sendPing(const RuntimeEnvironment *_r,uint64_t now)
 {
 	bool sent = false;
 	SharedPtr<Peer> self(this);
 
-	Mutex::Lock _l(_lock);
-
-	// NOTE: this will never ping a peer that has *only* TCP paths. Right
-	// now there's never such a thing as TCP is only for failover.
-
-	bool pingTcp;
-	if (!firstSinceReset) {
-		uint64_t lastUdp = 0;
-		uint64_t lastTcp = 0;
-		uint64_t lastPing = 0;
-		for(std::vector<Path>::iterator p(_paths.begin());p!=_paths.end();++p) {
-			if (p->tcp())
-				lastTcp = std::max(p->lastReceived(),lastTcp);
-			else lastUdp = std::max(p->lastReceived(),lastUdp);
-			lastPing = std::max(p->lastPing(),lastPing);
-		}
-		uint64_t lastAny = std::max(lastUdp,lastTcp);
-		pingTcp = ( ( (lastAny < lastPing) && ((lastPing - lastAny) >= ZT_PING_UNANSWERED_AFTER) ) || (lastTcp > lastUdp) );
-	} else pingTcp = false;
+	// In the ping case we will never send TCP unless this returns true.
+	bool useTcp = isTcpFailoverTime(_r,now);
 
-	TRACE("PING %s (pingTcp==%d)",_id.address().toString().c_str(),(int)pingTcp);
+	TRACE("PING %s (useTcp==%d)",_id.address().toString().c_str(),(int)useTcp);
 
+	Mutex::Lock _l(_lock);
 	for(std::vector<Path>::iterator p(_paths.begin());p!=_paths.end();++p) {
-		if ((pingTcp)||(!p->tcp())) {
+		if ((useTcp)||(!p->tcp())) {
 			if (_r->sw->sendHELLO(self,*p)) {
 				p->sent(now);
 				p->pinged(now);

+ 28 - 2
node/Peer.hpp

@@ -160,10 +160,9 @@ public:
 	 * 
 	 * @param _r Runtime environment
 	 * @param now Current time
-	 * @param firstSinceReset If true, this is the first ping sent since a network reset
 	 * @return True if send appears successful for at least one address type
 	 */
-	bool sendPing(const RuntimeEnvironment *_r,uint64_t now,bool firstSinceReset);
+	bool sendPing(const RuntimeEnvironment *_r,uint64_t now);
 
 	/**
 	 * Called periodically by Topology::clean() to remove stale paths and do other cleanup
@@ -263,6 +262,33 @@ public:
 		return _lastAnnouncedTo;
 	}
 
+	/**
+	 * @param _r Runtime environment
+	 * @param now Current time
+	 * @return True if it's time to attempt TCP failover (if we have TCP_OUT paths)
+	 */
+	inline bool isTcpFailoverTime(const RuntimeEnvironment *_r,uint64_t now) const
+		throw()
+	{
+		if ((now - _r->timeOfLastResynchronize) >= ZT_TCP_TUNNEL_FAILOVER_TIMEOUT) {
+			uint64_t lastUdpPingSent = 0;
+			uint64_t lastUdpReceive = 0;
+
+			{
+				Mutex::Lock _l(_lock);
+				for(std::vector<Path>::const_iterator p(_paths.begin());p!=_paths.end();++p) {
+					if (p->type() == Path::PATH_TYPE_UDP) {
+						lastUdpPingSent = std::max(lastUdpPingSent,p->lastPing());
+						lastUdpReceive = std::max(lastUdpReceive,p->lastReceived());
+					}
+				}
+			}
+
+			return ( (lastUdpPingSent > lastUdpReceive) && ((now - lastUdpPingSent) >= ZT_TCP_TUNNEL_FAILOVER_TIMEOUT) );
+		}
+		return false;
+	}
+
 	/**
 	 * @return Current latency or 0 if unknown (max: 65535)
 	 */

+ 3 - 5
node/Topology.hpp

@@ -207,11 +207,10 @@ public:
 	class PingPeersThatNeedPing
 	{
 	public:
-		PingPeersThatNeedPing(const RuntimeEnvironment *renv,uint64_t now,bool firstSinceReset) throw() :
+		PingPeersThatNeedPing(const RuntimeEnvironment *renv,uint64_t now) throw() :
 			_now(now),
 			_supernodeAddresses(renv->topology->supernodeAddresses()),
-			_r(renv),
-			_firstSinceReset(firstSinceReset) {}
+			_r(renv) {}
 
 		inline void operator()(Topology &t,const SharedPtr<Peer> &p)
 		{
@@ -228,14 +227,13 @@ public:
 			       /* 2b: peer is not a supernode */
 					   (!_supernodeAddresses.count(p->address()))
 			     )
-			   ) { p->sendPing(_r,_now,_firstSinceReset); }
+			   ) { p->sendPing(_r,_now); }
 		}
 
 	private:
 		uint64_t _now;
 		std::set<Address> _supernodeAddresses;
 		const RuntimeEnvironment *_r;
-		bool _firstSinceReset;
 	};
 
 	/**