Browse Source

Tighten up dead path detection. Should now auto-detect dead paths in less than 10 seconds at a very small cost in ECHO requests (or HELLOs for older peers). GitHib issue #272

Adam Ierymenko 9 years ago
parent
commit
05b2c0743f
3 changed files with 65 additions and 11 deletions
  1. 1 1
      node/Constants.hpp
  2. 37 6
      node/Path.hpp
  3. 27 4
      node/Peer.cpp

+ 1 - 1
node/Constants.hpp

@@ -274,7 +274,7 @@
 /**
 /**
  * No answer timeout to trigger dead path detection
  * No answer timeout to trigger dead path detection
  */
  */
-#define ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT 3000
+#define ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT 2500
 
 
 /**
 /**
  * Probation threshold after which a path becomes dead
  * Probation threshold after which a path becomes dead

+ 37 - 6
node/Path.hpp

@@ -66,6 +66,8 @@ class Path
 public:
 public:
 	Path() :
 	Path() :
 		_lastSend(0),
 		_lastSend(0),
+		_lastPing(0),
+		_lastKeepalive(0),
 		_lastReceived(0),
 		_lastReceived(0),
 		_addr(),
 		_addr(),
 		_localAddress(),
 		_localAddress(),
@@ -76,6 +78,8 @@ public:
 
 
 	Path(const InetAddress &localAddress,const InetAddress &addr) :
 	Path(const InetAddress &localAddress,const InetAddress &addr) :
 		_lastSend(0),
 		_lastSend(0),
+		_lastPing(0),
+		_lastKeepalive(0),
 		_lastReceived(0),
 		_lastReceived(0),
 		_addr(addr),
 		_addr(addr),
 		_localAddress(localAddress),
 		_localAddress(localAddress),
@@ -98,10 +102,21 @@ public:
 	 *
 	 *
 	 * @param t Time of send
 	 * @param t Time of send
 	 */
 	 */
-	inline void sent(uint64_t t)
-	{
-		_lastSend = t;
-	}
+	inline void sent(uint64_t t) { _lastSend = t; }
+
+	/**
+	 * Called when we've sent a ping or echo
+	 *
+	 * @param t Time of send
+	 */
+	inline void pinged(uint64_t t) { _lastPing = t; }
+
+	/**
+	 * Called when we send a NAT keepalive
+	 *
+	 * @param t Time of send
+	 */
+	inline void sentKeepalive(uint64_t t) { _lastKeepalive = t; }
 
 
 	/**
 	/**
 	 * Called when a packet is received from this remote path
 	 * Called when a packet is received from this remote path
@@ -145,6 +160,16 @@ public:
 	 */
 	 */
 	inline uint64_t lastSend() const throw() { return _lastSend; }
 	inline uint64_t lastSend() const throw() { return _lastSend; }
 
 
+	/**
+	 * @return Time we last pinged or dead path checked this link
+	 */
+	inline uint64_t lastPing() const throw() { return _lastPing; }
+
+	/**
+	 * @return Time of last keepalive
+	 */
+	inline uint64_t lastKeepalive() const throw() { return _lastKeepalive; }
+
 	/**
 	/**
 	 * @return Time of last receive from this path
 	 * @return Time of last receive from this path
 	 */
 	 */
@@ -260,8 +285,10 @@ public:
 	template<unsigned int C>
 	template<unsigned int C>
 	inline void serialize(Buffer<C> &b) const
 	inline void serialize(Buffer<C> &b) const
 	{
 	{
-		b.append((uint8_t)1); // version
+		b.append((uint8_t)2); // version
 		b.append((uint64_t)_lastSend);
 		b.append((uint64_t)_lastSend);
+		b.append((uint64_t)_lastPing);
+		b.append((uint64_t)_lastKeepalive);
 		b.append((uint64_t)_lastReceived);
 		b.append((uint64_t)_lastReceived);
 		_addr.serialize(b);
 		_addr.serialize(b);
 		_localAddress.serialize(b);
 		_localAddress.serialize(b);
@@ -273,9 +300,11 @@ public:
 	inline unsigned int deserialize(const Buffer<C> &b,unsigned int startAt = 0)
 	inline unsigned int deserialize(const Buffer<C> &b,unsigned int startAt = 0)
 	{
 	{
 		unsigned int p = startAt;
 		unsigned int p = startAt;
-		if (b[p++] != 1)
+		if (b[p++] != 2)
 			throw std::invalid_argument("invalid serialized Path");
 			throw std::invalid_argument("invalid serialized Path");
 		_lastSend = b.template at<uint64_t>(p); p += 8;
 		_lastSend = b.template at<uint64_t>(p); p += 8;
+		_lastPing = b.template at<uint64_t>(p); p += 8;
+		_lastKeepalive = b.template at<uint64_t>(p); p += 8;
 		_lastReceived = b.template at<uint64_t>(p); p += 8;
 		_lastReceived = b.template at<uint64_t>(p); p += 8;
 		p += _addr.deserialize(b,p);
 		p += _addr.deserialize(b,p);
 		p += _localAddress.deserialize(b,p);
 		p += _localAddress.deserialize(b,p);
@@ -290,6 +319,8 @@ public:
 
 
 private:
 private:
 	uint64_t _lastSend;
 	uint64_t _lastSend;
+	uint64_t _lastPing;
+	uint64_t _lastKeepalive;
 	uint64_t _lastReceived;
 	uint64_t _lastReceived;
 	InetAddress _addr;
 	InetAddress _addr;
 	InetAddress _localAddress;
 	InetAddress _localAddress;

+ 27 - 4
node/Peer.cpp

@@ -168,7 +168,10 @@ void Peer::received(
 					} else {
 					} else {
 						uint64_t slotLRmin = 0xffffffffffffffffULL;
 						uint64_t slotLRmin = 0xffffffffffffffffULL;
 						for(unsigned int p=0;p<ZT_MAX_PEER_NETWORK_PATHS;++p) {
 						for(unsigned int p=0;p<ZT_MAX_PEER_NETWORK_PATHS;++p) {
-							if (_paths[p].lastReceived() <= slotLRmin) {
+							if (!_paths[p].active(now)) {
+								slot = &(_paths[p]);
+								break;
+							} else if (_paths[p].lastReceived() <= slotLRmin) {
 								slotLRmin = _paths[p].lastReceived();
 								slotLRmin = _paths[p].lastReceived();
 								slot = &(_paths[p]);
 								slot = &(_paths[p]);
 							}
 							}
@@ -249,11 +252,12 @@ bool Peer::doPingAndKeepalive(uint64_t now,int inetAddressFamily)
 			//TRACE("PING %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived());
 			//TRACE("PING %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived());
 			sendHELLO(p->localAddress(),p->address(),now);
 			sendHELLO(p->localAddress(),p->address(),now);
 			p->sent(now);
 			p->sent(now);
-		} else if (((now - p->lastSend()) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) {
+			p->pinged(now);
+		} else if (((now - std::max(p->lastSend(),p->lastKeepalive())) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) {
 			//TRACE("NAT keepalive %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived());
 			//TRACE("NAT keepalive %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived());
 			_natKeepaliveBuf += (uint32_t)((now * 0x9e3779b1) >> 1); // tumble this around to send constantly varying (meaningless) payloads
 			_natKeepaliveBuf += (uint32_t)((now * 0x9e3779b1) >> 1); // tumble this around to send constantly varying (meaningless) payloads
 			RR->node->putPacket(p->localAddress(),p->address(),&_natKeepaliveBuf,sizeof(_natKeepaliveBuf));
 			RR->node->putPacket(p->localAddress(),p->address(),&_natKeepaliveBuf,sizeof(_natKeepaliveBuf));
-			p->sent(now);
+			p->sentKeepalive(now);
 		} else {
 		} else {
 			//TRACE("no PING or NAT keepalive: addr==%s reliable==%d %llums/%llums send/receive inactivity",p->address().toString().c_str(),(int)p->reliable(),now - p->lastSend(),now - p->lastReceived());
 			//TRACE("no PING or NAT keepalive: addr==%s reliable==%d %llums/%llums send/receive inactivity",p->address().toString().c_str(),(int)p->reliable(),now - p->lastSend(),now - p->lastReceived());
 		}
 		}
@@ -339,6 +343,8 @@ bool Peer::resetWithinScope(InetAddress::IpScope scope,uint64_t now)
 	unsigned int y = 0;
 	unsigned int y = 0;
 	while (x < np) {
 	while (x < np) {
 		if (_paths[x].address().ipScope() == scope) {
 		if (_paths[x].address().ipScope() == scope) {
+			// Resetting a path means sending a HELLO and then forgetting it. If we
+			// get OK(HELLO) then it will be re-learned.
 			sendHELLO(_paths[x].localAddress(),_paths[x].address(),now);
 			sendHELLO(_paths[x].localAddress(),_paths[x].address(),now);
 		} else {
 		} else {
 			_paths[y++] = _paths[x];
 			_paths[y++] = _paths[x];
@@ -491,7 +497,22 @@ bool Peer::_checkPath(Path &p,const uint64_t now)
 	if (!p.active(now))
 	if (!p.active(now))
 		return false;
 		return false;
 
 
-	if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) {
+	/* Dead path detection: if we have sent something to this peer and have not
+	 * yet received a reply, double check this path. The majority of outbound
+	 * packets including Ethernet frames do generate some kind of reply either
+	 * immediately or at some point in the near future. This will occasionally
+	 * (every NO_ANSWER_TIMEOUT ms) check paths unnecessarily if traffic that
+	 * does not generate a response is being sent such as multicast announcements
+	 * or frames belonging to unidirectional UDP protocols, but the cost is very
+	 * tiny and the benefit in reliability is very large. This takes care of many
+	 * failure modes including crap NATs that forget links and spurious changes
+	 * to physical network topology that cannot be otherwise detected.
+	 *
+	 * Each time we do this we increment a probation counter in the path. This
+	 * counter is reset on any packet receive over this path. If it reaches the
+	 * MAX_PROBATION threshold the path is considred dead. */
+
+	if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) && ((now - p.lastPing()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) {
 		TRACE("%s(%s) has not answered, checking if dead (probation: %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation());
 		TRACE("%s(%s) has not answered, checking if dead (probation: %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation());
 
 
 		if ( (_vProto >= 5) && ( !((_vMajor == 1)&&(_vMinor == 1)&&(_vRevision == 0)) ) ) {
 		if ( (_vProto >= 5) && ( !((_vMajor == 1)&&(_vMinor == 1)&&(_vRevision == 0)) ) ) {
@@ -499,9 +520,11 @@ bool Peer::_checkPath(Path &p,const uint64_t now)
 			Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ECHO);
 			Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ECHO);
 			outp.armor(_key,true);
 			outp.armor(_key,true);
 			p.send(RR,outp.data(),outp.size(),now);
 			p.send(RR,outp.data(),outp.size(),now);
+			p.pinged(now);
 		} else {
 		} else {
 			sendHELLO(p.localAddress(),p.address(),now);
 			sendHELLO(p.localAddress(),p.address(),now);
 			p.sent(now);
 			p.sent(now);
+			p.pinged(now);
 		}
 		}
 
 
 		p.increaseProbation();
 		p.increaseProbation();