Browse Source

Introduced basic multipath support

Joseph Henry 7 years ago
parent
commit
6a2ba4baca
17 changed files with 1310 additions and 41 deletions
  1. 36 0
      include/ZeroTierOne.h
  2. 92 0
      node/Constants.hpp
  3. 4 1
      node/IncomingPacket.cpp
  4. 1 1
      node/Multicaster.cpp
  5. 2 2
      node/Node.cpp
  6. 5 0
      node/Node.hpp
  7. 260 5
      node/Path.hpp
  8. 272 17
      node/Peer.cpp
  9. 31 7
      node/Peer.hpp
  10. 315 0
      node/RingBuffer.hpp
  11. 3 3
      node/Switch.cpp
  12. 1 1
      node/Topology.hpp
  13. 20 0
      node/Trace.cpp
  14. 6 0
      node/Trace.hpp
  15. 15 0
      osdep/Binder.hpp
  16. 194 1
      osdep/Phy.hpp
  17. 53 3
      service/OneService.cpp

+ 36 - 0
include/ZeroTierOne.h

@@ -422,6 +422,42 @@ enum ZT_ResultCode
  */
  */
 #define ZT_ResultCode_isFatal(x) ((((int)(x)) >= 100)&&(((int)(x)) < 1000))
 #define ZT_ResultCode_isFatal(x) ((((int)(x)) >= 100)&&(((int)(x)) < 1000))
 
 
+/**
+ * The multipath algorithm in use by this node.
+ */
+enum ZT_MultipathMode
+{
+	/**
+	 * No active multipath.
+	 *
+	 * Traffic is merely sent over the strongest path. That being
+	 * said, this mode will automatically failover in the event that a link goes down.
+	 */
+	ZT_MULTIPATH_NONE = 0,
+
+	/**
+	 * Traffic is randomly distributed among all active paths.
+	 *
+	 * Will cease sending traffic over links that appear to be stale.
+	 */
+	ZT_MULTIPATH_RANDOM = 1,
+
+	/**
+	 * Traffic is allocated across all active paths in proportion to their strength and
+	 * reliability.
+	 *
+	 * Will cease sending traffic over links that appear to be stale.
+	 */
+	ZT_MULTIPATH_PROPORTIONALLY_BALANCED = 2,
+
+	/**
+	 * Traffic is allocated across a user-defined interface/allocation
+	 *
+	 * Will cease sending traffic over links that appear to be stale.
+	 */
+	ZT_MULTIPATH_MANUALLY_BALANCED = 3
+};
+
 /**
 /**
  * Status codes sent to status update callback when things happen
  * Status codes sent to status update callback when things happen
  */
  */

+ 92 - 0
node/Constants.hpp

@@ -267,6 +267,98 @@
  */
  */
 #define ZT_PING_CHECK_INVERVAL 5000
 #define ZT_PING_CHECK_INVERVAL 5000
 
 
+/**
+ * Length of interface name
+ */
+#define ZT_PATH_INTERFACE_NAME_SZ 16
+
+/**
+ * How frequently to check for changes to the system's network interfaces. When
+ * the service decides to use this constant it's because we want to react more
+ * quickly to new interfaces that pop up or go down.
+ */
+#define ZT_MULTIPATH_BINDER_REFRESH_PERIOD 5000
+
+/**
+ * Path choice history window size. This is used to keep track of which paths were
+ * previously selected so that we can maintain a target allocation over time.
+ */
+#define ZT_MULTIPATH_PROPORTION_WIN_SZ 128
+
+/**
+ * Threshold for flow to be considered balanced.
+ */
+#define ZT_MULTIPATH_FLOW_BALANCE_THESHOLD 0.80
+
+/**
+ * Number of samples to consider when computing path statistics
+ */
+#define ZT_PATH_QUALITY_METRIC_WIN_SZ 128
+
+/**
+ * How often important path metrics are sampled (in ms). These metrics are later used
+ * for path quality estimates
+ */
+#define ZT_PATH_QUALITY_SAMPLE_INTERVAL 100
+
+/**
+ * How often new path quality estimates are computed
+ */
+#define ZT_PATH_QUALITY_ESTIMATE_INTERVAL 100
+
+/**
+ * How often we will sample packet latency. Should be at least greater than ZT_PING_CHECK_INVERVAL
+ * since we will record a 0 bit/s measurement if no valid latency measurement was made within this
+ * window of time.
+ */
+#define ZT_PATH_LATENCY_SAMPLE_INTERVAL ZT_PING_CHECK_INVERVAL * 2
+
+/**
+ * Interval used for rate-limiting the computation of path quality estimates. Set at 0
+ * to compute as new packets arrive with no delay.
+ */
+#define ZT_PATH_QUALITY_COMPUTE_INTERVAL 0
+
+/**
+ * Path error rate history window size. This is used to keep track of packet error
+ * measurements over a path's medium-term history.
+ */
+#define ZT_PATH_ERROR_HIST_WIN_SZ 10
+
+/**
+ * The number of packet error measurements in each sample
+ */
+#define ZT_PATH_ERROR_SAMPLE_WIN_SZ 1024
+
+/**
+ * How often a peer will prune its own paths. Pruning is important when multipath is
+ * enabled because we want to prevent the allocation algorithms from sending anything
+ * out on known dead paths. Additionally, quickly marking paths as dead helps when
+ * a new path is learned and needs to replace an older path.
+ */
+#define ZT_CLOSED_PATH_PRUNING_INTERVAL 1000
+
+/**
+ * Datagram used to test link throughput. Contents are random.
+ */
+#define ZT_LINK_TEST_DATAGRAM_SZ 1024
+
+/**
+ * Size of datagram expected as a reply to a link speed test
+ */
+#define ZT_LINK_TEST_DATAGRAM_RESPONSE_SZ 8
+
+/**
+ * Time before a link test datagram is considered lost. Any corresponding
+ * timing records that would have been used to compute a RTT are purged.
+ */
+#define ZT_LINK_TEST_TIMEOUT 10000
+
+/**
+ * How often the service tests the link throughput.
+ */
+#define ZT_LINK_SPEED_TEST_INTERVAL 1000
+
 /**
 /**
  * How frequently to send heartbeats over in-use paths
  * How frequently to send heartbeats over in-use paths
  */
  */

+ 4 - 1
node/IncomingPacket.cpp

@@ -80,6 +80,7 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr)
 			if (!trusted) {
 			if (!trusted) {
 				if (!dearmor(peer->key())) {
 				if (!dearmor(peer->key())) {
 					RR->t->incomingPacketMessageAuthenticationFailure(tPtr,_path,packetId(),sourceAddress,hops(),"invalid MAC");
 					RR->t->incomingPacketMessageAuthenticationFailure(tPtr,_path,packetId(),sourceAddress,hops(),"invalid MAC");
+					_path->recordPacket(false);
 					return true;
 					return true;
 				}
 				}
 			}
 			}
@@ -89,6 +90,8 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr)
 				return true;
 				return true;
 			}
 			}
 
 
+			_path->recordPacket(true);
+
 			const Packet::Verb v = verb();
 			const Packet::Verb v = verb();
 			switch(v) {
 			switch(v) {
 				//case Packet::VERB_NOP:
 				//case Packet::VERB_NOP:
@@ -446,7 +449,7 @@ bool IncomingPacket::_doOK(const RuntimeEnvironment *RR,void *tPtr,const SharedP
 			}
 			}
 
 
 			if (!hops())
 			if (!hops())
-				_path->updateLatency((unsigned int)latency);
+				_path->updateLatency((unsigned int)latency, RR->node->now());
 
 
 			peer->setRemoteVersion(vProto,vMajor,vMinor,vRevision);
 			peer->setRemoteVersion(vProto,vMajor,vMinor,vRevision);
 
 

+ 1 - 1
node/Multicaster.cpp

@@ -190,7 +190,7 @@ void Multicaster::send(
 				for(unsigned int i=0;i<multicastReplicatorCount;++i) {
 				for(unsigned int i=0;i<multicastReplicatorCount;++i) {
 					const SharedPtr<Peer> p(RR->topology->getPeerNoCache(multicastReplicators[i]));
 					const SharedPtr<Peer> p(RR->topology->getPeerNoCache(multicastReplicators[i]));
 					if ((p)&&(p->isAlive(now))) {
 					if ((p)&&(p->isAlive(now))) {
-						const SharedPtr<Path> pp(p->getBestPath(now,false));
+						const SharedPtr<Path> pp(p->getAppropriatePath(now,false));
 						if ((pp)&&(pp->latency() < bestMulticastReplicatorLatency)) {
 						if ((pp)&&(pp->latency() < bestMulticastReplicatorLatency)) {
 							bestMulticastReplicatorLatency = pp->latency();
 							bestMulticastReplicatorLatency = pp->latency();
 							bestMulticastReplicatorPath = pp;
 							bestMulticastReplicatorPath = pp;

+ 2 - 2
node/Node.cpp

@@ -234,7 +234,7 @@ public:
 			}
 			}
 
 
 			if ((!contacted)&&(_bestCurrentUpstream)) {
 			if ((!contacted)&&(_bestCurrentUpstream)) {
-				const SharedPtr<Path> up(_bestCurrentUpstream->getBestPath(_now,true));
+				const SharedPtr<Path> up(_bestCurrentUpstream->getAppropriatePath(_now,true));
 				if (up)
 				if (up)
 					p->sendHELLO(_tPtr,up->localSocket(),up->address(),_now);
 					p->sendHELLO(_tPtr,up->localSocket(),up->address(),_now);
 			}
 			}
@@ -465,7 +465,7 @@ ZT_PeerList *Node::peers() const
 		p->role = RR->topology->role(pi->second->identity().address());
 		p->role = RR->topology->role(pi->second->identity().address());
 
 
 		std::vector< SharedPtr<Path> > paths(pi->second->paths(_now));
 		std::vector< SharedPtr<Path> > paths(pi->second->paths(_now));
-		SharedPtr<Path> bestp(pi->second->getBestPath(_now,false));
+		SharedPtr<Path> bestp(pi->second->getAppropriatePath(_now,false));
 		p->pathCount = 0;
 		p->pathCount = 0;
 		for(std::vector< SharedPtr<Path> >::iterator path(paths.begin());path!=paths.end();++path) {
 		for(std::vector< SharedPtr<Path> >::iterator path(paths.begin());path!=paths.end();++path) {
 			ZT_FAST_MEMCPY(&(p->paths[p->pathCount].address),&((*path)->address()),sizeof(struct sockaddr_storage));
 			ZT_FAST_MEMCPY(&(p->paths[p->pathCount].address),&((*path)->address()),sizeof(struct sockaddr_storage));

+ 5 - 0
node/Node.hpp

@@ -260,6 +260,9 @@ public:
 	inline const Address &remoteTraceTarget() const { return _remoteTraceTarget; }
 	inline const Address &remoteTraceTarget() const { return _remoteTraceTarget; }
 	inline Trace::Level remoteTraceLevel() const { return _remoteTraceLevel; }
 	inline Trace::Level remoteTraceLevel() const { return _remoteTraceLevel; }
 
 
+	inline void setMultipathMode(uint8_t mode) { _multipathMode = mode; }
+	inline uint8_t getMultipathMode() { return _multipathMode; }
+
 private:
 private:
 	RuntimeEnvironment _RR;
 	RuntimeEnvironment _RR;
 	RuntimeEnvironment *RR;
 	RuntimeEnvironment *RR;
@@ -284,6 +287,8 @@ private:
 	Address _remoteTraceTarget;
 	Address _remoteTraceTarget;
 	enum Trace::Level _remoteTraceLevel;
 	enum Trace::Level _remoteTraceLevel;
 
 
+	uint8_t _multipathMode;
+
 	volatile int64_t _now;
 	volatile int64_t _now;
 	int64_t _lastPingCheck;
 	int64_t _lastPingCheck;
 	int64_t _lastHousekeepingRun;
 	int64_t _lastHousekeepingRun;

+ 260 - 5
node/Path.hpp

@@ -39,6 +39,9 @@
 #include "SharedPtr.hpp"
 #include "SharedPtr.hpp"
 #include "AtomicCounter.hpp"
 #include "AtomicCounter.hpp"
 #include "Utils.hpp"
 #include "Utils.hpp"
+#include "RingBuffer.hpp"
+
+#include "../osdep/Phy.hpp"
 
 
 /**
 /**
  * Maximum return value of preferenceRank()
  * Maximum return value of preferenceRank()
@@ -55,6 +58,7 @@ class RuntimeEnvironment;
 class Path
 class Path
 {
 {
 	friend class SharedPtr<Path>;
 	friend class SharedPtr<Path>;
+	Phy<Path *> *_phy;
 
 
 public:
 public:
 	/**
 	/**
@@ -93,22 +97,71 @@ public:
 		_lastOut(0),
 		_lastOut(0),
 		_lastIn(0),
 		_lastIn(0),
 		_lastTrustEstablishedPacketReceived(0),
 		_lastTrustEstablishedPacketReceived(0),
+		_lastPathQualityComputeTime(0),
 		_localSocket(-1),
 		_localSocket(-1),
 		_latency(0xffff),
 		_latency(0xffff),
 		_addr(),
 		_addr(),
-		_ipScope(InetAddress::IP_SCOPE_NONE)
+		_ipScope(InetAddress::IP_SCOPE_NONE),
+		_currentPacketSampleCounter(0),
+		_meanPacketErrorRatio(0.0),
+		_meanLatency(0.0),
+		_lastLatencyUpdate(0),
+		_jitter(0.0),
+		_lastPathQualitySampleTime(0),
+		_lastComputedQuality(0.0),
+		_lastPathQualityEstimate(0),
+		_meanAge(0.0),
+		_meanThroughput(0.0),
+		_packetLossRatio(0)
 	{
 	{
+		memset(_ifname, 0, sizeof(_ifname));
+		memset(_addrString, 0, sizeof(_addrString));
+		_throughputSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
+		_ageSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
+		_latencySamples = new RingBuffer<uint32_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
+		_errSamples = new RingBuffer<float>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
 	}
 	}
 
 
 	Path(const int64_t localSocket,const InetAddress &addr) :
 	Path(const int64_t localSocket,const InetAddress &addr) :
 		_lastOut(0),
 		_lastOut(0),
 		_lastIn(0),
 		_lastIn(0),
 		_lastTrustEstablishedPacketReceived(0),
 		_lastTrustEstablishedPacketReceived(0),
+		_lastPathQualityComputeTime(0),
 		_localSocket(localSocket),
 		_localSocket(localSocket),
 		_latency(0xffff),
 		_latency(0xffff),
 		_addr(addr),
 		_addr(addr),
-		_ipScope(addr.ipScope())
+		_ipScope(addr.ipScope()),
+		_currentPacketSampleCounter(0),
+		_meanPacketErrorRatio(0.0),
+		_meanLatency(0.0),
+		_lastLatencyUpdate(0),
+		_jitter(0.0),
+		_lastPathQualitySampleTime(0),
+		_lastComputedQuality(0.0),
+		_lastPathQualityEstimate(0),
+		_meanAge(0.0),
+		_meanThroughput(0.0),
+		_packetLossRatio(0)
+	{
+		memset(_ifname, 0, sizeof(_ifname));
+		memset(_addrString, 0, sizeof(_addrString));
+		_throughputSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
+		_ageSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
+		_latencySamples = new RingBuffer<uint32_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
+		_errSamples = new RingBuffer<float>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
+	}
+
+	~Path()
 	{
 	{
+		delete _throughputSamples;
+		delete _ageSamples;
+		delete _latencySamples;
+		delete _errSamples;
+
+		_throughputSamples = NULL;
+		_ageSamples = NULL;
+		_latencySamples = NULL;
+		_errSamples = NULL;
 	}
 	}
 
 
 	/**
 	/**
@@ -147,12 +200,17 @@ public:
 	 *
 	 *
 	 * @param l Measured latency
 	 * @param l Measured latency
 	 */
 	 */
-	inline void updateLatency(const unsigned int l)
+	inline void updateLatency(const unsigned int l, int64_t now)
 	{
 	{
 		unsigned int pl = _latency;
 		unsigned int pl = _latency;
-		if (pl < 0xffff)
+		if (pl < 0xffff) {
 			_latency = (pl + l) / 2;
 			_latency = (pl + l) / 2;
-		else _latency = l;
+		}
+		else {
+			_latency = l;
+		}
+		_lastLatencyUpdate = now;
+		_latencySamples->push(l);
 	}
 	}
 
 
 	/**
 	/**
@@ -240,11 +298,180 @@ public:
 		return (((age < (ZT_PATH_HEARTBEAT_PERIOD + 5000)) ? l : (l + 0xffff + age)) * (long)((ZT_INETADDRESS_MAX_SCOPE - _ipScope) + 1));
 		return (((age < (ZT_PATH_HEARTBEAT_PERIOD + 5000)) ? l : (l + 0xffff + age)) * (long)((ZT_INETADDRESS_MAX_SCOPE - _ipScope) + 1));
 	}
 	}
 
 
+	/**
+	 * @return An estimate of path quality -- higher is better.
+	 */
+	inline float computeQuality(const int64_t now)
+	{
+		float latency_contrib    = _meanLatency ? 1.0 / _meanLatency : 0;
+		float jitter_contrib     = _jitter ? 1.0 / _jitter : 0;
+		float throughput_contrib = _meanThroughput ? _meanThroughput / 1000000 : 0; // in Mbps
+		float age_contrib        = _meanAge > 0 ? (float)sqrt(_meanAge) : 1;
+		float error_contrib      = 1.0 - _meanPacketErrorRatio;
+		float sum = (latency_contrib + jitter_contrib + throughput_contrib + error_contrib) / age_contrib;
+		_lastComputedQuality = sum * (long)((_ipScope) + 1);
+		return _lastComputedQuality;
+	}
+
+	/**
+	 * Since quality estimates can become expensive we should cache the most recent result for traffic allocation
+	 * algorithms which may need to reference this value multiple times through the course of their execution.
+	 */
+	inline float lastComputedQuality() {
+		return _lastComputedQuality;
+	}
+
+	/**
+	 * @return A pointer to a cached copy of the human-readable name of the interface this Path's localSocket is bound to
+	 */
+	inline char *getName() { return _ifname; }
+
+	/**
+	 * @return Estimated throughput in bps of this link
+	 */
+	inline uint64_t getThroughput() { return _phy->getThroughput((PhySocket *)((uintptr_t)_localSocket)); }
+
+	/**
+	 * @return Packet delay varience
+	 */
+	inline float jitter() { return _jitter; }
+
+	/**
+	 * @return Previously-computed mean latency
+	 */
+	inline float meanLatency() { return _meanLatency; }
+
+	/**
+	 * @return Packet loss rate
+	 */
+	inline float packetLossRatio() { return _packetLossRatio; }
+
+	/**
+	 * @return Mean packet error ratio
+	 */
+	inline float meanPacketErrorRatio() { return _meanPacketErrorRatio; }
+
+	/**
+	 * @return Current packet error ratio (possibly incomplete sample set)
+	 */
+	inline float currentPacketErrorRatio() {
+		int errorsPerSample = 0;
+		for (int i=0; i<_currentPacketSampleCounter; i++) {
+			if (_packetValidity[i] == false) {
+				errorsPerSample++;
+			}
+		}
+		return (float)errorsPerSample / (float)ZT_PATH_ERROR_SAMPLE_WIN_SZ;
+	}
+
+	/**
+	 * @return Whether the Path's local socket is in a CLOSED state
+	 */
+	inline bool isClosed() { return _phy->isClosed((PhySocket *)((uintptr_t)_localSocket)); }
+
+	/**
+	 * @return The state of a Path's local socket
+	 */
+	inline int getState() { return _phy->getState((PhySocket *)((uintptr_t)_localSocket)); }
+
+	/**
+	 * @return Whether this socket may have been erased by the virtual physical link layer
+	 */
+	inline bool isValidState() { return _phy->isValidState((PhySocket *)((uintptr_t)_localSocket)); }
+
+	/**
+	 * @return Whether the path quality monitors have collected enough data to provide a quality value
+	 * TODO: expand this
+	 */
+	inline bool monitorsReady() {
+		return _latencySamples->count() && _ageSamples->count() && _throughputSamples->count();
+	}
+
+	/**
+	 * @return A pointer to a cached copy of the address string for this Path (For debugging only)
+	 */
+	inline char *getAddressString() { return _addrString; }
+
+	/**
+	 * Handle path sampling, computation of quality estimates, and other periodic tasks
+	 * @param now Current time
+	 */
+	inline void measureLink(int64_t now) {
+		// Sample path properties and store them in a continuously-revolving buffer
+		if (now - _lastPathQualitySampleTime > ZT_PATH_QUALITY_SAMPLE_INTERVAL) {
+			_lastPathQualitySampleTime = now;
+			_throughputSamples->push(getThroughput()); // Thoughtput in bits/s
+			_ageSamples->push(now - _lastIn); // Age (time since last received packet)
+			if (now - _lastLatencyUpdate > ZT_PATH_LATENCY_SAMPLE_INTERVAL) {
+				_lastLatencyUpdate = now;
+				// Record 0 bp/s. Since we're using this to detect possible packet loss
+				updateLatency(0, now);
+			}
+		}
+		// Compute statistical values for use in link quality estimates
+		if (now - _lastPathQualityComputeTime > ZT_PATH_QUALITY_COMPUTE_INTERVAL) {
+			_lastPathQualityComputeTime = now;
+			// Cache Path address string
+			address().toString(_addrString);
+			_phy->getIfName((PhySocket *)((uintptr_t)_localSocket), _ifname, ZT_PATH_INTERFACE_NAME_SZ); // Cache Interface name
+			// Derived values
+			if (_throughputSamples->count()) {
+				_packetLossRatio = (float)_throughputSamples->zeroCount() / (float)_throughputSamples->count();
+			}
+			_meanThroughput = _throughputSamples->mean();
+			_meanAge = _ageSamples->mean();
+			_meanLatency = _latencySamples->mean();
+			// Jitter
+			// SEE: RFC 3393, RFC 4689
+			_jitter = _latencySamples->stddev();
+			_meanPacketErrorRatio = _errSamples->mean(); // Packet Error Ratio (PER)
+		}
+		// Periodically compute a path quality estimate
+		if (now - _lastPathQualityEstimate > ZT_PATH_QUALITY_ESTIMATE_INTERVAL) {
+			computeQuality(now);
+		}
+	}
+
+	/**
+	 * Record whether a packet is considered invalid by MAC/compression/cipher checks. This
+	 * could be an indication of a bit error. This function will keep a running counter of
+	 * up to a given window size and with each counter overflow it will compute a mean error rate
+	 * and store that in a continuously shifting sample window.
+	 *
+	 * @param isValid Whether the packet in question is considered invalid
+	 */
+	inline void recordPacket(bool isValid) {
+		if (_currentPacketSampleCounter < ZT_PATH_ERROR_SAMPLE_WIN_SZ) {
+			_packetValidity[_currentPacketSampleCounter] = isValid;
+			_currentPacketSampleCounter++;
+		}
+		else {
+			// Sample array is full, compute an mean and stick it in the ring buffer for trend analysis
+			_errSamples->push(currentPacketErrorRatio());
+			_currentPacketSampleCounter=0;
+		}
+	}
+
+	/**
+	 * @return The mean age (in ms) of this link
+	 */
+	inline float meanAge() { return _meanAge; }
+
+	/**
+	 * @return The mean throughput (in bits/s) of this link
+	 */
+	inline float meanThroughput() { return _meanThroughput; }
+
 	/**
 	/**
 	 * @return True if this path is alive (receiving heartbeats)
 	 * @return True if this path is alive (receiving heartbeats)
 	 */
 	 */
 	inline bool alive(const int64_t now) const { return ((now - _lastIn) < (ZT_PATH_HEARTBEAT_PERIOD + 5000)); }
 	inline bool alive(const int64_t now) const { return ((now - _lastIn) < (ZT_PATH_HEARTBEAT_PERIOD + 5000)); }
 
 
+	/**
+	 * @return True if this path hasn't received a packet in a "significant" amount of time
+	 */
+	inline bool stale(const int64_t now) const { return ((now - _lastIn) > ZT_LINK_SPEED_TEST_INTERVAL * 10); }
+
 	/**
 	/**
 	 * @return True if this path needs a heartbeat
 	 * @return True if this path needs a heartbeat
 	 */
 	 */
@@ -269,11 +496,39 @@ private:
 	volatile int64_t _lastOut;
 	volatile int64_t _lastOut;
 	volatile int64_t _lastIn;
 	volatile int64_t _lastIn;
 	volatile int64_t _lastTrustEstablishedPacketReceived;
 	volatile int64_t _lastTrustEstablishedPacketReceived;
+	volatile int64_t _lastPathQualityComputeTime;
 	int64_t _localSocket;
 	int64_t _localSocket;
 	volatile unsigned int _latency;
 	volatile unsigned int _latency;
 	InetAddress _addr;
 	InetAddress _addr;
 	InetAddress::IpScope _ipScope; // memoize this since it's a computed value checked often
 	InetAddress::IpScope _ipScope; // memoize this since it's a computed value checked often
 	AtomicCounter __refCount;
 	AtomicCounter __refCount;
+
+	// Packet Error Ratio (PER)
+	int _packetValidity[ZT_PATH_ERROR_SAMPLE_WIN_SZ];
+	int _currentPacketSampleCounter;
+	volatile float _meanPacketErrorRatio;
+
+	// Latency and Jitter
+	volatile float _meanLatency;
+	int64_t _lastLatencyUpdate;
+	volatile float _jitter;
+
+	int64_t _lastPathQualitySampleTime;
+	float _lastComputedQuality;
+	int64_t _lastPathQualityEstimate;
+	float _meanAge;
+	float _meanThroughput;
+
+	// Circular buffers used to efficiently store large time series
+	RingBuffer<uint64_t> *_throughputSamples;
+	RingBuffer<uint32_t> *_latencySamples;
+	RingBuffer<uint64_t> *_ageSamples;
+	RingBuffer<float> *_errSamples;
+
+	float _packetLossRatio;
+
+	char _ifname[ZT_PATH_INTERFACE_NAME_SZ];
+	char _addrString[256];
 };
 };
 
 
 } // namespace ZeroTier
 } // namespace ZeroTier

+ 272 - 17
node/Peer.cpp

@@ -35,6 +35,7 @@
 #include "Packet.hpp"
 #include "Packet.hpp"
 #include "Trace.hpp"
 #include "Trace.hpp"
 #include "InetAddress.hpp"
 #include "InetAddress.hpp"
+#include "RingBuffer.hpp"
 
 
 namespace ZeroTier {
 namespace ZeroTier {
 
 
@@ -59,10 +60,14 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident
 	_vRevision(0),
 	_vRevision(0),
 	_id(peerIdentity),
 	_id(peerIdentity),
 	_directPathPushCutoffCount(0),
 	_directPathPushCutoffCount(0),
-	_credentialsCutoffCount(0)
+	_credentialsCutoffCount(0),
+	_linkBalanceStatus(false),
+	_linkRedundancyStatus(false)
 {
 {
 	if (!myIdentity.agree(peerIdentity,_key,ZT_PEER_SECRET_KEY_LENGTH))
 	if (!myIdentity.agree(peerIdentity,_key,ZT_PEER_SECRET_KEY_LENGTH))
 		throw ZT_EXCEPTION_INVALID_ARGUMENT;
 		throw ZT_EXCEPTION_INVALID_ARGUMENT;
+	_pathChoiceHist = new RingBuffer<int>(ZT_MULTIPATH_PROPORTION_WIN_SZ);
+	_flowBalanceHist = new RingBuffer<float>(ZT_MULTIPATH_PROPORTION_WIN_SZ);
 }
 }
 
 
 void Peer::received(
 void Peer::received(
@@ -95,6 +100,18 @@ void Peer::received(
 		path->trustedPacketReceived(now);
 		path->trustedPacketReceived(now);
 	}
 	}
 
 
+	if (RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) {
+		if ((now - _lastPathPrune) > ZT_CLOSED_PATH_PRUNING_INTERVAL) {
+			_lastPathPrune = now;
+			prunePaths();
+		}
+		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i].p) {
+				_paths[i].p->measureLink(now);
+			}
+		}
+	}
+
 	if (hops == 0) {
 	if (hops == 0) {
 		// If this is a direct packet (no hops), update existing paths or learn new ones
 		// If this is a direct packet (no hops), update existing paths or learn new ones
 
 
@@ -232,26 +249,246 @@ void Peer::received(
 	}
 	}
 }
 }
 
 
-SharedPtr<Path> Peer::getBestPath(int64_t now,bool includeExpired) const
+SharedPtr<Path> Peer::getAppropriatePath(int64_t now, bool includeExpired)
 {
 {
 	Mutex::Lock _l(_paths_m);
 	Mutex::Lock _l(_paths_m);
-
 	unsigned int bestPath = ZT_MAX_PEER_NETWORK_PATHS;
 	unsigned int bestPath = ZT_MAX_PEER_NETWORK_PATHS;
-	long bestPathQuality = 2147483647;
+
+	/**
+	 * Send traffic across the highest quality path only. This algorithm will still
+	 * use the old path quality metric.
+	 */
+	if (RR->node->getMultipathMode() == ZT_MULTIPATH_NONE) {
+		long bestPathQuality = 2147483647;
+		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i].p && _paths[i].p->isValidState()) {
+				if ((includeExpired)||((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION)) {
+					const long q = _paths[i].p->quality(now) / _paths[i].priority;
+					if (q <= bestPathQuality) {
+						bestPathQuality = q;
+						bestPath = i;
+					}
+				}
+			} else break;
+		}
+		if (bestPath != ZT_MAX_PEER_NETWORK_PATHS) {
+			return _paths[bestPath].p;
+		}
+		return SharedPtr<Path>();
+	}
+
+	if ((now - _lastPathPrune) > ZT_CLOSED_PATH_PRUNING_INTERVAL) {
+		_lastPathPrune = now;
+		prunePaths();
+	}
 	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 		if (_paths[i].p) {
 		if (_paths[i].p) {
-			if ((includeExpired)||((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION)) {
-				const long q = _paths[i].p->quality(now) / _paths[i].priority;
-				if (q <= bestPathQuality) {
-					bestPathQuality = q;
+			_paths[i].p->measureLink(now);
+		}
+	}
+
+	/**
+	 * Randomly distribute traffic across all paths
+	 *
+	 * Behavior:
+	 *  - If path DOWN: Stop randomly choosing that path
+	 *  - If path UP: Start randomly choosing that path
+	 *  - If all paths are unresponsive: randomly choose from all paths
+	 */
+	int numAlivePaths = 0;
+	int numStalePaths = 0;
+	if (RR->node->getMultipathMode() == ZT_MULTIPATH_RANDOM) {
+		int alivePaths[ZT_MAX_PEER_NETWORK_PATHS];
+		int stalePaths[ZT_MAX_PEER_NETWORK_PATHS];
+		memset(&alivePaths, -1, sizeof(alivePaths));
+		memset(&stalePaths, -1, sizeof(stalePaths));
+		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i].p) {
+				if (_paths[i].p->isValidState()) {
+					if (_paths[i].p->alive(now)) {
+						alivePaths[numAlivePaths] = i;
+						numAlivePaths++;
+					}
+					else {
+						stalePaths[numStalePaths] = i;
+						numStalePaths++;
+					}
+				}
+			}
+		}
+		unsigned int r;
+		Utils::getSecureRandom(&r, 1);
+		if (numAlivePaths > 0) {
+			// pick a random out of the set deemed "alive"
+			int rf = (float)(r %= numAlivePaths);
+			return _paths[alivePaths[rf]].p;
+		}
+		else if(numStalePaths > 0) {
+			// resort to trying any non-expired path
+			int rf = (float)(r %= numStalePaths);
+			return _paths[stalePaths[rf]].p;
+		}
+	}
+
+	/**
+	 * Proportionally allocate traffic according to dynamic path quality measurements
+	 */
+	if (RR->node->getMultipathMode() == ZT_MULTIPATH_PROPORTIONALLY_BALANCED) {
+		float relq[ZT_MAX_PEER_NETWORK_PATHS];
+		memset(&relq, 0, sizeof(relq));
+		float alloc[ZT_MAX_PEER_NETWORK_PATHS];
+		memset(&alloc, 0, sizeof(alloc));
+
+		// Survey
+		//
+		// Take a survey of all available link qualities. We use this to determine if we
+		// can skip this algorithm altogether and if not, to establish baseline for physical
+		// link quality used in later calculations.
+		//
+		// We find the min/max quality of our currently-active links so
+		// that we can form a relative scale to rank each link proportionally
+		// to each other link.
+		uint16_t alivePaths[ZT_MAX_PEER_NETWORK_PATHS];
+		uint16_t stalePaths[ZT_MAX_PEER_NETWORK_PATHS];
+		memset(&alivePaths, -1, sizeof(alivePaths));
+		memset(&stalePaths, -1, sizeof(stalePaths));
+		uint16_t numAlivePaths = 0;
+		uint16_t numStalePaths = 0;
+		float minQuality = 10000;
+		float maxQuality = -1;
+		float currQuality;
+		for(uint16_t i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i].p && _paths[i].p->isValidState()) {
+				if (!_paths[i].p->monitorsReady()) {
+					// TODO: This should fix itself anyway but we should test whether forcing the use of a new path will
+					// aid in establishing flow balance more quickly.
+				}
+				// Compute quality here, going forward we will use lastComputedQuality()
+				currQuality = _paths[i].p->computeQuality(now);
+				if (!_paths[i].p->stale(now)) {
+					alivePaths[i] = currQuality;
+					numAlivePaths++;
+				}
+				else {
+					stalePaths[i] = currQuality;
+					numStalePaths++;
+				}
+				if (currQuality > maxQuality) {
+					maxQuality = currQuality;
 					bestPath = i;
 					bestPath = i;
 				}
 				}
+				if (currQuality < minQuality) {
+					minQuality = currQuality;
+				}
+				relq[i] = currQuality;
 			}
 			}
-		} else break;
+		}
+
+		// Attempt to find an excuse not to use the rest of this algorithm
+		if (bestPath == ZT_MAX_PEER_NETWORK_PATHS || (numAlivePaths == 0 && numStalePaths == 0)) {
+			return SharedPtr<Path>();
+		} if (numAlivePaths == 1) {
+			return _paths[bestPath].p;
+		} if (numStalePaths == 1) {
+			return _paths[bestPath].p;
+		}
+
+		// Relative quality
+		//
+		// The strongest link will have a value of 1.0 whereas every other
+		// link will have a value which represents some fraction of the strongest link.
+		float totalRelativeQuality = 0;
+		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i].p && _paths[i].p->isValidState()) {
+				relq[i] /= maxQuality ? maxQuality : 1;
+				totalRelativeQuality += relq[i];
+			}
+		}
+
+		// Convert the relative quality values into flow allocations.
+		// Additionally, determine whether each path in the flow is
+		// contributing more or less than its target allocation. If
+		// it is contributing more than required, don't allow it to be
+		// randomly selected for the next packet. If however the path
+		// needs to contribute more to the flow, we should record
+		float imbalance = 0;
+		float qualityScalingFactor = 1.0 / totalRelativeQuality;
+		for(uint16_t i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			// Out of the last N packets to this peer, how many were sent by this path?
+			int numPktSentWithinWin = (int)_pathChoiceHist->countValue((float)i);
+			// Compute traffic allocation for each path in the flow
+			if (_paths[i].p && _paths[i].p->isValidState()) {
+				// Allocation
+				// This is the percentage of traffic we want to send over a given path
+				alloc[i] = relq[i] * qualityScalingFactor;
+				float currProportion = numPktSentWithinWin / (float)ZT_MULTIPATH_PROPORTION_WIN_SZ;
+				float targetProportion = alloc[i];
+				float diffProportion = currProportion - targetProportion;
+				// Imbalance
+				//
+				// This is the sum of the distances of each path's currently observed flow contributions
+				// from its most recent target allocation. In other words, this is a measure of how closely we
+				// are adhering to our desired allocations. It is worth noting that this value can be greater
+				// than 1.0 if a significant change to allocations is made by the algorithm, this will
+				// eventually correct itself.
+				imbalance += fabs(diffProportion);
+				if (diffProportion < 0) {
+					alloc[i] = targetProportion;
+				}
+				else {
+					alloc[i] = targetProportion;
+				}
+			}
+		}
+
+		// Compute and record current flow balance
+		float balance = 1.0 - imbalance;
+		if (balance >= ZT_MULTIPATH_FLOW_BALANCE_THESHOLD) {
+			if (!_linkBalanceStatus) {
+				_linkBalanceStatus = true;
+				RR->t->peerLinkBalanced(NULL,0,*this);
+			}
+		}
+		else {
+			if (_linkBalanceStatus) {
+				_linkBalanceStatus = false;
+				RR->t->peerLinkImbalanced(NULL,0,*this);
+			}
+		}
+
+		// Record the current flow balance. Later used for computing a mean flow balance value.
+		_flowBalanceHist->push(balance);
+
+		// Randomly choose path from allocated candidates
+		unsigned int r;
+		Utils::getSecureRandom(&r, 1);
+		float rf = (float)(r %= 100) / 100;
+		for(int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i].p && _paths[i].p->isValidState() && _paths[i].p->address().isV4()) {
+				if (alloc[i] > 0 && rf < alloc[i]) {
+					bestPath = i;
+					_pathChoiceHist->push(bestPath); // Record which path we chose
+					break;
+				}
+				if (alloc[i] > 0) {
+					rf -= alloc[i];
+				}
+				else {
+					rf -= alloc[i]*-1;
+				}
+			}
+		}
+		if (bestPath < ZT_MAX_PEER_NETWORK_PATHS) {
+			return _paths[bestPath].p;
+		}
+		return SharedPtr<Path>();
+	}
+
+	// Adhere to a user-defined interface/allocation scheme
+	if (RR->node->getMultipathMode() == ZT_MULTIPATH_MANUALLY_BALANCED) {
+		// TODO
 	}
 	}
 
 
-	if (bestPath != ZT_MAX_PEER_NETWORK_PATHS)
-		return _paths[bestPath].p;
 	return SharedPtr<Path>();
 	return SharedPtr<Path>();
 }
 }
 
 
@@ -477,16 +714,34 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
 			}
 			}
 		} else break;
 		} else break;
 	}
 	}
-	while(j < ZT_MAX_PEER_NETWORK_PATHS) {
-		_paths[j].lr = 0;
-		_paths[j].p.zero();
-		_paths[j].priority = 1;
-		++j;
+	if (RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) {
+		while(j < ZT_MAX_PEER_NETWORK_PATHS) {
+			_paths[j].lr = 0;
+			_paths[j].p.zero();
+			_paths[j].priority = 1;
+			++j;
+		}
 	}
 	}
-
 	return sent;
 	return sent;
 }
 }
 
 
+unsigned int Peer::prunePaths()
+{
+	Mutex::Lock _l(_paths_m);
+	unsigned int pruned = 0;
+	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+		if (_paths[i].p) {
+			if(_paths[i].p->isClosed() || !_paths[i].p->isValidState()) {
+				_paths[i].lr = 0;
+				_paths[i].p.zero();
+				_paths[i].priority = 1;
+				pruned++;
+			}
+		}
+	}
+	return pruned;
+}
+
 void Peer::clusterRedirect(void *tPtr,const SharedPtr<Path> &originatingPath,const InetAddress &remoteAddress,const int64_t now)
 void Peer::clusterRedirect(void *tPtr,const SharedPtr<Path> &originatingPath,const InetAddress &remoteAddress,const int64_t now)
 {
 {
 	SharedPtr<Path> np(RR->topology->getPath(originatingPath->localSocket(),remoteAddress));
 	SharedPtr<Path> np(RR->topology->getPath(originatingPath->localSocket(),remoteAddress));

+ 31 - 7
node/Peer.hpp

@@ -65,7 +65,13 @@ private:
 	Peer() {} // disabled to prevent bugs -- should not be constructed uninitialized
 	Peer() {} // disabled to prevent bugs -- should not be constructed uninitialized
 
 
 public:
 public:
-	~Peer() { Utils::burn(_key,sizeof(_key)); }
+	~Peer() {
+		Utils::burn(_key,sizeof(_key));
+		delete _pathChoiceHist;
+		delete _flowBalanceHist;
+		_pathChoiceHist = NULL;
+		_flowBalanceHist = NULL;
+	}
 
 
 	/**
 	/**
 	 * Construct a new peer
 	 * Construct a new peer
@@ -145,20 +151,20 @@ public:
 	 */
 	 */
 	inline bool sendDirect(void *tPtr,const void *data,unsigned int len,int64_t now,bool force)
 	inline bool sendDirect(void *tPtr,const void *data,unsigned int len,int64_t now,bool force)
 	{
 	{
-		SharedPtr<Path> bp(getBestPath(now,force));
+		SharedPtr<Path> bp(getAppropriatePath(now,force));
 		if (bp)
 		if (bp)
 			return bp->send(RR,tPtr,data,len,now);
 			return bp->send(RR,tPtr,data,len,now);
 		return false;
 		return false;
 	}
 	}
 
 
 	/**
 	/**
-	 * Get the best current direct path
+	 * Get the most appropriate direct path based on current multipath configuration
 	 *
 	 *
 	 * @param now Current time
 	 * @param now Current time
 	 * @param includeExpired If true, include even expired paths
 	 * @param includeExpired If true, include even expired paths
 	 * @return Best current path or NULL if none
 	 * @return Best current path or NULL if none
 	 */
 	 */
-	SharedPtr<Path> getBestPath(int64_t now,bool includeExpired) const;
+	SharedPtr<Path> getAppropriatePath(int64_t now, bool includeExpired);
 
 
 	/**
 	/**
 	 * Send VERB_RENDEZVOUS to this and another peer via the best common IP scope and path
 	 * Send VERB_RENDEZVOUS to this and another peer via the best common IP scope and path
@@ -212,6 +218,16 @@ public:
 	 */
 	 */
 	unsigned int doPingAndKeepalive(void *tPtr,int64_t now);
 	unsigned int doPingAndKeepalive(void *tPtr,int64_t now);
 
 
+	/**
+	 * Clear paths whose localSocket(s) are in a CLOSED state or have an otherwise INVALID state.
+	 * This should be called frequently so that we can detect and remove unproductive or invalid paths.
+	 *
+	 * Under the hood this is done periodically based on ZT_CLOSED_PATH_PRUNING_INTERVAL.
+	 *
+	 * @return Number of paths that were pruned this round
+	 */
+	unsigned int prunePaths();
+
 	/**
 	/**
 	 * Process a cluster redirect sent by this peer
 	 * Process a cluster redirect sent by this peer
 	 *
 	 *
@@ -270,9 +286,9 @@ public:
 	/**
 	/**
 	 * @return Latency in milliseconds of best path or 0xffff if unknown / no paths
 	 * @return Latency in milliseconds of best path or 0xffff if unknown / no paths
 	 */
 	 */
-	inline unsigned int latency(const int64_t now) const
+	inline unsigned int latency(const int64_t now)
 	{
 	{
-		SharedPtr<Path> bp(getBestPath(now,false));
+		SharedPtr<Path> bp(getAppropriatePath(now,false));
 		if (bp)
 		if (bp)
 			return bp->latency();
 			return bp->latency();
 		return 0xffff;
 		return 0xffff;
@@ -289,7 +305,7 @@ public:
 	 *
 	 *
 	 * @return Relay quality score computed from latency and other factors, lower is better
 	 * @return Relay quality score computed from latency and other factors, lower is better
 	 */
 	 */
-	inline unsigned int relayQuality(const int64_t now) const
+	inline unsigned int relayQuality(const int64_t now)
 	{
 	{
 		const uint64_t tsr = now - _lastReceive;
 		const uint64_t tsr = now - _lastReceive;
 		if (tsr >= ZT_PEER_ACTIVITY_TIMEOUT)
 		if (tsr >= ZT_PEER_ACTIVITY_TIMEOUT)
@@ -515,6 +531,7 @@ private:
 	int64_t _lastCredentialsReceived;
 	int64_t _lastCredentialsReceived;
 	int64_t _lastTrustEstablishedPacketReceived;
 	int64_t _lastTrustEstablishedPacketReceived;
 	int64_t _lastSentFullHello;
 	int64_t _lastSentFullHello;
+	int64_t _lastPathPrune;
 
 
 	uint16_t _vProto;
 	uint16_t _vProto;
 	uint16_t _vMajor;
 	uint16_t _vMajor;
@@ -530,6 +547,13 @@ private:
 	unsigned int _credentialsCutoffCount;
 	unsigned int _credentialsCutoffCount;
 
 
 	AtomicCounter __refCount;
 	AtomicCounter __refCount;
+
+	RingBuffer<int> *_pathChoiceHist;
+	RingBuffer<float> *_flowBalanceHist;
+
+	bool _linkBalanceStatus;
+	bool _linkRedundancyStatus;
+
 };
 };
 
 
 } // namespace ZeroTier
 } // namespace ZeroTier

+ 315 - 0
node/RingBuffer.hpp

@@ -0,0 +1,315 @@
+/*
+ * ZeroTier One - Network Virtualization Everywhere
+ * Copyright (C) 2011-2018  ZeroTier, Inc.  https://www.zerotier.com/
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * --
+ *
+ * You can be released from the requirements of the license by purchasing
+ * a commercial license. Buying such a license is mandatory as soon as you
+ * develop commercial closed-source software that incorporates or links
+ * directly against ZeroTier software without disclosing the source code
+ * of your own application.
+ */
+
+#ifndef ZT_RINGBUFFER_H
+#define ZT_RINGBUFFER_H
+
+#include <typeinfo>
+#include <cstdint>
+#include <stdlib.h>
+#include <memory.h>
+#include <algorithm>
+#include <math.h>
+
+namespace ZeroTier {
+
+/**
+ * A revolving (ring) buffer. 
+ *
+ * For fast handling of continuously-evolving variables (such as path quality metrics).
+ * Using this, we can maintain longer sliding historical windows for important path 
+ * metrics without the need for potentially expensive calls to memcpy/memmove.
+ *
+ * Some basic statistical functionality is implemented here in an attempt
+ * to reduce the complexity of code needed to interact with this type of buffer.
+ */
+
+template <class T>
+class RingBuffer
+{
+private:
+	T * buf;
+	size_t size;
+	size_t begin;
+	size_t end;
+	bool wrap;
+
+public:
+
+	/**
+	 * create a RingBuffer with space for up to size elements.
+	 */
+	explicit RingBuffer(size_t size)
+		: size(size),
+		begin(0),
+		end(0),
+		wrap(false)
+	{
+		buf = new T[size];
+		memset(buf, 0, sizeof(T) * size);
+	}
+
+	/**
+	 * @return A pointer to the underlying buffer
+	 */
+	T* get_buf()
+	{
+		return buf + begin;
+	}
+
+	/** 
+	 * Adjust buffer index pointer as if we copied data in
+	 * @param n Number of elements to copy in
+	 * @return Number of elements we copied in
+	 */
+	size_t produce(size_t n)
+	{
+		n = std::min(n, getFree());
+		if (n == 0) {
+			return n;
+		}
+		const size_t first_chunk = std::min(n, size - end);
+		end = (end + first_chunk) % size;
+		if (first_chunk < n) {
+			const size_t second_chunk = n - first_chunk;
+			end = (end + second_chunk) % size;
+		}
+		if (begin == end) {
+			wrap = true;
+		}
+		return n;
+	}
+
+	/** 
+	 * Fast erase, O(1). 
+	 * Merely reset the buffer pointer, doesn't erase contents
+	 */
+	void reset()
+	{
+		consume(count());
+	}
+
+	/** 
+	 * adjust buffer index pointer as if we copied data out
+	 * @param n Number of elements we copied from the buffer
+	 * @return Number of elements actually available from the buffer
+	 */
+	size_t consume(size_t n)
+	{
+		n = std::min(n, count());
+		if (n == 0) {
+			return n;
+		}
+		if (wrap) {
+			wrap = false;
+		}
+		const size_t first_chunk = std::min(n, size - begin);
+		begin = (begin + first_chunk) % size;
+		if (first_chunk < n) {
+			const size_t second_chunk = n - first_chunk;
+			begin = (begin + second_chunk) % size;
+		}
+		return n;
+	}
+
+	/**
+	 * @param data Buffer that is to be written to the ring
+	 * @param n Number of elements to write to the buffer
+	 */
+	size_t write(const T * data, size_t n)
+	{
+		n = std::min(n, getFree());
+		if (n == 0) {
+			return n;
+		}
+		const size_t first_chunk = std::min(n, size - end);
+		memcpy(buf + end, data, first_chunk * sizeof(T));
+		end = (end + first_chunk) % size;
+		if (first_chunk < n) {
+			const size_t second_chunk = n - first_chunk;
+			memcpy(buf + end, data + first_chunk, second_chunk * sizeof(T));
+			end = (end + second_chunk) % size;
+		}
+		if (begin == end) {
+			wrap = true;
+		}
+		return n;
+	}
+
+	/**
+	 * Place a single value on the buffer. If the buffer is full, consume a value first.
+	 *
+	 * @param value A single value to be placed in the buffer
+	 */
+	void push(const T value)
+	{
+		if (count() == size) {
+			consume(1);
+		}
+		write(&value, 1);
+	}
+
+	/**
+	 * @param dest Destination buffer
+	 * @param n Size (in terms of number of elements) of the destination buffer
+	 * @return Number of elements read from the buffer
+	 */
+	size_t read(T * dest, size_t n)
+	{
+		n = std::min(n, count());
+		if (n == 0) {
+			return n;
+		}
+		if (wrap) {
+			wrap = false;
+		}
+		const size_t first_chunk = std::min(n, size - begin);
+		memcpy(dest, buf + begin, first_chunk * sizeof(T));
+		begin = (begin + first_chunk) % size;
+		if (first_chunk < n) {
+			const size_t second_chunk = n - first_chunk;
+			memcpy(dest + first_chunk, buf + begin, second_chunk * sizeof(T));
+			begin = (begin + second_chunk) % size;
+		}
+		return n;
+	}
+
+	/**
+	 * Return how many elements are in the buffer, O(1).
+	 *
+	 * @return The number of elements in the buffer
+	 */
+	size_t count()
+	{
+		if (end == begin) {
+			return wrap ? size : 0;
+		}
+		else if (end > begin) {
+			return end - begin;
+		}
+		else {
+			return size + end - begin;
+		}
+	}
+
+	/**
+	 * @return The number of slots that are unused in the buffer
+	 */
+	size_t getFree()
+	{
+		return size - count();
+	}
+
+	/**
+	 * @return The arithmetic mean of the contents of the buffer
+	 */
+	T mean()
+	{
+		size_t iterator = begin;
+		T mean = 0;
+		for (int i=0; i<size; i++) {
+			iterator = (iterator + size - 1) % size;
+			mean += *(buf + iterator);
+		}
+		return count() ? mean / (T)count() : 0;
+	}
+
+	/**
+	 * @return The sample standard deviation of the contents of the ring buffer
+	 */
+	T stddev()
+	{
+		size_t iterator = begin;
+		T cached_mean = mean();
+		if (size) {
+			T sum_of_squared_deviations = 0;
+			for (int i=0; i<size; i++) {
+				iterator = (iterator + size - 1) % size;
+				T deviation = (buf[i] - cached_mean);
+				T sdev = deviation*deviation;
+				sum_of_squared_deviations += sdev;
+			}
+			T variance = sum_of_squared_deviations / (size - 1);
+			T sd = sqrt(variance);
+			return sd;
+		}
+		return 0;
+	}
+
+	/**
+	 * @return The number of elements of zero value, O(n)
+	 */
+	size_t zeroCount()
+	{
+		size_t iterator = begin;
+		size_t zeros = 0;
+		for (int i=0; i<size; i++) {
+			iterator = (iterator + size - 1) % size;
+			if (*(buf + iterator) == 0) {
+				zeros++;
+			}
+		}
+		return zeros;
+	}
+
+	/**
+	 * @param value Value to match against in buffer
+	 * @return The number of values held in the ring buffer which match a given value
+	 */
+	size_t countValue(T value)
+	{
+		size_t iterator = begin;
+		size_t count = 0;
+		for (int i=0; i<size; i++) {
+			iterator = (iterator + size - 1) % size;
+			if (*(buf + iterator) == value) {
+				count++;
+			}
+		}
+		return count;
+	}
+
+	/**
+	 * Print the contents of the buffer
+	 */
+	void dump()
+	{
+		size_t iterator = begin;
+		for (int i=0; i<size; i++) {
+			iterator = (iterator + size - 1) % size;
+			if (typeid(T) == typeid(int)) {
+				// DEBUG_INFO("buf[%2zu]=%2d", iterator, (int)*(buf + iterator));
+			}
+			else {
+				// DEBUG_INFO("buf[%2zu]=%2f", iterator, (float)*(buf + iterator));
+			}
+		}
+	}
+};
+
+} // namespace ZeroTier
+
+#endif

+ 3 - 3
node/Switch.cpp

@@ -646,12 +646,12 @@ bool Switch::_trySend(void *tPtr,Packet &packet,bool encrypt)
 
 
 	const SharedPtr<Peer> peer(RR->topology->getPeer(tPtr,destination));
 	const SharedPtr<Peer> peer(RR->topology->getPeer(tPtr,destination));
 	if (peer) {
 	if (peer) {
-		viaPath = peer->getBestPath(now,false);
+		viaPath = peer->getAppropriatePath(now,false);
 		if (!viaPath) {
 		if (!viaPath) {
 			peer->tryMemorizedPath(tPtr,now); // periodically attempt memorized or statically defined paths, if any are known
 			peer->tryMemorizedPath(tPtr,now); // periodically attempt memorized or statically defined paths, if any are known
 			const SharedPtr<Peer> relay(RR->topology->getUpstreamPeer());
 			const SharedPtr<Peer> relay(RR->topology->getUpstreamPeer());
-			if ( (!relay) || (!(viaPath = relay->getBestPath(now,false))) ) {
-				if (!(viaPath = peer->getBestPath(now,true)))
+			if ( (!relay) || (!(viaPath = relay->getAppropriatePath(now,false))) ) {
+				if (!(viaPath = peer->getAppropriatePath(now,true)))
 					return false;
 					return false;
 			}
 			}
 		}
 		}

+ 1 - 1
node/Topology.hpp

@@ -299,7 +299,7 @@ public:
 		Address *a = (Address *)0;
 		Address *a = (Address *)0;
 		SharedPtr<Peer> *p = (SharedPtr<Peer> *)0;
 		SharedPtr<Peer> *p = (SharedPtr<Peer> *)0;
 		while (i.next(a,p)) {
 		while (i.next(a,p)) {
-			const SharedPtr<Path> pp((*p)->getBestPath(now,false));
+			const SharedPtr<Path> pp((*p)->getAppropriatePath(now,false));
 			if (pp)
 			if (pp)
 				++cnt;
 				++cnt;
 		}
 		}

+ 20 - 0
node/Trace.cpp

@@ -106,6 +106,26 @@ void Trace::peerConfirmingUnknownPath(void *const tPtr,const uint64_t networkId,
 	}
 	}
 }
 }
 
 
+void Trace::peerLinkNowRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath)
+{
+	ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is fully redundant",peer.address().toInt(),networkId);
+}
+
+void Trace::peerLinkNoLongerRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath)
+{
+	ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is no longer redundant",peer.address().toInt(),networkId);
+}
+
+void Trace::peerLinkBalanced(void *const tPtr,const uint64_t networkId,Peer &peer)
+{
+	ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is balanced",peer.address().toInt(),networkId);
+}
+
+void Trace::peerLinkImbalanced(void *const tPtr,const uint64_t networkId,Peer &peer)
+{
+	ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is unbalanced",peer.address().toInt(),networkId);
+}
+
 void Trace::peerLearnedNewPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath,const uint64_t packetId)
 void Trace::peerLearnedNewPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath,const uint64_t packetId)
 {
 {
 	char tmp[128];
 	char tmp[128];

+ 6 - 0
node/Trace.hpp

@@ -121,6 +121,12 @@ public:
 	void resettingPathsInScope(void *const tPtr,const Address &reporter,const InetAddress &reporterPhysicalAddress,const InetAddress &myPhysicalAddress,const InetAddress::IpScope scope);
 	void resettingPathsInScope(void *const tPtr,const Address &reporter,const InetAddress &reporterPhysicalAddress,const InetAddress &myPhysicalAddress,const InetAddress::IpScope scope);
 
 
 	void peerConfirmingUnknownPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &path,const uint64_t packetId,const Packet::Verb verb);
 	void peerConfirmingUnknownPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &path,const uint64_t packetId,const Packet::Verb verb);
+
+	void peerLinkNowRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath);
+	void peerLinkNoLongerRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath);
+	void peerLinkBalanced(void *const tPtr,const uint64_t networkId,Peer &peer);
+	void peerLinkImbalanced(void *const tPtr,const uint64_t networkId,Peer &peer);
+
 	void peerLearnedNewPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath,const uint64_t packetId);
 	void peerLearnedNewPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath,const uint64_t packetId);
 	void peerRedirected(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath);
 	void peerRedirected(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath);
 
 

+ 15 - 0
osdep/Binder.hpp

@@ -388,6 +388,7 @@ public:
 						_bindings[_bindingCount].udpSock = udps;
 						_bindings[_bindingCount].udpSock = udps;
 						_bindings[_bindingCount].tcpListenSock = tcps;
 						_bindings[_bindingCount].tcpListenSock = tcps;
 						_bindings[_bindingCount].address = ii->first;
 						_bindings[_bindingCount].address = ii->first;
+						phy.setIfName(udps, (char*)ii->second.c_str(), ii->second.length());
 						++_bindingCount;
 						++_bindingCount;
 					}
 					}
 				} else {
 				} else {
@@ -455,6 +456,20 @@ public:
 		return false;
 		return false;
 	}
 	}
 
 
+	/**
+	 * Get a list of socket pointers for all bindings.
+	 * 
+	 * @return A list of socket pointers for current bindings
+	 */
+	inline std::vector<PhySocket*> getBoundSockets()
+	{
+		std::vector<PhySocket*> sockets;
+		for (int i=0; i<ZT_BINDER_MAX_BINDINGS; i++) {
+			sockets.push_back(_bindings[i].udpSock);
+		}
+		return sockets;
+	}
+
 private:
 private:
 	_Binding _bindings[ZT_BINDER_MAX_BINDINGS];
 	_Binding _bindings[ZT_BINDER_MAX_BINDINGS];
 	std::atomic<unsigned int> _bindingCount;
 	std::atomic<unsigned int> _bindingCount;

+ 194 - 1
osdep/Phy.hpp

@@ -27,6 +27,8 @@
 #ifndef ZT_PHY_HPP
 #ifndef ZT_PHY_HPP
 #define ZT_PHY_HPP
 #define ZT_PHY_HPP
 
 
+#include "../osdep/OSUtils.hpp"
+
 #include <stdio.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdlib.h>
 #include <string.h>
 #include <string.h>
@@ -86,6 +88,22 @@ namespace ZeroTier {
  */
  */
 typedef void PhySocket;
 typedef void PhySocket;
 
 
+struct link_test_record
+{
+	link_test_record(PhySocket *_s, uint64_t _id, uint64_t _egress_time, uint32_t _length) :
+		s(_s),
+		id(_id),
+		egress_time(_egress_time),
+		length(_length)
+	{
+		//
+	}
+	PhySocket *s;
+	uint64_t id;
+	uint64_t egress_time;
+	uint32_t length;
+};
+
 /**
 /**
  * Simple templated non-blocking sockets implementation
  * Simple templated non-blocking sockets implementation
  *
  *
@@ -154,10 +172,17 @@ private:
 
 
 	struct PhySocketImpl
 	struct PhySocketImpl
 	{
 	{
+		PhySocketImpl() :
+			throughput(0)
+		{
+			memset(ifname, 0, sizeof(ifname));
+		}
 		PhySocketType type;
 		PhySocketType type;
 		ZT_PHY_SOCKFD_TYPE sock;
 		ZT_PHY_SOCKFD_TYPE sock;
 		void *uptr; // user-settable pointer
 		void *uptr; // user-settable pointer
 		ZT_PHY_SOCKADDR_STORAGE_TYPE saddr; // remote for TCP_OUT and TCP_IN, local for TCP_LISTEN, RAW, and UDP
 		ZT_PHY_SOCKADDR_STORAGE_TYPE saddr; // remote for TCP_OUT and TCP_IN, local for TCP_LISTEN, RAW, and UDP
+		char ifname[16];
+		uint64_t throughput;
 	};
 	};
 
 
 	std::list<PhySocketImpl> _socks;
 	std::list<PhySocketImpl> _socks;
@@ -173,6 +198,7 @@ private:
 
 
 	bool _noDelay;
 	bool _noDelay;
 	bool _noCheck;
 	bool _noCheck;
+	std::vector<struct link_test_record*> link_test_records;
 
 
 public:
 public:
 	/**
 	/**
@@ -249,6 +275,173 @@ public:
 	 */
 	 */
 	static inline void** getuptr(PhySocket *s) throw() { return &(reinterpret_cast<PhySocketImpl *>(s)->uptr); }
 	static inline void** getuptr(PhySocket *s) throw() { return &(reinterpret_cast<PhySocketImpl *>(s)->uptr); }
 
 
+	/**
+	 * @param s Socket object
+	 * @param nameBuf Buffer to store name of interface which this Socket object is bound to
+	 * @param buflen Length of buffer to copy name into
+	 */
+	static inline void getIfName(PhySocket *s, char *nameBuf, int buflen)
+	{
+		memcpy(nameBuf, reinterpret_cast<PhySocketImpl *>(s)->ifname, buflen);
+	}
+
+	/**
+	 * @param s Socket object
+	 * @param ifname Buffer containing name of interface that this Socket object is bound to
+	 * @param len Length of name of interface
+	 */
+	static inline void setIfName(PhySocket *s, char *ifname, int len)
+	{
+		memcpy(&(reinterpret_cast<PhySocketImpl *>(s)->ifname), ifname, len);
+	}
+
+	/**
+	 * Get result of most recent throughput test
+	 *
+	 * @param s Socket object
+	 */
+	inline uint64_t getThroughput(PhySocket *s)
+	{
+		PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
+		return sws ? sws->throughput : 0;
+	}
+
+	/**
+	 * Whether or not the socket object is in a closed state
+	 *
+	 * @param s Socket object
+	 * @return true if socket is closed, false if otherwise
+	 */
+	inline bool isClosed(PhySocket *s)
+	{
+		PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
+		return sws->type == ZT_PHY_SOCKET_CLOSED;
+	}
+
+	/**
+	 * Get state of socket object
+	 *
+	 * @param s Socket object
+	 * @return State of socket
+	 */
+	inline int getState(PhySocket *s)
+	{
+		PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
+		return sws->type;
+	}
+
+	/**
+	 * In the event that this socket is erased, we need a way to convey to the multipath logic
+	 * that this path is no longer valid.
+	 *
+	 * @param s Socket object
+	 * @return Whether the state of this socket is within an acceptable range of values
+	 */
+	inline bool isValidState(PhySocket *s)
+	{
+		PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
+		return sws->type >= ZT_PHY_SOCKET_CLOSED && sws->type <= ZT_PHY_SOCKET_UNIX_LISTEN;
+	}
+
+	/**
+	 * Send a datagram of a known size to a selected peer and record egress time. The peer
+	 * shall eventually respond by echoing back a smaller datagram.
+	 *
+	 * @param s Socket object
+	 * @param remoteAddress Address of remote peer to receive link test packet
+	 * @param data Buffer containing random packet data
+	 * @param len Length of packet data buffer
+	 * @return Number of bytes successfully written to socket
+	 */
+	inline int test_link_speed(PhySocket *s, const struct sockaddr *to, void *data, uint32_t len) {
+		if (!reinterpret_cast<PhySocketImpl *>(s)) {
+			return 0;
+		}
+		uint64_t *buf = (uint64_t*)data;
+		uint64_t id = buf[0];
+		if (to->sa_family != AF_INET && to->sa_family != AF_INET6) {
+			return 0;
+		}
+		uint64_t egress_time = OSUtils::now();
+		PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
+#if defined(_WIN32) || defined(_WIN64)
+		return ((long)::sendto(sws->sock,reinterpret_cast<const char *>(data),len,0,to,(to->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in)) == (long)len);
+#else
+		int w = ::sendto(sws->sock,data,len,0,to,(to->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in));
+#endif
+		if (w > 0) {
+			link_test_records.push_back(new link_test_record(s, id, egress_time, len));
+		}
+		return w;
+	}
+
+	/**
+	 * Remove link speed test records which have timed-out and record a 0 bits/s measurement 
+	 */
+	inline void refresh_link_speed_records()
+	{
+		for(int i=0;i<link_test_records.size();i++) {
+			if(OSUtils::now() - link_test_records[i]->egress_time > ZT_LINK_TEST_TIMEOUT) {
+				PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(link_test_records[i]->s));
+				if (sws) {
+					sws->throughput = 0;
+				}
+				link_test_records.erase(link_test_records.begin() + i);
+			}
+		}
+	}
+
+	/**
+	 * Upon receipt of a link speed test datagram we echo back only the identification portion
+	 *
+	 * @param s Socket object
+	 * @param from Address of remote peer that sent this datagram 
+	 * @param data Buffer containing datagram's contents
+	 * @param len Length of datagram
+	 * @return Number of bytes successfully written to socket in response
+	 */
+	inline int respond_to_link_test(PhySocket *s,const struct sockaddr *from,void *data,unsigned long len) {
+		PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
+		uint64_t *id = (uint64_t*)data;
+#if defined(_WIN32) || defined(_WIN64)
+		return ((long)::sendto(sws->sock,reinterpret_cast<const char *>(data),len,0,from,(from->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in)) == (long)len);
+#else
+		int w = ::sendto(sws->sock,id,sizeof(id[0]),0,from,(from->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in));
+#endif
+		return w;
+	}
+
+	/**
+	 * Upon receipt of a response to our original link test datagram, correlate this new datagram with the record
+	 * of the one we sent. Compute the transit time and update the throughput field of the relevant socket. This
+	 * value will later be read by the path quality estimation logic located in Path.hpp.
+	 *
+	 * @param s Socket object
+	 * @param from Address of remote peer that sent this datagram
+	 * @param data Buffer containing datagram contents (ID of original link test datagram)
+	 * @param len Length of datagram
+	 * @return true if datagram correponded to previous record, false if otherwise
+	 */
+	inline bool handle_link_test_response(PhySocket *s,const struct sockaddr *from,void *data,unsigned long len) {
+		uint64_t *id = (uint64_t*)data;
+		for(int i=0;i<link_test_records.size();i++) {
+			if(link_test_records[i]->id == id[0]) {
+				float rtt  = (OSUtils::now()-link_test_records[i]->egress_time) / (float)1000; // s
+				uint32_t sz   = (link_test_records[i]->length) * 8; // bits
+				float transit_time = rtt / 2.0;
+				int64_t raw = sz / transit_time;
+				PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
+				if (sws) {
+					sws->throughput = raw;
+				}
+				delete link_test_records[i];
+				link_test_records.erase(link_test_records.begin() + i);
+				return true;
+			}
+		}
+		return false;
+	}
+
 	/**
 	/**
 	 * Cause poll() to stop waiting immediately
 	 * Cause poll() to stop waiting immediately
 	 *
 	 *
@@ -985,7 +1178,7 @@ public:
 					ZT_PHY_SOCKFD_TYPE sock = s->sock; // if closed, s->sock becomes invalid as s is no longer dereferencable
 					ZT_PHY_SOCKFD_TYPE sock = s->sock; // if closed, s->sock becomes invalid as s is no longer dereferencable
 					if ((FD_ISSET(sock,&wfds))&&(FD_ISSET(sock,&_writefds))) {
 					if ((FD_ISSET(sock,&wfds))&&(FD_ISSET(sock,&_writefds))) {
 						try {
 						try {
-							_handler->phyOnUnixWritable((PhySocket *)&(*s),&(s->uptr),false);
+							_handler->phyOnUnixWritable((PhySocket *)&(*s),&(s->uptr));
 						} catch ( ... ) {}
 						} catch ( ... ) {}
 					}
 					}
 					if (FD_ISSET(sock,&rfds)) {
 					if (FD_ISSET(sock,&rfds)) {

+ 53 - 3
service/OneService.cpp

@@ -37,6 +37,7 @@
 
 
 #include "../version.h"
 #include "../version.h"
 #include "../include/ZeroTierOne.h"
 #include "../include/ZeroTierOne.h"
+#include "../include/ZeroTierDebug.h"
 
 
 #include "../node/Constants.hpp"
 #include "../node/Constants.hpp"
 #include "../node/Mutex.hpp"
 #include "../node/Mutex.hpp"
@@ -417,6 +418,7 @@ public:
 	PhySocket *_localControlSocket6;
 	PhySocket *_localControlSocket6;
 	bool _updateAutoApply;
 	bool _updateAutoApply;
 	bool _allowTcpFallbackRelay;
 	bool _allowTcpFallbackRelay;
+	unsigned int _multipathMode;
 	unsigned int _primaryPort;
 	unsigned int _primaryPort;
 	volatile unsigned int _udpPortPickerCounter;
 	volatile unsigned int _udpPortPickerCounter;
 
 
@@ -455,6 +457,9 @@ public:
 	// Last potential sleep/wake event
 	// Last potential sleep/wake event
 	uint64_t _lastRestart;
 	uint64_t _lastRestart;
 
 
+	// Last time link throughput was tested
+	uint64_t _lastLinkSpeedTest;
+
 	// Deadline for the next background task service function
 	// Deadline for the next background task service function
 	volatile int64_t _nextBackgroundTaskDeadline;
 	volatile int64_t _nextBackgroundTaskDeadline;
 
 
@@ -818,6 +823,7 @@ public:
 			_lastRestart = clockShouldBe;
 			_lastRestart = clockShouldBe;
 			int64_t lastTapMulticastGroupCheck = 0;
 			int64_t lastTapMulticastGroupCheck = 0;
 			int64_t lastBindRefresh = 0;
 			int64_t lastBindRefresh = 0;
+			int64_t lastMultipathModeUpdate = 0;
 			int64_t lastUpdateCheck = clockShouldBe;
 			int64_t lastUpdateCheck = clockShouldBe;
 			int64_t lastCleanedPeersDb = 0;
 			int64_t lastCleanedPeersDb = 0;
 			int64_t lastLocalInterfaceAddressCheck = (clockShouldBe - ZT_LOCAL_INTERFACE_CHECK_INTERVAL) + 15000; // do this in 15s to give portmapper time to configure and other things time to settle
 			int64_t lastLocalInterfaceAddressCheck = (clockShouldBe - ZT_LOCAL_INTERFACE_CHECK_INTERVAL) + 15000; // do this in 15s to give portmapper time to configure and other things time to settle
@@ -849,8 +855,9 @@ public:
 						_updater->apply();
 						_updater->apply();
 				}
 				}
 
 
-				// Refresh bindings in case device's interfaces have changed, and also sync routes to update any shadow routes (e.g. shadow default)
-				if (((now - lastBindRefresh) >= ZT_BINDER_REFRESH_PERIOD)||(restarted)) {
+				// Refresh bindings
+				int interfaceRefreshPeriod = _multipathMode ? ZT_MULTIPATH_BINDER_REFRESH_PERIOD : ZT_BINDER_REFRESH_PERIOD;
+				if (((now - lastBindRefresh) >= interfaceRefreshPeriod)||(restarted)) {
 					lastBindRefresh = now;
 					lastBindRefresh = now;
 					unsigned int p[3];
 					unsigned int p[3];
 					unsigned int pc = 0;
 					unsigned int pc = 0;
@@ -867,6 +874,34 @@ public:
 						}
 						}
 					}
 					}
 				}
 				}
+				// Update multipath mode (if needed)
+				if (((now - lastMultipathModeUpdate) >= interfaceRefreshPeriod)||(restarted)) {
+					lastMultipathModeUpdate = now;
+					_node->setMultipathMode(_multipathMode);
+				}
+
+				// Test link speeds
+				// TODO: This logic should eventually find its way into the core or as part of a passive
+				// measure within the protocol.
+				if (_multipathMode && ((now - _lastLinkSpeedTest) >= ZT_LINK_SPEED_TEST_INTERVAL)) {
+					_phy.refresh_link_speed_records();
+					_lastLinkSpeedTest = now;
+					// Generate random data to fill UDP packet
+					uint64_t pktBuf[ZT_LINK_TEST_DATAGRAM_SZ / sizeof(uint64_t)];
+					Utils::getSecureRandom(pktBuf, ZT_LINK_TEST_DATAGRAM_SZ);
+					ZT_PeerList *pl = _node->peers();
+					// get bindings (specifically just the sockets)
+					std::vector<PhySocket*> sockets = _binder.getBoundSockets();
+					// interfaces
+					for (int i=0; i<ZT_BINDER_MAX_BINDINGS; i++) {
+						for(int j=0;j<pl->peerCount;++j) {
+							for (int k=0; k<(ZT_MAX_PEER_NETWORK_PATHS/4); k++) {
+								Utils::getSecureRandom(pktBuf, 8); // generate one random integer for unique id
+								_phy.test_link_speed(sockets[i], (struct sockaddr*)&(pl->peers[j].paths[k].address), pktBuf, ZT_LINK_TEST_DATAGRAM_SZ);
+							}
+						}
+					}
+				}
 
 
 				// Run background task processor in core if it's time to do so
 				// Run background task processor in core if it's time to do so
 				int64_t dl = _nextBackgroundTaskDeadline;
 				int64_t dl = _nextBackgroundTaskDeadline;
@@ -1190,6 +1225,7 @@ public:
 					json &settings = res["config"]["settings"];
 					json &settings = res["config"]["settings"];
 					settings["primaryPort"] = OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff;
 					settings["primaryPort"] = OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff;
 					settings["allowTcpFallbackRelay"] = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],_allowTcpFallbackRelay);
 					settings["allowTcpFallbackRelay"] = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],_allowTcpFallbackRelay);
+					settings["multipathMode"] = OSUtils::jsonInt(settings["multipathMode"],_multipathMode);
 #ifdef ZT_USE_MINIUPNPC
 #ifdef ZT_USE_MINIUPNPC
 					settings["portMappingEnabled"] = OSUtils::jsonBool(settings["portMappingEnabled"],true);
 					settings["portMappingEnabled"] = OSUtils::jsonBool(settings["portMappingEnabled"],true);
 #else
 #else
@@ -1518,6 +1554,11 @@ public:
 
 
 		_primaryPort = (unsigned int)OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff;
 		_primaryPort = (unsigned int)OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff;
 		_allowTcpFallbackRelay = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],true);
 		_allowTcpFallbackRelay = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],true);
+		_multipathMode = OSUtils::jsonInt(settings["multipathMode"],0);
+		if (_multipathMode != 0 && _allowTcpFallbackRelay) {
+			fprintf(stderr,"WARNING: multipathMode cannot be used with allowTcpFallbackRelay. Disabling allowTcpFallbackRelay");
+			_allowTcpFallbackRelay = false;
+		}
 		_portMappingEnabled = OSUtils::jsonBool(settings["portMappingEnabled"],true);
 		_portMappingEnabled = OSUtils::jsonBool(settings["portMappingEnabled"],true);
 
 
 		json &ignoreIfs = settings["interfacePrefixBlacklist"];
 		json &ignoreIfs = settings["interfacePrefixBlacklist"];
@@ -1758,6 +1799,15 @@ public:
 
 
 	inline void phyOnDatagram(PhySocket *sock,void **uptr,const struct sockaddr *localAddr,const struct sockaddr *from,void *data,unsigned long len)
 	inline void phyOnDatagram(PhySocket *sock,void **uptr,const struct sockaddr *localAddr,const struct sockaddr *from,void *data,unsigned long len)
 	{
 	{
+		if (_multipathMode) {
+			// Handle link test packets (should eventually be moved into the protocol itself)
+			if (len == ZT_LINK_TEST_DATAGRAM_SZ) {
+				_phy.respond_to_link_test(sock, from, data, len);
+			}
+			if (len == ZT_LINK_TEST_DATAGRAM_RESPONSE_SZ) {
+				_phy.handle_link_test_response(sock, from, data, len);
+			}
+		}
 		if ((len >= 16)&&(reinterpret_cast<const InetAddress *>(from)->ipScope() == InetAddress::IP_SCOPE_GLOBAL))
 		if ((len >= 16)&&(reinterpret_cast<const InetAddress *>(from)->ipScope() == InetAddress::IP_SCOPE_GLOBAL))
 			_lastDirectReceiveFromGlobal = OSUtils::now();
 			_lastDirectReceiveFromGlobal = OSUtils::now();
 		const ZT_ResultCode rc = _node->processWirePacket(
 		const ZT_ResultCode rc = _node->processWirePacket(
@@ -2007,7 +2057,7 @@ public:
 	inline void phyOnUnixAccept(PhySocket *sockL,PhySocket *sockN,void **uptrL,void **uptrN) {}
 	inline void phyOnUnixAccept(PhySocket *sockL,PhySocket *sockN,void **uptrL,void **uptrN) {}
 	inline void phyOnUnixClose(PhySocket *sock,void **uptr) {}
 	inline void phyOnUnixClose(PhySocket *sock,void **uptr) {}
 	inline void phyOnUnixData(PhySocket *sock,void **uptr,void *data,unsigned long len) {}
 	inline void phyOnUnixData(PhySocket *sock,void **uptr,void *data,unsigned long len) {}
-	inline void phyOnUnixWritable(PhySocket *sock,void **uptr,bool lwip_invoked) {}
+	inline void phyOnUnixWritable(PhySocket *sock,void **uptr) {}
 
 
 	inline int nodeVirtualNetworkConfigFunction(uint64_t nwid,void **nuptr,enum ZT_VirtualNetworkConfigOperation op,const ZT_VirtualNetworkConfig *nwc)
 	inline int nodeVirtualNetworkConfigFunction(uint64_t nwid,void **nuptr,enum ZT_VirtualNetworkConfigOperation op,const ZT_VirtualNetworkConfig *nwc)
 	{
 	{