Kaynağa Gözat

Add Bonds, Slaves, and Flows

Joseph Henry 5 yıl önce
ebeveyn
işleme
a50e8e9878

+ 120 - 38
include/ZeroTierOne.h

@@ -415,55 +415,128 @@ enum ZT_ResultCode
  */
 #define ZT_ResultCode_isFatal(x) ((((int)(x)) >= 100)&&(((int)(x)) < 1000))
 
+
 /**
- * The multipath algorithm in use by this node.
+ *  Multipath bonding policy
  */
-enum ZT_MultipathMode
+enum ZT_MultipathBondingPolicy
 {
 	/**
-	 * No fault tolerance or balancing.
+	 * Normal operation. No fault tolerance, no load balancing
 	 */
-	ZT_MULTIPATH_NONE = 0,
+	ZT_BONDING_POLICY_NONE = 0,
 
 	/**
-	 * Sends traffic out on all paths.
+	 * Sends traffic out on only one path at a time. Configurable immediate
+	 * fail-over.
 	 */
-	ZT_MULTIPATH_BROADCAST = 1,
+	ZT_BONDING_POLICY_ACTIVE_BACKUP = 1,
 
 	/**
-	 * Sends traffic out on only one path at a time. Immediate fail-over.
+	 * Sends traffic out on all paths
 	 */
-	ZT_MULTIPATH_ACTIVE_BACKUP= 2,
+	ZT_BONDING_POLICY_BROADCAST = 2,
 
 	/**
-	 * Sends traffic out on all interfaces according to a uniform random distribution.
+	 * Stripes packets across all paths
 	 */
-	ZT_MULTIPATH_BALANCE_RANDOM = 3,
+	ZT_BONDING_POLICY_BALANCE_RR = 3,
 
 	/**
-	 * Stripes packets across all paths.
+	 * Packets destined for specific peers will always be sent over the same
+	 * path.
 	 */
-	ZT_MULTIPATH_BALANCE_RR_OPAQUE = 4,
+	ZT_BONDING_POLICY_BALANCE_XOR = 4,
 
 	/**
-	 * Balances flows across all paths.
+	 * Balances flows among all paths according to path performance
 	 */
-	ZT_MULTIPATH_BALANCE_RR_FLOW = 5,
+	ZT_BONDING_POLICY_BALANCE_AWARE = 5
+};
 
+/**
+ * Multipath active re-selection policy (slaveSelectMethod)
+ */
+enum ZT_MultipathSlaveSelectMethod
+{
 	/**
-	 * Hashes flows across all paths.
+	 * Primary slave regains status as active slave whenever it comes back up
+	 * (default when slaves are explicitly specified)
 	 */
-	ZT_MULTIPATH_BALANCE_XOR_FLOW = 6,
+	ZT_MULTIPATH_RESELECTION_POLICY_ALWAYS = 0,
 
 	/**
-	 * Balances traffic across all paths according to observed performance.
+	 * Primary slave regains status as active slave when it comes back up and
+	 * (if) it is better than the currently-active slave.
 	 */
-	ZT_MULTIPATH_BALANCE_DYNAMIC_OPAQUE = 7,
+	ZT_MULTIPATH_RESELECTION_POLICY_BETTER = 1,
 
 	/**
-	 * Balances flows across all paths.
+	 * Primary slave regains status as active slave only if the currently-active
+	 * slave fails.
 	 */
-	ZT_MULTIPATH_BALANCE_DYNAMIC_FLOW = 8,
+	ZT_MULTIPATH_RESELECTION_POLICY_FAILURE = 2,
+
+	/**
+	 * The primary slave can change if a superior path is detected.
+	 * (default if user provides no fail-over guidance)
+	 */
+	ZT_MULTIPATH_RESELECTION_POLICY_OPTIMIZE = 3
+};
+
+/**
+ * Mode of multipath slave interface
+ */
+enum ZT_MultipathSlaveMode
+{
+	ZT_MULTIPATH_SLAVE_MODE_PRIMARY = 0,
+	ZT_MULTIPATH_SLAVE_MODE_SPARE = 1
+};
+
+/**
+ * Strategy for path monitoring
+ */
+enum ZT_MultipathMonitorStrategy
+{
+	/**
+	 * Use bonding policy's default strategy
+	 */
+	ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_DEFAULT = 0,
+
+	/**
+	 * Does not actively send probes to judge aliveness, will rely
+	 * on conventional traffic and summary statistics.
+	 */
+	ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_PASSIVE = 1,
+
+	/**
+	 * Sends probes at a constant rate to judge aliveness.
+	 */
+	ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_ACTIVE = 2,
+
+	/**
+	 * Sends probes at varying rates which correlate to native
+	 * traffic loads to judge aliveness.
+	 */
+	ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_DYNAMIC = 3
+};
+
+/**
+ * Indices for the path quality weight vector
+ */
+enum ZT_MultipathQualityWeightIndex
+{
+	ZT_QOS_LAT_IDX,
+	ZT_QOS_LTM_IDX,
+	ZT_QOS_PDV_IDX,
+	ZT_QOS_PLR_IDX,
+	ZT_QOS_PER_IDX,
+	ZT_QOS_THR_IDX,
+	ZT_QOS_THM_IDX,
+	ZT_QOS_THV_IDX,
+	ZT_QOS_AGE_IDX,
+	ZT_QOS_SCP_IDX,
+	ZT_QOS_WEIGHT_SIZE
 };
 
 /**
@@ -1272,44 +1345,49 @@ typedef struct
 	uint64_t trustedPathId;
 
 	/**
-	 * One-way latency
+	 * Mean latency
+	 */
+	float latencyMean;
+
+	/**
+	 * Maximum observed latency
 	 */
-	float latency;
+	float latencyMax;
 
 	/**
-	 * How much latency varies over time
+	 * Variance of latency
 	 */
-	float packetDelayVariance;
+	float latencyVariance;
 
 	/**
-	 * How much observed throughput varies over time
+	 * Packet loss ratio
 	 */
-	float throughputDisturbCoeff;
+	float packetLossRatio;
 
 	/**
-	 * Packet Error Ratio (PER)
+	 * Packet error ratio
 	 */
 	float packetErrorRatio;
 
 	/**
-	 * Packet Loss Ratio (PLR)
+	 * Mean throughput
 	 */
-	float packetLossRatio;
+	uint64_t throughputMean;
 
 	/**
-	 * Stability of the path
+	 * Maximum observed throughput
 	 */
-	float stability;
+	float throughputMax;
 
 	/**
-	 * Current throughput (moving average)
+	 * Throughput variance
 	 */
-	uint64_t throughput;
+	float throughputVariance;
 
 	/**
-	 * Maximum observed throughput for this path
+	 * Address scope
 	 */
-	uint64_t maxThroughput;
+	uint8_t scope;
 
 	/**
 	 * Percentage of traffic allocated to this path
@@ -1319,7 +1397,9 @@ typedef struct
 	/**
 	 * Name of physical interface (for monitoring)
 	 */
-	char *ifname;
+	char ifname[32];
+
+	uint64_t localSocket;
 
 	/**
 	 * Is path expired?
@@ -1373,9 +1453,11 @@ typedef struct
 	unsigned int pathCount;
 
 	/**
-	 * Whether this peer was ever reachable via an aggregate link
+	 * Whether multiple paths to this peer are bonded
 	 */
-	bool hadAggregateLink;
+	bool isBonded;
+
+	int bondingPolicy;
 
 	/**
 	 * Known network paths to peer

+ 1730 - 0
node/Bond.cpp

@@ -0,0 +1,1730 @@
+/*
+ * Copyright (c)2013-2020 ZeroTier, Inc.
+ *
+ * Use of this software is governed by the Business Source License included
+ * in the LICENSE.TXT file in the project's root directory.
+ *
+ * Change Date: 2024-01-01
+ *
+ * On the date above, in accordance with the Business Source License, use
+ * of this software will be governed by version 2.0 of the Apache License.
+ */
+/****/
+
+#include <cmath>
+
+#include "Peer.hpp"
+#include "Bond.hpp"
+#include "Switch.hpp"
+#include "Flow.hpp"
+#include "Path.hpp"
+
+namespace ZeroTier {
+
+Bond::Bond(const RuntimeEnvironment *renv, int policy, const SharedPtr<Peer>& peer) :
+	RR(renv),
+	_peer(peer)
+{
+	setReasonableDefaults(policy);
+	_policyAlias = BondController::getPolicyStrByCode(policy);
+}
+
+Bond::Bond(std::string& basePolicy, std::string& policyAlias, const SharedPtr<Peer>& peer) :
+	_policyAlias(policyAlias),
+	_peer(peer)
+{
+	setReasonableDefaults(BondController::getPolicyCodeByStr(basePolicy));
+}
+
+Bond::Bond(const RuntimeEnvironment *renv, const Bond &originalBond, const SharedPtr<Peer>& peer) :
+	RR(renv),
+	_peer(peer)
+{
+	// First, set everything to sane defaults
+	setReasonableDefaults(originalBond._bondingPolicy);
+	_policyAlias = originalBond._policyAlias;
+	// Second, apply user specified values (only if they make sense)
+	_downDelay = originalBond._downDelay;
+	_upDelay = originalBond._upDelay;
+	if (originalBond._bondMonitorInterval > 0 && originalBond._bondMonitorInterval < 65535) {
+		_bondMonitorInterval = originalBond._bondMonitorInterval;
+	}
+	else {
+		fprintf(stderr, "warning: bondMonitorInterval (%d) is out of range, using default (%d)\n", originalBond._bondMonitorInterval, _bondMonitorInterval);
+	}
+	if (originalBond._slaveMonitorStrategy == ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_PASSIVE
+		&& originalBond._failoverInterval != 0) {
+		fprintf(stderr, "warning: passive path monitoring was specified, this will prevent failovers from happening in a timely manner.\n");
+	}
+	_abSlaveSelectMethod = originalBond._abSlaveSelectMethod;
+	memcpy(_qualityWeights, originalBond._qualityWeights, ZT_QOS_WEIGHT_SIZE * sizeof(float));
+}
+
+void Bond::nominatePath(const SharedPtr<Path>& path, int64_t now)
+{
+	char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "nominatePath: %s %s\n", getSlave(path)->ifname().c_str(), pathStr);
+	Mutex::Lock _l(_paths_m);
+	if (!RR->bc->slaveAllowed(_policyAlias, getSlave(path))) {
+		return;
+	}
+	bool alreadyPresent = false;
+	for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+		if (path.ptr() == _paths[i].ptr()) {
+			fprintf(stderr, "previously encountered path, not notifying bond (%s)\n", pathStr);
+			alreadyPresent = true;
+			break;
+		}
+	}
+	if (!alreadyPresent) {
+		for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+			if (!_paths[i]) {
+				fprintf(stderr, "notifyOfNewPath(): Setting path %s to idx=%d\n", pathStr, i);
+				_paths[i] = path;
+				//_paths[i]->slave = RR->bc->getSlaveBySocket(_policyAlias, path->localSocket());
+				_paths[i]->startTrial(now);
+				break;
+			}
+		}
+	}
+	curateBond(now, true);
+	estimatePathQuality(now);
+}
+
+SharedPtr<Path> Bond::getAppropriatePath(int64_t now, int32_t flowId)
+{
+	Mutex::Lock _l(_paths_m);
+	/**
+	 * active-backup
+	 */
+	if (_bondingPolicy== ZT_BONDING_POLICY_ACTIVE_BACKUP) {
+		if (_abPath) {
+			return _abPath;
+		}
+	}
+	/**
+	 * broadcast
+	 */
+	if (_bondingPolicy== ZT_BONDING_POLICY_BROADCAST) {
+		return SharedPtr<Path>(); // Handled in Switch::_trySend()
+	}
+	if (!_numBondedPaths) {
+		return SharedPtr<Path>(); // No paths assigned to bond yet, cannot balance traffic
+	}
+	/**
+	 * balance-rr
+	 */
+	if (_bondingPolicy== ZT_BONDING_POLICY_BALANCE_RR) {
+		if (!_allowFlowHashing) {
+			//fprintf(stderr, "_rrPacketsSentOnCurrSlave=%d, _numBondedPaths=%d, _rrIdx=%d\n", _rrPacketsSentOnCurrSlave, _numBondedPaths, _rrIdx);
+			if (_packetsPerSlave == 0) {
+				// Randomly select a path
+				return _paths[_bondedIdx[_freeRandomByte % _numBondedPaths]]; // TODO: Optimize
+			}
+			if (_rrPacketsSentOnCurrSlave < _packetsPerSlave) {
+				// Continue to use this slave
+				++_rrPacketsSentOnCurrSlave;
+				return _paths[_bondedIdx[_rrIdx]];
+			}
+			// Reset striping counter
+			_rrPacketsSentOnCurrSlave = 0;
+			if (_numBondedPaths == 1) {
+				_rrIdx = 0;
+			}
+			else {
+				int _tempIdx = _rrIdx;
+				for (int searchCount = 0; searchCount < (_numBondedPaths-1); searchCount++) {
+					_tempIdx = (_tempIdx == (_numBondedPaths-1)) ? 0 : _tempIdx+1;
+					if (_paths[_bondedIdx[_tempIdx]] && _paths[_bondedIdx[_tempIdx]]->eligible(now,_ackSendInterval)) {
+						_rrIdx = _tempIdx;
+						break;
+					}
+				}
+			}
+			//fprintf(stderr, "resultant _rrIdx=%d\n", _rrIdx);
+			if (_paths[_bondedIdx[_rrIdx]]) {
+				return _paths[_bondedIdx[_rrIdx]];
+			}
+		}
+	}
+	/**
+	 * balance-xor
+	 */
+	if (_bondingPolicy== ZT_BONDING_POLICY_BALANCE_XOR || _bondingPolicy== ZT_BONDING_POLICY_BALANCE_AWARE) {
+		if (!_allowFlowHashing || flowId == -1) {
+			// No specific path required for unclassified traffic, send on anything
+			return _paths[_bondedIdx[_freeRandomByte % _numBondedPaths]]; // TODO: Optimize
+		}
+		else if (_allowFlowHashing) {
+			// TODO: Optimize
+			Mutex::Lock _l(_flows_m);
+			SharedPtr<Flow> flow;
+			if (_flows.count(flowId)) {
+				flow = _flows[flowId];
+				flow->updateActivity(now);
+			}
+			else {
+				unsigned char entropy;
+				Utils::getSecureRandom(&entropy, 1);
+				flow = createFlow(SharedPtr<Path>(), flowId, entropy, now);
+			}
+			if (flow) {
+				return flow->assignedPath();
+			}
+		}
+	}
+	return SharedPtr<Path>();
+}
+
+void Bond::recordIncomingInvalidPacket(const SharedPtr<Path>& path)
+{
+	//char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "recordIncomingInvalidPacket() %s %s\n", getSlave(path)->ifname().c_str(), pathStr);
+	Mutex::Lock _l(_paths_m);
+	for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+		if (_paths[i] == path) {
+			_paths[i]->packetValiditySamples.push(false);
+		}
+	}
+}
+
+void Bond::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
+	uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now)
+{
+	//char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "recordOutgoingPacket() %s %s, packetId=%llx, payloadLength=%d, verb=%x, flowId=%lx\n", getSlave(path)->ifname().c_str(), pathStr, packetId, payloadLength, verb, flowId);
+	_freeRandomByte += (unsigned char)(packetId >> 8); // Grab entropy to use in path selection logic
+	if (!_shouldCollectPathStatistics) {
+		return;
+	}
+	bool isFrame = (verb == Packet::VERB_FRAME || verb == Packet::VERB_EXT_FRAME);
+	bool shouldRecord = (packetId & (ZT_QOS_ACK_DIVISOR - 1)
+		&& (verb != Packet::VERB_ACK)
+		&& (verb != Packet::VERB_QOS_MEASUREMENT));
+	if (isFrame || shouldRecord) {
+		Mutex::Lock _l(_paths_m);
+		if (isFrame) {
+			++(path->_packetsOut);
+			_lastFrame=now;
+		}
+		if (shouldRecord) {
+			path->_unackedBytes += payloadLength;
+			// Take note that we're expecting a VERB_ACK on this path as of a specific time
+			if (path->qosStatsOut.size() < ZT_QOS_MAX_OUTSTANDING_RECORDS) {
+				path->qosStatsOut[packetId] = now;
+			}
+		}
+	}
+	if (_allowFlowHashing) {
+		if (_allowFlowHashing && (flowId != ZT_QOS_NO_FLOW)) {
+			Mutex::Lock _l(_flows_m);
+			if (_flows.count(flowId)) {
+				_flows[flowId]->recordOutgoingBytes(payloadLength);
+			}
+		}
+	}
+}
+
+void Bond::recordIncomingPacket(const SharedPtr<Path>& path, uint64_t packetId, uint16_t payloadLength,
+	Packet::Verb verb, int32_t flowId, int64_t now)
+{
+	//char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "recordIncomingPacket() %s %s, packetId=%llx, payloadLength=%d, verb=%x, flowId=%lx\n", getSlave(path)->ifname().c_str(), pathStr, packetId, payloadLength, verb, flowId);
+	bool isFrame = (verb == Packet::VERB_FRAME || verb == Packet::VERB_EXT_FRAME);
+	bool shouldRecord = (packetId & (ZT_QOS_ACK_DIVISOR - 1)
+		&& (verb != Packet::VERB_ACK)
+		&& (verb != Packet::VERB_QOS_MEASUREMENT));
+	if (isFrame || shouldRecord) {
+		Mutex::Lock _l(_paths_m);
+		if (isFrame) {
+			++(path->_packetsIn);
+			_lastFrame=now;
+		}
+		if (shouldRecord) {
+			path->ackStatsIn[packetId] = payloadLength;
+			++(path->_packetsReceivedSinceLastAck);
+			path->qosStatsIn[packetId] = now;
+			++(path->_packetsReceivedSinceLastQoS);
+			path->packetValiditySamples.push(true);
+		}
+	}
+	/**
+	 * Learn new flows and pro-actively create entries for them in the bond so
+	 * that the next time we send a packet out that is part of a flow we know 
+	 * which path to use.
+	 */
+	if ((flowId != ZT_QOS_NO_FLOW)
+		&& (_bondingPolicy== ZT_BONDING_POLICY_BALANCE_RR
+			|| _bondingPolicy== ZT_BONDING_POLICY_BALANCE_XOR
+			|| _bondingPolicy== ZT_BONDING_POLICY_BALANCE_AWARE)) {
+		Mutex::Lock _l(_flows_m);
+		SharedPtr<Flow> flow;
+		if (!_flows.count(flowId)) {
+			flow = createFlow(path, flowId, 0, now);
+		} else {
+			flow = _flows[flowId];
+		}
+		if (flow) {
+			flow->recordIncomingBytes(payloadLength);
+		}
+	}
+}
+
+void Bond::receivedQoS(const SharedPtr<Path>& path, int64_t now, int count, uint64_t *rx_id, uint16_t *rx_ts)
+{
+	//char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "receivedQoS() %s %s\n", getSlave(path)->ifname().c_str(), pathStr);
+	Mutex::Lock _l(_paths_m);
+	// Look up egress times and compute latency values for each record
+	std::map<uint64_t,uint64_t>::iterator it;
+	for (int j=0; j<count; j++) {
+		it = path->qosStatsOut.find(rx_id[j]);
+		if (it != path->qosStatsOut.end()) {
+			path->latencySamples.push(((uint16_t)(now - it->second) - rx_ts[j]) / 2);
+			path->qosStatsOut.erase(it);
+		}
+	}
+	path->qosRecordSize.push(count);
+	//char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "receivedQoS() on path %s %s, count=%d, successful=%d, qosStatsOut.size()=%d\n", getSlave(path)->ifname().c_str(), pathStr, count, path->aknowledgedQoSRecordCountSinceLastCheck, path->qosStatsOut.size());
+}
+
+void Bond::receivedAck(const SharedPtr<Path>& path, int64_t now, int32_t ackedBytes)
+{
+	Mutex::Lock _l(_paths_m);
+	//char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "receivedAck() %s %s, (ackedBytes=%d, lastAckReceived=%lld, ackAge=%lld)\n", getSlave(path)->ifname().c_str(), pathStr, ackedBytes, path->lastAckReceived, path->ackAge(now));
+	path->_lastAckReceived = now;
+	path->_unackedBytes = (ackedBytes > path->_unackedBytes) ? 0 : path->_unackedBytes - ackedBytes;
+	int64_t timeSinceThroughputEstimate = (now - path->_lastThroughputEstimation);
+	if (timeSinceThroughputEstimate >= throughputMeasurementInterval) {
+		// TODO: See if this floating point math can be reduced
+		uint64_t throughput = (uint64_t)((float)(path->_bytesAckedSinceLastThroughputEstimation) / ((float)timeSinceThroughputEstimate / (float)1000));
+		throughput /= 1000;
+		if (throughput > 0.0) {
+			path->throughputSamples.push(throughput);
+			path->_throughputMax = throughput > path->_throughputMax ? throughput : path->_throughputMax;
+		}
+		path->_lastThroughputEstimation = now;
+		path->_bytesAckedSinceLastThroughputEstimation = 0;
+	} else {
+		path->_bytesAckedSinceLastThroughputEstimation += ackedBytes;
+	}
+}
+
+int32_t Bond::generateQoSPacket(const SharedPtr<Path>& path, int64_t now, char *qosBuffer)
+{
+	//char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "generateQoSPacket() %s %s\n", getSlave(path)->ifname().c_str(), pathStr);
+	int32_t len = 0;
+	std::map<uint64_t,uint64_t>::iterator it = path->qosStatsIn.begin();
+	int i=0;
+	int numRecords = std::min(path->_packetsReceivedSinceLastQoS,ZT_QOS_TABLE_SIZE);
+	while (i<numRecords && it != path->qosStatsIn.end()) {
+		uint64_t id = it->first;
+		memcpy(qosBuffer, &id, sizeof(uint64_t));
+		qosBuffer+=sizeof(uint64_t);
+		uint16_t holdingTime = (uint16_t)(now - it->second);
+		memcpy(qosBuffer, &holdingTime, sizeof(uint16_t));
+		qosBuffer+=sizeof(uint16_t);
+		len+=sizeof(uint64_t)+sizeof(uint16_t);
+		path->qosStatsIn.erase(it++);
+		++i;
+	}
+	return len;
+}
+
+bool Bond::assignFlowToBondedPath(SharedPtr<Flow> &flow, int64_t now)
+{
+	//fprintf(stderr, "assignFlowToBondedPath\n");
+	char curPathStr[128];
+	unsigned int idx = ZT_MAX_PEER_NETWORK_PATHS;
+	if (_bondingPolicy == ZT_BONDING_POLICY_BALANCE_XOR) {
+		idx = abs((int)(flow->id() % (_numBondedPaths)));
+		flow->assignPath(_paths[_bondedIdx[idx]],now);
+	}
+	if (_bondingPolicy == ZT_BONDING_POLICY_BALANCE_AWARE) {
+		unsigned char entropy;
+		Utils::getSecureRandom(&entropy, 1);
+		if (_totalBondUnderload) {
+			entropy %= _totalBondUnderload;
+		}
+		if (!_numBondedPaths) {
+			fprintf(stderr, "no bonded paths for flow assignment\n");
+			return false;
+		}
+		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i] && _paths[i]->bonded()) {
+				SharedPtr<Slave> slave = RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+				_paths[i]->address().toString(curPathStr);
+				uint8_t probabilitySegment = (_totalBondUnderload > 0) ? _paths[i]->_affinity : _paths[i]->_allocation;
+				//fprintf(stderr, "i=%2d, entropy=%3d, alloc=%3d, byteload=%4d, segment=%3d, _totalBondUnderload=%3d, ifname=%s, path=%20s\n", i, entropy, _paths[i]->allocation, _paths[i]->relativeByteLoad, probabilitySegment, _totalBondUnderload, slave->ifname().c_str(), curPathStr);
+				if (entropy <= probabilitySegment) {
+					idx = i;
+					//fprintf(stderr, "\t is best path\n");
+					break;
+				}
+				entropy -= probabilitySegment;
+			}
+		}
+		if (idx < ZT_MAX_PEER_NETWORK_PATHS) {
+			flow->assignPath(_paths[idx],now);
+			++(_paths[idx]->_assignedFlowCount);
+		}
+		else {
+			fprintf(stderr, "could not assign flow?\n"); exit(0); // TODO: Remove
+			return false;
+		}
+	}
+	flow->assignedPath()->address().toString(curPathStr);
+	SharedPtr<Slave> slave = RR->bc->getSlaveBySocket(_policyAlias, flow->assignedPath()->localSocket());
+	fprintf(stderr, "assigned (tx) flow %x with peer %llx to path %s on %s (idx=%d)\n", flow->id(), _peer->_id.address().toInt(), curPathStr, slave->ifname().c_str(), idx);
+	return true;
+}
+
+SharedPtr<Flow> Bond::createFlow(const SharedPtr<Path> &path, int32_t flowId, unsigned char entropy, int64_t now)
+{
+	//fprintf(stderr, "createFlow\n");
+	char curPathStr[128];
+	// ---
+	if (!_numBondedPaths) {
+		fprintf(stderr, "there are no bonded paths, cannot assign flow\n");
+		return SharedPtr<Flow>();
+	}
+	if (_flows.size() >= ZT_FLOW_MAX_COUNT) {
+		fprintf(stderr, "max number of flows reached (%d), forcibly forgetting oldest flow\n", ZT_FLOW_MAX_COUNT);
+        forgetFlowsWhenNecessary(0,true,now);
+	}
+	SharedPtr<Flow> flow = new Flow(flowId, now);
+	_flows[flowId] = flow;
+	fprintf(stderr, "new flow %x detected with peer %llx, %lu active flow(s)\n", flowId, _peer->_id.address().toInt(), (_flows.size()));
+	/**
+	 * Add a flow with a given Path already provided. This is the case when a packet
+	 * is received on a path but no flow exists, in this case we simply assign the path
+	 * that the remote peer chose for us.
+	 */
+	if (path) {
+		flow->assignPath(path,now);
+		path->address().toString(curPathStr);
+		SharedPtr<Slave> slave = RR->bc->getSlaveBySocket(_policyAlias, flow->assignedPath()->localSocket());
+		fprintf(stderr, "assigned (rx) flow %x with peer %llx to path %s on %s\n", flow->id(), _peer->_id.address().toInt(), curPathStr, slave->ifname().c_str());
+	}
+	/**
+	 * Add a flow when no path was provided. This means that it is an outgoing packet
+	 * and that it is up to the local peer to decide how to load-balance its transmission.
+	 */
+	else if (!path) {
+		assignFlowToBondedPath(flow, now);
+	}
+	return flow;
+}
+
+void Bond::forgetFlowsWhenNecessary(uint64_t age, bool oldest, int64_t now)
+{
+	//fprintf(stderr, "forgetFlowsWhenNecessary\n");
+	std::map<int32_t,SharedPtr<Flow> >::iterator it = _flows.begin();
+	std::map<int32_t,SharedPtr<Flow> >::iterator oldestFlow = _flows.end();
+	SharedPtr<Flow> expiredFlow;
+	if (age) { // Remove by specific age
+		while (it != _flows.end()) {
+			if (it->second->age(now) > age) {
+				fprintf(stderr, "forgetting flow %x between this node and %llx, %lu active flow(s)\n", it->first, _peer->_id.address().toInt(), (_flows.size()-1));
+				it = _flows.erase(it);
+			} else {
+				++it;
+			}
+		}
+	}
+	else if (oldest) { // Remove single oldest by natural expiration
+		uint64_t maxAge = 0;
+		while (it != _flows.end()) {
+			if (it->second->age(now) > maxAge) {
+				maxAge = (now - it->second->age(now));
+				oldestFlow = it;
+			}
+			++it;
+		}
+		if (oldestFlow != _flows.end()) {
+			fprintf(stderr, "forgetting oldest flow %x (of age %llu) between this node and %llx, %lu active flow(s)\n", oldestFlow->first, oldestFlow->second->age(now), _peer->_id.address().toInt(), (_flows.size()-1));
+			_flows.erase(oldestFlow);
+		}
+	}
+	fprintf(stderr, "000\n");
+}
+
+void Bond::processIncomingPathNegotiationRequest(uint64_t now, SharedPtr<Path> &path, int16_t remoteUtility)
+{
+	//fprintf(stderr, "processIncomingPathNegotiationRequest\n");
+	if (_abSlaveSelectMethod != ZT_MULTIPATH_RESELECTION_POLICY_OPTIMIZE) {
+		return;
+	}
+	Mutex::Lock _l(_paths_m);
+	char pathStr[128];
+	path->address().toString(pathStr);
+	if (!_lastPathNegotiationCheck) {
+		return;
+	}
+	SharedPtr<Slave> slave = RR->bc->getSlaveBySocket(_policyAlias, path->localSocket());
+	if (remoteUtility > _localUtility) {
+		fprintf(stderr, "peer suggests path, its utility (%d) is greater than ours (%d), we will switch to %s on %s (ls=%llx)\n", remoteUtility, _localUtility, pathStr, slave->ifname().c_str(), path->localSocket());
+		negotiatedPath = path;
+	}
+	if (remoteUtility < _localUtility) {
+		fprintf(stderr, "peer suggests path, its utility (%d) is less than ours (%d), we will NOT switch to %s on %s (ls=%llx)\n", remoteUtility, _localUtility, pathStr, slave->ifname().c_str(), path->localSocket());
+	}
+	if (remoteUtility == _localUtility) {
+		fprintf(stderr, "peer suggest path, but utility is equal, picking choice made by peer with greater identity.\n");
+		if (_peer->_id.address().toInt() > RR->node->identity().address().toInt()) {
+			fprintf(stderr, "peer identity was greater, going with their choice of %s on %s (ls=%llx)\n", pathStr, slave->ifname().c_str(), path->localSocket());
+			negotiatedPath = path;
+		} else {
+			fprintf(stderr, "our identity was greater, no change\n");
+		}
+	}
+}
+
+void Bond::pathNegotiationCheck(void *tPtr, const int64_t now)
+{
+	//fprintf(stderr, "pathNegotiationCheck\n");
+	char pathStr[128];
+	int maxInPathIdx = ZT_MAX_PEER_NETWORK_PATHS;
+	int maxOutPathIdx = ZT_MAX_PEER_NETWORK_PATHS;
+	uint64_t maxInCount = 0;
+	uint64_t maxOutCount = 0;
+	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+		if (!_paths[i]) {
+			continue;
+		}
+		if (_paths[i]->_packetsIn > maxInCount) {
+			maxInCount = _paths[i]->_packetsIn;
+			maxInPathIdx = i;
+		}
+		if (_paths[i]->_packetsOut > maxOutCount) {
+			maxOutCount = _paths[i]->_packetsOut;
+			maxOutPathIdx = i;
+		}
+		_paths[i]->resetPacketCounts();
+	}
+	bool _peerLinksSynchronized = ((maxInPathIdx != ZT_MAX_PEER_NETWORK_PATHS)
+		&& (maxOutPathIdx != ZT_MAX_PEER_NETWORK_PATHS)
+		&& (maxInPathIdx != maxOutPathIdx)) ? false : true;
+	/**
+	 * Determine utility and attempt to petition remote peer to switch to our chosen path
+	 */
+	if (!_peerLinksSynchronized) {
+		_localUtility = _paths[maxOutPathIdx]->_failoverScore - _paths[maxInPathIdx]->_failoverScore;
+		if (_paths[maxOutPathIdx]->_negotiated) {
+			_localUtility -= ZT_MULTIPATH_FAILOVER_HANDICAP_NEGOTIATED;
+		}
+		if ((now - _lastSentPathNegotiationRequest) > ZT_PATH_NEGOTIATION_CUTOFF_TIME) {
+			fprintf(stderr, "BT: (sync) it's been long enough, sending more requests.\n");
+			_numSentPathNegotiationRequests = 0;
+		}
+		if (_numSentPathNegotiationRequests < ZT_PATH_NEGOTIATION_TRY_COUNT) {
+			if (_localUtility >= 0) {
+				fprintf(stderr, "BT: (sync) paths appear to be out of sync (utility=%d)\n", _localUtility);
+				sendPATH_NEGOTIATION_REQUEST(tPtr, _paths[maxOutPathIdx]);
+				++_numSentPathNegotiationRequests;
+				_lastSentPathNegotiationRequest = now;
+				_paths[maxOutPathIdx]->address().toString(pathStr);
+				SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[maxOutPathIdx]->localSocket());
+				fprintf(stderr, "sending request to use %s on %s, ls=%llx, utility=%d\n", pathStr, slave->ifname().c_str(), _paths[maxOutPathIdx]->localSocket(), _localUtility);
+			}
+		}
+		/**
+		 * Give up negotiating and consider switching
+		 */
+		else if ((now - _lastSentPathNegotiationRequest) > (2 * ZT_PATH_NEGOTIATION_CHECK_INTERVAL)) {
+			if (_localUtility == 0) {
+				// There's no loss to us, just switch without sending a another request
+				fprintf(stderr, "BT: (sync) giving up, switching to remote peer's path.\n");
+				negotiatedPath = _paths[maxInPathIdx];
+			}
+		}
+	}
+}
+
+void Bond::sendPATH_NEGOTIATION_REQUEST(void *tPtr, const SharedPtr<Path> &path)
+{
+	//char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "sendPATH_NEGOTIATION_REQUEST() %s %s\n", getSlave(path)->ifname().c_str(), pathStr);
+	if (_abSlaveSelectMethod != ZT_MULTIPATH_RESELECTION_POLICY_OPTIMIZE) {
+		return;
+	}
+	Packet outp(_peer->_id.address(),RR->identity.address(),Packet::VERB_PATH_NEGOTIATION_REQUEST);
+	outp.append<int16_t>(_localUtility);
+	if (path->address()) {
+		outp.armor(_peer->key(),false);
+		RR->node->putPacket(tPtr,path->localSocket(),path->address(),outp.data(),outp.size());
+	}
+}
+
+void Bond::sendACK(void *tPtr,const SharedPtr<Path> &path,const int64_t localSocket,
+	const InetAddress &atAddress,int64_t now)
+{
+	//char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "sendACK() %s %s\n", getSlave(path)->ifname().c_str(), pathStr);
+	Packet outp(_peer->_id.address(),RR->identity.address(),Packet::VERB_ACK);
+	int32_t bytesToAck = 0;
+	std::map<uint64_t,uint16_t>::iterator it = path->ackStatsIn.begin();
+	while (it != path->ackStatsIn.end()) {
+		bytesToAck += it->second;
+		++it;
+	}
+	outp.append<uint32_t>(bytesToAck);
+	if (atAddress) {
+		outp.armor(_peer->key(),false);
+		RR->node->putPacket(tPtr,localSocket,atAddress,outp.data(),outp.size());
+	} else {
+		RR->sw->send(tPtr,outp,false);
+	}
+	path->ackStatsIn.clear();
+	path->_packetsReceivedSinceLastAck = 0;
+	path->_lastAckSent = now;
+}
+
+void Bond::sendQOS_MEASUREMENT(void *tPtr,const SharedPtr<Path> &path,const int64_t localSocket,
+	const InetAddress &atAddress,int64_t now)
+{
+	//char pathStr[128];path->address().toString(pathStr);fprintf(stderr, "sendQOS() %s %s\n", getSlave(path)->ifname().c_str(), pathStr);
+	const int64_t _now = RR->node->now();
+	Packet outp(_peer->_id.address(),RR->identity.address(),Packet::VERB_QOS_MEASUREMENT);
+	char qosData[ZT_QOS_MAX_PACKET_SIZE];
+	int16_t len = generateQoSPacket(path, _now,qosData);
+	outp.append(qosData,len);
+	if (atAddress) {
+		outp.armor(_peer->key(),false);
+		RR->node->putPacket(tPtr,localSocket,atAddress,outp.data(),outp.size());
+	} else {
+		RR->sw->send(tPtr,outp,false);
+	}
+    // Account for the fact that a VERB_QOS_MEASUREMENT was just sent. Reset timers.
+	path->_packetsReceivedSinceLastQoS = 0;
+	path->_lastQoSMeasurement = now;
+}
+
+void Bond::processBackgroundTasks(void *tPtr, const int64_t now)
+{
+	Mutex::Lock _l(_paths_m);
+	if (!_peer->_canUseMultipath || (now - _lastBackgroundTaskCheck) < ZT_BOND_BACKGROUND_TASK_MIN_INTERVAL) {
+		return;
+	}
+	_lastBackgroundTaskCheck = now;
+
+	// Compute dynamic path monitor timer interval
+	if (_slaveMonitorStrategy == ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_DYNAMIC) {
+		int suggestedMonitorInterval  = (now - _lastFrame) / 100;
+		_dynamicPathMonitorInterval = std::min(ZT_PATH_HEARTBEAT_PERIOD, ((suggestedMonitorInterval > _bondMonitorInterval) ? suggestedMonitorInterval : _bondMonitorInterval));
+		//fprintf(stderr, "_lastFrame=%llu, suggestedMonitorInterval=%d, _dynamicPathMonitorInterval=%d\n",
+		//	(now-_lastFrame), suggestedMonitorInterval, _dynamicPathMonitorInterval);
+	}
+		
+	if (_slaveMonitorStrategy == ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_DYNAMIC) {
+		_shouldCollectPathStatistics = true;
+	}
+
+	// Memoize oft-used properties in the packet ingress/egress logic path
+	if (_bondingPolicy== ZT_BONDING_POLICY_BALANCE_AWARE) {
+		// Required for real-time balancing
+		_shouldCollectPathStatistics = true;
+	}
+	if (_bondingPolicy== ZT_BONDING_POLICY_ACTIVE_BACKUP) {
+		if (_abSlaveSelectMethod == ZT_MULTIPATH_RESELECTION_POLICY_BETTER) {
+			// Required for judging suitability of primary slave after recovery
+			_shouldCollectPathStatistics = true;
+		}
+		if (_abSlaveSelectMethod == ZT_MULTIPATH_RESELECTION_POLICY_OPTIMIZE) {
+			// Required for judging suitability of new candidate primary
+			_shouldCollectPathStatistics = true;
+		}
+	}
+	if ((now - _lastCheckUserPreferences) > 1000) {
+		_lastCheckUserPreferences = now;
+		applyUserPrefs();
+	}
+
+	curateBond(now,false);
+	if ((now - _lastQualityEstimation) > _qualityEstimationInterval) {
+		_lastQualityEstimation = now;
+		estimatePathQuality(now);
+	}
+	dumpInfo(now);
+
+	// Send QOS/ACK packets as needed
+	if (_shouldCollectPathStatistics) {
+		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i] && _paths[i]->allowed()) {
+				if (_paths[i]->needsToSendQoS(now,_qosSendInterval)) {
+					sendQOS_MEASUREMENT(tPtr, _paths[i], _paths[i]->localSocket(), _paths[i]->address(), now);
+				}
+				if (_paths[i]->needsToSendAck(now,_ackSendInterval)) {
+					sendACK(tPtr, _paths[i], _paths[i]->localSocket(), _paths[i]->address(), now);
+				}
+			}
+		}
+	}
+	// Perform periodic background tasks unique to each bonding policy
+	switch (_bondingPolicy)
+	{
+		case ZT_BONDING_POLICY_ACTIVE_BACKUP:
+			processActiveBackupTasks(now);
+			break;
+		case ZT_BONDING_POLICY_BROADCAST:
+			break;
+		case ZT_BONDING_POLICY_BALANCE_RR:
+		case ZT_BONDING_POLICY_BALANCE_XOR:
+		case ZT_BONDING_POLICY_BALANCE_AWARE:
+			processBalanceTasks(now);
+			break;
+		default:
+			break;
+	}
+	// Check whether or not a path negotiation needs to be performed
+	if (((now - _lastPathNegotiationCheck) > ZT_PATH_NEGOTIATION_CHECK_INTERVAL) && _allowPathNegotiation) {
+		_lastPathNegotiationCheck = now;
+		pathNegotiationCheck(tPtr, now);
+	}	
+}
+
+void Bond::applyUserPrefs()
+{
+	fprintf(stderr, "applyUserPrefs, _minReqPathMonitorInterval=%d\n", RR->bc->minReqPathMonitorInterval());
+	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+		if (!_paths[i]) {
+			continue;
+		}
+		SharedPtr<Slave> sl = getSlave(_paths[i]);
+		if (sl) {
+			if (sl->monitorInterval() == 0) { // If no interval was specified for this slave, use more generic bond-wide interval
+				sl->setMonitorInterval(_bondMonitorInterval);
+			}
+			RR->bc->setMinReqPathMonitorInterval((sl->monitorInterval() < RR->bc->minReqPathMonitorInterval()) ? sl->monitorInterval() : RR->bc->minReqPathMonitorInterval());
+			bool bFoundCommonSlave = false;
+			SharedPtr<Slave> commonSlave =RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+			for(unsigned int j=0;j<ZT_MAX_PEER_NETWORK_PATHS;++j) {
+				if (_paths[j] && _paths[j].ptr() != _paths[i].ptr()) {
+					if (RR->bc->getSlaveBySocket(_policyAlias, _paths[j]->localSocket()) == commonSlave) {
+						bFoundCommonSlave = true;
+					}
+				}
+			}
+			_paths[i]->_monitorInterval = sl->monitorInterval();
+			_paths[i]->_upDelay = sl->upDelay() ? sl->upDelay() : _upDelay;
+			_paths[i]->_downDelay = sl->downDelay() ? sl->downDelay() : _downDelay;
+			_paths[i]->_ipvPref = sl->ipvPref();
+			_paths[i]->_mode = sl->mode();
+			_paths[i]->_enabled = sl->enabled();
+			_paths[i]->_onlyPathOnSlave = !bFoundCommonSlave;
+		}
+	}
+	if (_peer) {
+		_peer->_shouldCollectPathStatistics = _shouldCollectPathStatistics;
+		_peer->_bondingPolicy = _bondingPolicy;
+	}
+}
+
+void Bond::curateBond(const int64_t now, bool rebuildBond)
+{
+	//fprintf(stderr, "%lu curateBond (rebuildBond=%d)\n", ((now - RR->bc->getBondStartTime())), rebuildBond);
+	char pathStr[128];
+	/**
+	 * Update path states
+	 */
+	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+		if (!_paths[i]) {
+			continue;
+		}
+		bool currEligibility = _paths[i]->eligible(now,_ackSendInterval);
+		if (currEligibility != _paths[i]->_lastEligibilityState) {
+			_paths[i]->address().toString(pathStr);
+			//fprintf(stderr, "\n\n%ld path eligibility (for %s, %s) has changed (from %d to %d)\n", (RR->node->now() - RR->bc->getBondStartTime()), getSlave(_paths[i])->ifname().c_str(), pathStr, _paths[i]->lastCheckedEligibility, _paths[i]->eligible(now,_ackSendInterval));
+			if (currEligibility) {
+				rebuildBond = true;
+			}
+			if (!currEligibility) {
+				_paths[i]->adjustRefractoryPeriod(now, _defaultPathRefractoryPeriod, !currEligibility);
+				if (_paths[i]->bonded()) {
+					//fprintf(stderr, "the path was bonded, reallocation of its flows will occur soon\n");
+					rebuildBond = true;
+					_paths[i]->_shouldReallocateFlows = _paths[i]->bonded();
+					_paths[i]->setBonded(false);
+				} else {
+					//fprintf(stderr, "the path was not bonded, no consequences\n");
+				}
+			}
+		}
+		if (currEligibility) {
+			_paths[i]->adjustRefractoryPeriod(now, _defaultPathRefractoryPeriod, false);
+		}
+		_paths[i]->_lastEligibilityState = currEligibility;
+	}
+	/**
+	 * Curate the set of paths that are part of the bond proper. Selects a single path
+	 * per logical slave according to eligibility and user-specified constraints.
+	 */
+	if ((_bondingPolicy== ZT_BONDING_POLICY_BALANCE_RR)
+			|| (_bondingPolicy== ZT_BONDING_POLICY_BALANCE_XOR)
+			|| (_bondingPolicy== ZT_BONDING_POLICY_BALANCE_AWARE)) {
+		if (!_numBondedPaths) {
+			rebuildBond = true;
+		}
+		// TODO: Optimize
+		if (rebuildBond) {
+			int updatedBondedPathCount = 0;
+			std::map<SharedPtr<Slave>,int> slaveMap;
+			for (int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+				if (_paths[i] && _paths[i]->allowed() && (_paths[i]->eligible(now,_ackSendInterval) || !_numBondedPaths)) {
+					SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+					if (!slaveMap.count(slave)) {
+						slaveMap[slave] = i;
+					}
+					else {
+						bool overriden = false;
+						_paths[i]->address().toString(pathStr);
+						//fprintf(stderr, " slave representative path already exists! (%s %s)\n", getSlave(_paths[i])->ifname().c_str(), pathStr);
+						if (_paths[i]->preferred() && !_paths[slaveMap[slave]]->preferred()) {
+							// Override previous choice if preferred
+							//fprintf(stderr, "overriding since its preferred!\n");
+							if (_paths[slaveMap[slave]]->_assignedFlowCount) {
+								_paths[slaveMap[slave]]->_deprecated = true;
+							}
+							else {
+								_paths[slaveMap[slave]]->_deprecated = true;
+								_paths[slaveMap[slave]]->setBonded(false);
+							}
+							slaveMap[slave] = i;
+							overriden = true;
+						}
+						if ((_paths[i]->preferred() && _paths[slaveMap[slave]]->preferred())
+							|| (!_paths[i]->preferred() && !_paths[slaveMap[slave]]->preferred())) {
+							if (_paths[i]->preferenceRank() > _paths[slaveMap[slave]]->preferenceRank()) {
+								// Override if higher preference
+								//fprintf(stderr, "overriding according to preference preferenceRank!\n");
+								if (_paths[slaveMap[slave]]->_assignedFlowCount) {
+									_paths[slaveMap[slave]]->_deprecated = true;
+								}
+								else {
+									_paths[slaveMap[slave]]->_deprecated = true;
+									_paths[slaveMap[slave]]->setBonded(false);
+								}
+								slaveMap[slave] = i;
+							}
+						}
+					}
+				}
+			}
+			std::map<SharedPtr<Slave>,int>::iterator it = slaveMap.begin();
+			for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+				if (!_paths[i]) {
+					continue;
+				}
+				_bondedIdx[i] = ZT_MAX_PEER_NETWORK_PATHS;
+				if (it != slaveMap.end()) {
+					_bondedIdx[i] = it->second;
+					_paths[_bondedIdx[i]]->setBonded(true);
+					++it;
+					++updatedBondedPathCount;
+					_paths[_bondedIdx[i]]->address().toString(pathStr);
+					fprintf(stderr, "setting i=%d, _bondedIdx[%d]=%d to bonded (%s %s)\n", i, i, _bondedIdx[i], getSlave(_paths[_bondedIdx[i]])->ifname().c_str(), pathStr);
+				}
+			}
+			_numBondedPaths = updatedBondedPathCount;
+
+			if (_bondingPolicy== ZT_BONDING_POLICY_BALANCE_RR) {
+				// Cause a RR reset since the currently used index might no longer be valid
+				_rrPacketsSentOnCurrSlave = _packetsPerSlave;
+			}
+		}
+	}
+}
+
+void Bond::estimatePathQuality(const int64_t now)
+{
+	char pathStr[128];
+	//---
+
+	uint32_t totUserSpecifiedSlaveSpeed = 0;
+	if (_numBondedPaths) { // Compute relative user-specified speeds of slaves
+		for(unsigned int i=0;i<_numBondedPaths;++i) {
+			SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+			if (_paths[i] && _paths[i]->allowed()) {
+				totUserSpecifiedSlaveSpeed += slave->speed();
+			}
+		}
+		for(unsigned int i=0;i<_numBondedPaths;++i) {
+				SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+			if (_paths[i] && _paths[i]->allowed()) {
+				slave->setRelativeSpeed(round( ((float)slave->speed() / (float)totUserSpecifiedSlaveSpeed) * 255));
+			}
+		}
+	}
+
+	float lat[ZT_MAX_PEER_NETWORK_PATHS];
+	float pdv[ZT_MAX_PEER_NETWORK_PATHS];
+	float plr[ZT_MAX_PEER_NETWORK_PATHS];
+	float per[ZT_MAX_PEER_NETWORK_PATHS];
+	float thr[ZT_MAX_PEER_NETWORK_PATHS];
+    float thm[ZT_MAX_PEER_NETWORK_PATHS];
+    float thv[ZT_MAX_PEER_NETWORK_PATHS];
+
+	float maxLAT = 0;
+	float maxPDV = 0;
+	float maxPLR = 0;
+	float maxPER = 0;
+	float maxTHR = 0;
+	float maxTHM = 0;
+	float maxTHV = 0;
+
+	float quality[ZT_MAX_PEER_NETWORK_PATHS];
+	uint8_t alloc[ZT_MAX_PEER_NETWORK_PATHS];
+	
+	float totQuality = 0.0f;
+
+	memset(&lat, 0, sizeof(lat));
+	memset(&pdv, 0, sizeof(pdv));
+	memset(&plr, 0, sizeof(plr));
+	memset(&per, 0, sizeof(per));
+	memset(&thr, 0, sizeof(thr));
+	memset(&thm, 0, sizeof(thm));
+	memset(&thv, 0, sizeof(thv));
+	memset(&quality, 0, sizeof(quality));
+	memset(&alloc, 0, sizeof(alloc));
+
+	// Compute initial summary statistics
+	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+		if (!_paths[i] || !_paths[i]->allowed()) {
+			continue;
+		}
+		// Compute/Smooth average of real-world observations
+		_paths[i]->_latencyMean = _paths[i]->latencySamples.mean();
+		_paths[i]->_latencyVariance = _paths[i]->latencySamples.stddev();
+		_paths[i]->_packetErrorRatio = 1.0 - (_paths[i]->packetValiditySamples.count() ? _paths[i]->packetValiditySamples.mean() : 1.0);
+
+		if (userHasSpecifiedSlaveSpeeds()) {
+			// Use user-reported metrics
+			SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+			if (slave) {
+				_paths[i]->_throughputMean = slave->speed();
+				_paths[i]->_throughputVariance = 0;
+			}
+		}
+		/*
+		else {
+			// Use estimated metrics
+			if (_paths[i]->throughputSamples.count()) {
+				// If we have samples, use them
+				_paths[i]->throughputMean = (uint64_t)_paths[i]->throughputSamples.mean();
+				if (_paths[i]->throughputMean > 0) {
+					_paths[i]->throughputVarianceSamples.push((float)_paths[i]->throughputSamples.stddev() / (float)_paths[i]->throughputMean);
+					_paths[i]->throughputVariance = _paths[i]->throughputVarianceSamples.mean();
+				}
+			}
+			else {
+				// No samples have been collected yet, assume best case scenario
+				_paths[i]->throughputMean = ZT_QOS_THR_NORM_MAX;
+				_paths[i]->throughputVariance = 0;
+			}
+		}
+		*/
+		// Drain unacknowledged QoS records
+		std::map<uint64_t,uint64_t>::iterator it = _paths[i]->qosStatsOut.begin();
+		uint64_t currentLostRecords = 0;
+		while (it != _paths[i]->qosStatsOut.end()) {
+			int qosRecordTimeout = 5000; //_paths[i]->monitorInterval() * ZT_MULTIPATH_QOS_ACK_INTERVAL_MULTIPLIER * 8;
+			if ((now - it->second) >= qosRecordTimeout) {
+				//fprintf(stderr, "packetId=%llx was lost\n", it->first);
+				it = _paths[i]->qosStatsOut.erase(it);
+				++currentLostRecords;
+			} else { ++it; }
+		}
+
+		quality[i]=0;
+		totQuality=0;
+		// Normalize raw observations according to sane limits and/or user specified values
+		lat[i] = 1.0 / expf(4*Utils::normalize(_paths[i]->_latencyMean, 0, _maxAcceptableLatency, 0, 1));
+		pdv[i] = 1.0 / expf(4*Utils::normalize(_paths[i]->_latencyVariance, 0, _maxAcceptablePacketDelayVariance, 0, 1));
+		plr[i] = 1.0 / expf(4*Utils::normalize(_paths[i]->_packetLossRatio, 0, _maxAcceptablePacketLossRatio, 0, 1));
+		per[i] = 1.0 / expf(4*Utils::normalize(_paths[i]->_packetErrorRatio, 0, _maxAcceptablePacketErrorRatio, 0, 1));
+		//thr[i] = 1.0; //Utils::normalize(_paths[i]->throughputMean, 0, ZT_QOS_THR_NORM_MAX, 0, 1);
+		//thm[i] = 1.0; //Utils::normalize(_paths[i]->throughputMax, 0, ZT_QOS_THM_NORM_MAX, 0, 1);
+		//thv[i] = 1.0; //1.0 / expf(4*Utils::normalize(_paths[i]->throughputVariance, 0, ZT_QOS_THV_NORM_MAX, 0, 1));
+		//scp[i] = _paths[i]->ipvPref != 0 ? 1.0 : Utils::normalize(_paths[i]->ipScope(), InetAddress::IP_SCOPE_NONE, InetAddress::IP_SCOPE_PRIVATE, 0, 1);
+		// Record bond-wide maximums to determine relative values
+		maxLAT = lat[i] > maxLAT ? lat[i] : maxLAT;
+		maxPDV = pdv[i] > maxPDV ? pdv[i] : maxPDV;
+		maxPLR = plr[i] > maxPLR ? plr[i] : maxPLR;
+		maxPER = per[i] > maxPER ? per[i] : maxPER;
+		//maxTHR = thr[i] > maxTHR ? thr[i] : maxTHR;
+		//maxTHM = thm[i] > maxTHM ? thm[i] : maxTHM;
+		//maxTHV = thv[i] > maxTHV ? thv[i] : maxTHV;
+
+		//fprintf(stdout, "EH   %d: lat=%8.3f,  ltm=%8.3f,  pdv=%8.3f,  plr=%5.3f,  per=%5.3f,  thr=%8f,  thm=%5.3f,  thv=%5.3f,  avl=%5.3f,  age=%8.2f,  scp=%4d,  q=%5.3f,  qtot=%5.3f,  ac=%d if=%s, path=%s\n",
+		//	              i,   lat[i],     ltm[i],     pdv[i],     plr[i],     per[i],     thr[i],     thm[i],     thv[i],     avl[i],     age[i],     scp[i], quality[i], totQuality, alloc[i], getSlave(_paths[i])->ifname().c_str(), pathStr);
+		
+	}
+	// Convert metrics to relative quantities and apply contribution weights
+	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+		if (_paths[i] && _paths[i]->bonded()) {
+			quality[i] += ((maxLAT > 0.0f ? lat[i] / maxLAT : 0.0f) * _qualityWeights[ZT_QOS_LAT_IDX]);
+			quality[i] += ((maxPDV > 0.0f ? pdv[i] / maxPDV : 0.0f) * _qualityWeights[ZT_QOS_PDV_IDX]);
+			quality[i] += ((maxPLR > 0.0f ? plr[i] / maxPLR : 0.0f) * _qualityWeights[ZT_QOS_PLR_IDX]);
+			quality[i] += ((maxPER > 0.0f ? per[i] / maxPER : 0.0f) * _qualityWeights[ZT_QOS_PER_IDX]);
+			//quality[i] += ((maxTHR > 0.0f ? thr[i] / maxTHR : 0.0f) * _qualityWeights[ZT_QOS_THR_IDX]);
+			//quality[i] += ((maxTHM > 0.0f ? thm[i] / maxTHM : 0.0f) * _qualityWeights[ZT_QOS_THM_IDX]);
+			//quality[i] += ((maxTHV > 0.0f ? thv[i] / maxTHV : 0.0f) * _qualityWeights[ZT_QOS_THV_IDX]);
+			//quality[i] += (scp[i] * _qualityWeights[ZT_QOS_SCP_IDX]);
+			totQuality += quality[i];
+		}
+	}
+	// 
+	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+		if (_paths[i] && _paths[i]->bonded()) {
+			alloc[i] = std::ceil((quality[i] / totQuality) * (float)255);
+			_paths[i]->_allocation = alloc[i];
+		}
+	}
+	/*
+	if ((now - _lastLogTS) > 500) {
+		if (!relevant()) {return;}
+		//fprintf(stderr, "\n");
+		_lastPrintTS = now;
+		_lastLogTS = now;
+		int numPlottablePaths=0;
+		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i]) {
+				++numPlottablePaths;
+				_paths[i]->address().toString(pathStr);
+				//fprintf(stderr, "%lu FIN [%d/%d]: pmi=%5d, lat=%4.3f, ltm=%4.3f, pdv=%4.3f, plr=%4.3f, per=%4.3f, thr=%4.3f, thm=%4.3f, thv=%4.3f, age=%4.3f, scp=%4d, q=%4.3f, qtot=%4.3f, ac=%4d, asf=%3d, if=%s, path=%20s, bond=%d, qosout=%d, plrraw=%d\n",
+				//	((now - RR->bc->getBondStartTime())), i, _numBondedPaths,   _paths[i]->monitorInterval,
+				//	lat[i],     ltm[i],     pdv[i],     plr[i],     per[i],     thr[i],     thm[i],     thv[i],     age[i],     scp[i],
+				//	quality[i], totQuality, alloc[i], _paths[i]->assignedFlowCount, getSlave(_paths[i])->ifname().c_str(), pathStr, _paths[i]->bonded(), _paths[i]->qosStatsOut.size(), _paths[i]->packetLossRatio);
+			}
+		}
+		if (numPlottablePaths < 2) {
+			return;
+		}
+		if (!_header) {
+			fprintf(stdout, "now, bonded, relativeUnderload, flows, ");
+			for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+				if (_paths[i]) {
+					_paths[i]->address().toString(pathStr);
+					std::string label = std::string((pathStr)) + " " + getSlave(_paths[i])->ifname();
+					for (int i=0; i<19; ++i) {
+						fprintf(stdout, "%s, ", label.c_str());
+					}
+				}
+			}
+			_header=true;
+		}
+		fprintf(stdout, "%ld, %d, %d, %d, ",((now - RR->bc->getBondStartTime())),_numBondedPaths,_totalBondUnderload, _flows.size());
+		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i]) {
+				_paths[i]->address().toString(pathStr);
+				fprintf(stdout, "%s, %s, %8.3f, %8.3f, %8.3f, %5.3f, %5.3f, %5.3f, %8f, %5.3f, %5.3f, %d, %5.3f, %d, %d, %d, %d, %d, %d, ",
+					              getSlave(_paths[i])->ifname().c_str(), pathStr, _paths[i]->latencyMean, lat[i],pdv[i], _paths[i]->packetLossRatio, plr[i],per[i],thr[i],thm[i],thv[i],(now - _paths[i]->lastIn()),quality[i],alloc[i],
+					              _paths[i]->relativeByteLoad, _paths[i]->assignedFlowCount, _paths[i]->alive(now, true), _paths[i]->eligible(now,_ackSendInterval), _paths[i]->qosStatsOut.size());
+			}
+		}
+		fprintf(stdout, "\n");
+	}
+	*/
+}
+
+void Bond::processBalanceTasks(const int64_t now)
+{
+	// Omitted
+}
+
+void Bond::dequeueNextActiveBackupPath(const uint64_t now)
+{
+	//fprintf(stderr, "dequeueNextActiveBackupPath\n");
+	if (_abFailoverQueue.empty()) {
+		return;
+	}
+	_abPath = _abFailoverQueue.front();
+	_abFailoverQueue.pop_front();
+	_lastActiveBackupPathChange = now;
+	for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+		if (_paths[i]) {
+			_paths[i]->resetPacketCounts();
+		}
+	}
+}
+
+void Bond::processActiveBackupTasks(const int64_t now)
+{	
+	//fprintf(stderr, "%llu processActiveBackupTasks\n", (now - RR->bc->getBondStartTime()));
+	char pathStr[128]; char prevPathStr[128]; char curPathStr[128];
+
+	SharedPtr<Path> prevActiveBackupPath = _abPath;
+	SharedPtr<Path> nonPreferredPath;
+	bool bFoundPrimarySlave = false;
+
+	/**
+	 * Select initial "active" active-backup slave
+	 */
+	if (!_abPath) {
+		fprintf(stderr, "%llu no active backup path yet...\n", ((now - RR->bc->getBondStartTime())));
+		/**
+		 * [Automatic mode]
+		 * The user has not explicitly specified slaves or their failover schedule,
+		 * the bonding policy will now select the first eligible path and set it as 
+		 * its active backup path, if a substantially better path is detected the bonding
+		 * policy will assign it as the new active backup path. If the path fails it will
+		 * simply find the next eligible path.
+		 */
+		if (!userHasSpecifiedSlaves()) {
+			fprintf(stderr, "%llu AB: (auto) user did not specify any slaves. waiting until we know more\n", ((now - RR->bc->getBondStartTime())));
+			for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+				if (_paths[i] && _paths[i]->eligible(now,_ackSendInterval)) {
+					_paths[i]->address().toString(curPathStr);
+					SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+					if (slave) {
+						fprintf(stderr, "%llu AB: (initial) [%d] found eligible path %s on: %s\n", ((now - RR->bc->getBondStartTime())), i, curPathStr, slave->ifname().c_str());
+					}
+					_abPath = _paths[i];
+					break;
+				}
+			}
+		}
+		/**
+	 	 * [Manual mode]
+	 	 * The user has specified slaves or failover rules that the bonding policy should adhere to.
+	 	 */
+		else if (userHasSpecifiedSlaves()) {
+			fprintf(stderr, "%llu AB: (manual) no active backup slave, checking local.conf\n", ((now - RR->bc->getBondStartTime())));
+			if (userHasSpecifiedPrimarySlave()) {
+				fprintf(stderr, "%llu AB: (manual) user has specified primary slave, looking for it.\n", ((now - RR->bc->getBondStartTime())));
+				for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+					if (!_paths[i]) {
+						continue;
+					}
+					SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+					if (_paths[i]->eligible(now,_ackSendInterval) && slave->primary()) {
+						if (!_paths[i]->preferred()) {
+							_paths[i]->address().toString(curPathStr);
+							fprintf(stderr, "%llu AB: (initial) [%d] found path on primary slave, taking note in case we don't find a preferred path\n", ((now - RR->bc->getBondStartTime())), i);
+							nonPreferredPath = _paths[i];
+							bFoundPrimarySlave = true;
+						}
+						if (_paths[i]->preferred()) {
+							_abPath = _paths[i];
+							_abPath->address().toString(curPathStr);
+							SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+							if (slave) {
+								fprintf(stderr, "%llu AB: (initial) [%d] found preferred path %s on primary slave: %s\n", ((now - RR->bc->getBondStartTime())), i, curPathStr, slave->ifname().c_str());
+							}
+							bFoundPrimarySlave = true;
+							break;
+						}
+					}
+				}
+				if (_abPath) {
+					_abPath->address().toString(curPathStr);
+					SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _abPath->localSocket());
+					if (slave) {
+						fprintf(stderr, "%llu AB: (initial) found preferred primary path: %s on %s\n", ((now - RR->bc->getBondStartTime())), curPathStr, slave->ifname().c_str());
+					}
+				}
+				else {
+					if (bFoundPrimarySlave && nonPreferredPath) {
+						fprintf(stderr, "%llu AB: (initial) found a non-preferred primary path\n", ((now - RR->bc->getBondStartTime())));
+						_abPath = nonPreferredPath;
+					}
+				}
+				if (!_abPath) {
+					fprintf(stderr, "%llu AB: (initial) designated primary slave is not yet ready\n", ((now - RR->bc->getBondStartTime())));
+					// TODO: Should fail-over to specified backup or just wait?
+				}
+			}
+			else if (!userHasSpecifiedPrimarySlave()) {
+				int _abIdx = ZT_MAX_PEER_NETWORK_PATHS;
+				fprintf(stderr, "%llu AB: (initial) user did not specify primary slave, just picking something\n", ((now - RR->bc->getBondStartTime())));
+				for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+					if (_paths[i] && _paths[i]->eligible(now,_ackSendInterval)) {
+						_abIdx = i;
+						break;
+					}
+				}
+				if (_abIdx == ZT_MAX_PEER_NETWORK_PATHS) {
+					fprintf(stderr, "%llu AB: (initial) unable to find a candidate next-best, no change\n", ((now - RR->bc->getBondStartTime())));
+				}
+				else {
+					_abPath = _paths[_abIdx];
+					SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _abPath->localSocket());
+					if (slave) {
+						fprintf(stderr, "%llu AB: (initial) selected non-primary slave idx=%d, %s on %s\n", ((now - RR->bc->getBondStartTime())), _abIdx, pathStr, slave->ifname().c_str());
+					}
+				}
+			}
+		}
+	}
+	/**
+	 * Update and maintain the active-backup failover queue
+	 */
+	if (_abPath) {
+		// Don't worry about the failover queue until we have an active slave
+		// Remove ineligible paths from the failover slave queue
+		for (std::list<SharedPtr<Path> >::iterator it(_abFailoverQueue.begin()); it!=_abFailoverQueue.end();) {
+			if ((*it) && !(*it)->eligible(now,_ackSendInterval)) {
+				(*it)->address().toString(curPathStr);
+				SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, (*it)->localSocket());
+				if (slave) {
+					fprintf(stderr, "%llu AB: (fq) %s on %s is now ineligible, removing from failover queue\n", ((now - RR->bc->getBondStartTime())), curPathStr, slave->ifname().c_str());
+				}
+				it = _abFailoverQueue.erase(it);
+			} else {
+				++it;
+			}
+		}
+		/**
+		 * Failover instructions were provided by user, build queue according those as well as IPv
+		 * preference, disregarding performance.
+		 */
+		if (userHasSpecifiedFailoverInstructions()) {
+			/**
+			 * Clear failover scores
+			 */
+			for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+				if (_paths[i]) {
+					_paths[i]->_failoverScore = 0;
+				}
+			}
+			//fprintf(stderr, "AB: (fq) user has specified specific failover instructions, will follow them.\n");
+			for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+				if (!_paths[i] || !_paths[i]->allowed() || !_paths[i]->eligible(now,_ackSendInterval)) {
+					continue;
+				}
+				SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+				_paths[i]->address().toString(pathStr);
+				
+				int failoverScoreHandicap = _paths[i]->_failoverScore;
+				if (_paths[i]->preferred()) 
+				{
+					failoverScoreHandicap += ZT_MULTIPATH_FAILOVER_HANDICAP_PREFERRED;
+					//fprintf(stderr, "%s on %s ----> %d for preferred\n", pathStr, _paths[i]->ifname().c_str(), failoverScoreHandicap);
+				}
+				if (slave->primary()) {
+					// If using "optimize" primary reselect mode, ignore user slave designations
+					failoverScoreHandicap += ZT_MULTIPATH_FAILOVER_HANDICAP_PRIMARY;
+					//fprintf(stderr, "%s on %s ----> %d for primary\n", pathStr, _paths[i]->ifname().c_str(), failoverScoreHandicap);
+				}
+				if (!_paths[i]->_failoverScore) {
+					// If we didn't inherit a failover score from a "parent" that wants to use this path as a failover
+					int newHandicap = failoverScoreHandicap ? failoverScoreHandicap : _paths[i]->_allocation;
+					_paths[i]->_failoverScore = newHandicap;
+					//fprintf(stderr, "%s on %s ----> %d for allocation\n", pathStr, _paths[i]->ifname().c_str(), newHandicap);
+				}
+				SharedPtr<Slave> failoverSlave;
+				if (slave->failoverToSlave().length()) {
+					failoverSlave = RR->bc->getSlaveByName(_policyAlias, slave->failoverToSlave());
+				}
+				if (failoverSlave) {
+					for (int j=0; j<ZT_MAX_PEER_NETWORK_PATHS; j++) {
+						if (_paths[j] && getSlave(_paths[j]) == failoverSlave.ptr()) {
+							_paths[j]->address().toString(pathStr);
+							int inheritedHandicap = failoverScoreHandicap - 10;
+							int newHandicap = _paths[j]->_failoverScore > inheritedHandicap ? _paths[j]->_failoverScore : inheritedHandicap;
+							//fprintf(stderr, "\thanding down %s on %s ----> %d\n", pathStr, getSlave(_paths[j])->ifname().c_str(), newHandicap);
+							if (!_paths[j]->preferred()) {
+								newHandicap--;
+							}
+							_paths[j]->_failoverScore = newHandicap;
+						}
+					}
+				}
+				if (_paths[i].ptr() != _abPath.ptr()) {
+					bool bFoundPathInQueue = false;
+					for (std::list<SharedPtr<Path> >::iterator it(_abFailoverQueue.begin()); it!=_abFailoverQueue.end();++it) {
+						if (_paths[i].ptr() == (*it).ptr()) {
+							bFoundPathInQueue = true;
+						}
+					}
+					if (!bFoundPathInQueue) {
+						_paths[i]->address().toString(curPathStr);
+						fprintf(stderr, "%llu AB: (fq) [%d] added %s on %s to queue\n", ((now - RR->bc->getBondStartTime())), i, curPathStr, getSlave(_paths[i])->ifname().c_str());
+						_abFailoverQueue.push_front(_paths[i]);
+					}
+				}
+			}
+		}
+		/**
+		 * No failover instructions provided by user, build queue according to performance
+		 * and IPv preference.
+		 */
+		else if (!userHasSpecifiedFailoverInstructions()) {
+			for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+				if (!_paths[i]
+					|| !_paths[i]->allowed()
+					|| !_paths[i]->eligible(now,_ackSendInterval)) {
+					continue;
+				}
+				int failoverScoreHandicap = 0;
+				if (_paths[i]->preferred()) {
+					failoverScoreHandicap = ZT_MULTIPATH_FAILOVER_HANDICAP_PREFERRED;
+				}
+				bool includeRefractoryPeriod = true;
+				if (!_paths[i]->eligible(now,includeRefractoryPeriod)) {
+					failoverScoreHandicap = -10000;
+				}
+				if (getSlave(_paths[i])->primary() && _abSlaveSelectMethod != ZT_MULTIPATH_RESELECTION_POLICY_OPTIMIZE) {
+					// If using "optimize" primary reselect mode, ignore user slave designations
+					failoverScoreHandicap = ZT_MULTIPATH_FAILOVER_HANDICAP_PRIMARY;
+				}
+				if (_paths[i].ptr() == negotiatedPath.ptr()) {
+					_paths[i]->_negotiated = true;
+					failoverScoreHandicap = ZT_MULTIPATH_FAILOVER_HANDICAP_NEGOTIATED;
+				} else { 
+					_paths[i]->_negotiated = false;
+				}
+				_paths[i]->_failoverScore = _paths[i]->_allocation + failoverScoreHandicap;
+				if (_paths[i].ptr() != _abPath.ptr()) {
+					bool bFoundPathInQueue = false;
+					for (std::list<SharedPtr<Path> >::iterator it(_abFailoverQueue.begin()); it!=_abFailoverQueue.end();++it) {
+						if (_paths[i].ptr() == (*it).ptr()) {
+							bFoundPathInQueue = true;
+						}
+					}
+					if (!bFoundPathInQueue) {
+						_paths[i]->address().toString(curPathStr);
+						fprintf(stderr, "%llu AB: (fq) [%d] added %s on %s to queue\n", ((now - RR->bc->getBondStartTime())), i, curPathStr, getSlave(_paths[i])->ifname().c_str());
+						_abFailoverQueue.push_front(_paths[i]);
+					}
+				}
+			}
+		}
+		_abFailoverQueue.sort(PathQualityComparator());
+		if (_abFailoverQueue.empty()) {
+			fprintf(stderr, "%llu AB: (fq) the failover queue is empty, the active-backup bond is no longer fault-tolerant\n", ((now - RR->bc->getBondStartTime())));
+		}
+	}
+	/**
+	 * Short-circuit if we have no queued paths
+	 */
+	if (_abFailoverQueue.empty()) {
+		return;
+	}
+	/**
+	 * Fulfill primary reselect obligations
+	 */
+	if (_abPath && !_abPath->eligible(now,_ackSendInterval)) { // Implicit ZT_MULTIPATH_RESELECTION_POLICY_FAILURE
+		_abPath->address().toString(curPathStr); fprintf(stderr, "%llu AB: (failure) failover event!, active backup path (%s) is no-longer eligible\n", ((now - RR->bc->getBondStartTime())), curPathStr);
+		if (!_abFailoverQueue.empty()) {
+			fprintf(stderr, "%llu AB: (failure) there are (%lu) slaves in queue to choose from...\n", ((now - RR->bc->getBondStartTime())), _abFailoverQueue.size());
+			dequeueNextActiveBackupPath(now);
+			_abPath->address().toString(curPathStr); fprintf(stderr, "%llu sAB: (failure) switched to %s on %s\n", ((now - RR->bc->getBondStartTime())), curPathStr, getSlave(_abPath)->ifname().c_str());
+		} else {
+			fprintf(stderr, "%llu AB: (failure) nothing available in the slave queue, doing nothing.\n", ((now - RR->bc->getBondStartTime())));
+		}
+	}
+	/**
+	 * Detect change to prevent flopping during later optimization step.
+	 */
+	if (prevActiveBackupPath != _abPath) {
+		_lastActiveBackupPathChange = now;
+	}
+	if (_abSlaveSelectMethod == ZT_MULTIPATH_RESELECTION_POLICY_ALWAYS) {
+		if (_abPath && !getSlave(_abPath)->primary()
+			&& getSlave(_abFailoverQueue.front())->primary()) {
+			fprintf(stderr, "%llu AB: (always) switching to available primary\n", ((now - RR->bc->getBondStartTime())));
+			dequeueNextActiveBackupPath(now);
+		}
+	}
+	if (_abSlaveSelectMethod == ZT_MULTIPATH_RESELECTION_POLICY_BETTER) {
+		if (_abPath && !getSlave(_abPath)->primary()) {
+			fprintf(stderr, "%llu AB: (better) active backup has switched to \"better\" primary slave according to re-select policy.\n", ((now - RR->bc->getBondStartTime())));
+			if (getSlave(_abFailoverQueue.front())->primary()
+				&& (_abFailoverQueue.front()->_failoverScore > _abPath->_failoverScore)) {
+				dequeueNextActiveBackupPath(now);
+				fprintf(stderr, "%llu AB: (better) switched back to user-defined primary\n", ((now - RR->bc->getBondStartTime())));
+			}
+		}
+	}
+	if (_abSlaveSelectMethod == ZT_MULTIPATH_RESELECTION_POLICY_OPTIMIZE && !_abFailoverQueue.empty()) {
+		/**
+		 * Implement link negotiation that was previously-decided
+		 */
+		if (_abFailoverQueue.front()->_negotiated) {
+			dequeueNextActiveBackupPath(now);
+			_abPath->address().toString(prevPathStr);
+			fprintf(stderr, "%llu AB: (optimize) switched to negotiated path %s on %s\n", ((now - RR->bc->getBondStartTime())), prevPathStr, getSlave(_abPath)->ifname().c_str());
+			_lastPathNegotiationCheck = now;
+		}
+		else {
+			// Try to find a better path and automatically switch to it -- not too often, though.
+			if ((now - _lastActiveBackupPathChange) > ZT_MULTIPATH_MIN_ACTIVE_BACKUP_AUTOFLOP_INTERVAL) {
+				if (!_abFailoverQueue.empty()) {
+					//fprintf(stderr, "AB: (optimize) there are (%d) slaves in queue to choose from...\n", _abFailoverQueue.size());
+					int newFScore = _abFailoverQueue.front()->_failoverScore;
+					int prevFScore = _abPath->_failoverScore;
+					// Establish a minimum switch threshold to prevent flapping
+					int failoverScoreDifference = _abFailoverQueue.front()->_failoverScore - _abPath->_failoverScore;
+					int thresholdQuantity = (ZT_MULTIPATH_ACTIVE_BACKUP_OPTIMIZE_MIN_THRESHOLD * (float)_abPath->_allocation);
+					if ((failoverScoreDifference > 0) && (failoverScoreDifference > thresholdQuantity)) {
+						SharedPtr<Path> oldPath = _abPath;
+						_abPath->address().toString(prevPathStr);
+						dequeueNextActiveBackupPath(now);
+						_abPath->address().toString(curPathStr);
+						fprintf(stderr, "%llu AB: (optimize) switched from %s on %s (fs=%d) to %s on %s (fs=%d)\n", ((now - RR->bc->getBondStartTime())), prevPathStr, getSlave(oldPath)->ifname().c_str(), prevFScore, curPathStr, getSlave(_abPath)->ifname().c_str(), newFScore);
+					}
+				}
+			}
+		}
+	}
+}
+
+void Bond::setReasonableDefaults(int policy)
+{
+	// If invalid bonding policy, try default
+	int _defaultBondingPolicy = BondController::defaultBondingPolicy();
+	if (policy <= ZT_BONDING_POLICY_NONE || policy > ZT_BONDING_POLICY_BALANCE_AWARE) {
+		// If no default set, use NONE (effectively disabling this bond)
+		if (_defaultBondingPolicy < ZT_BONDING_POLICY_NONE || _defaultBondingPolicy > ZT_BONDING_POLICY_BALANCE_AWARE) {
+			_bondingPolicy= ZT_BONDING_POLICY_NONE;
+		}
+		_bondingPolicy= _defaultBondingPolicy;
+	} else {
+		_bondingPolicy= policy;
+	}
+
+	_downDelay = 0;
+	_upDelay = 0;
+	_allowFlowHashing=false;
+	_bondMonitorInterval=0;
+	_allowPathNegotiation=false;
+	_shouldCollectPathStatistics=false;
+	_lastPathNegotiationReceived=0;
+	_lastBackgroundTaskCheck=0;
+	_lastPathNegotiationCheck=0;
+	
+	_lastFlowStatReset=0;
+	_lastFlowExpirationCheck=0;
+	_localUtility=0;
+	_numBondedPaths=0;
+	_rrPacketsSentOnCurrSlave=0;
+	_rrIdx=0;
+	_lastPathNegotiationReceived=0;
+	_pathNegotiationCutoffCount=0;
+	_lastFlowRebalance=0;
+	_totalBondUnderload = 0;
+	
+	//_maxAcceptableLatency
+	_maxAcceptablePacketDelayVariance = 50;
+	_maxAcceptablePacketLossRatio = 0.10;
+	_maxAcceptablePacketErrorRatio = 0.10;
+	_userHasSpecifiedSlaveSpeeds=0;
+
+	_lastFrame=0;
+
+	// TODO: Remove
+	_header=false;
+	_lastLogTS = 0;
+	_lastPrintTS = 0;
+
+
+
+
+	/**
+	 * Paths are actively monitored to provide a real-time quality/preference-ordered rapid failover queue.
+	 */
+	switch (policy) {
+		case ZT_BONDING_POLICY_ACTIVE_BACKUP:
+			_failoverInterval = 5000;
+			_abSlaveSelectMethod = ZT_MULTIPATH_RESELECTION_POLICY_OPTIMIZE;
+			_slaveMonitorStrategy = ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_DYNAMIC;
+			_qualityWeights[ZT_QOS_LAT_IDX] = 0.2f;
+			_qualityWeights[ZT_QOS_LTM_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_PDV_IDX] = 0.2f;
+			_qualityWeights[ZT_QOS_PLR_IDX] = 0.2f;
+			_qualityWeights[ZT_QOS_PER_IDX] = 0.2f;
+			_qualityWeights[ZT_QOS_THR_IDX] = 0.2f;
+			_qualityWeights[ZT_QOS_THM_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_THV_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_SCP_IDX] = 0.0f;
+			break;
+		/**
+		 * All seemingly-alive paths are used. Paths are not actively monitored.
+		 */
+		case ZT_BONDING_POLICY_BROADCAST:
+			_downDelay = 30000;
+			_upDelay = 0;
+			break;
+		/**
+		 * Paths are monitored to determine when/if one needs to be added or removed from the rotation
+		 */
+		case ZT_BONDING_POLICY_BALANCE_RR:
+			_failoverInterval = 5000;
+			_allowFlowHashing = false;
+			_packetsPerSlave = 8;
+			_slaveMonitorStrategy = ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_DYNAMIC;
+			_qualityWeights[ZT_QOS_LAT_IDX] = 0.4f;
+			_qualityWeights[ZT_QOS_LTM_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_PDV_IDX] = 0.2f;
+			_qualityWeights[ZT_QOS_PLR_IDX] = 0.1f;
+			_qualityWeights[ZT_QOS_PER_IDX] = 0.1f;
+			_qualityWeights[ZT_QOS_THR_IDX] = 0.1f;
+			_qualityWeights[ZT_QOS_THM_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_THV_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_SCP_IDX] = 0.0f;
+			break;
+		/**
+		 * Path monitoring is used to determine the capacity of each
+		 * path and where to place the next flow.
+		 */
+		case ZT_BONDING_POLICY_BALANCE_XOR:
+			_failoverInterval = 5000;;
+			_upDelay=_bondMonitorInterval*2;
+			_allowFlowHashing = true;
+			_slaveMonitorStrategy = ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_DYNAMIC;
+			_qualityWeights[ZT_QOS_LAT_IDX] = 0.4f;
+			_qualityWeights[ZT_QOS_LTM_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_PDV_IDX] = 0.2f;
+			_qualityWeights[ZT_QOS_PLR_IDX] = 0.1f;
+			_qualityWeights[ZT_QOS_PER_IDX] = 0.1f;
+			_qualityWeights[ZT_QOS_THR_IDX] = 0.1f;
+			_qualityWeights[ZT_QOS_THM_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_THV_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_SCP_IDX] = 0.0f;
+			break;
+		/**
+		 * Path monitoring is used to determine the capacity of each
+		 * path and where to place the next flow. Additionally, re-shuffling
+		 * of flows may take place.
+		 */
+		case ZT_BONDING_POLICY_BALANCE_AWARE:
+			_failoverInterval = 3000;
+			_allowFlowHashing = true;
+			_slaveMonitorStrategy = ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_DYNAMIC;
+			_qualityWeights[ZT_QOS_LAT_IDX] = 0.3f;
+			_qualityWeights[ZT_QOS_LTM_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_PDV_IDX] = 0.1f;
+			_qualityWeights[ZT_QOS_PLR_IDX] = 0.1f;
+			_qualityWeights[ZT_QOS_PER_IDX] = 0.1f;
+			_qualityWeights[ZT_QOS_THR_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_THM_IDX] = 0.4f;
+			_qualityWeights[ZT_QOS_THV_IDX] = 0.0f;
+			_qualityWeights[ZT_QOS_SCP_IDX] = 0.0f;
+			break;
+		default:
+			break;
+	}
+
+	/**
+	 * Timer geometries and counters
+	 */
+	_bondMonitorInterval = _failoverInterval / 3;
+	_ackSendInterval = _failoverInterval;
+	_qualityEstimationInterval = _failoverInterval * 2;
+
+	_dynamicPathMonitorInterval = 0;
+
+	_downDelay=0;
+	_upDelay=0;
+
+	_ackCutoffCount = 0;
+	_lastAckRateCheck = 0;
+	_qosSendInterval = _bondMonitorInterval * 4;
+	_qosCutoffCount = 0;
+	_lastQoSRateCheck = 0;
+	throughputMeasurementInterval = _ackSendInterval * 2;
+	BondController::setMinReqPathMonitorInterval(_bondMonitorInterval);
+
+	_defaultPathRefractoryPeriod = 8000;
+
+	fprintf(stderr, "TIMERS: strat=%d, fi= %d, bmi= %d, qos= %d, ack= %d, estimateInt= %d, refractory= %d, ud= %d, dd= %d\n",
+		_slaveMonitorStrategy,
+		_failoverInterval,
+		_bondMonitorInterval,
+		_qosSendInterval,
+		_ackSendInterval,
+		_qualityEstimationInterval,
+		_defaultPathRefractoryPeriod,
+		_upDelay,
+		_downDelay);
+
+	_lastQualityEstimation=0;
+}
+
+void Bond::setUserQualityWeights(float weights[], int len)
+{
+	if (len == ZT_QOS_WEIGHT_SIZE) {
+		float weightTotal = 0.0;
+		for (unsigned int i=0; i<ZT_QOS_WEIGHT_SIZE; ++i) {
+			weightTotal += weights[i];
+		}
+		if (weightTotal > 0.99 && weightTotal < 1.01) {
+			memcpy(_qualityWeights, weights, len * sizeof(float));
+		}
+	}
+}
+
+
+bool Bond::relevant() {
+	return _peer->identity().address().toInt() == 0x16a03a3d03 
+		|| _peer->identity().address().toInt() == 0x4410300d03 
+		|| _peer->identity().address().toInt() == 0x795cbf86fa;
+}
+
+SharedPtr<Slave> Bond::getSlave(const SharedPtr<Path>& path)
+{
+	return RR->bc->getSlaveBySocket(_policyAlias, path->localSocket());
+}
+
+void Bond::dumpInfo(const int64_t now)
+{
+	char pathStr[128];
+	//char oldPathStr[128];
+	char currPathStr[128];
+
+	if (!relevant()) {	
+		return;
+	}
+	/*
+	fprintf(stderr, "---[ bp=%d, id=%llx, dd=%d, up=%d, pmi=%d, specifiedSlaves=%d, _specifiedPrimarySlave=%d, _specifiedFailInst=%d ]\n",
+			_policy, _peer->identity().address().toInt(), _downDelay, _upDelay, _monitorInterval, _userHasSpecifiedSlaves, _userHasSpecifiedPrimarySlave, _userHasSpecifiedFailoverInstructions);
+
+	if (_bondingPolicy== ZT_BONDING_POLICY_ACTIVE_BACKUP) {
+		fprintf(stderr, "Paths (bp=%d, stats=%d, primaryReselect=%d) :\n",
+			_policy, _shouldCollectPathStatistics, _abSlaveSelectMethod);
+	}
+	if (_bondingPolicy== ZT_BONDING_POLICY_BALANCE_RR
+		|| _bondingPolicy== ZT_BONDING_POLICY_BALANCE_XOR
+		|| _bondingPolicy== ZT_BONDING_POLICY_BALANCE_AWARE) {
+		fprintf(stderr, "Paths (bp=%d, stats=%d, fh=%d) :\n",
+			_policy, _shouldCollectPathStatistics, _allowFlowHashing);
+	}*/
+
+	if ((now - _lastLogTS) < 1000) {
+		return;
+	}
+	_lastPrintTS = now;
+	_lastLogTS = now;
+	
+	fprintf(stderr, "\n\n");
+
+	for(int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; ++i) {
+		if (_paths[i]) {
+			SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[i]->localSocket());
+			_paths[i]->address().toString(pathStr);
+			fprintf(stderr, " %2d: lat=%8.3f, ac=%3d, fail%5s, fscore=%6d, in=%7d, out=%7d, age=%7ld, ack=%7ld, ref=%6d, ls=%llx",
+				i,
+				_paths[i]->_latencyMean,
+				_paths[i]->_allocation,
+				slave->failoverToSlave().c_str(),
+				_paths[i]->_failoverScore,
+				_paths[i]->_packetsIn,
+				_paths[i]->_packetsOut,
+				(long)_paths[i]->age(now),
+				(long)_paths[i]->ackAge(now),
+				_paths[i]->_refractoryPeriod,
+				_paths[i]->localSocket()
+			);
+			if (slave->spare()) {
+				fprintf(stderr, " SPR.");
+			} else {
+				fprintf(stderr, "     ");
+			}
+			if (slave->primary()) {
+				fprintf(stderr, " PRIM.");
+			} else {
+				fprintf(stderr, "      ");
+			}
+			if (_paths[i]->allowed()) {
+				fprintf(stderr, " ALL.");
+			} else {
+				fprintf(stderr, "     ");
+			}
+			if (_paths[i]->eligible(now,_ackSendInterval)) {
+				fprintf(stderr, " ELI.");
+			} else {
+				fprintf(stderr, "     ");
+			}
+			if (_paths[i]->preferred()) {
+				fprintf(stderr, " PREF.");
+			} else {
+				fprintf(stderr, "      ");
+			}
+			if (_paths[i]->_negotiated) {
+				fprintf(stderr, " NEG.");
+			} else {
+				fprintf(stderr, "     ");
+			}
+			if (_paths[i]->bonded()) {
+				fprintf(stderr, " BOND ");
+			} else {
+				fprintf(stderr, "      ");
+			}
+			if (_bondingPolicy== ZT_BONDING_POLICY_ACTIVE_BACKUP && _abPath && (_abPath == _paths[i].ptr())) {
+				fprintf(stderr, " ACTIVE  ");
+			} else if (_bondingPolicy== ZT_BONDING_POLICY_ACTIVE_BACKUP) {
+				fprintf(stderr, "         ");
+			}
+			if (_bondingPolicy== ZT_BONDING_POLICY_ACTIVE_BACKUP && _abFailoverQueue.size() && (_abFailoverQueue.front().ptr() == _paths[i].ptr())) {
+				fprintf(stderr, " NEXT    ");
+			} else  if (_bondingPolicy== ZT_BONDING_POLICY_ACTIVE_BACKUP) {
+				fprintf(stderr, "         ");
+			}
+			fprintf(stderr, "%5s %s\n", slave->ifname().c_str(), pathStr);
+		}
+	}
+
+	if (_bondingPolicy== ZT_BONDING_POLICY_ACTIVE_BACKUP) {
+		if (!_abFailoverQueue.empty()) {
+			fprintf(stderr, "\nFailover Queue:\n");
+			for (std::list<SharedPtr<Path> >::iterator it(_abFailoverQueue.begin()); it!=_abFailoverQueue.end();++it) {
+				(*it)->address().toString(currPathStr);
+				SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, (*it)->localSocket());
+				fprintf(stderr, "\t%8s\tspeed=%7d\trelSpeed=%3d\tipvPref=%3d\tfscore=%9d\t\t%s\n",
+					slave->ifname().c_str(),
+					slave->speed(),
+					slave->relativeSpeed(),
+					slave->ipvPref(),
+					(*it)->_failoverScore,
+					currPathStr);
+			}
+		}
+		else
+		{
+			fprintf(stderr, "\nFailover Queue size = %lu\n", _abFailoverQueue.size());
+		}
+	}
+
+	if (_bondingPolicy== ZT_BONDING_POLICY_BALANCE_RR
+		|| _bondingPolicy== ZT_BONDING_POLICY_BALANCE_XOR
+		|| _bondingPolicy== ZT_BONDING_POLICY_BALANCE_AWARE) {
+		/*
+		if (_numBondedPaths) {
+			fprintf(stderr, "\nBonded Paths:\n");
+			for (int i=0; i<_numBondedPaths; ++i) {
+				_paths[_bondedIdx[i]].p->address().toString(currPathStr);
+				SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, _paths[_bondedIdx[i]].p->localSocket());
+				fprintf(stderr, " [%d]\t%8s\tflows=%3d\tspeed=%7d\trelSpeed=%3d\tipvPref=%3d\tfscore=%9d\t\t%s\n", i,
+				//fprintf(stderr, " [%d]\t%8s\tspeed=%7d\trelSpeed=%3d\tflowCount=%2d\tipvPref=%3d\tfscore=%9d\t\t%s\n", i,
+					slave->ifname().c_str(),
+					numberOfAssignedFlows(_paths[_bondedIdx[i]].p),
+					slave->speed(),
+					slave->relativeSpeed(),
+					//_paths[_bondedIdx[i]].p->assignedFlows.size(),
+					slave->ipvPref(),
+					_paths[_bondedIdx[i]].p->failoverScore(),
+					currPathStr);
+			}
+		}
+		*/
+		/*
+		if (_allowFlowHashing) {
+			//Mutex::Lock _l(_flows_m);
+			if (_flows.size()) {
+				fprintf(stderr, "\nFlows:\n");
+				std::map<int32_t,SharedPtr<Flow> >::iterator it = _flows.begin();
+				while (it != _flows.end()) {
+					it->second->assignedPath()->address().toString(currPathStr);
+					SharedPtr<Slave> slave =RR->bc->getSlaveBySocket(_policyAlias, it->second->assignedPath()->localSocket());
+					fprintf(stderr, " [%4x] in=%16llu, out=%16llu, bytes=%16llu, last=%16llu, if=%8s\t\t%s\n",
+						it->second->id(),
+						it->second->bytesInPerUnitTime(),
+						it->second->bytesOutPerUnitTime(),
+						it->second->totalBytes(),
+						it->second->age(now),
+						slave->ifname().c_str(),
+						currPathStr);
+					++it;
+				}
+			}
+		}
+		*/
+	}
+	//fprintf(stderr, "\n\n\n\n\n");
+}
+
+} // namespace ZeroTier

+ 689 - 0
node/Bond.hpp

@@ -0,0 +1,689 @@
+/*
+ * Copyright (c)2013-2020 ZeroTier, Inc.
+ *
+ * Use of this software is governed by the Business Source License included
+ * in the LICENSE.TXT file in the project's root directory.
+ *
+ * Change Date: 2024-01-01
+ *
+ * On the date above, in accordance with the Business Source License, use
+ * of this software will be governed by version 2.0 of the Apache License.
+ */
+/****/
+
+#ifndef ZT_BOND_HPP
+#define ZT_BOND_HPP
+
+#include <map>
+
+#include "Path.hpp"
+#include "Peer.hpp"
+#include "../osdep/Slave.hpp"
+#include "Flow.hpp"
+
+namespace ZeroTier {
+
+class RuntimeEnvironment;
+class Slave;
+
+class Bond
+{
+	friend class SharedPtr<Bond>;
+	friend class Peer;
+	friend class BondController;
+
+	struct PathQualityComparator
+	{
+		bool operator ()(const SharedPtr<Path> & a, const SharedPtr<Path> & b)
+		{
+			if(a->_failoverScore == b->_failoverScore) {
+				return a < b;
+			}
+			return a->_failoverScore > b->_failoverScore;
+		}
+	};
+
+public:
+
+    // TODO: Remove
+    bool _header;
+    int64_t _lastLogTS;
+    int64_t _lastPrintTS;
+    void dumpInfo(const int64_t now);
+    bool relevant();
+
+    SharedPtr<Slave> getSlave(const SharedPtr<Path>& path);
+
+    /**
+     * Constructor. For use only in first initialization in Node
+     *
+     * @param renv Runtime environment
+     */
+    Bond(const RuntimeEnvironment *renv);
+
+    /**
+     * Constructor. Creates a bond based off of ZT defaults
+     *
+     * @param renv Runtime environment
+     * @param policy Bonding policy
+     * @param peer
+     */
+    Bond(const RuntimeEnvironment *renv, int policy, const SharedPtr<Peer>& peer);
+
+    /**
+     * Constructor. For use when user intends to manually specify parameters
+     *
+     * @param basePolicy
+     * @param policyAlias
+     * @param peer
+     */
+    Bond(std::string& basePolicy, std::string& policyAlias, const SharedPtr<Peer>& peer);
+
+    /**
+     * Constructor. Creates a bond based off of a user-defined bond template
+     *
+     * @param renv Runtime environment
+     * @param original
+     * @param peer
+     */
+    Bond(const RuntimeEnvironment *renv, const Bond &original, const SharedPtr<Peer>& peer);
+
+	/**
+	 *
+	 * @return
+	 */
+	std::string policyAlias() { return _policyAlias; }
+
+	/**
+	 * Inform the bond about the path that its peer just learned about
+	 *
+	 * @param path Newly-learned Path which should now be handled by the Bond
+	 * @param now Current time
+	 */
+	void nominatePath(const SharedPtr<Path>& path, int64_t now);
+	
+	/**
+	 * Propagate and memoize often-used bonding preferences for each path
+	 */
+	void applyUserPrefs();
+
+	/**
+	 * Check path states and perform bond rebuilds if needed.
+	 * 
+	 * @param now Current time
+	 * @param rebuild Whether or not the bond should be reconstructed. 
+	 */
+	void curateBond(const int64_t now, bool rebuild);
+
+	/**
+	 * Periodically perform statistical summaries of quality metrics for all paths.
+	 *
+	 * @param now Current time
+	 */
+	void estimatePathQuality(int64_t now);
+
+	/**
+	 * Record an invalid incoming packet. This packet failed
+	 * MAC/compression/cipher checks and will now contribute to a
+	 * Packet Error Ratio (PER).
+	 *
+	 * @param path Path over which packet was received
+	 */
+	void recordIncomingInvalidPacket(const SharedPtr<Path>& path);
+
+	/**
+	 * Record statistics on outbound an packet.
+	 *
+	 * @param path Path over which packet is being sent
+	 * @param packetId Packet ID
+	 * @param payloadLength Packet data length
+	 * @param verb Packet verb
+	 * @param flowId Flow ID
+	 * @param now Current time
+	 */
+	void recordOutgoingPacket(const SharedPtr<Path> &path, uint64_t packetId,
+		uint16_t payloadLength, Packet::Verb verb, int32_t flowId, int64_t now);
+
+	/**
+	 * Process the contents of an inbound VERB_QOS_MEASUREMENT to gather path quality observations.
+	 *
+	 * @param now Current time
+	 * @param count Number of records
+	 * @param rx_id table of packet IDs
+	 * @param rx_ts table of holding times
+	 */
+	void receivedQoS(const SharedPtr<Path>& path, int64_t now, int count, uint64_t *rx_id, uint16_t *rx_ts);
+
+	/**
+	 * Process the contents of an inbound VERB_ACK to gather path quality observations.
+	 * 
+	 * @param path Path over which packet was received
+	 * @param now Current time
+	 * @param ackedBytes Number of bytes ACKed by this VERB_ACK
+	 */
+	void receivedAck(const SharedPtr<Path>& path, int64_t now, int32_t ackedBytes);
+
+	/**
+	 * Generate the contents of a VERB_QOS_MEASUREMENT packet.
+	 *
+	 * @param now Current time
+	 * @param qosBuffer destination buffer
+	 * @return Size of payload
+	 */
+	int32_t generateQoSPacket(const SharedPtr<Path>& path, int64_t now, char *qosBuffer);
+
+	/**
+	 * Record statistics for an inbound packet.
+	 * 
+	 * @param path Path over which packet was received
+	 * @param packetId Packet ID
+	 * @param payloadLength Packet data length
+	 * @param verb Packet verb
+	 * @param flowId Flow ID
+	 * @param now Current time
+	 */
+	void recordIncomingPacket(const SharedPtr<Path>& path, uint64_t packetId, uint16_t payloadLength,
+	        Packet::Verb verb, int32_t flowId, int64_t now);
+
+	/**
+	 * Determines the most appropriate path for packet and flow egress. This decision is made by
+	 * the underlying bonding policy as well as QoS-related statistical observations of path quality.
+	 *
+	 * @param now Current time
+	 * @param flowId Flow ID
+	 * @return Pointer to suggested Path
+	 */
+	SharedPtr<Path> getAppropriatePath(int64_t now, int32_t flowId);
+
+	/**
+	 * Creates a new flow record
+	 * 
+	 * @param path Path over which flow shall be handled
+	 * @param flowId Flow ID
+	 * @param entropy A byte of entropy to be used by the bonding algorithm
+	 * @param now Current time
+	 * @return Pointer to newly-created Flow
+	 */
+	SharedPtr<Flow> createFlow(const SharedPtr<Path> &path, int32_t flowId, unsigned char entropy, int64_t now);
+
+	/**
+	 * Removes flow records that are past a certain age limit.
+	 * 
+	 * @param age Age threshold to be forgotten
+	 * @param oldest Whether only the oldest shall be forgotten
+	 * @param now Current time
+	 */
+	void forgetFlowsWhenNecessary(uint64_t age, bool oldest, int64_t now);
+
+	/**
+	 * Assigns a new flow to a bonded path
+	 * 
+	 * @param flow Flow to be assigned
+	 * @param now Current time
+	 */
+	bool assignFlowToBondedPath(SharedPtr<Flow> &flow, int64_t now);
+
+    /**
+	 * Determine whether a path change should occur given the remote peer's reported utility and our
+	 * local peer's known utility. This has the effect of assigning inbound and outbound traffic to
+	 * the same path.  
+	 * 
+	 * @param now Current time
+	 * @param path Path over which the negotiation request was received
+	 * @param remoteUtility How much utility the remote peer claims to gain by using the declared path
+	 */
+	void processIncomingPathNegotiationRequest(uint64_t now, SharedPtr<Path> &path, int16_t remoteUtility);
+
+	/**
+	 * Determine state of path synchronization and whether a negotiation request
+	 * shall be sent to the peer.
+	 *
+	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
+	 * @param now Current time
+	 */
+	void pathNegotiationCheck(void *tPtr, const int64_t now);
+
+	/**
+	 * Sends a VERB_ACK to the remote peer.
+	 * 
+	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
+	 * @param path Path over which packet should be sent
+	 * @param localSocket Local source socket
+	 * @param atAddress
+	 * @param now Current time
+	 */
+	void sendACK(void *tPtr,const SharedPtr<Path> &path,int64_t localSocket,
+	        const InetAddress &atAddress,int64_t now);
+
+	/**
+	 * Sends a VERB_QOS_MEASUREMENT to the remote peer.
+	 * 
+	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
+	 * @param path Path over which packet should be sent
+	 * @param localSocket Local source socket
+	 * @param atAddress
+	 * @param now Current time
+	 */
+	void sendQOS_MEASUREMENT(void *tPtr,const SharedPtr<Path> &path,int64_t localSocket,
+	        const InetAddress &atAddress,int64_t now);
+
+	/**
+	 * Sends a VERB_PATH_NEGOTIATION_REQUEST to the remote peer.
+	 * 
+	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
+	 * @param path Path over which packet should be sent
+	 */
+	void sendPATH_NEGOTIATION_REQUEST(void *tPtr, const SharedPtr<Path> &path);
+
+	/**
+	 *
+	 * @param now Current time
+	 */
+	void processBalanceTasks(int64_t now);
+	
+	/**
+	 * Perform periodic tasks unique to active-backup
+	 * 
+	 * @param now Current time
+	 */
+	void processActiveBackupTasks(int64_t now);
+
+	/**
+	 * Switches the active slave in an active-backup scenario to the next best during
+	 * a failover event.
+	 *
+	 * @param now Current time
+	 */
+	void dequeueNextActiveBackupPath(uint64_t now);
+
+    /**
+     * Set bond parameters to reasonable defaults, these may later be overwritten by
+	 * user-specified parameters.
+     *
+     * @param policy Bonding policy
+     */
+	void setReasonableDefaults(int policy);
+
+	/**
+	 * Check and assign user-specified quality weights to this bond.
+	 *
+	 * @param weights Set of user-specified weights
+	 * @param len Length of weight vector
+	 */
+	void setUserQualityWeights(float weights[], int len);
+
+	/**
+	 * @param latencyInMilliseconds Maximum acceptable latency.
+	 */
+	void setMaxAcceptableLatency(int16_t latencyInMilliseconds) {
+		_maxAcceptableLatency = latencyInMilliseconds;
+	}
+
+	/**
+	 * @param latencyInMilliseconds Maximum acceptable (mean) latency.
+	 */
+	void setMaxAcceptableMeanLatency(int16_t latencyInMilliseconds) {
+		_maxAcceptableMeanLatency = latencyInMilliseconds;
+	}
+
+	/**
+	 * @param latencyVarianceInMilliseconds Maximum acceptable packet delay variance (jitter).
+	 */
+	void setMaxAcceptablePacketDelayVariance(int16_t latencyVarianceInMilliseconds) {
+		_maxAcceptablePacketDelayVariance = latencyVarianceInMilliseconds;
+	}
+
+	/**
+	 * @param lossRatio Maximum acceptable packet loss ratio (PLR).
+	 */
+	void setMaxAcceptablePacketLossRatio(float lossRatio) {
+		_maxAcceptablePacketLossRatio = lossRatio;
+	}
+
+	/**
+	 * @param errorRatio Maximum acceptable packet error ratio (PER).
+	 */
+	void setMaxAcceptablePacketErrorRatio(float errorRatio) {
+		_maxAcceptablePacketErrorRatio = errorRatio;
+	}
+
+	/**
+	 * @param errorRatio Maximum acceptable packet error ratio (PER).
+	 */
+	void setMinAcceptableAllocation(float minAlloc) {
+		_minAcceptableAllocation = minAlloc * 255;
+	}
+
+	/**
+	 * @return Whether the user has defined slaves for use on this bond
+	 */
+	inline bool userHasSpecifiedSlaves() { return _userHasSpecifiedSlaves; }
+
+	/**
+	 * @return Whether the user has defined a set of failover slave(s) for this bond
+	 */
+	inline bool userHasSpecifiedFailoverInstructions() { return _userHasSpecifiedFailoverInstructions; };
+
+	/**
+	 * @return Whether the user has specified a primary slave
+	 */
+	inline bool userHasSpecifiedPrimarySlave() { return _userHasSpecifiedPrimarySlave; }
+
+	/**
+	 * @return Whether the user has specified slave speeds
+	 */
+	inline bool userHasSpecifiedSlaveSpeeds() { return _userHasSpecifiedSlaveSpeeds; }
+
+	/**
+	 * Periodically perform maintenance tasks for each active bond.
+	 *
+	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
+	 * @param now Current time
+	 */
+	void processBackgroundTasks(void *tPtr, int64_t now);
+
+	/**
+	 * Rate limit gate for VERB_ACK
+	 *
+	 * @param now Current time
+	 * @return Whether the incoming packet should be rate-gated
+	 */
+	inline bool rateGateACK(const int64_t now)
+	{
+		_ackCutoffCount++;
+		int numToDrain = _lastAckRateCheck ? (now - _lastAckRateCheck) / ZT_ACK_DRAINAGE_DIVISOR  : _ackCutoffCount;
+		_lastAckRateCheck = now;
+		if (_ackCutoffCount > numToDrain) {
+			_ackCutoffCount-=numToDrain;
+		} else {
+			_ackCutoffCount = 0;
+		}
+		return (_ackCutoffCount < ZT_ACK_CUTOFF_LIMIT);
+	}
+
+	/**
+	 * Rate limit gate for VERB_QOS_MEASUREMENT
+	 *
+	 * @param now Current time
+	 * @return Whether the incoming packet should be rate-gated
+	 */
+	inline bool rateGateQoS(const int64_t now)
+	{
+		_qosCutoffCount++;
+		int numToDrain = (now - _lastQoSRateCheck) / ZT_QOS_DRAINAGE_DIVISOR;
+		_lastQoSRateCheck = now;
+		if (_qosCutoffCount > numToDrain) {
+			_qosCutoffCount-=numToDrain;
+		} else {
+			_qosCutoffCount = 0;
+		}
+		return (_qosCutoffCount < ZT_QOS_CUTOFF_LIMIT);
+	}
+
+	/**
+	 * Rate limit gate for VERB_PATH_NEGOTIATION_REQUEST
+	 *
+	 * @param now Current time
+	 * @return Whether the incoming packet should be rate-gated
+	 */
+	inline bool rateGatePathNegotiation(const int64_t now)
+	{
+		if ((now - _lastPathNegotiationReceived) <= ZT_PATH_NEGOTIATION_CUTOFF_TIME)
+			++_pathNegotiationCutoffCount;
+		else _pathNegotiationCutoffCount = 0;
+		_lastPathNegotiationReceived = now;
+		return (_pathNegotiationCutoffCount < ZT_PATH_NEGOTIATION_CUTOFF_LIMIT);
+	}
+
+	/**
+	 * @param interval Maximum amount of time user expects a failover to take on this bond.
+	 */
+	inline void setFailoverInterval(uint32_t interval) { _failoverInterval = interval; }
+
+	/**
+	 * @param strategy The strategy that the bond uses to prob for path aliveness and quality
+	 */
+	inline void setSlaveMonitorStrategy(uint8_t strategy) { _slaveMonitorStrategy = strategy; }
+
+	/**
+	 * @return the current up delay parameter
+	 */
+	inline uint16_t getUpDelay() { return _upDelay; }
+
+    /**
+     * @param upDelay Length of time before a newly-discovered path is admitted to the bond
+     */
+	inline void setUpDelay(int upDelay) { if (upDelay >= 0) { _upDelay = upDelay; } }
+
+    /**
+     * @return Length of time before a newly-failed path is removed from the bond
+     */
+	inline uint16_t getDownDelay() { return _downDelay; }
+
+    /**
+     * @param downDelay Length of time before a newly-failed path is removed from the bond
+     */
+	inline void setDownDelay(int downDelay) { if (downDelay >= 0) { _downDelay = downDelay; } }
+
+	/**
+	 * @return the current monitoring interval for the bond (can be overridden with intervals specific to certain slaves.)
+	 */
+	inline uint16_t getBondMonitorInterval() { return _bondMonitorInterval; }
+
+    /**
+     * Set the current monitoring interval for the bond (can be overridden with intervals specific to certain slaves.)
+     *
+     * @param monitorInterval How often gratuitous VERB_HELLO(s) are sent to remote peer.
+     */
+	inline void setBondMonitorInterval(uint16_t interval) { _bondMonitorInterval = interval; }
+
+	/**
+	 * @param policy Bonding policy for this bond
+	 */
+	inline void setPolicy(uint8_t policy) { _bondingPolicy = policy; }
+
+	/**
+	 * @return the current bonding policy
+	 */
+	inline uint8_t getPolicy() { return _bondingPolicy; }
+
+    /**
+     *
+     * @param allowFlowHashing
+     */
+	inline void setFlowHashing(bool allowFlowHashing) { _allowFlowHashing = allowFlowHashing; }
+
+	/**
+	 * @return Whether flow-hashing is currently enabled for this bond.
+	 */
+	bool flowHashingEnabled() { return _allowFlowHashing; }
+
+    /**
+     *
+     * @param packetsPerSlave
+     */
+	inline void setPacketsPerSlave(int packetsPerSlave) { _packetsPerSlave = packetsPerSlave; }
+
+	/**
+	 *
+	 * @param slaveSelectMethod
+	 */
+	inline void setSlaveSelectMethod(uint8_t method) { _abSlaveSelectMethod = method; }
+
+	/**
+	 *
+	 * @return
+	 */
+    inline uint8_t getSlaveSelectMethod() { return _abSlaveSelectMethod; }
+
+	/**
+	 *
+	 * @param allowPathNegotiation
+	 */
+	inline void setAllowPathNegotiation(bool allowPathNegotiation) { _allowPathNegotiation = allowPathNegotiation; }
+
+	/**
+	 *
+	 * @return
+	 */
+	inline bool allowPathNegotiation() { return _allowPathNegotiation; }
+
+private:
+
+	const RuntimeEnvironment *RR;
+	AtomicCounter __refCount;
+
+	/**
+	 * Custom name given by the user to this bond type.
+	 */
+	std::string _policyAlias;
+
+	/**
+	 * Paths that this bond has been made aware of but that are not necessarily
+	 * part of the bond proper.
+	 */
+	SharedPtr<Path> _paths[ZT_MAX_PEER_NETWORK_PATHS];
+
+	/**
+	 * Set of indices corresponding to paths currently included in the bond proper. This
+	 * may only be updated during a call to curateBond(). The reason for this is so that
+	 * we can simplify the high frequency packet egress logic.
+	 */
+	int _bondedIdx[ZT_MAX_PEER_NETWORK_PATHS];
+
+	/**
+	 * Number of paths currently included in the _bondedIdx set.
+	 */
+	int _numBondedPaths;
+
+	/**
+	 * Flows hashed according to port and protocol
+	 */
+	std::map<int32_t,SharedPtr<Flow> > _flows;
+
+	float _qualityWeights[ZT_QOS_WEIGHT_SIZE]; // How much each factor contributes to the "quality" score of a path.
+
+	uint8_t _bondingPolicy;
+	uint32_t _upDelay;
+	uint32_t _downDelay;
+
+	// active-backup
+	SharedPtr<Path> _abPath; // current active path
+	std::list<SharedPtr<Path> > _abFailoverQueue;
+	uint8_t _abSlaveSelectMethod; // slave re-selection policy for the primary slave in active-backup
+	uint64_t _lastActiveBackupPathChange;
+
+	// balance-rr
+	uint8_t _rrIdx; // index to path currently in use during Round Robin operation
+	uint16_t _rrPacketsSentOnCurrSlave; // number of packets sent on this slave since the most recent path switch.
+	/**
+	 * How many packets will be sent on a path before moving to the next path
+	 * in the round-robin sequence. A value of zero will cause a random path
+	 * selection for each outgoing packet.
+	 */
+	int _packetsPerSlave;
+
+	// balance-aware
+	uint64_t _totalBondUnderload;
+
+	// dynamic slave monitoring
+	uint8_t _slaveMonitorStrategy;
+	uint64_t _lastFrame;
+	uint32_t _dynamicPathMonitorInterval;
+
+	// path negotiation
+	int16_t _localUtility;
+	SharedPtr<Path> negotiatedPath;
+	uint8_t _numSentPathNegotiationRequests;
+	unsigned int _pathNegotiationCutoffCount;
+	bool _allowPathNegotiation;
+	uint64_t _lastPathNegotiationReceived;
+	uint64_t _lastSentPathNegotiationRequest;
+
+	// timers
+	uint32_t _failoverInterval;
+	uint32_t _qosSendInterval;
+	uint32_t _ackSendInterval;
+	uint16_t _ackCutoffCount;
+	uint64_t _lastAckRateCheck;
+	uint16_t _qosCutoffCount;
+	uint64_t _lastQoSRateCheck;
+	uint32_t throughputMeasurementInterval;
+	uint32_t _qualityEstimationInterval;
+
+	// timestamps
+	uint64_t _lastCheckUserPreferences;
+	uint64_t _lastQualityEstimation;
+	uint64_t _lastFlowStatReset;
+	uint64_t _lastFlowExpirationCheck;
+	uint64_t _lastFlowRebalance;
+	uint64_t _lastPathNegotiationCheck;
+	uint64_t _lastBackgroundTaskCheck;
+
+	float _maxAcceptablePacketLossRatio;
+	float _maxAcceptablePacketErrorRatio;
+	uint16_t _maxAcceptableLatency;
+	uint16_t _maxAcceptableMeanLatency;
+	uint16_t _maxAcceptablePacketDelayVariance;
+	uint8_t _minAcceptableAllocation;
+
+	/**
+	 * Default initial punishment inflicted on misbehaving paths. Punishment slowly
+	 * drains linearly. For each eligibility change the remaining punishment is doubled.
+	 */
+	uint32_t _defaultPathRefractoryPeriod;
+
+	/**
+	 * Whether the current bonding policy requires computation of path statistics
+	 */
+	bool _shouldCollectPathStatistics;
+
+	/**
+	 * Free byte of entropy that is updated on every packet egress event.
+	 */
+	unsigned char _freeRandomByte;
+
+	/**
+	 * Remote peer that this bond services
+	 */
+	SharedPtr<Peer> _peer;
+
+	Mutex _paths_m;
+	Mutex _flows_m;
+
+	/**
+	 * Whether the user has specified slaves for this bond.
+	 */
+	bool _userHasSpecifiedSlaves;
+
+	/**
+	 * Whether the user has specified a primary slave for this bond.
+	 */
+	bool _userHasSpecifiedPrimarySlave;
+
+	/**
+	 * Whether the user has specified failover instructions for this bond.
+	 */
+	bool _userHasSpecifiedFailoverInstructions;
+
+	/**
+	 * Whether the user has specified slaves speeds for this bond.
+	 */
+	bool _userHasSpecifiedSlaveSpeeds;
+
+	/**
+	 * How frequently (in ms) a VERB_ECHO is sent to a peer to verify that a
+	 * path is still active. A value of zero (0) will disable active path
+	 * monitoring; as result, all monitoring will be a function of traffic.
+	 */
+	uint16_t _bondMonitorInterval;
+
+	/**
+	 * Whether or not flow hashing is allowed.
+	 */
+	bool _allowFlowHashing;
+};
+
+} // namespace ZeroTier
+
+#endif

+ 203 - 0
node/BondController.cpp

@@ -0,0 +1,203 @@
+/*
+ * Copyright (c)2013-2020 ZeroTier, Inc.
+ *
+ * Use of this software is governed by the Business Source License included
+ * in the LICENSE.TXT file in the project's root directory.
+ *
+ * Change Date: 2024-01-01
+ *
+ * On the date above, in accordance with the Business Source License, use
+ * of this software will be governed by version 2.0 of the Apache License.
+ */
+/****/
+
+#include "BondController.hpp"
+#include "Peer.hpp"
+
+namespace ZeroTier {
+
+int BondController::_minReqPathMonitorInterval;
+uint8_t BondController::_defaultBondingPolicy;
+
+BondController::BondController(const RuntimeEnvironment *renv) :
+	RR(renv)
+{
+	bondStartTime = RR->node->now();
+}
+
+bool BondController::slaveAllowed(std::string &policyAlias, SharedPtr<Slave> slave)
+{
+	bool foundInDefinitions = false;
+	if (_slaveDefinitions.count(policyAlias)) {
+		auto it = _slaveDefinitions[policyAlias].begin();
+		while (it != _slaveDefinitions[policyAlias].end()) {
+			if (slave->ifname() == (*it)->ifname()) {
+				foundInDefinitions = true;
+				break;
+			}
+			++it;
+		}
+	}
+	return _slaveDefinitions[policyAlias].empty() || foundInDefinitions;
+}
+
+void BondController::addCustomSlave(std::string& policyAlias, SharedPtr<Slave> slave)
+{
+	Mutex::Lock _l(_slaves_m);
+	_slaveDefinitions[policyAlias].push_back(slave);
+	auto search = _interfaceToSlaveMap[policyAlias].find(slave->ifname());
+	if (search == _interfaceToSlaveMap[policyAlias].end()) {
+		slave->setAsUserSpecified(true);
+		_interfaceToSlaveMap[policyAlias].insert(std::pair<std::string, SharedPtr<Slave>>(slave->ifname(), slave));
+	} else {
+		fprintf(stderr, "slave already exists=%s\n", slave->ifname().c_str());
+		// Slave is already defined, overlay user settings
+	}
+}
+
+bool BondController::addCustomPolicy(const SharedPtr<Bond>& newBond)
+{
+	Mutex::Lock _l(_bonds_m);
+	if (!_bondPolicyTemplates.count(newBond->policyAlias())) {
+		_bondPolicyTemplates[newBond->policyAlias()] = newBond;
+		return true;
+	}
+	return false;
+}
+
+bool BondController::assignBondingPolicyToPeer(int64_t identity, const std::string& policyAlias)
+{
+	Mutex::Lock _l(_bonds_m);
+	if (!_policyTemplateAssignments.count(identity)) {
+		_policyTemplateAssignments[identity] = policyAlias;
+		return true;
+	}
+	return false;
+}
+
+SharedPtr<Bond> BondController::createTransportTriggeredBond(const RuntimeEnvironment *renv, const SharedPtr<Peer>& peer)
+{
+	fprintf(stderr, "createTransportTriggeredBond\n");
+	Mutex::Lock _l(_bonds_m);
+	int64_t identity = peer->identity().address().toInt();
+	Bond *bond = nullptr;
+	if (!_bonds.count(identity)) {
+		std::string policyAlias;
+		int _defaultBondingPolicy = defaultBondingPolicy();
+		fprintf(stderr, "new bond, registering for %llx\n", identity);
+		if (!_policyTemplateAssignments.count(identity)) {
+			if (defaultBondingPolicy()) {
+				fprintf(stderr, "  no assignment, using default (%d)\n", _defaultBondingPolicy);
+				bond = new Bond(renv, _defaultBondingPolicy, peer);
+			}
+			if (!_defaultBondingPolicy && _defaultBondingPolicyStr.length()) {
+				fprintf(stderr, "  no assignment, using default custom (%s)\n", _defaultBondingPolicyStr.c_str());
+				bond = new Bond(renv, *(_bondPolicyTemplates[_defaultBondingPolicyStr].ptr()), peer);
+			}
+		}
+		else {
+			fprintf(stderr, "  assignment found for %llx, using it as a template (%s)\n", identity,_policyTemplateAssignments[identity].c_str());
+			if (!_bondPolicyTemplates[_policyTemplateAssignments[identity]]) {
+				fprintf(stderr, "unable to locate template (%s), ignoring assignment for (%llx), using defaults\n", _policyTemplateAssignments[identity].c_str(), identity);
+				bond = new Bond(renv, _defaultBondingPolicy, peer);
+			}
+			else {
+				bond = new Bond(renv, *(_bondPolicyTemplates[_policyTemplateAssignments[identity]].ptr()), peer);
+			}
+		}
+	}
+	else {
+		fprintf(stderr, "bond already exists for %llx, cannot re-register. exiting\n", identity); exit(0); // TODO: Remove
+	}
+	if (bond) {
+		_bonds[identity] = bond;
+		/**
+		 * Determine if user has specified anything that could affect the bonding policy's decisions
+		 */
+		if (_interfaceToSlaveMap.count(bond->policyAlias())) {
+			std::map<std::string, SharedPtr<Slave> >::iterator it = _interfaceToSlaveMap[bond->policyAlias()].begin();
+			while (it != _interfaceToSlaveMap[bond->policyAlias()].end()) {
+				if (it->second->isUserSpecified()) {
+					bond->_userHasSpecifiedSlaves = true;
+				}
+				if (it->second->isUserSpecified() && it->second->primary()) {
+					bond->_userHasSpecifiedPrimarySlave = true;
+				}
+				if (it->second->isUserSpecified() && it->second->userHasSpecifiedFailoverInstructions()) {
+					bond->_userHasSpecifiedFailoverInstructions = true;
+				}
+				if (it->second->isUserSpecified() && (it->second->speed() > 0)) {
+					bond->_userHasSpecifiedSlaveSpeeds = true;
+				}
+				++it;
+			}
+		}
+		return bond;
+	}
+	return SharedPtr<Bond>();
+}
+
+SharedPtr<Slave> BondController::getSlaveBySocket(const std::string& policyAlias, uint64_t localSocket)
+{
+	Mutex::Lock _l(_slaves_m);
+	char ifname[16];
+	_phy->getIfName((PhySocket *) ((uintptr_t)localSocket), ifname, 16);
+	std::string ifnameStr(ifname);
+	auto search = _interfaceToSlaveMap[policyAlias].find(ifnameStr);
+	if (search == _interfaceToSlaveMap[policyAlias].end()) {
+		SharedPtr<Slave> s = new Slave(ifnameStr, 0, 0, 0, 0, 0, true, ZT_MULTIPATH_SLAVE_MODE_SPARE, "", 0.0);
+		_interfaceToSlaveMap[policyAlias].insert(std::pair<std::string,SharedPtr<Slave> >(ifnameStr, s));
+		return s;
+	}
+	else {
+		return search->second;
+	}
+}
+
+SharedPtr<Slave> BondController::getSlaveByName(const std::string& policyAlias, const std::string& ifname)
+{
+	Mutex::Lock _l(_slaves_m);
+	auto search = _interfaceToSlaveMap[policyAlias].find(ifname);
+	if (search != _interfaceToSlaveMap[policyAlias].end()) {
+		return search->second;
+	}
+	return SharedPtr<Slave>();
+}
+
+bool BondController::allowedToBind(const std::string& ifname)
+{
+	return true;
+	/*
+	if (!_defaultBondingPolicy) {
+		return true; // no restrictions
+	}
+	Mutex::Lock _l(_slaves_m);
+	if (_interfaceToSlaveMap.empty()) {
+		return true; // no restrictions
+	}
+	std::map<std::string, std::map<std::string, SharedPtr<Slave> > >::iterator policyItr = _interfaceToSlaveMap.begin();
+	while (policyItr != _interfaceToSlaveMap.end()) {
+		std::map<std::string, SharedPtr<Slave> >::iterator slaveItr = policyItr->second.begin();
+		while (slaveItr != policyItr->second.end()) {
+			if (slaveItr->first == ifname) {
+				return true;
+			}
+			++slaveItr;
+		}
+		++policyItr;
+	}
+	return false;
+	*/
+}
+
+void BondController::processBackgroundTasks(void *tPtr, const int64_t now)
+{
+	Mutex::Lock _l(_bonds_m);
+	std::map<int64_t,SharedPtr<Bond> >::iterator bondItr = _bonds.begin();
+	while (bondItr != _bonds.end()) {
+		bondItr->second->processBackgroundTasks(tPtr, now);
+		++bondItr;
+	}
+}
+
+} // namespace ZeroTier

+ 231 - 0
node/BondController.hpp

@@ -0,0 +1,231 @@
+/*
+ * Copyright (c)2013-2020 ZeroTier, Inc.
+ *
+ * Use of this software is governed by the Business Source License included
+ * in the LICENSE.TXT file in the project's root directory.
+ *
+ * Change Date: 2024-01-01
+ *
+ * On the date above, in accordance with the Business Source License, use
+ * of this software will be governed by version 2.0 of the Apache License.
+ */
+/****/
+
+#ifndef ZT_BONDCONTROLLER_HPP
+#define ZT_BONDCONTROLLER_HPP
+
+#include <map>
+#include <vector>
+
+#include "SharedPtr.hpp"
+#include "../osdep/Phy.hpp"
+#include "../osdep/Slave.hpp"
+
+namespace ZeroTier {
+
+class RuntimeEnvironment;
+class Bond;
+class Peer;
+
+class BondController
+{
+	friend class Bond;
+
+public:
+
+	BondController(const RuntimeEnvironment *renv);
+
+	/**
+	 * @return The minimum interval required to poll the active bonds to fulfill all active monitoring timing requirements.
+	 */
+	bool slaveAllowed(std::string &policyAlias, SharedPtr<Slave> slave);
+
+	/**
+	 * @return The minimum interval required to poll the active bonds to fulfill all active monitoring timing requirements.
+	 */
+	int minReqPathMonitorInterval() { return _minReqPathMonitorInterval; }
+
+	/**
+	 * @return The minimum interval required to poll the active bonds to fulfill all active monitoring timing requirements.
+	 */
+	static void setMinReqPathMonitorInterval(int minReqPathMonitorInterval) { _minReqPathMonitorInterval = minReqPathMonitorInterval; }
+
+	/**
+	 * @return Whether the bonding layer is currently set up to be used.
+	 */
+	bool inUse() { return !_bondPolicyTemplates.empty() || _defaultBondingPolicy; }
+
+    /**
+     * @param basePolicyName Bonding policy name (See ZeroTierOne.h)
+     * @return The bonding policy code for a given human-readable bonding policy name
+     */
+	static int getPolicyCodeByStr(const std::string& basePolicyName)
+	{
+		if (basePolicyName == "active-backup") { return 1; }
+		if (basePolicyName == "broadcast") { return 2; }
+		if (basePolicyName == "balance-rr") { return 3; }
+		if (basePolicyName == "balance-xor") { return 4; }
+		if (basePolicyName == "balance-aware") { return 5; }
+		return 0; // "none"
+	}
+
+	/**
+	 * @param policy Bonding policy code (See ZeroTierOne.h)
+	 * @return The human-readable name for the given bonding policy code
+	 */
+	static std::string getPolicyStrByCode(int policy)
+	{
+		if (policy == 1) { return "active-backup"; }
+		if (policy == 2) { return "broadcast"; }
+		if (policy == 3) { return "balance-rr"; }
+		if (policy == 4) { return "balance-xor"; }
+		if (policy == 5) { return "balance-aware"; }
+		return "none";
+	}
+
+    /**
+     * Sets the default bonding policy for new or undefined bonds.
+	 *
+     * @param bp Bonding policy
+     */
+	void setBondingLayerDefaultPolicy(uint8_t bp) { _defaultBondingPolicy = bp; }
+
+    /**
+     * Sets the default (custom) bonding policy for new or undefined bonds.
+	 *
+     * @param alias Human-readable string alias for bonding policy
+     */
+	void setBondingLayerDefaultPolicyStr(std::string alias) { _defaultBondingPolicyStr = alias; }
+
+	/**
+	 * @return The default bonding policy
+	 */
+	static int defaultBondingPolicy() { return _defaultBondingPolicy; }
+
+	/**
+	 * Add a user-defined slave to a given bonding policy.
+	 *
+	 * @param policyAlias User-defined custom name for variant of bonding policy
+	 * @param slave Pointer to new slave definition
+	 */
+	void addCustomSlave(std::string& policyAlias, SharedPtr<Slave> slave);
+
+	/**
+	 * Add a user-defined bonding policy that is based on one of the standard types.
+	 *
+	 * @param newBond Pointer to custom Bond object
+	 * @return Whether a uniquely-named custom policy was successfully added
+	 */
+	bool addCustomPolicy(const SharedPtr<Bond>& newBond);
+
+	/**
+	 * Assigns a specific bonding policy  
+	 *
+	 * @param identity
+	 * @param policyAlias
+	 * @return
+	 */
+	bool assignBondingPolicyToPeer(int64_t identity, const std::string& policyAlias);
+
+	/**
+	 * Add a new bond to the bond controller.
+	 *
+	 * @param renv Runtime environment
+	 * @param peer Remote peer that this bond services
+	 * @return A pointer to the newly created Bond
+	 */
+	SharedPtr<Bond> createTransportTriggeredBond(const RuntimeEnvironment *renv, const SharedPtr<Peer>& peer);
+
+	/**
+	 * Periodically perform maintenance tasks for the bonding layer.
+	 *
+	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
+	 * @param now Current time
+	 */
+	void processBackgroundTasks(void *tPtr, int64_t now);
+
+	/**
+	 * Gets a reference to a physical slave definition given a policy alias and a local socket.
+	 *
+	 * @param policyAlias Policy in use
+	 * @param localSocket Local source socket
+	 * @return Physical slave definition
+	 */
+	SharedPtr<Slave> getSlaveBySocket(const std::string& policyAlias, uint64_t localSocket);
+
+	/**
+	 * Gets a reference to a physical slave definition given its human-readable system name.
+	 *
+	 * @param policyAlias Policy in use
+	 * @param ifname Alphanumeric human-readable name
+	 * @return Physical slave definition
+	 */
+	SharedPtr<Slave> getSlaveByName(const std::string& policyAlias, const std::string& ifname);
+
+	/**
+	 * @param ifname Name of interface that we want to know if we can bind to
+	 */
+	bool allowedToBind(const std::string& ifname);
+
+	uint64_t getBondStartTime() { return bondStartTime; }
+
+private:
+
+	Phy<BondController *> *_phy;
+	const RuntimeEnvironment *RR;
+
+	Mutex _bonds_m;
+	Mutex _slaves_m;
+
+	/**
+	 * The last time that the bond controller updated the set of bonds.
+	 */
+	uint64_t _lastBackgroundBondControlTaskCheck;
+
+	/**
+	 * The minimum monitoring interval among all paths in this bond.
+	 */
+	static int _minReqPathMonitorInterval;
+
+	/**
+	 * The default bonding policy used for new bonds unless otherwise specified.
+	 */
+	static uint8_t _defaultBondingPolicy;
+
+	/**
+	 * The default bonding policy used for new bonds unless otherwise specified.
+	 */
+	std::string _defaultBondingPolicyStr;
+
+	/**
+	 * All currently active bonds.
+	 */
+	std::map<int64_t,SharedPtr<Bond> > _bonds;
+
+	/**
+	 * Map of peers to custom bonding policies
+	 */
+	std::map<int64_t,std::string> _policyTemplateAssignments;
+
+	/**
+	 * User-defined bonding policies (can be assigned to a peer)
+	 */
+	std::map<std::string,SharedPtr<Bond> > _bondPolicyTemplates;
+
+	/**
+	 * Set of slaves defined for a given bonding policy
+	 */
+	std::map<std::string,std::vector<SharedPtr<Slave> > > _slaveDefinitions;
+
+	/**
+	 * Set of slave objects mapped to their physical interfaces
+	 */
+	std::map<std::string, std::map<std::string, SharedPtr<Slave> > > _interfaceToSlaveMap;
+
+	// TODO: Remove
+	uint64_t bondStartTime;
+};
+
+} // namespace ZeroTier
+
+#endif

+ 136 - 132
node/Constants.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -192,7 +192,7 @@
 /**
  * Minimum delay between timer task checks to prevent thrashing
  */
-#define ZT_CORE_TIMER_TASK_GRANULARITY 500
+#define ZT_CORE_TIMER_TASK_GRANULARITY 60
 
 /**
  * How often Topology::clean() and Network::clean() and similar are called, in ms
@@ -254,35 +254,50 @@
 #define ZT_LOCAL_CONF_FILE_CHECK_INTERVAL 10000
 
 /**
- * How long before we consider a flow to be dead and remove it from the balancing
- * policy's list.
+ * How frequently to send heartbeats over in-use paths
+ */
+#define ZT_PATH_HEARTBEAT_PERIOD 14000
+
+/**
+ * Do not accept HELLOs over a given path more often than this
+ */
+#define ZT_PATH_HELLO_RATE_LIMIT 1000
+
+/**
+ * Delay between full-fledge pings of directly connected peers
+ */
+#define ZT_PEER_PING_PERIOD 60000
+
+/**
+ * Paths are considered expired if they have not sent us a real packet in this long
  */
-#define ZT_MULTIPATH_FLOW_EXPIRATION 60000
+#define ZT_PEER_PATH_EXPIRATION ((ZT_PEER_PING_PERIOD * 4) + 3000)
 
 /**
- * How frequently to check for changes to the system's network interfaces. When
- * the service decides to use this constant it's because we want to react more
- * quickly to new interfaces that pop up or go down.
+ * How often to retry expired paths that we're still remembering
  */
-#define ZT_MULTIPATH_BINDER_REFRESH_PERIOD 5000
+#define ZT_PEER_EXPIRED_PATH_TRIAL_PERIOD (ZT_PEER_PING_PERIOD * 10)
 
 /**
- * Packets are only used for QoS/ACK statistical sampling if their packet ID is divisible by
- * this integer. This is to provide a mechanism for both peers to agree on which packets need
- * special treatment without having to exchange information. Changing this value would be
- * a breaking change and would necessitate a protocol version upgrade. Since each incoming and
- * outgoing packet ID is checked against this value its evaluation is of the form:
+ * Outgoing packets are only used for QoS/ACK statistical sampling if their
+ * packet ID is divisible by this integer. This is to provide a mechanism for
+ * both peers to agree on which packets need special treatment without having
+ * to exchange information. Changing this value would be a breaking change and
+ * would necessitate a protocol version upgrade. Since each incoming and
+ * outgoing packet ID is checked against this value its evaluation is of the
+ * form:
+ *
  * (id & (divisor - 1)) == 0, thus the divisor must be a power of 2.
  *
- * This value is set at (16) so that given a normally-distributed RNG output we will sample
- * 1/16th (or ~6.25%) of packets.
+ * This value is set at (16) so that given a normally-distributed RNG output
+ * we will sample 1/16th (or ~6.25%) of packets.
  */
-#define ZT_PATH_QOS_ACK_PROTOCOL_DIVISOR 0x10
+#define ZT_QOS_ACK_DIVISOR 0x2
 
 /**
  * Time horizon for VERB_QOS_MEASUREMENT and VERB_ACK packet processing cutoff
  */
-#define ZT_PATH_QOS_ACK_CUTOFF_TIME 30000
+#define ZT_QOS_ACK_CUTOFF_TIME 30000
 
 /**
  * Maximum number of VERB_QOS_MEASUREMENT and VERB_ACK packets allowed to be
@@ -294,201 +309,178 @@
  * CUTOFF_LIMIT times per CUTOFF_TIME milliseconds per peer to prevent
  * this from being useful for DOS amplification attacks.
  */
-#define ZT_PATH_QOS_ACK_CUTOFF_LIMIT 128
+#define ZT_QOS_ACK_CUTOFF_LIMIT 128
 
 /**
- * Path choice history window size. This is used to keep track of which paths were
- * previously selected so that we can maintain a target allocation over time.
+ * Minimum acceptable size for a VERB_QOS_MEASUREMENT packet
  */
-#define ZT_MULTIPATH_PROPORTION_WIN_SZ 128
+#define ZT_QOS_MIN_PACKET_SIZE (8 + 1)
 
 /**
- * How often we will sample packet latency. Should be at least greater than ZT_PING_CHECK_INVERVAL
- * since we will record a 0 bit/s measurement if no valid latency measurement was made within this
- * window of time.
+ * Maximum acceptable size for a VERB_QOS_MEASUREMENT packet
  */
-#define ZT_PATH_LATENCY_SAMPLE_INTERVAL (ZT_MULTIPATH_PEER_PING_PERIOD * 2)
+#define ZT_QOS_MAX_PACKET_SIZE 1400
 
 /**
- * Interval used for rate-limiting the computation of path quality estimates.
+ * How many ID:sojourn time pairs are in a single QoS packet
  */
-#define ZT_PATH_QUALITY_COMPUTE_INTERVAL 1000
+#define ZT_QOS_TABLE_SIZE ((ZT_QOS_MAX_PACKET_SIZE * 8) / (64 + 16))
 
 /**
- * Number of samples to consider when computing real-time path statistics
- */
-#define ZT_PATH_QUALITY_METRIC_REALTIME_CONSIDERATION_WIN_SZ 128
-
-/**
- * Number of samples to consider when computing performing long-term path quality analysis.
- * By default this value is set to ZT_PATH_QUALITY_METRIC_REALTIME_CONSIDERATION_WIN_SZ but can
- * be set to any value greater than that to observe longer-term path quality behavior.
- */
-#define ZT_PATH_QUALITY_METRIC_WIN_SZ ZT_PATH_QUALITY_METRIC_REALTIME_CONSIDERATION_WIN_SZ
-
-/**
- * Maximum acceptable Packet Delay Variance (PDV) over a path
+ * Maximum number of outgoing packets we monitor for QoS information
  */
-#define ZT_PATH_MAX_PDV 1000
+#define ZT_QOS_MAX_OUTSTANDING_RECORDS (1024*16)
 
 /**
- * Maximum acceptable time interval between expectation and receipt of at least one ACK over a path
+ * Interval used for rate-limiting the computation of path quality estimates.
  */
-#define ZT_PATH_MAX_AGE 30000
+#define ZT_QOS_COMPUTE_INTERVAL 1000
 
 /**
- * Maximum acceptable mean latency over a path
+ * Number of samples to consider when processing real-time path statistics
  */
-#define ZT_PATH_MAX_MEAN_LATENCY 1000
+#define ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE 32
 
 /**
- * How much each factor contributes to the "stability" score of a path
+ * Number of samples to consider when processing long-term trends
  */
-
-#if 0
-#define ZT_PATH_CONTRIB_PDV                    (1.5 / 3.0)
-#define ZT_PATH_CONTRIB_LATENCY                (0.0 / 3.0)
-#define ZT_PATH_CONTRIB_THROUGHPUT_DISTURBANCE (1.5 / 3.0)
-#else
-#define ZT_PATH_CONTRIB_PDV                    (1.0 / 3.0)
-#define ZT_PATH_CONTRIB_LATENCY                (1.0 / 3.0)
-#define ZT_PATH_CONTRIB_THROUGHPUT_DISTURBANCE (1.0 / 3.0)
-#endif
+#define ZT_QOS_LONGTERM_SAMPLE_WIN_SIZE (ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE * 4)
 
 /**
- * How much each factor contributes to the "quality" score of a path
+ * Max allowable time spent in any queue (in ms)
  */
-#if 0
-#define ZT_PATH_CONTRIB_STABILITY  (2.00 / 3.0)
-#define ZT_PATH_CONTRIB_THROUGHPUT (0.50 / 3.0)
-#define ZT_PATH_CONTRIB_SCOPE      (0.50 / 3.0)
-#else
-#define ZT_PATH_CONTRIB_STABILITY  (0.75 / 3.0)
-#define ZT_PATH_CONTRIB_THROUGHPUT (1.50 / 3.0)
-#define ZT_PATH_CONTRIB_SCOPE      (0.75 / 3.0)
-#endif
+#define ZT_AQM_TARGET 5
 
 /**
- * How often a QoS packet is sent
+ * Time period where the time spent in the queue by a packet should fall below.
+ * target at least once. (in ms)
  */
-#define ZT_PATH_QOS_INTERVAL 3000
+#define ZT_AQM_INTERVAL 100
 
 /**
- * Min and max acceptable sizes for a VERB_QOS_MEASUREMENT packet
+ * The number of bytes that each queue is allowed to send during each DRR cycle.
+ * This approximates a single-byte-based fairness queuing scheme.
  */
-#define ZT_PATH_MIN_QOS_PACKET_SZ 8 + 1
-#define ZT_PATH_MAX_QOS_PACKET_SZ 1400
+#define ZT_AQM_QUANTUM ZT_DEFAULT_MTU
 
 /**
- * How many ID:sojourn time pairs in a single QoS packet
+ * The maximum total number of packets that can be queued among all
+ * active/inactive, old/new queues.
  */
-#define ZT_PATH_QOS_TABLE_SIZE ((ZT_PATH_MAX_QOS_PACKET_SZ * 8) / (64 + 16))
+#define ZT_AQM_MAX_ENQUEUED_PACKETS 1024
 
 /**
- * Maximum number of outgoing packets we monitor for QoS information
+ * Number of QoS queues (buckets)
  */
-#define ZT_PATH_MAX_OUTSTANDING_QOS_RECORDS 128
+#define ZT_AQM_NUM_BUCKETS 9
 
 /**
- * Timeout for QoS records
+ * All unspecified traffic is put in this bucket. Anything in a bucket with a
+ * smaller value is deprioritized. Anything in a bucket with a higher value is
+ prioritized over other traffic.
  */
-#define ZT_PATH_QOS_TIMEOUT (ZT_PATH_QOS_INTERVAL * 2)
+#define ZT_AQM_DEFAULT_BUCKET 0
 
 /**
- * How often the service tests the path throughput
+ * How long before we consider a path to be dead in the general sense. This is
+ * used while searching for default or alternative paths to try in the absence
+ * of direct guidance from the user or a selection policy.
  */
-#define ZT_PATH_THROUGHPUT_MEASUREMENT_INTERVAL (ZT_PATH_ACK_INTERVAL * 8)
+#define ZT_MULTIPATH_DEFAULT_FAILOVER_INTERVAL 10000
 
 /**
- * Minimum amount of time between each ACK packet
+ * How often flows are evaluated
  */
-#define ZT_PATH_ACK_INTERVAL 1000
+#define ZT_MULTIPATH_FLOW_CHECK_INTERVAL 10000
 
 /**
- * How often an aggregate link statistics report is emitted into this tracing system
+ * How long before we consider a flow to be dead and remove it from the
+ * policy's list.
  */
-#define ZT_PATH_AGGREGATE_STATS_REPORT_INTERVAL 30000
+#define ZT_MULTIPATH_FLOW_EXPIRATION_INTERVAL 30000
 
 /**
- * How much an aggregate link's component paths can vary from their target allocation
- * before the link is considered to be in a state of imbalance.
+ * How often a flow's statistical counters are reset
  */
-#define ZT_PATH_IMBALANCE_THRESHOLD 0.20
+#define ZT_FLOW_STATS_RESET_INTERVAL ZT_MULTIPATH_FLOW_EXPIRATION_INTERVAL
 
 /**
- * Max allowable time spent in any queue
+ * Maximum number of flows allowed before we start forcibly forgetting old ones
  */
-#define ZT_QOS_TARGET 5 // ms
+#define ZT_FLOW_MAX_COUNT (1024*64)
 
 /**
- * Time period where the time spent in the queue by a packet should fall below
- * target at least once
+ * How often flows are rebalanced across slave interfaces (if at all)
  */
-#define ZT_QOS_INTERVAL 100 // ms
+#define ZT_FLOW_MIN_REBALANCE_INTERVAL 5000
 
 /**
- * The number of bytes that each queue is allowed to send during each DRR cycle.
- * This approximates a single-byte-based fairness queuing scheme
+ * How often flows are rebalanced across slave interfaces (if at all)
  */
-#define ZT_QOS_QUANTUM ZT_DEFAULT_MTU
+#define ZT_FLOW_REBALANCE_INTERVAL 5000
 
 /**
- * The maximum total number of packets that can be queued among all
- * active/inactive, old/new queues
+ * A defensive timer to prevent path quality metrics from being
+ * processed too often.
  */
-#define ZT_QOS_MAX_ENQUEUED_PACKETS 1024
+#define ZT_BOND_BACKGROUND_TASK_MIN_INTERVAL ZT_CORE_TIMER_TASK_GRANULARITY
 
 /**
- * Number of QoS queues (buckets)
+ * How often a bonding policy's background tasks are processed,
+ * some need more frequent attention than others.
  */
-#define ZT_QOS_NUM_BUCKETS 9
+#define ZT_MULTIPATH_ACTIVE_BACKUP_CHECK_INTERVAL ZT_CORE_TIMER_TASK_GRANULARITY
 
 /**
- * All unspecified traffic is put in this bucket. Anything in a bucket with a smaller
- * value is de-prioritized. Anything in a bucket with a higher value is prioritized over
- * other traffic.
+ * Minimum amount of time (since a previous transition) before the active-backup bonding
+ * policy is allowed to transition to a different slave. Only valid for active-backup.
  */
-#define ZT_QOS_DEFAULT_BUCKET 0
+#define ZT_MULTIPATH_MIN_ACTIVE_BACKUP_AUTOFLOP_INTERVAL 10000
 
 /**
- * How frequently to send heartbeats over in-use paths
+ * How often a peer checks that incoming (and outgoing) traffic on a bonded link is
+ * appropriately paired.
  */
-#define ZT_PATH_HEARTBEAT_PERIOD 14000
+#define ZT_PATH_NEGOTIATION_CHECK_INTERVAL 15000
 
 /**
- * Do not accept HELLOs over a given path more often than this
+ * Time horizon for path negotiation paths cutoff
  */
-#define ZT_PATH_HELLO_RATE_LIMIT 1000
+#define ZT_PATH_NEGOTIATION_CUTOFF_TIME 60000
 
 /**
- * Delay between full-fledge pings of directly connected peers
+ * Maximum number of path negotiations within cutoff time
+ *
+ * This limits response to PATH_NEGOTIATION to CUTOFF_LIMIT responses
+ * per CUTOFF_TIME milliseconds per peer to prevent this from being
+ * useful for DOS amplification attacks.
  */
-#define ZT_PEER_PING_PERIOD 60000
+#define ZT_PATH_NEGOTIATION_CUTOFF_LIMIT 8
 
 /**
- * Delay between full-fledge pings of directly connected peers.
- *
- * With multipath bonding enabled ping peers more often to measure
- * packet loss and latency. This uses more bandwidth so is disabled
- * by default to avoid increasing idle bandwidth use for regular
- * links.
+ * How many times a peer will attempt to petition another peer to synchronize its
+ * traffic to the same path before giving up and surrendering to the other peer's preference.
  */
-#define ZT_MULTIPATH_PEER_PING_PERIOD (ZT_PEER_PING_PERIOD / 10)
+#define ZT_PATH_NEGOTIATION_TRY_COUNT 3
 
 /**
- * How long before we consider a path to be dead in rapid fail-over scenarios
+ * How much greater the quality of a path should be before an
+ * optimization procedure triggers a switch.
  */
-#define ZT_MULTIPATH_ACTIVE_BACKUP_RAPID_FAILOVER_PERIOD 250
+#define ZT_MULTIPATH_ACTIVE_BACKUP_OPTIMIZE_MIN_THRESHOLD 0.10
 
 /**
- * Paths are considered expired if they have not sent us a real packet in this long
+ * Artificially inflates the failover score for paths which meet
+ * certain non-performance-related policy ranking criteria.
  */
-#define ZT_PEER_PATH_EXPIRATION ((ZT_PEER_PING_PERIOD * 4) + 3000)
+#define ZT_MULTIPATH_FAILOVER_HANDICAP_PREFERRED 500
+#define ZT_MULTIPATH_FAILOVER_HANDICAP_PRIMARY 1000
+#define ZT_MULTIPATH_FAILOVER_HANDICAP_NEGOTIATED 5000
 
 /**
- * How often to retry expired paths that we're still remembering
+ * An indicator that no flow is to be associated with the given packet
  */
-#define ZT_PEER_EXPIRED_PATH_TRIAL_PERIOD (ZT_PEER_PING_PERIOD * 10)
+#define ZT_QOS_NO_FLOW -1
 
 /**
  * Timeout for overall peer activity (measured from last receive)
@@ -558,18 +550,30 @@
 #define ZT_DIRECT_PATH_PUSH_INTERVAL_HAVEPATH 120000
 
 /**
- * Interval between direct path pushes in milliseconds if we are currently in multipath
- * mode. In this mode the distinction between ZT_DIRECT_PATH_PUSH_INTERVAL and
- * ZT_DIRECT_PATH_PUSH_INTERVAL_HAVEPATH does not exist since we want to inform other
- * peers of this peer's new link/address as soon as possible so that both peers can
- * begin forming an aggregated link.
+ * Time horizon for push direct paths cutoff
  */
-#define ZT_DIRECT_PATH_PUSH_INTERVAL_MULTIPATH (ZT_DIRECT_PATH_PUSH_INTERVAL_HAVEPATH / 16)
+#define ZT_PUSH_DIRECT_PATHS_CUTOFF_TIME 30000
 
 /**
- * Time horizon for push direct paths cutoff
+ * Drainage constants for VERB_ECHO rate-limiters
  */
-#define ZT_PUSH_DIRECT_PATHS_CUTOFF_TIME 30000
+#define ZT_ECHO_CUTOFF_LIMIT ((1000 / ZT_CORE_TIMER_TASK_GRANULARITY) * ZT_MAX_PEER_NETWORK_PATHS)
+#define ZT_ECHO_DRAINAGE_DIVISOR (1000 / ZT_ECHO_CUTOFF_LIMIT)
+
+/**
+ * Drainage constants for VERB_QOS rate-limiters
+ */
+#define ZT_QOS_CUTOFF_LIMIT ((1000 / ZT_CORE_TIMER_TASK_GRANULARITY) * ZT_MAX_PEER_NETWORK_PATHS)
+#define ZT_QOS_DRAINAGE_DIVISOR (1000 / ZT_QOS_CUTOFF_LIMIT)
+
+/**
+ * Drainage constants for VERB_ACK rate-limiters
+ */
+#define ZT_ACK_CUTOFF_LIMIT 128
+#define ZT_ACK_DRAINAGE_DIVISOR (1000 / ZT_ACK_CUTOFF_LIMIT)
+
+#define ZT_MULTIPATH_DEFAULT_REFRCTORY_PERIOD 8000
+#define ZT_MULTIPATH_MAX_REFRACTORY_PERIOD 600000
 
 /**
  * Maximum number of direct path pushes within cutoff time

+ 123 - 0
node/Flow.hpp

@@ -0,0 +1,123 @@
+/*
+ * Copyright (c)2013-2020 ZeroTier, Inc.
+ *
+ * Use of this software is governed by the Business Source License included
+ * in the LICENSE.TXT file in the project's root directory.
+ *
+ * Change Date: 2024-01-01
+ *
+ * On the date above, in accordance with the Business Source License, use
+ * of this software will be governed by version 2.0 of the Apache License.
+ */
+/****/
+
+#ifndef ZT_FLOW_HPP
+#define ZT_FLOW_HPP
+
+#include "Path.hpp"
+#include "SharedPtr.hpp"
+
+namespace ZeroTier {
+
+/**
+ * A protocol flow that is identified by the origin and destination port.
+ */
+struct Flow
+{
+    /**
+     * @param flowId Given flow ID
+     * @param now Current time
+     */
+	Flow(int32_t flowId, int64_t now) :
+		_flowId(flowId),
+		_bytesInPerUnitTime(0),
+		_bytesOutPerUnitTime(0),
+		_lastActivity(now),
+		_lastPathReassignment(0),
+		_assignedPath(SharedPtr<Path>())
+	{}
+
+	/**
+	 * Reset flow statistics
+	 */
+	void resetByteCounts()
+	{
+		_bytesInPerUnitTime = 0;
+		_bytesOutPerUnitTime = 0;
+	}
+
+	/**
+	 * @return The Flow's ID
+	 */
+	int32_t id() { return _flowId; }
+	
+	/**
+	 * @return Number of incoming bytes processed on this flow per unit time
+	 */
+	int64_t bytesInPerUnitTime() { return _bytesInPerUnitTime; }
+	
+	/**
+	 * Record number of incoming bytes on this flow
+	 *
+	 * @param bytes Number of incoming bytes
+	 */
+	void recordIncomingBytes(uint64_t bytes) { _bytesInPerUnitTime += bytes; }
+
+	/**
+	 * @return Number of outgoing bytes processed on this flow per unit time
+	 */
+	int64_t bytesOutPerUnitTime() { return _bytesOutPerUnitTime; }
+
+	/**
+	 * Record number of outgoing bytes on this flow
+	 *
+	 * @param bytes
+	 */
+	void recordOutgoingBytes(uint64_t bytes) { _bytesOutPerUnitTime += bytes; }
+
+	/**
+	 * @return The total number of bytes processed on this flow
+	 */
+	uint64_t totalBytes() { return _bytesInPerUnitTime + _bytesOutPerUnitTime; }
+
+	/**
+	 * How long since a packet was sent or received in this flow
+	 *
+	 * @param now Current time
+	 * @return The age of the flow in terms of last recorded activity
+	 */
+	int64_t age(int64_t now) { return now - _lastActivity; }
+
+	/**
+	 * Record that traffic was processed on this flow at the given time.
+	 *
+	 * @param now Current time
+	 */
+	void updateActivity(int64_t now) { _lastActivity = now; }
+
+	/**
+	 * @return Path assigned to this flow
+	 */
+	SharedPtr<Path> assignedPath() { return _assignedPath; }
+
+	/**
+	 * @param path Assigned path over which this flow should be handled
+	 */
+	void assignPath(const SharedPtr<Path> &path, int64_t now) {
+		_assignedPath = path;
+		_lastPathReassignment = now;
+	}
+
+	AtomicCounter __refCount;
+
+	int32_t _flowId;
+	uint64_t _bytesInPerUnitTime;
+	uint64_t _bytesOutPerUnitTime;
+	int64_t _lastActivity;
+	int64_t _lastPathReassignment;
+	SharedPtr<Path> _assignedPath;
+};
+
+} // namespace ZeroTier
+
+#endif

+ 204 - 80
node/IncomingPacket.cpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -35,10 +35,12 @@
 #include "Tag.hpp"
 #include "Revocation.hpp"
 #include "Trace.hpp"
+#include "Path.hpp"
+#include "Bond.hpp"
 
 namespace ZeroTier {
 
-bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr)
+bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr,int32_t flowId)
 {
 	const Address sourceAddress(source());
 
@@ -67,7 +69,7 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr)
 			if (!trusted) {
 				if (!dearmor(peer->key())) {
 					RR->t->incomingPacketMessageAuthenticationFailure(tPtr,_path,packetId(),sourceAddress,hops(),"invalid MAC");
-					_path->recordInvalidPacket();
+					peer->recordIncomingInvalidPacket(_path);
 					return true;
 				}
 			}
@@ -78,11 +80,12 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr)
 			}
 
 			const Packet::Verb v = verb();
+
 			bool r = true;
 			switch(v) {
 				//case Packet::VERB_NOP:
 				default: // ignore unknown verbs, but if they pass auth check they are "received"
-					peer->received(tPtr,_path,hops(),packetId(),payloadLength(),v,0,Packet::VERB_NOP,false,0);
+					peer->received(tPtr,_path,hops(),packetId(),payloadLength(),v,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
 					break;
 				case Packet::VERB_HELLO:                      r = _doHELLO(RR,tPtr,true); break;
 				case Packet::VERB_ACK:                        r = _doACK(RR,tPtr,peer); break;
@@ -91,8 +94,8 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr)
 				case Packet::VERB_OK:                         r = _doOK(RR,tPtr,peer); break;
 				case Packet::VERB_WHOIS:                      r = _doWHOIS(RR,tPtr,peer); break;
 				case Packet::VERB_RENDEZVOUS:                 r = _doRENDEZVOUS(RR,tPtr,peer); break;
-				case Packet::VERB_FRAME:                      r = _doFRAME(RR,tPtr,peer); break;
-				case Packet::VERB_EXT_FRAME:                  r = _doEXT_FRAME(RR,tPtr,peer); break;
+				case Packet::VERB_FRAME:                      r = _doFRAME(RR,tPtr,peer,flowId); break;
+				case Packet::VERB_EXT_FRAME:                  r = _doEXT_FRAME(RR,tPtr,peer,flowId); break;
 				case Packet::VERB_ECHO:                       r = _doECHO(RR,tPtr,peer); break;
 				case Packet::VERB_MULTICAST_LIKE:             r = _doMULTICAST_LIKE(RR,tPtr,peer); break;
 				case Packet::VERB_NETWORK_CREDENTIALS:        r = _doNETWORK_CREDENTIALS(RR,tPtr,peer); break;
@@ -103,6 +106,7 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr)
 				case Packet::VERB_PUSH_DIRECT_PATHS:          r = _doPUSH_DIRECT_PATHS(RR,tPtr,peer); break;
 				case Packet::VERB_USER_MESSAGE:               r = _doUSER_MESSAGE(RR,tPtr,peer); break;
 				case Packet::VERB_REMOTE_TRACE:               r = _doREMOTE_TRACE(RR,tPtr,peer); break;
+				case Packet::VERB_PATH_NEGOTIATION_REQUEST:   r = _doPATH_NEGOTIATION_REQUEST(RR,tPtr,peer); break;
 			}
 			if (r) {
 				RR->node->statsLogVerb((unsigned int)v,(unsigned int)size());
@@ -113,9 +117,6 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr)
 			RR->sw->requestWhois(tPtr,RR->node->now(),sourceAddress);
 			return false;
 		}
-	} catch (int ztExcCode) {
-		RR->t->incomingPacketInvalid(tPtr,_path,packetId(),sourceAddress,hops(),verb(),"unexpected exception in tryDecode()");
-		return true;
 	} catch ( ... ) {
 		RR->t->incomingPacketInvalid(tPtr,_path,packetId(),sourceAddress,hops(),verb(),"unexpected exception in tryDecode()");
 		return true;
@@ -193,59 +194,59 @@ bool IncomingPacket::_doERROR(const RuntimeEnvironment *RR,void *tPtr,const Shar
 		default: break;
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_ERROR,inRePacketId,inReVerb,false,networkId);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_ERROR,inRePacketId,inReVerb,false,networkId,ZT_QOS_NO_FLOW);
 
 	return true;
 }
 
 bool IncomingPacket::_doACK(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer)
 {
-	if (!peer->rateGateACK(RR->node->now()))
+	SharedPtr<Bond> bond = peer->bond();
+	if (!bond || !bond->rateGateACK(RR->node->now())) {
 		return true;
+	}
 	/* Dissect incoming ACK packet. From this we can estimate current throughput of the path, establish known
 	 * maximums and detect packet loss. */
-	if (peer->localMultipathSupport()) {
-		int32_t ackedBytes;
-		if (payloadLength() != sizeof(ackedBytes)) {
-			return true; // ignore
-		}
-		memcpy(&ackedBytes, payload(), sizeof(ackedBytes));
-		_path->receivedAck(RR->node->now(), Utils::ntoh(ackedBytes));
-		peer->inferRemoteMultipathEnabled();
+	int32_t ackedBytes;
+	if (payloadLength() != sizeof(ackedBytes)) {
+		return true; // ignore
+	}
+	memcpy(&ackedBytes, payload(), sizeof(ackedBytes));
+	if (bond) {
+		bond->receivedAck(_path, RR->node->now(), Utils::ntoh(ackedBytes));
 	}
-
 	return true;
 }
 
 bool IncomingPacket::_doQOS_MEASUREMENT(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer)
 {
-	if (!peer->rateGateQoS(RR->node->now()))
+	SharedPtr<Bond> bond = peer->bond();
+	if (!bond || !bond->rateGateQoS(RR->node->now())) {
 		return true;
+	}
 	/* Dissect incoming QoS packet. From this we can compute latency values and their variance.
 	 * The latency variance is used as a measure of "jitter". */
-	if (peer->localMultipathSupport()) {
-		if (payloadLength() > ZT_PATH_MAX_QOS_PACKET_SZ || payloadLength() < ZT_PATH_MIN_QOS_PACKET_SZ) {
-			return true; // ignore
-		}
-		const int64_t now = RR->node->now();
-		uint64_t rx_id[ZT_PATH_QOS_TABLE_SIZE];
-		uint16_t rx_ts[ZT_PATH_QOS_TABLE_SIZE];
-		char *begin = (char *)payload();
-		char *ptr = begin;
-		int count = 0;
-		int len = payloadLength();
-		// Read packet IDs and latency compensation intervals for each packet tracked by this QoS packet
-		while (ptr < (begin + len) && (count < ZT_PATH_QOS_TABLE_SIZE)) {
-			memcpy((void*)&rx_id[count], ptr, sizeof(uint64_t));
-			ptr+=sizeof(uint64_t);
-			memcpy((void*)&rx_ts[count], ptr, sizeof(uint16_t));
-			ptr+=sizeof(uint16_t);
-			count++;
-		}
-		_path->receivedQoS(now, count, rx_id, rx_ts);
-		peer->inferRemoteMultipathEnabled();
+	if (payloadLength() > ZT_QOS_MAX_PACKET_SIZE || payloadLength() < ZT_QOS_MIN_PACKET_SIZE) {
+		return true; // ignore
+	}
+	const int64_t now = RR->node->now();
+	uint64_t rx_id[ZT_QOS_TABLE_SIZE];
+	uint16_t rx_ts[ZT_QOS_TABLE_SIZE];
+	char *begin = (char *)payload();
+	char *ptr = begin;
+	int count = 0;
+	unsigned int len = payloadLength();
+	// Read packet IDs and latency compensation intervals for each packet tracked by this QoS packet
+	while (ptr < (begin + len) && (count < ZT_QOS_TABLE_SIZE)) {
+		memcpy((void*)&rx_id[count], ptr, sizeof(uint64_t));
+		ptr+=sizeof(uint64_t);
+		memcpy((void*)&rx_ts[count], ptr, sizeof(uint16_t));
+		ptr+=sizeof(uint16_t);
+		count++;
+	}
+	if (bond) {
+		bond->receivedQoS(_path, now, count, rx_id, rx_ts);
 	}
-
 	return true;
 }
 
@@ -441,11 +442,12 @@ bool IncomingPacket::_doHELLO(const RuntimeEnvironment *RR,void *tPtr,const bool
 	}
 	outp.setAt<uint16_t>(worldUpdateSizeAt,(uint16_t)(outp.size() - (worldUpdateSizeAt + 2)));
 
+	peer->recordOutgoingPacket(_path,outp.packetId(),outp.payloadLength(),outp.verb(),ZT_QOS_NO_FLOW,now);
 	outp.armor(peer->key(),true);
 	_path->send(RR,tPtr,outp.data(),outp.size(),now);
 
 	peer->setRemoteVersion(protoVersion,vMajor,vMinor,vRevision); // important for this to go first so received() knows the version
-	peer->received(tPtr,_path,hops(),pid,payloadLength(),Packet::VERB_HELLO,0,Packet::VERB_NOP,false,0);
+	peer->received(tPtr,_path,hops(),pid,payloadLength(),Packet::VERB_HELLO,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
 
 	return true;
 }
@@ -493,7 +495,10 @@ bool IncomingPacket::_doOK(const RuntimeEnvironment *RR,void *tPtr,const SharedP
 			}
 
 			if (!hops()) {
-				_path->updateLatency((unsigned int)latency,RR->node->now());
+				SharedPtr<Bond> bond = peer->bond();
+				if (!bond) {
+					_path->updateLatency((unsigned int)latency,RR->node->now());
+				}
 			}
 
 			peer->setRemoteVersion(vProto,vMajor,vMinor,vRevision);
@@ -522,8 +527,7 @@ bool IncomingPacket::_doOK(const RuntimeEnvironment *RR,void *tPtr,const SharedP
 			if (network) {
 				const MulticastGroup mg(MAC(field(ZT_PROTO_VERB_MULTICAST_GATHER__OK__IDX_MAC,6),6),at<uint32_t>(ZT_PROTO_VERB_MULTICAST_GATHER__OK__IDX_ADI));
 				const unsigned int count = at<uint16_t>(ZT_PROTO_VERB_MULTICAST_GATHER__OK__IDX_GATHER_RESULTS + 4);
-				if (((ZT_PROTO_VERB_MULTICAST_GATHER__OK__IDX_GATHER_RESULTS + 6) + (count * 5)) <= size())
-					RR->mc->addMultiple(tPtr,RR->node->now(),networkId,mg,field(ZT_PROTO_VERB_MULTICAST_GATHER__OK__IDX_GATHER_RESULTS + 6,count * 5),count,at<uint32_t>(ZT_PROTO_VERB_MULTICAST_GATHER__OK__IDX_GATHER_RESULTS));
+				RR->mc->addMultiple(tPtr,RR->node->now(),networkId,mg,field(ZT_PROTO_VERB_MULTICAST_GATHER__OK__IDX_GATHER_RESULTS + 6,count * 5),count,at<uint32_t>(ZT_PROTO_VERB_MULTICAST_GATHER__OK__IDX_GATHER_RESULTS));
 			}
 		}	break;
 
@@ -556,7 +560,7 @@ bool IncomingPacket::_doOK(const RuntimeEnvironment *RR,void *tPtr,const SharedP
 		default: break;
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_OK,inRePacketId,inReVerb,false,networkId);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_OK,inRePacketId,inReVerb,false,networkId,ZT_QOS_NO_FLOW);
 
 	return true;
 }
@@ -591,7 +595,7 @@ bool IncomingPacket::_doWHOIS(const RuntimeEnvironment *RR,void *tPtr,const Shar
 		_path->send(RR,tPtr,outp.data(),outp.size(),RR->node->now());
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_WHOIS,0,Packet::VERB_NOP,false,0);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_WHOIS,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
 
 	return true;
 }
@@ -615,13 +619,108 @@ bool IncomingPacket::_doRENDEZVOUS(const RuntimeEnvironment *RR,void *tPtr,const
 		}
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_RENDEZVOUS,0,Packet::VERB_NOP,false,0);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_RENDEZVOUS,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
 
 	return true;
 }
 
-bool IncomingPacket::_doFRAME(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer)
+// Returns true if packet appears valid; pos and proto will be set
+static bool _ipv6GetPayload(const uint8_t *frameData,unsigned int frameLen,unsigned int &pos,unsigned int &proto)
 {
+	if (frameLen < 40)
+		return false;
+	pos = 40;
+	proto = frameData[6];
+	while (pos <= frameLen) {
+		switch(proto) {
+			case 0: // hop-by-hop options
+			case 43: // routing
+			case 60: // destination options
+			case 135: // mobility options
+				if ((pos + 8) > frameLen)
+					return false; // invalid!
+				proto = frameData[pos];
+				pos += ((unsigned int)frameData[pos + 1] * 8) + 8;
+				break;
+
+			//case 44: // fragment -- we currently can't parse these and they are deprecated in IPv6 anyway
+			//case 50:
+			//case 51: // IPSec ESP and AH -- we have to stop here since this is encrypted stuff
+			default:
+				return true;
+		}
+	}
+	return false; // overflow == invalid
+}
+
+bool IncomingPacket::_doFRAME(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer,int32_t flowId)
+{
+	int32_t _flowId = ZT_QOS_NO_FLOW;
+	SharedPtr<Bond> bond = peer->bond();
+	if (bond && bond->flowHashingEnabled()) {
+		if (size() > ZT_PROTO_VERB_EXT_FRAME_IDX_PAYLOAD) {
+			const unsigned int etherType = at<uint16_t>(ZT_PROTO_VERB_FRAME_IDX_ETHERTYPE);
+			const unsigned int frameLen = size() - ZT_PROTO_VERB_FRAME_IDX_PAYLOAD;
+			const uint8_t *const frameData = reinterpret_cast<const uint8_t *>(data()) + ZT_PROTO_VERB_FRAME_IDX_PAYLOAD;
+
+			if (etherType == ZT_ETHERTYPE_IPV4 && (frameLen >= 20)) {
+				uint16_t srcPort = 0;
+				uint16_t dstPort = 0;
+				uint8_t proto = (reinterpret_cast<const uint8_t *>(frameData)[9]);
+				const unsigned int headerLen = 4 * (reinterpret_cast<const uint8_t *>(frameData)[0] & 0xf);
+				switch(proto) {
+					case 0x01: // ICMP
+						//flowId = 0x01;
+						break;
+					// All these start with 16-bit source and destination port in that order
+					case 0x06: // TCP
+					case 0x11: // UDP
+					case 0x84: // SCTP
+					case 0x88: // UDPLite
+						if (frameLen > (headerLen + 4)) {
+							unsigned int pos = headerLen + 0;
+							srcPort = (reinterpret_cast<const uint8_t *>(frameData)[pos++]) << 8;
+							srcPort |= (reinterpret_cast<const uint8_t *>(frameData)[pos]);
+							pos++;
+							dstPort = (reinterpret_cast<const uint8_t *>(frameData)[pos++]) << 8;
+							dstPort |= (reinterpret_cast<const uint8_t *>(frameData)[pos]);
+							_flowId = dstPort ^ srcPort ^ proto;
+						}
+						break;
+				}
+			}
+
+			if (etherType == ZT_ETHERTYPE_IPV6 && (frameLen >= 40)) {
+				uint16_t srcPort = 0;
+				uint16_t dstPort = 0;
+				unsigned int pos;
+				unsigned int proto;
+				_ipv6GetPayload((const uint8_t *)frameData, frameLen, pos, proto);
+				switch(proto) {
+					case 0x3A: // ICMPv6
+						//flowId = 0x3A;
+						break;
+					// All these start with 16-bit source and destination port in that order
+					case 0x06: // TCP
+					case 0x11: // UDP
+					case 0x84: // SCTP
+					case 0x88: // UDPLite
+						if (frameLen > (pos + 4)) {
+							srcPort = (reinterpret_cast<const uint8_t *>(frameData)[pos++]) << 8;
+							srcPort |= (reinterpret_cast<const uint8_t *>(frameData)[pos]);
+							pos++;
+							dstPort = (reinterpret_cast<const uint8_t *>(frameData)[pos++]) << 8;
+							dstPort |= (reinterpret_cast<const uint8_t *>(frameData)[pos]);
+							_flowId = dstPort ^ srcPort ^ proto;
+						}
+						break;
+					default:
+						break;
+				}
+			}
+		}
+	}
+
 	const uint64_t nwid = at<uint64_t>(ZT_PROTO_VERB_FRAME_IDX_NETWORK_ID);
 	const SharedPtr<Network> network(RR->node->network(nwid));
 	bool trustEstablished = false;
@@ -641,13 +740,12 @@ bool IncomingPacket::_doFRAME(const RuntimeEnvironment *RR,void *tPtr,const Shar
 			return false;
 		}
 	}
-
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_FRAME,0,Packet::VERB_NOP,trustEstablished,nwid);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_FRAME,0,Packet::VERB_NOP,trustEstablished,nwid,_flowId);
 
 	return true;
 }
 
-bool IncomingPacket::_doEXT_FRAME(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer)
+bool IncomingPacket::_doEXT_FRAME(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer,int32_t flowId)
 {
 	const uint64_t nwid = at<uint64_t>(ZT_PROTO_VERB_EXT_FRAME_IDX_NETWORK_ID);
 	const SharedPtr<Network> network(RR->node->network(nwid));
@@ -676,7 +774,7 @@ bool IncomingPacket::_doEXT_FRAME(const RuntimeEnvironment *RR,void *tPtr,const
 			const uint8_t *const frameData = (const uint8_t *)field(comLen + ZT_PROTO_VERB_EXT_FRAME_IDX_PAYLOAD,frameLen);
 
 			if ((!from)||(from == network->mac())) {
-				peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,true,nwid); // trustEstablished because COM is okay
+				peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,true,nwid,flowId); // trustEstablished because COM is okay
 				return true;
 			}
 
@@ -687,19 +785,19 @@ bool IncomingPacket::_doEXT_FRAME(const RuntimeEnvironment *RR,void *tPtr,const
 							network->learnBridgeRoute(from,peer->address());
 						} else {
 							RR->t->incomingNetworkFrameDropped(tPtr,network,_path,packetId(),size(),peer->address(),Packet::VERB_EXT_FRAME,from,to,"bridging not allowed (remote)");
-							peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,true,nwid); // trustEstablished because COM is okay
+							peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,true,nwid,flowId); // trustEstablished because COM is okay
 							return true;
 						}
 					} else if (to != network->mac()) {
 						if (to.isMulticast()) {
 							if (network->config().multicastLimit == 0) {
 								RR->t->incomingNetworkFrameDropped(tPtr,network,_path,packetId(),size(),peer->address(),Packet::VERB_EXT_FRAME,from,to,"multicast disabled");
-								peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,true,nwid); // trustEstablished because COM is okay
+								peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,true,nwid,flowId); // trustEstablished because COM is okay
 								return true;
 							}
 						} else if (!network->config().permitsBridging(RR->identity.address())) {
 							RR->t->incomingNetworkFrameDropped(tPtr,network,_path,packetId(),size(),peer->address(),Packet::VERB_EXT_FRAME,from,to,"bridging not allowed (local)");
-							peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,true,nwid); // trustEstablished because COM is okay
+							peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,true,nwid,flowId); // trustEstablished because COM is okay
 							return true;
 						}
 					}
@@ -715,13 +813,15 @@ bool IncomingPacket::_doEXT_FRAME(const RuntimeEnvironment *RR,void *tPtr,const
 			outp.append((uint8_t)Packet::VERB_EXT_FRAME);
 			outp.append((uint64_t)packetId());
 			outp.append((uint64_t)nwid);
+			const int64_t now = RR->node->now();
+			peer->recordOutgoingPacket(_path,outp.packetId(),outp.payloadLength(),outp.verb(),ZT_QOS_NO_FLOW,now);
 			outp.armor(peer->key(),true);
 			_path->send(RR,tPtr,outp.data(),outp.size(),RR->node->now());
 		}
 
-		peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,true,nwid);
+		peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,true,nwid,flowId);
 	} else {
-		peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,false,nwid);
+		peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_EXT_FRAME,0,Packet::VERB_NOP,false,nwid,flowId);
 	}
 
 	return true;
@@ -729,8 +829,10 @@ bool IncomingPacket::_doEXT_FRAME(const RuntimeEnvironment *RR,void *tPtr,const
 
 bool IncomingPacket::_doECHO(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer)
 {
-	if (!peer->rateGateEchoRequest(RR->node->now()))
+	uint64_t now = RR->node->now();
+	if (!peer->rateGateEchoRequest(now)) {
 		return true;
+	}
 
 	const uint64_t pid = packetId();
 	Packet outp(peer->address(),RR->identity.address(),Packet::VERB_OK);
@@ -738,10 +840,11 @@ bool IncomingPacket::_doECHO(const RuntimeEnvironment *RR,void *tPtr,const Share
 	outp.append((uint64_t)pid);
 	if (size() > ZT_PACKET_IDX_PAYLOAD)
 		outp.append(reinterpret_cast<const unsigned char *>(data()) + ZT_PACKET_IDX_PAYLOAD,size() - ZT_PACKET_IDX_PAYLOAD);
+	peer->recordOutgoingPacket(_path,outp.packetId(),outp.payloadLength(),outp.verb(),ZT_QOS_NO_FLOW,now);
 	outp.armor(peer->key(),true);
 	_path->send(RR,tPtr,outp.data(),outp.size(),RR->node->now());
 
-	peer->received(tPtr,_path,hops(),pid,payloadLength(),Packet::VERB_ECHO,0,Packet::VERB_NOP,false,0);
+	peer->received(tPtr,_path,hops(),pid,payloadLength(),Packet::VERB_ECHO,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
 
 	return true;
 }
@@ -767,7 +870,7 @@ bool IncomingPacket::_doMULTICAST_LIKE(const RuntimeEnvironment *RR,void *tPtr,c
 			RR->mc->add(tPtr,now,nwid,MulticastGroup(MAC(field(ptr + 8,6),6),at<uint32_t>(ptr + 14)),peer->address());
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_LIKE,0,Packet::VERB_NOP,false,0);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_LIKE,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
 	return true;
 }
 
@@ -889,7 +992,7 @@ bool IncomingPacket::_doNETWORK_CREDENTIALS(const RuntimeEnvironment *RR,void *t
 		}
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_NETWORK_CREDENTIALS,0,Packet::VERB_NOP,trustEstablished,(network) ? network->id() : 0);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_NETWORK_CREDENTIALS,0,Packet::VERB_NOP,trustEstablished,(network) ? network->id() : 0,ZT_QOS_NO_FLOW);
 
 	return true;
 }
@@ -915,7 +1018,7 @@ bool IncomingPacket::_doNETWORK_CONFIG_REQUEST(const RuntimeEnvironment *RR,void
 		_path->send(RR,tPtr,outp.data(),outp.size(),RR->node->now());
 	}
 
-	peer->received(tPtr,_path,hopCount,requestPacketId,payloadLength(),Packet::VERB_NETWORK_CONFIG_REQUEST,0,Packet::VERB_NOP,false,nwid);
+	peer->received(tPtr,_path,hopCount,requestPacketId,payloadLength(),Packet::VERB_NETWORK_CONFIG_REQUEST,0,Packet::VERB_NOP,false,nwid,ZT_QOS_NO_FLOW);
 
 	return true;
 }
@@ -931,12 +1034,14 @@ bool IncomingPacket::_doNETWORK_CONFIG(const RuntimeEnvironment *RR,void *tPtr,c
 			outp.append((uint64_t)packetId());
 			outp.append((uint64_t)network->id());
 			outp.append((uint64_t)configUpdateId);
+			const int64_t now = RR->node->now();
+			peer->recordOutgoingPacket(_path,outp.packetId(),outp.payloadLength(),outp.verb(),ZT_QOS_NO_FLOW,now);
 			outp.armor(peer->key(),true);
 			_path->send(RR,tPtr,outp.data(),outp.size(),RR->node->now());
 		}
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_NETWORK_CONFIG,0,Packet::VERB_NOP,false,(network) ? network->id() : 0);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_NETWORK_CONFIG,0,Packet::VERB_NOP,false,(network) ? network->id() : 0,ZT_QOS_NO_FLOW);
 
 	return true;
 }
@@ -979,12 +1084,13 @@ bool IncomingPacket::_doMULTICAST_GATHER(const RuntimeEnvironment *RR,void *tPtr
 		outp.append((uint32_t)mg.adi());
 		const unsigned int gatheredLocally = RR->mc->gather(peer->address(),nwid,mg,outp,gatherLimit);
 		if (gatheredLocally > 0) {
+			peer->recordOutgoingPacket(_path,outp.packetId(),outp.payloadLength(),outp.verb(),ZT_QOS_NO_FLOW,now);
 			outp.armor(peer->key(),true);
 			_path->send(RR,tPtr,outp.data(),outp.size(),now);
 		}
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_GATHER,0,Packet::VERB_NOP,trustEstablished,nwid);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_GATHER,0,Packet::VERB_NOP,trustEstablished,nwid,ZT_QOS_NO_FLOW);
 
 	return true;
 }
@@ -1032,19 +1138,19 @@ bool IncomingPacket::_doMULTICAST_FRAME(const RuntimeEnvironment *RR,void *tPtr,
 
 		if (network->config().multicastLimit == 0) {
 			RR->t->incomingNetworkFrameDropped(tPtr,network,_path,packetId(),size(),peer->address(),Packet::VERB_MULTICAST_FRAME,from,to.mac(),"multicast disabled");
-			peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_FRAME,0,Packet::VERB_NOP,false,nwid);
+			peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_FRAME,0,Packet::VERB_NOP,false,nwid,ZT_QOS_NO_FLOW);
 			return true;
 		}
 
 		if ((frameLen > 0)&&(frameLen <= ZT_MAX_MTU)) {
 			if (!to.mac().isMulticast()) {
 				RR->t->incomingPacketInvalid(tPtr,_path,packetId(),source(),hops(),Packet::VERB_MULTICAST_FRAME,"destination not multicast");
-				peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_FRAME,0,Packet::VERB_NOP,true,nwid); // trustEstablished because COM is okay
+				peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_FRAME,0,Packet::VERB_NOP,true,nwid,ZT_QOS_NO_FLOW); // trustEstablished because COM is okay
 				return true;
 			}
 			if ((!from)||(from.isMulticast())||(from == network->mac())) {
 				RR->t->incomingPacketInvalid(tPtr,_path,packetId(),source(),hops(),Packet::VERB_MULTICAST_FRAME,"invalid source MAC");
-				peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_FRAME,0,Packet::VERB_NOP,true,nwid); // trustEstablished because COM is okay
+				peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_FRAME,0,Packet::VERB_NOP,true,nwid,ZT_QOS_NO_FLOW); // trustEstablished because COM is okay
 				return true;
 			}
 
@@ -1058,7 +1164,7 @@ bool IncomingPacket::_doMULTICAST_FRAME(const RuntimeEnvironment *RR,void *tPtr,
 					network->learnBridgeRoute(from,peer->address());
 				} else {
 					RR->t->incomingNetworkFrameDropped(tPtr,network,_path,packetId(),size(),peer->address(),Packet::VERB_MULTICAST_FRAME,from,to.mac(),"bridging not allowed (remote)");
-					peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_FRAME,0,Packet::VERB_NOP,true,nwid); // trustEstablished because COM is okay
+					peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_FRAME,0,Packet::VERB_NOP,true,nwid,ZT_QOS_NO_FLOW); // trustEstablished because COM is okay
 					return true;
 				}
 			}
@@ -1076,12 +1182,14 @@ bool IncomingPacket::_doMULTICAST_FRAME(const RuntimeEnvironment *RR,void *tPtr,
 			outp.append((uint32_t)to.adi());
 			outp.append((unsigned char)0x02); // flag 0x02 = contains gather results
 			if (RR->mc->gather(peer->address(),nwid,to,outp,gatherLimit)) {
+				const int64_t now = RR->node->now();
+				peer->recordOutgoingPacket(_path,outp.packetId(),outp.payloadLength(),outp.verb(),ZT_QOS_NO_FLOW,now);
 				outp.armor(peer->key(),true);
 				_path->send(RR,tPtr,outp.data(),outp.size(),RR->node->now());
 			}
 		}
 
-		peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_FRAME,0,Packet::VERB_NOP,true,nwid);
+		peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_MULTICAST_FRAME,0,Packet::VERB_NOP,true,nwid,ZT_QOS_NO_FLOW);
 	} else {
 		_sendErrorNeedCredentials(RR,tPtr,peer,nwid);
 		return false;
@@ -1094,9 +1202,8 @@ bool IncomingPacket::_doPUSH_DIRECT_PATHS(const RuntimeEnvironment *RR,void *tPt
 {
 	const int64_t now = RR->node->now();
 
-	// First, subject this to a rate limit
 	if (!peer->rateGatePushDirectPaths(now)) {
-		peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_PUSH_DIRECT_PATHS,0,Packet::VERB_NOP,false,0);
+		peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_PUSH_DIRECT_PATHS,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
 		return true;
 	}
 
@@ -1108,8 +1215,6 @@ bool IncomingPacket::_doPUSH_DIRECT_PATHS(const RuntimeEnvironment *RR,void *tPt
 	unsigned int ptr = ZT_PACKET_IDX_PAYLOAD + 2;
 
 	while (count--) { // if ptr overflows Buffer will throw
-		// TODO: some flags are not yet implemented
-
 		unsigned int flags = (*this)[ptr++];
 		unsigned int extLen = at<uint16_t>(ptr); ptr += 2;
 		ptr += extLen; // unused right now
@@ -1132,6 +1237,7 @@ bool IncomingPacket::_doPUSH_DIRECT_PATHS(const RuntimeEnvironment *RR,void *tPt
 				}
 			}	break;
 			case 6: {
+
 				const InetAddress a(field(ptr,16),16,at<uint16_t>(ptr + 16));
 				if (
 					((flags & ZT_PUSH_DIRECT_PATHS_FLAG_FORGET_PATH) == 0) && // not being told to forget
@@ -1149,7 +1255,7 @@ bool IncomingPacket::_doPUSH_DIRECT_PATHS(const RuntimeEnvironment *RR,void *tPt
 		ptr += addrLen;
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_PUSH_DIRECT_PATHS,0,Packet::VERB_NOP,false,0);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_PUSH_DIRECT_PATHS,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
 
 	return true;
 }
@@ -1165,7 +1271,7 @@ bool IncomingPacket::_doUSER_MESSAGE(const RuntimeEnvironment *RR,void *tPtr,con
 		RR->node->postEvent(tPtr,ZT_EVENT_USER_MESSAGE,reinterpret_cast<const void *>(&um));
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_USER_MESSAGE,0,Packet::VERB_NOP,false,0);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_USER_MESSAGE,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
 
 	return true;
 }
@@ -1189,8 +1295,26 @@ bool IncomingPacket::_doREMOTE_TRACE(const RuntimeEnvironment *RR,void *tPtr,con
 		}
 	}
 
-	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_REMOTE_TRACE,0,Packet::VERB_NOP,false,0);
+	peer->received(tPtr,_path,hops(),packetId(),payloadLength(),Packet::VERB_REMOTE_TRACE,0,Packet::VERB_NOP,false,0,ZT_QOS_NO_FLOW);
+
+	return true;
+}
 
+bool IncomingPacket::_doPATH_NEGOTIATION_REQUEST(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer)
+{
+	uint64_t now = RR->node->now();
+	SharedPtr<Bond> bond = peer->bond();
+	if (!bond || !bond->rateGatePathNegotiation(now)) {
+		return true;
+	}
+	if (payloadLength() != sizeof(int16_t)) {
+		return true;
+	}
+	int16_t remoteUtility = 0;
+	memcpy(&remoteUtility, payload(), sizeof(int16_t));
+	if (peer->bond()) {
+		peer->bond()->processIncomingPathNegotiationRequest(now, _path, Utils::ntoh(remoteUtility));
+	}
 	return true;
 }
 

+ 6 - 5
node/IncomingPacket.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -100,7 +100,7 @@ public:
 	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
 	 * @return True if decoding and processing is complete, false if caller should try again
 	 */
-	bool tryDecode(const RuntimeEnvironment *RR,void *tPtr);
+	bool tryDecode(const RuntimeEnvironment *RR,void *tPtr,int32_t flowId);
 
 	/**
 	 * @return Time of packet receipt / start of decode
@@ -117,8 +117,8 @@ private:
 	bool _doOK(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
 	bool _doWHOIS(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
 	bool _doRENDEZVOUS(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
-	bool _doFRAME(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
-	bool _doEXT_FRAME(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
+	bool _doFRAME(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer,int32_t flowId);
+	bool _doEXT_FRAME(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer,int32_t flowId);
 	bool _doECHO(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
 	bool _doMULTICAST_LIKE(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
 	bool _doNETWORK_CREDENTIALS(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
@@ -129,6 +129,7 @@ private:
 	bool _doPUSH_DIRECT_PATHS(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
 	bool _doUSER_MESSAGE(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
 	bool _doREMOTE_TRACE(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
+	bool _doPATH_NEGOTIATION_REQUEST(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer);
 
 	void _sendErrorNeedCredentials(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer,const uint64_t nwid);
 

+ 34 - 18
node/Node.cpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -48,6 +48,7 @@ Node::Node(void *uptr,void *tptr,const struct ZT_Node_Callbacks *callbacks,int64
 	_networks(8),
 	_now(now),
 	_lastPingCheck(0),
+	_lastGratuitousPingCheck(0),
 	_lastHousekeepingRun(0),
 	_lastMemoizedTraceSettings(0)
 {
@@ -102,8 +103,9 @@ Node::Node(void *uptr,void *tptr,const struct ZT_Node_Callbacks *callbacks,int64
 		const unsigned long mcs = sizeof(Multicaster) + (((sizeof(Multicaster) & 0xf) != 0) ? (16 - (sizeof(Multicaster) & 0xf)) : 0);
 		const unsigned long topologys = sizeof(Topology) + (((sizeof(Topology) & 0xf) != 0) ? (16 - (sizeof(Topology) & 0xf)) : 0);
 		const unsigned long sas = sizeof(SelfAwareness) + (((sizeof(SelfAwareness) & 0xf) != 0) ? (16 - (sizeof(SelfAwareness) & 0xf)) : 0);
+		const unsigned long bc = sizeof(BondController) + (((sizeof(BondController) & 0xf) != 0) ? (16 - (sizeof(BondController) & 0xf)) : 0);
 
-		m = reinterpret_cast<char *>(::malloc(16 + ts + sws + mcs + topologys + sas));
+		m = reinterpret_cast<char *>(::malloc(16 + ts + sws + mcs + topologys + sas + bc));
 		if (!m)
 			throw std::bad_alloc();
 		RR->rtmem = m;
@@ -118,12 +120,15 @@ Node::Node(void *uptr,void *tptr,const struct ZT_Node_Callbacks *callbacks,int64
 		RR->topology = new (m) Topology(RR,tptr);
 		m += topologys;
 		RR->sa = new (m) SelfAwareness(RR);
+		m += sas;
+		RR->bc = new (m) BondController(RR);
 	} catch ( ... ) {
 		if (RR->sa) RR->sa->~SelfAwareness();
 		if (RR->topology) RR->topology->~Topology();
 		if (RR->mc) RR->mc->~Multicaster();
 		if (RR->sw) RR->sw->~Switch();
 		if (RR->t) RR->t->~Trace();
+		if (RR->bc) RR->bc->~BondController();
 		::free(m);
 		throw;
 	}
@@ -142,6 +147,7 @@ Node::~Node()
 	if (RR->mc) RR->mc->~Multicaster();
 	if (RR->sw) RR->sw->~Switch();
 	if (RR->t) RR->t->~Trace();
+	if (RR->bc) RR->bc->~BondController();
 	::free(RR->rtmem);
 }
 
@@ -246,9 +252,23 @@ ZT_ResultCode Node::processBackgroundTasks(void *tptr,int64_t now,volatile int64
 	_now = now;
 	Mutex::Lock bl(_backgroundTasksLock);
 
+
+	unsigned long bondCheckInterval = ZT_CORE_TIMER_TASK_GRANULARITY;
+	if (RR->bc->inUse()) {
+		// Gratuitously ping active peers so that QoS metrics have enough data to work with (if active path monitoring is enabled)
+		bondCheckInterval = std::min(std::max(RR->bc->minReqPathMonitorInterval(), ZT_CORE_TIMER_TASK_GRANULARITY), ZT_PING_CHECK_INVERVAL);
+		if ((now - _lastGratuitousPingCheck) >= bondCheckInterval) {
+			Hashtable< Address,std::vector<InetAddress> > alwaysContact;
+			_PingPeersThatNeedPing pfunc(RR,tptr,alwaysContact,now);
+			RR->topology->eachPeer<_PingPeersThatNeedPing &>(pfunc);
+			_lastGratuitousPingCheck = now;
+		}
+		RR->bc->processBackgroundTasks(tptr, now);
+	}
+
 	unsigned long timeUntilNextPingCheck = ZT_PING_CHECK_INVERVAL;
 	const int64_t timeSinceLastPingCheck = now - _lastPingCheck;
-	if (timeSinceLastPingCheck >= ZT_PING_CHECK_INVERVAL) {
+	if (timeSinceLastPingCheck >= timeUntilNextPingCheck) {
 		try {
 			_lastPingCheck = now;
 
@@ -354,7 +374,7 @@ ZT_ResultCode Node::processBackgroundTasks(void *tptr,int64_t now,volatile int64
 	}
 
 	try {
-		*nextBackgroundTaskDeadline = now + (int64_t)std::max(std::min(timeUntilNextPingCheck,RR->sw->doTimerTasks(tptr,now)),(unsigned long)ZT_CORE_TIMER_TASK_GRANULARITY);
+		*nextBackgroundTaskDeadline = now + (int64_t)std::max(std::min(bondCheckInterval,std::min(timeUntilNextPingCheck,RR->sw->doTimerTasks(tptr,now))),(unsigned long)ZT_CORE_TIMER_TASK_GRANULARITY);
 	} catch ( ... ) {
 		return ZT_RESULT_FATAL_ERROR_INTERNAL;
 	}
@@ -461,7 +481,7 @@ ZT_PeerList *Node::peers() const
 	for(std::vector< std::pair< Address,SharedPtr<Peer> > >::iterator pi(peers.begin());pi!=peers.end();++pi) {
 		ZT_Peer *p = &(pl->peers[pl->peerCount++]);
 		p->address = pi->second->address().toInt();
-		p->hadAggregateLink = 0;
+		p->isBonded = 0;
 		if (pi->second->remoteVersionKnown()) {
 			p->versionMajor = pi->second->remoteVersionMajor();
 			p->versionMinor = pi->second->remoteVersionMinor();
@@ -478,28 +498,24 @@ ZT_PeerList *Node::peers() const
 
 		std::vector< SharedPtr<Path> > paths(pi->second->paths(_now));
 		SharedPtr<Path> bestp(pi->second->getAppropriatePath(_now,false));
-		p->hadAggregateLink |= pi->second->hasAggregateLink();
 		p->pathCount = 0;
 		for(std::vector< SharedPtr<Path> >::iterator path(paths.begin());path!=paths.end();++path) {
 			memcpy(&(p->paths[p->pathCount].address),&((*path)->address()),sizeof(struct sockaddr_storage));
+			//memcpy(&(p->paths[p->pathCount].ifname,&((*path)->slave()),32);)
+			p->paths[p->pathCount].localSocket = (*path)->localSocket();
 			p->paths[p->pathCount].lastSend = (*path)->lastOut();
 			p->paths[p->pathCount].lastReceive = (*path)->lastIn();
 			p->paths[p->pathCount].trustedPathId = RR->topology->getOutboundPathTrust((*path)->address());
 			p->paths[p->pathCount].expired = 0;
 			p->paths[p->pathCount].preferred = ((*path) == bestp) ? 1 : 0;
-			p->paths[p->pathCount].latency = (float)(*path)->latency();
-			p->paths[p->pathCount].packetDelayVariance = (*path)->packetDelayVariance();
-			p->paths[p->pathCount].throughputDisturbCoeff = (*path)->throughputDisturbanceCoefficient();
-			p->paths[p->pathCount].packetErrorRatio = (*path)->packetErrorRatio();
-			p->paths[p->pathCount].packetLossRatio = (*path)->packetLossRatio();
-			p->paths[p->pathCount].stability = (*path)->lastComputedStability();
-			p->paths[p->pathCount].throughput = (*path)->meanThroughput();
-			p->paths[p->pathCount].maxThroughput = (*path)->maxLifetimeThroughput();
-			p->paths[p->pathCount].allocation = (float)(*path)->allocation() / (float)255;
-			p->paths[p->pathCount].ifname = (*path)->getName();
-
+			//p->paths[p->pathCount].age = (*path)->age(_now);
+			p->paths[p->pathCount].scope = (*path)->ipScope();
 			++p->pathCount;
 		}
+		if (pi->second->bond()) {
+			p->isBonded = pi->second->bond();
+			p->bondingPolicy = pi->second->bond()->getPolicy();
+		}
 	}
 
 	return pl;

+ 6 - 7
node/Node.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -34,6 +34,7 @@
 #include "Salsa20.hpp"
 #include "NetworkController.hpp"
 #include "Hashtable.hpp"
+#include "BondController.hpp"
 
 // Bit mask for "expecting reply" hash
 #define ZT_EXPECTING_REPLIES_BUCKET_MASK1 255
@@ -186,6 +187,8 @@ public:
 
 	inline const Identity &identity() const { return _RR.identity; }
 
+	inline BondController *bondController() const { return _RR.bc; }
+
 	/**
 	 * Register that we are expecting a reply to a packet ID
 	 *
@@ -247,9 +250,6 @@ public:
 	inline const Address &remoteTraceTarget() const { return _remoteTraceTarget; }
 	inline Trace::Level remoteTraceLevel() const { return _remoteTraceLevel; }
 
-	inline void setMultipathMode(uint8_t mode) { _multipathMode = mode; }
-	inline uint8_t getMultipathMode() { return _multipathMode; }
-
 	inline bool localControllerHasAuthorized(const int64_t now,const uint64_t nwid,const Address &addr) const
 	{
 		_localControllerAuthorizations_m.lock();
@@ -306,10 +306,9 @@ private:
 	Address _remoteTraceTarget;
 	enum Trace::Level _remoteTraceLevel;
 
-	uint8_t _multipathMode;
-
 	volatile int64_t _now;
 	int64_t _lastPingCheck;
+	int64_t _lastGratuitousPingCheck;
 	int64_t _lastHousekeepingRun;
 	int64_t _lastMemoizedTraceSettings;
 	volatile int64_t _prngState[2];

+ 2 - 2
node/Packet.cpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.

+ 34 - 6
node/Packet.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -931,13 +931,13 @@ public:
 		 *
 		 * Upon receipt of this packet, the local peer will verify that the correct
 		 * number of bytes were received by the remote peer. If these values do
-		 * not agree that could be an indicator of packet loss.
+		 * not agree that could be an indication of packet loss.
 		 *
 		 * Additionally, the local peer knows the interval of time that has
 		 * elapsed since the last received ACK. With this information it can compute
 		 * a rough estimate of the current throughput.
 		 *
-		 * This is sent at a maximum rate of once per every ZT_PATH_ACK_INTERVAL
+		 * This is sent at a maximum rate of once per every ZT_QOS_ACK_INTERVAL
 		 */
 		VERB_ACK = 0x12,
 
@@ -963,7 +963,8 @@ public:
 		 * measure of the amount of time between when a packet was received and the
 		 * egress time of its tracking QoS packet.
 		 *
-		 * This is sent at a maximum rate of once per every ZT_PATH_QOS_INTERVAL
+		 * This is sent at a maximum rate of once per every
+		 * ZT_QOS_MEASUREMENT_INTERVAL
 		 */
 		VERB_QOS_MEASUREMENT = 0x13,
 
@@ -996,7 +997,34 @@ public:
 		 * node on startup. This is helpful in identifying traces from different
 		 * members of a cluster.
 		 */
-		VERB_REMOTE_TRACE = 0x15
+		VERB_REMOTE_TRACE = 0x15,
+
+		/**
+		 * A request to a peer to use a specific path in a multi-path scenario:
+		 * <[2] 16-bit unsigned integer that encodes a path choice utility>
+		 *
+		 * This is sent when a node operating in multipath mode observes that
+		 * its inbound and outbound traffic aren't going over the same path. The
+		 * node will compute its perceived utility for using its chosen outbound
+		 * path and send this to a peer in an attempt to petition it to send
+		 * its traffic over this same path.
+		 *
+		 * Scenarios:
+		 *
+		 * (1) Remote peer utility is GREATER than ours:
+		 *     - Remote peer will refuse the petition and continue using current path
+		 * (2) Remote peer utility is LESS than than ours:
+		 *     - Remote peer will accept the petition and switch to our chosen path
+		 * (3) Remote peer utility is EQUAL to our own:
+		 *     - To prevent confusion and flapping, both side will agree to use the
+		 *       numerical values of their identities to determine which path to use.
+		 *       The peer with the greatest identity will win.
+		 *
+		 * If a node petitions a peer repeatedly with no effect it will regard
+		 * that as a refusal by the remote peer, in this case if the utility is
+		 * negligible it will voluntarily switch to the remote peer's chosen path.
+		 */
+		VERB_PATH_NEGOTIATION_REQUEST = 0x16
 	};
 
 	/**

+ 331 - 328
node/Path.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -26,12 +26,11 @@
 #include "SharedPtr.hpp"
 #include "AtomicCounter.hpp"
 #include "Utils.hpp"
-#include "RingBuffer.hpp"
 #include "Packet.hpp"
+#include "RingBuffer.hpp"
+//#include "Bond.hpp"
 
-#include "../osdep/Phy.hpp"
-
-#include "../include/ZeroTierDebug.h"
+#include "../osdep/Slave.hpp"
 
 /**
  * Maximum return value of preferenceRank()
@@ -48,7 +47,8 @@ class RuntimeEnvironment;
 class Path
 {
 	friend class SharedPtr<Path>;
-	Phy<Path *> *_phy;
+	friend class Bond;
+	//friend class SharedPtr<Bond>;
 
 public:
 	/**
@@ -87,77 +87,113 @@ public:
 		_lastOut(0),
 		_lastIn(0),
 		_lastTrustEstablishedPacketReceived(0),
-		_lastPathQualityComputeTime(0),
 		_localSocket(-1),
 		_latency(0xffff),
 		_addr(),
 		_ipScope(InetAddress::IP_SCOPE_NONE),
-		_lastAck(0),
-		_lastThroughputEstimation(0),
+		_lastAckReceived(0),
+		_lastAckSent(0),
 		_lastQoSMeasurement(0),
-		_lastQoSRecordPurge(0),
+		_lastThroughputEstimation(0),
+		_lastRefractoryUpdate(0),
+		_lastAliveToggle(0),
+		_lastEligibilityState(false),
+		_lastTrialBegin(0),
+		_refractoryPeriod(0),
+		_monitorInterval(0),
+		_upDelay(0),
+		_downDelay(0),
+		_ipvPref(0),
+		_mode(0),
+		_onlyPathOnSlave(false),
+		_enabled(false),
+		_bonded(false),
+		_negotiated(false),
+		_deprecated(false),
+		_shouldReallocateFlows(false),
+		_assignedFlowCount(0),
+		_latencyMean(0),
+		_latencyVariance(0),
+		_packetLossRatio(0),
+		_packetErrorRatio(0),
+		_throughputMean(0),
+		_throughputMax(0),
+		_throughputVariance(0),
+		_allocation(0),
+		_byteLoad(0),
+		_relativeByteLoad(0),
+		_affinity(0),
+		_failoverScore(0),
 		_unackedBytes(0),
-		_expectingAckAsOf(0),
 		_packetsReceivedSinceLastAck(0),
 		_packetsReceivedSinceLastQoS(0),
-		_maxLifetimeThroughput(0),
-		_lastComputedMeanThroughput(0),
 		_bytesAckedSinceLastThroughputEstimation(0),
-		_lastComputedMeanLatency(0.0),
-		_lastComputedPacketDelayVariance(0.0),
-		_lastComputedPacketErrorRatio(0.0),
-		_lastComputedPacketLossRatio(0),
-		_lastComputedStability(0.0),
-		_lastComputedRelativeQuality(0),
-		_lastComputedThroughputDistCoeff(0.0),
-		_lastAllocation(0)
-	{
-		memset(_ifname, 0, 16);
-		memset(_addrString, 0, sizeof(_addrString));
-	}
+		_packetsIn(0),
+		_packetsOut(0),
+		_prevEligibility(false)
+		{}
 
 	Path(const int64_t localSocket,const InetAddress &addr) :
 		_lastOut(0),
 		_lastIn(0),
 		_lastTrustEstablishedPacketReceived(0),
-		_lastPathQualityComputeTime(0),
 		_localSocket(localSocket),
 		_latency(0xffff),
 		_addr(addr),
 		_ipScope(addr.ipScope()),
-		_lastAck(0),
-		_lastThroughputEstimation(0),
+		_lastAckReceived(0),
+		_lastAckSent(0),
 		_lastQoSMeasurement(0),
-		_lastQoSRecordPurge(0),
+		_lastThroughputEstimation(0),
+		_lastRefractoryUpdate(0),
+		_lastAliveToggle(0),
+		_lastEligibilityState(false),
+		_lastTrialBegin(0),
+		_refractoryPeriod(0),
+		_monitorInterval(0),
+		_upDelay(0),
+		_downDelay(0),
+		_ipvPref(0),
+		_mode(0),
+		_onlyPathOnSlave(false),
+		_enabled(false),
+		_bonded(false),
+		_negotiated(false),
+		_deprecated(false),
+		_shouldReallocateFlows(false),
+		_assignedFlowCount(0),
+		_latencyMean(0),
+		_latencyVariance(0),
+		_packetLossRatio(0),
+		_packetErrorRatio(0),
+		_throughputMean(0),
+		_throughputMax(0),
+		_throughputVariance(0),
+		_allocation(0),
+		_byteLoad(0),
+		_relativeByteLoad(0),
+		_affinity(0),
+		_failoverScore(0),
 		_unackedBytes(0),
-		_expectingAckAsOf(0),
 		_packetsReceivedSinceLastAck(0),
 		_packetsReceivedSinceLastQoS(0),
-		_maxLifetimeThroughput(0),
-		_lastComputedMeanThroughput(0),
 		_bytesAckedSinceLastThroughputEstimation(0),
-		_lastComputedMeanLatency(0.0),
-		_lastComputedPacketDelayVariance(0.0),
-		_lastComputedPacketErrorRatio(0.0),
-		_lastComputedPacketLossRatio(0),
-		_lastComputedStability(0.0),
-		_lastComputedRelativeQuality(0),
-		_lastComputedThroughputDistCoeff(0.0),
-		_lastAllocation(0)
-	{
-		memset(_ifname, 0, 16);
-		memset(_addrString, 0, sizeof(_addrString));
-		if (_localSocket != -1) {
-			_phy->getIfName((PhySocket *) ((uintptr_t) _localSocket), _ifname, 16);
-		}
-	}
+		_packetsIn(0),
+		_packetsOut(0),
+		_prevEligibility(false)
+	{}
 
 	/**
 	 * Called when a packet is received from this remote path, regardless of content
 	 *
 	 * @param t Time of receive
 	 */
-	inline void received(const uint64_t t) { _lastIn = t; }
+	inline void received(const uint64_t t) {
+		_lastIn = t;
+		if (!_prevEligibility) {
+			_lastAliveToggle = _lastIn;
+		}
+	}
 
 	/**
 	 * Set time last trusted packet was received (done in Peer::received())
@@ -197,7 +233,6 @@ public:
 		else {
 			_latency = l;
 		}
-		_latencySamples.push(l);
 	}
 
 	/**
@@ -286,407 +321,375 @@ public:
 	}
 
 	/**
-	 * Record statistics on outgoing packets. Used later to estimate QoS metrics.
-	 *
-	 * @param now Current time
-	 * @param packetId ID of packet
-	 * @param payloadLength Length of payload
-	 * @param verb Packet verb
+	 * @param bonded Whether this path is part of a bond.
 	 */
-	inline void recordOutgoingPacket(int64_t now, int64_t packetId, uint16_t payloadLength, Packet::Verb verb)
-	{
-		Mutex::Lock _l(_statistics_m);
-		if (verb != Packet::VERB_ACK && verb != Packet::VERB_QOS_MEASUREMENT) {
-			if ((packetId & (ZT_PATH_QOS_ACK_PROTOCOL_DIVISOR - 1)) == 0) {
-				_unackedBytes += payloadLength;
-				// Take note that we're expecting a VERB_ACK on this path as of a specific time
-				_expectingAckAsOf = ackAge(now) > ZT_PATH_ACK_INTERVAL ? _expectingAckAsOf : now;
-				if (_outQoSRecords.size() < ZT_PATH_MAX_OUTSTANDING_QOS_RECORDS) {
-					_outQoSRecords[packetId] = now;
-				}
-			}
-		}
-	}
+	inline void setBonded(bool bonded) { _bonded = bonded; }
 
 	/**
-	 * Record statistics on incoming packets. Used later to estimate QoS metrics.
-	 *
-	 * @param now Current time
-	 * @param packetId ID of packet
-	 * @param payloadLength Length of payload
-	 * @param verb Packet verb
+	 * @return True if this path is currently part of a bond.
 	 */
-	inline void recordIncomingPacket(int64_t now, int64_t packetId, uint16_t payloadLength, Packet::Verb verb)
-	{
-		Mutex::Lock _l(_statistics_m);
-		if (verb != Packet::VERB_ACK && verb != Packet::VERB_QOS_MEASUREMENT) {
-			if ((packetId & (ZT_PATH_QOS_ACK_PROTOCOL_DIVISOR - 1)) == 0) {
-				_inACKRecords[packetId] = payloadLength;
-				_packetsReceivedSinceLastAck++;
-				_inQoSRecords[packetId] = now;
-				_packetsReceivedSinceLastQoS++;
-			}
-			_packetValiditySamples.push(true);
-		}
-	}
+	inline bool bonded() { return _bonded; }
 
 	/**
-	 * Record that we've received a VERB_ACK on this path, also compute throughput if required.
-	 *
-	 * @param now Current time
-	 * @param ackedBytes Number of bytes acknowledged by other peer
+	 * @return True if this path is alive (receiving heartbeats)
 	 */
-	inline void receivedAck(int64_t now, int32_t ackedBytes)
-	{
-		_expectingAckAsOf = 0;
-		_unackedBytes = (ackedBytes > _unackedBytes) ? 0 : _unackedBytes - ackedBytes;
-		int64_t timeSinceThroughputEstimate = (now - _lastThroughputEstimation);
-		if (timeSinceThroughputEstimate >= ZT_PATH_THROUGHPUT_MEASUREMENT_INTERVAL) {
-			uint64_t throughput = (uint64_t)((float)(_bytesAckedSinceLastThroughputEstimation * 8) / ((float)timeSinceThroughputEstimate / (float)1000));
-			_throughputSamples.push(throughput);
-			_maxLifetimeThroughput = throughput > _maxLifetimeThroughput ? throughput : _maxLifetimeThroughput;
-			_lastThroughputEstimation = now;
-			_bytesAckedSinceLastThroughputEstimation = 0;
-		} else {
-			_bytesAckedSinceLastThroughputEstimation += ackedBytes;
-		}
+	inline bool alive(const int64_t now, bool bondingEnabled = false) const {
+		return (bondingEnabled && _monitorInterval) ? ((now - _lastIn) < (_monitorInterval * 3)) : ((now - _lastIn) < (ZT_PATH_HEARTBEAT_PERIOD + 5000));
 	}
 
 	/**
-	 * @return Number of bytes this peer is responsible for ACKing since last ACK
+	 * @return True if this path needs a heartbeat
 	 */
-	inline int32_t bytesToAck()
-	{
-		Mutex::Lock _l(_statistics_m);
-		int32_t bytesToAck = 0;
-		std::map<uint64_t,uint16_t>::iterator it = _inACKRecords.begin();
-		while (it != _inACKRecords.end()) {
-			bytesToAck += it->second;
-			it++;
-		}
-		return bytesToAck;
-	}
+	inline bool needsHeartbeat(const int64_t now) const { return ((now - _lastOut) >= ZT_PATH_HEARTBEAT_PERIOD); }
 
 	/**
-	 * @return Number of bytes thus far sent that have not been acknowledged by the remote peer
+	 * @return True if this path needs a heartbeat in accordance to the user-specified path monitor frequency
 	 */
-	inline int64_t unackedSentBytes()
-	{
-		return _unackedBytes;
-	}
+	inline bool needsGratuitousHeartbeat(const int64_t now) { return allowed() && (_monitorInterval > 0) && ((now - _lastOut) >= _monitorInterval); }
 
 	/**
-	 * Account for the fact that an ACK was just sent. Reset counters, timers, and clear statistics buffers
-	 *
-	 * @param Current time
+	 * @return Last time we sent something
 	 */
-	inline void sentAck(int64_t now)
-	{
-		Mutex::Lock _l(_statistics_m);
-		_inACKRecords.clear();
-		_packetsReceivedSinceLastAck = 0;
-		_lastAck = now;
-	}
+	inline int64_t lastOut() const { return _lastOut; }
 
 	/**
-	 * Receive QoS data, match with recorded egress times from this peer, compute latency
-	 * estimates.
-	 *
-	 * @param now Current time
-	 * @param count Number of records
-	 * @param rx_id table of packet IDs
-	 * @param rx_ts table of holding times
+	 * @return Last time we received anything
 	 */
-	inline void receivedQoS(int64_t now, int count, uint64_t *rx_id, uint16_t *rx_ts)
-	{
-		Mutex::Lock _l(_statistics_m);
-		// Look up egress times and compute latency values for each record
-		std::map<uint64_t,uint64_t>::iterator it;
-		for (int j=0; j<count; j++) {
-			it = _outQoSRecords.find(rx_id[j]);
-			if (it != _outQoSRecords.end()) {
-				uint16_t rtt = (uint16_t)(now - it->second);
-				uint16_t rtt_compensated = rtt - rx_ts[j];
-				uint16_t latency = rtt_compensated / 2;
-				updateLatency(latency, now);
-				_outQoSRecords.erase(it);
+	inline int64_t lastIn() const { return _lastIn; }
+
+	/**
+	 * @return the age of the path in terms of receiving packets
+	 */
+	inline int64_t age(int64_t now) { return (now - _lastIn); }
+		
+	/**
+	 * @return Time last trust-established packet was received
+	 */
+	inline int64_t lastTrustEstablishedPacketReceived() const { return _lastTrustEstablishedPacketReceived; }
+
+	/**
+	 * @return Time since last VERB_ACK was received
+	 */
+	inline int64_t ackAge(int64_t now) { return _lastAckReceived ? now - _lastAckReceived : 0; }
+
+	/**
+	 * Set or update a refractory period for the path.
+	 *
+	 * @param punishment How much a path should be punished
+	 * @param pathFailure Whether this call is the result of a recent path failure
+	 */
+	inline void adjustRefractoryPeriod(int64_t now, uint32_t punishment, bool pathFailure) {
+		if (pathFailure) {
+			unsigned int suggestedRefractoryPeriod = _refractoryPeriod ? punishment + (_refractoryPeriod * 2) : punishment;
+			_refractoryPeriod = std::min(suggestedRefractoryPeriod, (unsigned int)ZT_MULTIPATH_MAX_REFRACTORY_PERIOD);
+			_lastRefractoryUpdate = 0;
+		} else {
+			uint32_t drainRefractory = 0;
+			if (_lastRefractoryUpdate) {
+				drainRefractory = (now - _lastRefractoryUpdate);
+			} else {
+				drainRefractory = (now - _lastAliveToggle);
+			}
+			_lastRefractoryUpdate = now;
+			if (_refractoryPeriod > drainRefractory) {
+				_refractoryPeriod -= drainRefractory;
+			} else {
+				_refractoryPeriod = 0;
+				_lastRefractoryUpdate = 0;
 			}
 		}
 	}
 
 	/**
-	 * Generate the contents of a VERB_QOS_MEASUREMENT packet.
+	 * Determine the current state of eligibility of the path.
 	 *
-	 * @param now Current time
-	 * @param qosBuffer destination buffer
-	 * @return Size of payload
+	 * @param includeRefractoryPeriod Whether current punishment should be taken into consideration
+	 * @return True if this path can be used in a bond at the current time
 	 */
-	inline int32_t generateQoSPacket(int64_t now, char *qosBuffer)
-	{
-		Mutex::Lock _l(_statistics_m);
-		int32_t len = 0;
-		std::map<uint64_t,uint64_t>::iterator it = _inQoSRecords.begin();
-		int i=0;
-		while (i<_packetsReceivedSinceLastQoS && it != _inQoSRecords.end()) {
-			uint64_t id = it->first;
-			memcpy(qosBuffer, &id, sizeof(uint64_t));
-			qosBuffer+=sizeof(uint64_t);
-			uint16_t holdingTime = (uint16_t)(now - it->second);
-			memcpy(qosBuffer, &holdingTime, sizeof(uint16_t));
-			qosBuffer+=sizeof(uint16_t);
-			len+=sizeof(uint64_t)+sizeof(uint16_t);
-			_inQoSRecords.erase(it++);
-			i++;
+	inline bool eligible(uint64_t now, int ackSendInterval, bool includeRefractoryPeriod = false) {
+		if (includeRefractoryPeriod && _refractoryPeriod) {
+			return false;
 		}
-		return len;
+		bool acceptableAge    = age(now) < ((_monitorInterval * 4) + _downDelay); // Simple RX age (driven by packets of any type and gratuitous VERB_HELLOs)
+		bool acceptableAckAge = ackAge(now) < (ackSendInterval); // Whether the remote peer is actually responding to our outgoing traffic or simply sending stuff to us
+		bool notTooEarly      = (now - _lastAliveToggle) >= _upDelay; // Whether we've waited long enough since the link last came online
+		bool inTrial          = (now - _lastTrialBegin) < _upDelay; // Whether this path is still in its trial period
+		bool currEligibility  = allowed() && (((acceptableAge || acceptableAckAge) && notTooEarly) || inTrial);
+		return currEligibility;
 	}
 
 	/**
-	 * Account for the fact that a VERB_QOS_MEASUREMENT was just sent. Reset timers.
-	 *
-	 * @param Current time
+	 * Record when this path first entered the bond. Each path is given a trial period where it is admitted
+	 * to the bond without requiring observations to prove its performance or reliability.
+	 */
+	inline void startTrial(uint64_t now) { _lastTrialBegin = now; }
+
+	/**
+	 * @return True if a path is permitted to be used in a bond (according to user pref.)
+	 */
+	inline bool allowed() {
+		return _enabled
+			&& (!_ipvPref
+				|| ((_addr.isV4() && (_ipvPref == 4 || _ipvPref == 46 || _ipvPref == 64))
+				|| ((_addr.isV6() && (_ipvPref == 6 || _ipvPref == 46 || _ipvPref == 64)))));
+	}
+
+	/**
+	 * @return True if a path is preferred over another on the same physical slave (according to user pref.)
 	 */
-	inline void sentQoS(int64_t now) {
-		_packetsReceivedSinceLastQoS = 0;
-		_lastQoSMeasurement = now;
+	inline bool preferred() {
+		return _onlyPathOnSlave
+			|| (_addr.isV4() && (_ipvPref == 4 || _ipvPref == 46))
+			|| (_addr.isV6() && (_ipvPref == 6 || _ipvPref == 64));
 	}
 
 	/**
 	 * @param now Current time
 	 * @return Whether an ACK (VERB_ACK) packet needs to be emitted at this time
 	 */
-	inline bool needsToSendAck(int64_t now) {
-		return ((now - _lastAck) >= ZT_PATH_ACK_INTERVAL ||
-			(_packetsReceivedSinceLastAck == ZT_PATH_QOS_TABLE_SIZE)) && _packetsReceivedSinceLastAck;
+	inline bool needsToSendAck(int64_t now, int ackSendInterval) {
+		return ((now - _lastAckSent) >= ackSendInterval ||
+			(_packetsReceivedSinceLastAck == ZT_QOS_TABLE_SIZE)) && _packetsReceivedSinceLastAck;
 	}
 
 	/**
 	 * @param now Current time
 	 * @return Whether a QoS (VERB_QOS_MEASUREMENT) packet needs to be emitted at this time
 	 */
-	inline bool needsToSendQoS(int64_t now) {
-		return ((_packetsReceivedSinceLastQoS >= ZT_PATH_QOS_TABLE_SIZE) ||
-			((now - _lastQoSMeasurement) > ZT_PATH_QOS_INTERVAL)) && _packetsReceivedSinceLastQoS;
+	inline bool needsToSendQoS(int64_t now, int qosSendInterval) {
+		return ((_packetsReceivedSinceLastQoS >= ZT_QOS_TABLE_SIZE) ||
+			((now - _lastQoSMeasurement) > qosSendInterval)) && _packetsReceivedSinceLastQoS;
+	}
+
+	/**
+	 * Reset packet counters
+	 */
+	inline void resetPacketCounts()
+	{
+		_packetsIn = 0;
+		_packetsOut = 0;
 	}
 
+private:
+
+	volatile int64_t _lastOut;
+	volatile int64_t _lastIn;
+	volatile int64_t _lastTrustEstablishedPacketReceived;
+	int64_t _localSocket;
+	volatile unsigned int _latency;
+	InetAddress _addr;
+	InetAddress::IpScope _ipScope; // memoize this since it's a computed value checked often
+	AtomicCounter __refCount;
+
+	std::map<uint64_t,uint64_t> qosStatsOut; // id:egress_time
+	std::map<uint64_t,uint64_t> qosStatsIn; // id:now
+	std::map<uint64_t,uint16_t> ackStatsIn; // id:len
+
+	RingBuffer<int,ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE> qosRecordSize;
+	RingBuffer<float,ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE> qosRecordLossSamples;
+	RingBuffer<uint64_t,ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE> throughputSamples;
+	RingBuffer<bool,ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE> packetValiditySamples;
+	RingBuffer<float,ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE> _throughputVarianceSamples;
+	RingBuffer<uint16_t,ZT_QOS_SHORTTERM_SAMPLE_WIN_SIZE> latencySamples;
+
 	/**
-	 * How much time has elapsed since we've been expecting a VERB_ACK on this path. This value
-	 * is used to determine a more relevant path "age". This lets us penalize paths which are no
-	 * longer ACKing, but not those that simple aren't being used to carry traffic at the
-	 * current time.
+	 * Last time that a VERB_ACK was received on this path.
 	 */
-	inline int64_t ackAge(int64_t now) { return _expectingAckAsOf ? now - _expectingAckAsOf : 0; }
+	uint64_t _lastAckReceived;
 
 	/**
-	 * The maximum observed throughput (in bits/s) for this path
+	 * Last time that a VERB_ACK was sent out on this path.
 	 */
-	inline uint64_t maxLifetimeThroughput() { return _maxLifetimeThroughput; }
+	uint64_t _lastAckSent;
 
 	/**
-	 * @return The mean throughput (in bits/s) of this link
+	 * Last time that a VERB_QOS_MEASUREMENT was sent out on this path.
 	 */
-	inline uint64_t meanThroughput() { return _lastComputedMeanThroughput; }
+	uint64_t _lastQoSMeasurement;
 
 	/**
-	 * Assign a new relative quality value for this path in the aggregate link
-	 *
-	 * @param rq Quality of this path in comparison to other paths available to this peer
+	 * Last time that a the path's throughput was estimated.
 	 */
-	inline void updateRelativeQuality(float rq) { _lastComputedRelativeQuality = rq; }
+	uint64_t _lastThroughputEstimation;
 
 	/**
-	 * @return Quality of this path compared to others in the aggregate link
+	 * The last time that the refractory period was updated.
 	 */
-	inline float relativeQuality() { return _lastComputedRelativeQuality; }
+	uint64_t _lastRefractoryUpdate;
 
 	/**
-	 * Assign a new allocation value for this path in the aggregate link
-	 *
-	 * @param allocation Percentage of traffic to be sent over this path to a peer
+	 * The last time that the path was marked as "alive".
 	 */
-	inline void updateComponentAllocationOfAggregateLink(unsigned char allocation) { _lastAllocation = allocation; }
+	uint64_t _lastAliveToggle;
 
 	/**
-	 * @return Percentage of traffic allocated to this path in the aggregate link
+	 * State of eligibility at last check. Used for determining state changes.
 	 */
-	inline unsigned char allocation() { return _lastAllocation; }
+	bool _lastEligibilityState;
 
 	/**
-	 * @return Stability estimates can become expensive to compute, we cache the most recent result.
+	 * Timestamp indicating when this path's trial period began.
 	 */
-	inline float lastComputedStability() { return _lastComputedStability; }
+	uint64_t _lastTrialBegin;
 
 	/**
-	 * @return A pointer to a cached copy of the human-readable name of the interface this Path's localSocket is bound to
+	 * Amount of time that this path is prevented from becoming a member of a bond.
 	 */
-	inline char *getName() { return _ifname; }
+	uint32_t _refractoryPeriod;
 
 	/**
-	 * @return Packet delay variance
+	 * Monitor interval specific to this path or that was inherited from the bond controller.
 	 */
-	inline float packetDelayVariance() { return _lastComputedPacketDelayVariance; }
+	int32_t _monitorInterval;
 
 	/**
-	 * @return Previously-computed mean latency
+	 * Up delay interval specific to this path or that was inherited from the bond controller.
 	 */
-	inline float meanLatency() { return _lastComputedMeanLatency; }
+	uint32_t _upDelay;
 
 	/**
-	 * @return Packet loss rate (PLR)
+	 * Down delay interval specific to this path or that was inherited from the bond controller.
 	 */
-	inline float packetLossRatio() { return _lastComputedPacketLossRatio; }
+	uint32_t _downDelay;
 
 	/**
-	 * @return Packet error ratio (PER)
+	 * IP version preference inherited from the physical slave.
 	 */
-	inline float packetErrorRatio() { return _lastComputedPacketErrorRatio; }
+	uint8_t _ipvPref;
 
 	/**
-	 * Record an invalid incoming packet. This packet failed MAC/compression/cipher checks and will now
-	 * contribute to a Packet Error Ratio (PER).
+	 * Mode inherited from the physical slave.
 	 */
-	inline void recordInvalidPacket() { _packetValiditySamples.push(false); }
+	uint8_t _mode;
 
 	/**
-	 * @return A pointer to a cached copy of the address string for this Path (For debugging only)
+	 * IP version preference inherited from the physical slave.
 	 */
-	inline char *getAddressString() { return _addrString; }
+	bool _onlyPathOnSlave;
 
 	/**
-	 * @return The current throughput disturbance coefficient
+	 * Enabled state inherited from the physical slave.
 	 */
-	inline float throughputDisturbanceCoefficient() { return _lastComputedThroughputDistCoeff; }
+	bool _enabled;
 
 	/**
-	 * Compute and cache stability and performance metrics. The resultant stability coefficient is a measure of how "well behaved"
-	 * this path is. This figure is substantially different from (but required for the estimation of the path's overall "quality".
-	 *
-	 * @param now Current time
+	 * Whether this path is currently part of a bond.
 	 */
-	inline void processBackgroundPathMeasurements(const int64_t now)
-	{
-		if (now - _lastPathQualityComputeTime > ZT_PATH_QUALITY_COMPUTE_INTERVAL) {
-			Mutex::Lock _l(_statistics_m);
-			_lastPathQualityComputeTime = now;
-			address().toString(_addrString);
-			_lastComputedMeanLatency = _latencySamples.mean();
-			_lastComputedPacketDelayVariance = _latencySamples.stddev(); // Similar to "jitter" (SEE: RFC 3393, RFC 4689)
-			_lastComputedMeanThroughput = (uint64_t)_throughputSamples.mean();
-
-			// If no packet validity samples, assume PER==0
-			_lastComputedPacketErrorRatio = 1 - (_packetValiditySamples.count() ? _packetValiditySamples.mean() : 1);
-
-			// Compute path stability
-			// Normalize measurements with wildly different ranges into a reasonable range
-			float normalized_pdv = Utils::normalize(_lastComputedPacketDelayVariance, 0, ZT_PATH_MAX_PDV, 0, 10);
-			float normalized_la = Utils::normalize(_lastComputedMeanLatency, 0, ZT_PATH_MAX_MEAN_LATENCY, 0, 10);
-			float throughput_cv = _throughputSamples.mean() > 0 ? _throughputSamples.stddev() / _throughputSamples.mean() : 1;
-
-			// Form an exponential cutoff and apply contribution weights
-			float pdv_contrib = expf((-1.0f)*normalized_pdv) * (float)ZT_PATH_CONTRIB_PDV;
-			float latency_contrib = expf((-1.0f)*normalized_la) * (float)ZT_PATH_CONTRIB_LATENCY;
-
-			// Throughput Disturbance Coefficient
-			float throughput_disturbance_contrib = expf((-1.0f)*throughput_cv) * (float)ZT_PATH_CONTRIB_THROUGHPUT_DISTURBANCE;
-			_throughputDisturbanceSamples.push(throughput_cv);
-			_lastComputedThroughputDistCoeff = _throughputDisturbanceSamples.mean();
-
-			// Obey user-defined ignored contributions
-			pdv_contrib = ZT_PATH_CONTRIB_PDV > 0.0 ? pdv_contrib : 1;
-			latency_contrib = ZT_PATH_CONTRIB_LATENCY > 0.0 ? latency_contrib : 1;
-			throughput_disturbance_contrib = ZT_PATH_CONTRIB_THROUGHPUT_DISTURBANCE > 0.0 ? throughput_disturbance_contrib : 1;
-
-			// Stability
-			_lastComputedStability = pdv_contrib + latency_contrib + throughput_disturbance_contrib;
-			_lastComputedStability *= 1 - _lastComputedPacketErrorRatio;
-
-			// Prevent QoS records from sticking around for too long
-			std::map<uint64_t,uint64_t>::iterator it = _outQoSRecords.begin();
-			while (it != _outQoSRecords.end()) {
-				// Time since egress of tracked packet
-				if ((now - it->second) >= ZT_PATH_QOS_TIMEOUT) {
-					_outQoSRecords.erase(it++);
-				} else { it++; }
-			}
-		}
-	}
+	bool _bonded;
 
 	/**
-	 * @return True if this path is alive (receiving heartbeats)
+	 * Whether this path was intentionally _negotiated by either peer.
 	 */
-	inline bool alive(const int64_t now) const { return ((now - _lastIn) < (ZT_PATH_HEARTBEAT_PERIOD + 5000)); }
+	bool _negotiated;
 
 	/**
-	 * @return True if this path needs a heartbeat
+	 * Whether this path has been deprecated due to performance issues. Current traffic flows
+	 * will be re-allocated to other paths in the most non-disruptive manner (if possible),
+	 * and new traffic will not be allocated to this path.
 	 */
-	inline bool needsHeartbeat(const int64_t now) const { return ((now - _lastOut) >= ZT_PATH_HEARTBEAT_PERIOD); }
+	bool _deprecated;
 
 	/**
-	 * @return Last time we sent something
+	 * Whether flows should be moved from this path. Current traffic flows will be re-allocated
+	 * immediately.
 	 */
-	inline int64_t lastOut() const { return _lastOut; }
+	bool _shouldReallocateFlows;
 
 	/**
-	 * @return Last time we received anything
+	 * The number of flows currently assigned to this path.
 	 */
-	inline int64_t lastIn() const { return _lastIn; }
+	uint16_t _assignedFlowCount;
 
 	/**
-	 * @return Time last trust-established packet was received
+	 * The mean latency (computed from a sliding window.)
 	 */
-	inline int64_t lastTrustEstablishedPacketReceived() const { return _lastTrustEstablishedPacketReceived; }
+	float _latencyMean;
 
-private:
-	Mutex _statistics_m;
+	/**
+	 * Packet delay variance (computed from a sliding window.)
+	 */
+	float _latencyVariance;
 
-	volatile int64_t _lastOut;
-	volatile int64_t _lastIn;
-	volatile int64_t _lastTrustEstablishedPacketReceived;
-	volatile int64_t _lastPathQualityComputeTime;
-	int64_t _localSocket;
-	volatile unsigned int _latency;
-	InetAddress _addr;
-	InetAddress::IpScope _ipScope; // memoize this since it's a computed value checked often
-	AtomicCounter __refCount;
+	/**
+	 * The ratio of lost packets to received packets.
+	 */
+	float _packetLossRatio;
+
+	/**
+	 * The ratio of packets that failed their MAC/CRC checks to those that did not.
+	 */
+	float _packetErrorRatio;
+
+	/**
+	 * The estimated mean throughput of this path.
+	 */
+	uint64_t _throughputMean;
+
+	/**
+	 * The maximum observed throughput of this path.
+	 */
+	uint64_t _throughputMax;
+
+	/**
+	 * The variance in the estimated throughput of this path.
+	 */
+	float _throughputVariance;
+	
+	/**
+	 * The relative quality of this path to all others in the bond, [0-255].
+	 */
+	uint8_t _allocation;
+
+	/**
+	 * How much load this path is under.
+	 */
+	uint64_t _byteLoad;
 
-	std::map<uint64_t,uint64_t> _outQoSRecords; // id:egress_time
-	std::map<uint64_t,uint64_t> _inQoSRecords; // id:now
-	std::map<uint64_t,uint16_t> _inACKRecords; // id:len
+	/**
+	 * How much load this path is under (relative to other paths in the bond.)
+	 */
+	uint8_t _relativeByteLoad;
+
+	/**
+	 * Relative value expressing how "deserving" this path is of new traffic.
+	 */
+	uint8_t _affinity;
 
-	int64_t _lastAck;
-	int64_t _lastThroughputEstimation;
-	int64_t _lastQoSMeasurement;
-	int64_t _lastQoSRecordPurge;
+	/**
+	 * Score that indicates to what degree this path is preferred over others that
+	 * are available to the bonding policy. (specifically for active-backup)
+	 */
+	uint32_t _failoverScore;
 
+	/**
+	 * Number of bytes thus far sent that have not been acknowledged by the remote peer.
+	 */
 	int64_t _unackedBytes;
-	int64_t _expectingAckAsOf;
-	int16_t _packetsReceivedSinceLastAck;
-	int16_t _packetsReceivedSinceLastQoS;
 
-	uint64_t _maxLifetimeThroughput;
-	uint64_t _lastComputedMeanThroughput;
-	uint64_t _bytesAckedSinceLastThroughputEstimation;
+	/**
+	 * Number of packets received since the last VERB_ACK was sent to the remote peer.
+	 */
+	int32_t _packetsReceivedSinceLastAck;
 
-	float _lastComputedMeanLatency;
-	float _lastComputedPacketDelayVariance;
+	/**
+	 * Number of packets received since the last VERB_QOS_MEASUREMENT was sent to the remote peer.
+	 */
+	int32_t _packetsReceivedSinceLastQoS;
 
-	float _lastComputedPacketErrorRatio;
-	float _lastComputedPacketLossRatio;
+	/**
+	 * Bytes acknowledged via incoming VERB_ACK since the last estimation of throughput.
+	 */
+	uint64_t _bytesAckedSinceLastThroughputEstimation;
 
-	// cached estimates
-	float _lastComputedStability;
-	float _lastComputedRelativeQuality;
-	float _lastComputedThroughputDistCoeff;
-	unsigned char _lastAllocation;
+	/**
+	 * Counters used for tracking path load.
+	 */
+	int _packetsIn;
+	int _packetsOut;
 
-	// cached human-readable strings for tracing purposes
-	char _ifname[16];
-	char _addrString[256];
+	// TODO: Remove
 
-	RingBuffer<uint64_t,ZT_PATH_QUALITY_METRIC_WIN_SZ> _throughputSamples;
-	RingBuffer<uint32_t,ZT_PATH_QUALITY_METRIC_WIN_SZ> _latencySamples;
-	RingBuffer<bool,ZT_PATH_QUALITY_METRIC_WIN_SZ> _packetValiditySamples;
-	RingBuffer<float,ZT_PATH_QUALITY_METRIC_WIN_SZ> _throughputDisturbanceSamples;
+	bool _prevEligibility;
 };
 
 } // namespace ZeroTier

+ 121 - 657
node/Peer.cpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -14,7 +14,6 @@
 #include "../version.h"
 #include "Constants.hpp"
 #include "Peer.hpp"
-#include "Node.hpp"
 #include "Switch.hpp"
 #include "Network.hpp"
 #include "SelfAwareness.hpp"
@@ -24,8 +23,6 @@
 #include "RingBuffer.hpp"
 #include "Utils.hpp"
 
-#include "../include/ZeroTierDebug.h"
-
 namespace ZeroTier {
 
 static unsigned char s_freeRandomByteCounter = 0;
@@ -37,20 +34,14 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident
 	_lastTriedMemorizedPath(0),
 	_lastDirectPathPushSent(0),
 	_lastDirectPathPushReceive(0),
+	_lastEchoRequestReceived(0),
 	_lastCredentialRequestSent(0),
 	_lastWhoisRequestReceived(0),
-	_lastEchoRequestReceived(0),
 	_lastCredentialsReceived(0),
 	_lastTrustEstablishedPacketReceived(0),
 	_lastSentFullHello(0),
-	_lastACKWindowReset(0),
-	_lastQoSWindowReset(0),
-	_lastMultipathCompatibilityCheck(0),
+	_lastEchoCheck(0),
 	_freeRandomByte((unsigned char)((uintptr_t)this >> 4) ^ ++s_freeRandomByteCounter),
-	_uniqueAlivePathCount(0),
-	_localMultipathSupported(false),
-	_remoteMultipathSupported(false),
-	_canUseMultipath(false),
 	_vProto(0),
 	_vMajor(0),
 	_vMinor(0),
@@ -58,17 +49,17 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident
 	_id(peerIdentity),
 	_directPathPushCutoffCount(0),
 	_credentialsCutoffCount(0),
-	_linkIsBalanced(false),
-	_linkIsRedundant(false),
-	_remotePeerMultipathEnabled(false),
-	_lastAggregateStatsReport(0),
-	_lastAggregateAllocation(0),
-	_virtualPathCount(0),
-	_roundRobinPathAssignmentIdx(0),
-	_pathAssignmentIdx(0)
+	_echoRequestCutoffCount(0),
+	_uniqueAlivePathCount(0),
+	_localMultipathSupported(false),
+	_remoteMultipathSupported(false),
+	_canUseMultipath(false),
+	_shouldCollectPathStatistics(0),
+	_lastComputedAggregateMeanLatency(0)
 {
-	if (!myIdentity.agree(peerIdentity,_key,ZT_PEER_SECRET_KEY_LENGTH))
+	if (!myIdentity.agree(peerIdentity,_key,ZT_PEER_SECRET_KEY_LENGTH)) {
 		throw ZT_EXCEPTION_INVALID_ARGUMENT;
+	}
 }
 
 void Peer::received(
@@ -81,7 +72,8 @@ void Peer::received(
 	const uint64_t inRePacketId,
 	const Packet::Verb inReVerb,
 	const bool trustEstablished,
-	const uint64_t networkId)
+	const uint64_t networkId,
+	const int32_t flowId)
 {
 	const int64_t now = RR->node->now();
 
@@ -98,28 +90,13 @@ void Peer::received(
 			break;
 	}
 
+	recordIncomingPacket(tPtr, path, packetId, payloadLength, verb, flowId, now);
+
 	if (trustEstablished) {
 		_lastTrustEstablishedPacketReceived = now;
 		path->trustedPacketReceived(now);
 	}
 
-	{
-		Mutex::Lock _l(_paths_m);
-
-		recordIncomingPacket(tPtr, path, packetId, payloadLength, verb, now);
-
-		if (_canUseMultipath) {
-			if (path->needsToSendQoS(now)) {
-				sendQOS_MEASUREMENT(tPtr, path, path->localSocket(), path->address(), now);
-			}
-			for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-				if (_paths[i].p) {
-					_paths[i].p->processBackgroundPathMeasurements(now);
-				}
-			}
-		}
-	}
-
 	if (hops == 0) {
 		// If this is a direct packet (no hops), update existing paths or learn new ones
 		bool havePath = false;
@@ -137,60 +114,45 @@ void Peer::received(
 		}
 
 		bool attemptToContact = false;
+
+		int replaceIdx = ZT_MAX_PEER_NETWORK_PATHS;
 		if ((!havePath)&&(RR->node->shouldUsePathForZeroTierTraffic(tPtr,_id.address(),path->localSocket(),path->address()))) {
 			Mutex::Lock _l(_paths_m);
-
-			// Paths are redundant if they duplicate an alive path to the same IP or
-			// with the same local socket and address family.
-			bool redundant = false;
-			unsigned int replacePath = ZT_MAX_PEER_NETWORK_PATHS;
 			for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 				if (_paths[i].p) {
-					if ( (_paths[i].p->alive(now)) && ( ((_paths[i].p->localSocket() == path->localSocket())&&(_paths[i].p->address().ss_family == path->address().ss_family)) || (_paths[i].p->address().ipsEqual2(path->address())) ) ) {
-						redundant = true;
-						break;
-					}
-					// If the path is the same address and port, simply assume this is a replacement
-					if ( (_paths[i].p->address().ipsEqual2(path->address()))) {
-						replacePath = i;
-						break;
+					// match addr
+					if ( (_paths[i].p->alive(now)) && ( ((_paths[i].p->localSocket() == path->localSocket())&&(_paths[i].p->address().ss_family == path->address().ss_family)) && (_paths[i].p->address().ipsEqual2(path->address())) ) ) {
+						// port
+						if (_paths[i].p->address().port() == path->address().port()) {
+							replaceIdx = i;
+							break;
+						}
 					}
-				} else break;
+				}
 			}
-
-			// If the path isn't a duplicate of the same localSocket AND we haven't already determined a replacePath,
-			// then find the worst path and replace it.
-			if (!redundant && replacePath == ZT_MAX_PEER_NETWORK_PATHS) {
-				int replacePathQuality = 0;
+			if (replaceIdx == ZT_MAX_PEER_NETWORK_PATHS) {
 				for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-					if (_paths[i].p) {
-						const int q = _paths[i].p->quality(now);
-						if (q > replacePathQuality) {
-							replacePathQuality = q;
-							replacePath = i;
-						}
-					} else {
-						replacePath = i;
+					if (!_paths[i].p) {
+						replaceIdx = i;
 						break;
 					}
 				}
 			}
-
-			if (replacePath != ZT_MAX_PEER_NETWORK_PATHS) {
+			if (replaceIdx != ZT_MAX_PEER_NETWORK_PATHS) {
 				if (verb == Packet::VERB_OK) {
 					RR->t->peerLearnedNewPath(tPtr,networkId,*this,path,packetId);
-					_paths[replacePath].lr = now;
-					_paths[replacePath].p = path;
-					_paths[replacePath].priority = 1;
+					performMultipathStateCheck(now);
+					if (_bondToPeer) {
+						_bondToPeer->nominatePath(path, now);
+					}
+					_paths[replaceIdx].lr = now;
+					_paths[replaceIdx].p = path;
+					_paths[replaceIdx].priority = 1;
 				} else {
 					attemptToContact = true;
 				}
-
-				// Every time we learn of new path, rebuild set of virtual paths
-				constructSetOfVirtualPaths();
 			}
 		}
-
 		if (attemptToContact) {
 			attemptToContactAt(tPtr,path->localSocket(),path->address(),now,true);
 			path->sent(now);
@@ -203,8 +165,7 @@ void Peer::received(
 	// is done less frequently.
 	if (this->trustEstablished(now)) {
 		const int64_t sinceLastPush = now - _lastDirectPathPushSent;
-		if (sinceLastPush >= ((hops == 0) ? ZT_DIRECT_PATH_PUSH_INTERVAL_HAVEPATH : ZT_DIRECT_PATH_PUSH_INTERVAL)
-			|| (_localMultipathSupported && (sinceLastPush >= (ZT_DIRECT_PATH_PUSH_INTERVAL_MULTIPATH)))) {
+		if (sinceLastPush >= ((hops == 0) ? ZT_DIRECT_PATH_PUSH_INTERVAL_HAVEPATH : ZT_DIRECT_PATH_PUSH_INTERVAL)) {
 			_lastDirectPathPushSent = now;
 			std::vector<InetAddress> pathsToPush(RR->node->directPaths());
 			if (pathsToPush.size() > 0) {
@@ -249,189 +210,15 @@ void Peer::received(
 	}
 }
 
-void Peer::constructSetOfVirtualPaths()
-{
-	if (!_remoteMultipathSupported) {
-		return;
-	}
-	Mutex::Lock _l(_virtual_paths_m);
-
-	int64_t now = RR->node->now();
-	_virtualPathCount = 0;
-	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-		if (_paths[i].p && _paths[i].p->alive(now)) {
-			for(unsigned int j=0;j<ZT_MAX_PEER_NETWORK_PATHS;++j) {
-				if (_paths[j].p && _paths[j].p->alive(now)) {
-					int64_t localSocket = _paths[j].p->localSocket();
-					bool foundVirtualPath = false;
-					for (int k=0; k<_virtualPaths.size(); k++) {
-						if (_virtualPaths[k]->localSocket == localSocket && _virtualPaths[k]->p == _paths[i].p) {
-							foundVirtualPath = true;
-						}
-					}
-					if (!foundVirtualPath)
-					{
-						VirtualPath *np = new VirtualPath;
-						np->p = _paths[i].p;
-						np->localSocket = localSocket;
-						_virtualPaths.push_back(np);
-					}
-				}
-			}
-		}
-	}
-}
-
-void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
-	uint16_t payloadLength, const Packet::Verb verb, int64_t now)
-{
-	_freeRandomByte += (unsigned char)(packetId >> 8); // grab entropy to use in path selection logic for multipath
-	if (_canUseMultipath) {
-		path->recordOutgoingPacket(now, packetId, payloadLength, verb);
-	}
-}
-
-void Peer::recordIncomingPacket(void *tPtr, const SharedPtr<Path> &path, const uint64_t packetId,
-	uint16_t payloadLength, const Packet::Verb verb, int64_t now)
-{
-	if (_canUseMultipath) {
-		if (path->needsToSendAck(now)) {
-			sendACK(tPtr, path, path->localSocket(), path->address(), now);
-		}
-		path->recordIncomingPacket(now, packetId, payloadLength, verb);
-	}
-}
-
-void Peer::computeAggregateAllocation(int64_t now)
-{
-	float maxStability = 0;
-	float totalRelativeQuality = 0;
-	float maxThroughput = 1;
-	float maxScope = 0;
-	float relStability[ZT_MAX_PEER_NETWORK_PATHS];
-	float relThroughput[ZT_MAX_PEER_NETWORK_PATHS];
-	memset(&relStability, 0, sizeof(relStability));
-	memset(&relThroughput, 0, sizeof(relThroughput));
-	// Survey all paths
-	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-		if (_paths[i].p) {
-			relStability[i] = _paths[i].p->lastComputedStability();
-			relThroughput[i] = (float)_paths[i].p->maxLifetimeThroughput();
-			maxStability = relStability[i] > maxStability ? relStability[i] : maxStability;
-			maxThroughput = relThroughput[i] > maxThroughput ? relThroughput[i] : maxThroughput;
-			maxScope = _paths[i].p->ipScope() > maxScope ? _paths[i].p->ipScope() : maxScope;
-		}
-	}
-	// Convert to relative values
-	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-		if (_paths[i].p) {
-			relStability[i] /= maxStability ? maxStability : 1;
-			relThroughput[i] /= maxThroughput ? maxThroughput : 1;
-			float normalized_ma = Utils::normalize((float)_paths[i].p->ackAge(now), 0, ZT_PATH_MAX_AGE, 0, 10);
-			float age_contrib = exp((-1)*normalized_ma);
-			float relScope = ((float)(_paths[i].p->ipScope()+1) / (maxScope + 1));
-			float relQuality =
-				(relStability[i] * (float)ZT_PATH_CONTRIB_STABILITY)
-				+ (fmaxf(1.0f, relThroughput[i]) * (float)ZT_PATH_CONTRIB_THROUGHPUT)
-				+ relScope * (float)ZT_PATH_CONTRIB_SCOPE;
-			relQuality *= age_contrib;
-			// Clamp values
-			relQuality = relQuality > (1.00f / 100.0f) ? relQuality : 0.0f;
-			relQuality = relQuality < (99.0f / 100.0f) ? relQuality : 1.0f;
-			totalRelativeQuality += relQuality;
-			_paths[i].p->updateRelativeQuality(relQuality);
-		}
-	}
-	// Convert set of relative performances into an allocation set
-	for(uint16_t i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-		if (_paths[i].p) {
-			if (RR->node->getMultipathMode() == ZT_MULTIPATH_BALANCE_RANDOM) {
-				_paths[i].p->updateComponentAllocationOfAggregateLink(((float)_pathChoiceHist.countValue(i) / (float)_pathChoiceHist.count()) * 255);
-			}
-			if (RR->node->getMultipathMode() == ZT_MULTIPATH_BALANCE_DYNAMIC_OPAQUE) {
-				_paths[i].p->updateComponentAllocationOfAggregateLink((unsigned char)((_paths[i].p->relativeQuality() / totalRelativeQuality) * 255));
-			}
-		}
-	}
-}
-
-int Peer::computeAggregateLinkPacketDelayVariance()
-{
-	float pdv = 0.0;
-	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-		if (_paths[i].p) {
-			pdv += _paths[i].p->relativeQuality() * _paths[i].p->packetDelayVariance();
-		}
-	}
-	return (int)pdv;
-}
-
-int Peer::computeAggregateLinkMeanLatency()
-{
-	int ml = 0;
-	int pathCount = 0;
-	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-		if (_paths[i].p) {
-			pathCount++;
-			ml += (int)(_paths[i].p->relativeQuality() * _paths[i].p->meanLatency());
-		}
-	}
-	return ml / pathCount;
-}
-
-int Peer::aggregateLinkPhysicalPathCount()
+SharedPtr<Path> Peer::getAppropriatePath(int64_t now, bool includeExpired, int32_t flowId)
 {
-	std::map<std::string, bool> ifnamemap;
-	int pathCount = 0;
-	int64_t now = RR->node->now();
-	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-		if (_paths[i].p && _paths[i].p->alive(now)) {
-			if (!ifnamemap[_paths[i].p->getName()]) {
-				ifnamemap[_paths[i].p->getName()] = true;
-				pathCount++;
-			}
-		}
-	}
-	return pathCount;
-}
-
-int Peer::aggregateLinkLogicalPathCount()
-{
-	int pathCount = 0;
-	int64_t now = RR->node->now();
-	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-		if (_paths[i].p && _paths[i].p->alive(now)) {
-			pathCount++;
-		}
-	}
-	return pathCount;
-}
-
-std::vector<SharedPtr<Path> > Peer::getAllPaths(int64_t now)
-{
-	Mutex::Lock _l(_virtual_paths_m); // FIXME: TX can now lock RX
-	std::vector<SharedPtr<Path> > paths;
-	for (int i=0; i<_virtualPaths.size(); i++) {
-		if (_virtualPaths[i]->p) {
-			paths.push_back(_virtualPaths[i]->p);
-		}
-	}
-	return paths;
-}
-
-SharedPtr<Path> Peer::getAppropriatePath(int64_t now, bool includeExpired, int64_t flowId)
-{
-	Mutex::Lock _l(_paths_m);
-	SharedPtr<Path> selectedPath;
-	char curPathStr[128];
-	char newPathStr[128];
-	unsigned int bestPath = ZT_MAX_PEER_NETWORK_PATHS;
-
-	/**
-	 * Send traffic across the highest quality path only. This algorithm will still
-	 * use the old path quality metric from protocol version 9.
-	 */
-	if (!_canUseMultipath) {
+	if (!_bondToPeer) {
+		Mutex::Lock _l(_paths_m);
+		unsigned int bestPath = ZT_MAX_PEER_NETWORK_PATHS;
+		/**
+		 * Send traffic across the highest quality path only. This algorithm will still
+		 * use the old path quality metric from protocol version 9.
+		 */
 		long bestPathQuality = 2147483647;
 		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 			if (_paths[i].p) {
@@ -449,293 +236,7 @@ SharedPtr<Path> Peer::getAppropriatePath(int64_t now, bool includeExpired, int64
 		}
 		return SharedPtr<Path>();
 	}
-
-	// Update path measurements
-	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-		if (_paths[i].p) {
-			_paths[i].p->processBackgroundPathMeasurements(now);
-		}
-	}
-	if (RR->sw->isFlowAware()) {
-		// Detect new flows and update existing records
-		if (_flows.count(flowId)) {
-			_flows[flowId]->lastSend = now;
-		}
-		else {
-			fprintf(stderr, "new flow %llx detected between this node and %llx (%lu active flow(s))\n",
-				flowId, this->_id.address().toInt(), (_flows.size()+1));
-			struct Flow *newFlow = new Flow(flowId, now);
-			_flows[flowId] = newFlow;
-			newFlow->assignedPath = nullptr;
-		}
-	}
-	// Construct set of virtual paths if needed
-	if (!_virtualPaths.size()) {
-		constructSetOfVirtualPaths();
-	}
-	if (!_virtualPaths.size()) {
-		fprintf(stderr, "no paths to send packet out on\n");
-		return SharedPtr<Path>();
-	}
-
-	/**
-	 * All traffic is sent on all paths.
-	 */
-	if (RR->node->getMultipathMode() == ZT_MULTIPATH_BROADCAST) {
-		// Not handled here. Handled in Switch::_trySend()
-	}
-
-	/**
-	 * Only one link is active. Fail-over is immediate.
-	 */
-	if (RR->node->getMultipathMode() == ZT_MULTIPATH_ACTIVE_BACKUP) {
-		bool bFoundHotPath = false;
-		if (!_activeBackupPath) {
-			/* Select the fist path that appears to still be active.
-			* This will eventually be user-configurable */
-			for (int i=0; i<ZT_MAX_PEER_NETWORK_PATHS; i++) {
-				if (_paths[i].p) {
-					if (_activeBackupPath.ptr() == _paths[i].p.ptr()) {
-						continue;
-					}
-					_activeBackupPath = _paths[i].p;
-					if ((now - _paths[i].p->lastIn()) < ZT_MULTIPATH_ACTIVE_BACKUP_RAPID_FAILOVER_PERIOD) {
-						bFoundHotPath = true;
-						_activeBackupPath = _paths[i].p;
-						_pathAssignmentIdx = i;
-						_activeBackupPath->address().toString(curPathStr);
-						fprintf(stderr, "selected %s as the primary active-backup path to %llx (idx=%d)\n",
-							curPathStr, this->_id.address().toInt(), _pathAssignmentIdx);
-						break;
-					}
-				}
-			}
-		}
-		else {
-			char what[128];
-			if ((now - _activeBackupPath->lastIn()) > ZT_MULTIPATH_ACTIVE_BACKUP_RAPID_FAILOVER_PERIOD) {
-				_activeBackupPath->address().toString(curPathStr); // Record path string for later debug trace
-				int16_t previousIdx = _pathAssignmentIdx;
-				SharedPtr<Path> nextAlternativePath;
-				// Search for a hot path, at the same time find the next path in
-				// a RR sequence that seems viable to use as an alternative
-				int searchCount = 0;
-				while (searchCount < ZT_MAX_PEER_NETWORK_PATHS) {
-					_pathAssignmentIdx++;
-					if (_pathAssignmentIdx == ZT_MAX_PEER_NETWORK_PATHS) {
-						_pathAssignmentIdx = 0;
-					}
-					searchCount++;
-					if (_paths[_pathAssignmentIdx].p) {
-						_paths[_pathAssignmentIdx].p->address().toString(what);
-						if (_activeBackupPath.ptr() == _paths[_pathAssignmentIdx].p.ptr()) {
-							continue;
-						}
-						if (!nextAlternativePath) { // Record the first viable alternative in the RR sequence
-							nextAlternativePath = _paths[_pathAssignmentIdx].p;
-						}
-						if ((now - _paths[_pathAssignmentIdx].p->lastIn()) < ZT_MULTIPATH_ACTIVE_BACKUP_RAPID_FAILOVER_PERIOD) {
-							bFoundHotPath = true;
-							_activeBackupPath = _paths[_pathAssignmentIdx].p;
-							_activeBackupPath->address().toString(newPathStr);
-							fprintf(stderr, "primary active-backup path %s to %llx appears to be dead, switched to %s\n",
-								curPathStr, this->_id.address().toInt(), newPathStr);
-							break;
-						}
-					}
-				}
-				if (!bFoundHotPath) {
-					if (nextAlternativePath) {
-						_activeBackupPath = nextAlternativePath;
-						_activeBackupPath->address().toString(curPathStr);
-						//fprintf(stderr, "no hot paths found to use as active-backup primary to %llx, using next best: %s\n",
-						//	this->_id.address().toInt(), curPathStr);
-					}
-					else {
-						// No change
-					}
-				}
-			}
-		}
-		if (!_activeBackupPath) {
-			return SharedPtr<Path>();
-		}
-		return _activeBackupPath;
-	}
-
-	/**
-	 * Traffic is randomly distributed among all active paths.
-	 */
-	if (RR->node->getMultipathMode() == ZT_MULTIPATH_BALANCE_RANDOM) {
-		int sz = _virtualPaths.size();
-		if (sz) {
-			int idx = _freeRandomByte % sz;
-			_pathChoiceHist.push(idx);
-			_virtualPaths[idx]->p->address().toString(curPathStr);
-			fprintf(stderr, "sending out: (%llx), idx=%d: path=%s, localSocket=%lld\n",
-				this->_id.address().toInt(), idx, curPathStr, _virtualPaths[idx]->localSocket);
-			return _virtualPaths[idx]->p;
-		}
-		// This call is algorithmically inert but gives us a value to show in the status output
-		computeAggregateAllocation(now);
-	}
-
-	/**
-	 * Packets are striped across all available paths.
-	 */
-	if (RR->node->getMultipathMode() == ZT_MULTIPATH_BALANCE_RR_OPAQUE) {
-		int16_t previousIdx = _roundRobinPathAssignmentIdx;
-		int cycleCount = 0;
-		int minLastIn = 0;
-		int bestAlternativeIdx = -1;
-		while (cycleCount < ZT_MAX_PEER_NETWORK_PATHS) {
-			if (_roundRobinPathAssignmentIdx < (_virtualPaths.size()-1)) {
-				_roundRobinPathAssignmentIdx++;
-			}
-			else {
-				_roundRobinPathAssignmentIdx = 0;
-			}
-			cycleCount++;
-			if (_virtualPaths[_roundRobinPathAssignmentIdx]->p) {
-				uint64_t lastIn = _virtualPaths[_roundRobinPathAssignmentIdx]->p->lastIn();
-				if (bestAlternativeIdx == -1) {
-					minLastIn = lastIn; // Initialization
-					bestAlternativeIdx = 0;
-				}
-				if (lastIn < minLastIn) {
-					minLastIn = lastIn;
-					bestAlternativeIdx = _roundRobinPathAssignmentIdx;
-				}
-				if ((now - lastIn) < 5000) {
-					selectedPath = _virtualPaths[_roundRobinPathAssignmentIdx]->p;
-				}
-			}
-		}
-		// If we can't find an appropriate path, try the most recently active one
-		if (!selectedPath) {
-			_roundRobinPathAssignmentIdx = bestAlternativeIdx;
-			selectedPath = _virtualPaths[bestAlternativeIdx]->p;
-			selectedPath->address().toString(curPathStr);
-			fprintf(stderr, "could not find good path, settling for next best %s\n",curPathStr);
-		}
-		selectedPath->address().toString(curPathStr);
-		fprintf(stderr, "sending packet out on path %s at index %d\n",
-			curPathStr, _roundRobinPathAssignmentIdx);
-		return selectedPath;
-	}
-
-	/**
-	 * Flows are striped across all available paths.
-	 */
-	if (RR->node->getMultipathMode() == ZT_MULTIPATH_BALANCE_RR_FLOW) {
-		// fprintf(stderr, "ZT_MULTIPATH_BALANCE_RR_FLOW\n");
-	}
-
-	/**
-	 * Flows are hashed across all available paths.
-	 */
-	if (RR->node->getMultipathMode() == ZT_MULTIPATH_BALANCE_XOR_FLOW) {
-		// fprintf(stderr, "ZT_MULTIPATH_BALANCE_XOR_FLOW (%llx) \n", flowId);
-		struct Flow *currFlow = NULL;
-		if (_flows.count(flowId)) {
-			currFlow = _flows[flowId];
-			if (!currFlow->assignedPath) {
-				int idx = abs((int)(currFlow->flowId % (_virtualPaths.size()-1)));
-				currFlow->assignedPath = _virtualPaths[idx];
-				_virtualPaths[idx]->p->address().toString(curPathStr);
-				fprintf(stderr, "assigning flow %llx between this node and peer %llx to path %s at index %d\n",
-					currFlow->flowId, this->_id.address().toInt(), curPathStr, idx);
-			}
-			else {
-				if (!currFlow->assignedPath->p->alive(now)) {
-					currFlow->assignedPath->p->address().toString(curPathStr);
-					// Re-assign
-					int idx = abs((int)(currFlow->flowId % (_virtualPaths.size()-1)));
-					currFlow->assignedPath = _virtualPaths[idx];
-					_virtualPaths[idx]->p->address().toString(newPathStr);
-					fprintf(stderr, "path %s assigned to flow %llx between this node and %llx appears to be dead, reassigning to path %s\n",
-						curPathStr, currFlow->flowId, this->_id.address().toInt(), newPathStr);
-				}
-			}
-			return currFlow->assignedPath->p;
-		}
-	}
-
-	/**
-	 * Proportionally allocate traffic according to dynamic path quality measurements.
-	 */
-	if (RR->node->getMultipathMode() == ZT_MULTIPATH_BALANCE_DYNAMIC_OPAQUE) {
-		if ((now - _lastAggregateAllocation) >= ZT_PATH_QUALITY_COMPUTE_INTERVAL) {
-			_lastAggregateAllocation = now;
-			computeAggregateAllocation(now);
-		}
-		// Randomly choose path according to their allocations
-		float rf = _freeRandomByte;
-		for(int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-			if (_paths[i].p) {
-				if (rf < _paths[i].p->allocation()) {
-					bestPath = i;
-					_pathChoiceHist.push(bestPath); // Record which path we chose
-					break;
-				}
-				rf -= _paths[i].p->allocation();
-			}
-		}
-		if (bestPath < ZT_MAX_PEER_NETWORK_PATHS) {
-			return _paths[bestPath].p;
-		}
-	}
-
-	/**
-	 * Flows are dynamically allocated across paths in proportion to link strength and load.
-	 */
-	if (RR->node->getMultipathMode() == ZT_MULTIPATH_BALANCE_DYNAMIC_FLOW) {
-	}
-
-	return SharedPtr<Path>();
-}
-
-char *Peer::interfaceListStr()
-{
-	std::map<std::string, int> ifnamemap;
-	char tmp[32];
-	const int64_t now = RR->node->now();
-	char *ptr = _interfaceListStr;
-	bool imbalanced = false;
-	memset(_interfaceListStr, 0, sizeof(_interfaceListStr));
-	int alivePathCount = aggregateLinkLogicalPathCount();
-	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-		if (_paths[i].p && _paths[i].p->alive(now)) {
-			int ipv = _paths[i].p->address().isV4();
-			// If this is acting as an aggregate link, check allocations
-			float targetAllocation = 1.0f / (float)alivePathCount;
-			float currentAllocation = 1.0f;
-			if (alivePathCount > 1) {
-				currentAllocation = (float)_pathChoiceHist.countValue(i) / (float)_pathChoiceHist.count();
-				if (fabs(targetAllocation - currentAllocation) > ZT_PATH_IMBALANCE_THRESHOLD) {
-					imbalanced = true;
-				}
-			}
-			char *ipvStr = ipv ? (char*)"ipv4" : (char*)"ipv6";
-			sprintf(tmp, "(%s, %s, %.3f)", _paths[i].p->getName(), ipvStr, currentAllocation);
-			// Prevent duplicates
-			if(ifnamemap[_paths[i].p->getName()] != ipv) {
-				memcpy(ptr, tmp, strlen(tmp));
-				ptr += strlen(tmp);
-				*ptr = ' ';
-				ptr++;
-				ifnamemap[_paths[i].p->getName()] = ipv;
-			}
-		}
-	}
-	ptr--; // Overwrite trailing space
-	if (imbalanced) {
-		sprintf(tmp, ", is asymmetrical");
-		memcpy(ptr, tmp, sizeof(tmp));
-	} else {
-		*ptr = '\0';
-	}
-	return _interfaceListStr;
+	return _bondToPeer->getAppropriatePath(now, flowId);
 }
 
 void Peer::introduce(void *const tPtr,const int64_t now,const SharedPtr<Peer> &other) const
@@ -859,87 +360,6 @@ void Peer::introduce(void *const tPtr,const int64_t now,const SharedPtr<Peer> &o
 	}
 }
 
-inline void Peer::processBackgroundPeerTasks(const int64_t now)
-{
-	// Determine current multipath compatibility with other peer
-	if ((now - _lastMultipathCompatibilityCheck) >= ZT_PATH_QUALITY_COMPUTE_INTERVAL) {
-		//
-		// Cache number of available paths so that we can short-circuit multipath logic elsewhere
-		//
-		// We also take notice of duplicate paths (same IP only) because we may have
-		// recently received a direct path push from a peer and our list might contain
-		// a dead path which hasn't been fully recognized as such. In this case we
-		// don't want the duplicate to trigger execution of multipath code prematurely.
-		//
-		// This is done to support the behavior of auto multipath enable/disable
-		// without user intervention.
-		//
-		int currAlivePathCount = 0;
-		int duplicatePathsFound = 0;
-		for (unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-			if (_paths[i].p) {
-				currAlivePathCount++;
-				for (unsigned int j=0;j<ZT_MAX_PEER_NETWORK_PATHS;++j) {
-					if (_paths[i].p && _paths[j].p && _paths[i].p->address().ipsEqual2(_paths[j].p->address()) && i != j) {
-						duplicatePathsFound+=1;
-						break;
-					}
-				}
-			}
-		}
-		_uniqueAlivePathCount = (currAlivePathCount - (duplicatePathsFound / 2));
-		_lastMultipathCompatibilityCheck = now;
-		_localMultipathSupported = ((RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) && (ZT_PROTO_VERSION > 9));
-		_remoteMultipathSupported = _vProto > 9;
-		// If both peers support multipath and more than one path exist, we can use multipath logic
-		_canUseMultipath = _localMultipathSupported && _remoteMultipathSupported && (_uniqueAlivePathCount > 1);
-	}
-
-	// Remove old flows
-	if (RR->sw->isFlowAware()) {
-		std::map<int64_t, struct Flow *>::iterator it = _flows.begin();
-		while (it != _flows.end()) {
-			if ((now - it->second->lastSend) > ZT_MULTIPATH_FLOW_EXPIRATION) {
-				fprintf(stderr, "forgetting flow %llx between this node and %llx (%lu active flow(s))\n",
-					it->first, this->_id.address().toInt(), _flows.size());
-				it = _flows.erase(it);
-			} else {
-				it++;
-			}
-		}
-	}
-}
-
-void Peer::sendACK(void *tPtr,const SharedPtr<Path> &path,const int64_t localSocket,const InetAddress &atAddress,int64_t now)
-{
-	Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ACK);
-	uint32_t bytesToAck = path->bytesToAck();
-	outp.append<uint32_t>(bytesToAck);
-	if (atAddress) {
-		outp.armor(_key,false);
-		RR->node->putPacket(tPtr,localSocket,atAddress,outp.data(),outp.size());
-	} else {
-		RR->sw->send(tPtr,outp,false);
-	}
-	path->sentAck(now);
-}
-
-void Peer::sendQOS_MEASUREMENT(void *tPtr,const SharedPtr<Path> &path,const int64_t localSocket,const InetAddress &atAddress,int64_t now)
-{
-	const int64_t _now = RR->node->now();
-	Packet outp(_id.address(),RR->identity.address(),Packet::VERB_QOS_MEASUREMENT);
-	char qosData[ZT_PATH_MAX_QOS_PACKET_SZ];
-	int16_t len = path->generateQoSPacket(_now,qosData);
-	outp.append(qosData,len);
-	if (atAddress) {
-		outp.armor(_key,false);
-		RR->node->putPacket(tPtr,localSocket,atAddress,outp.data(),outp.size());
-	} else {
-		RR->sw->send(tPtr,outp,false);
-	}
-	path->sentQoS(now);
-}
-
 void Peer::sendHELLO(void *tPtr,const int64_t localSocket,const InetAddress &atAddress,int64_t now)
 {
 	Packet outp(_id.address(),RR->identity.address(),Packet::VERB_HELLO);
@@ -1005,29 +425,57 @@ void Peer::tryMemorizedPath(void *tPtr,int64_t now)
 	}
 }
 
+void Peer::performMultipathStateCheck(int64_t now)
+{
+	/**
+	 * Check for conditions required for multipath bonding and create a bond
+	 * if allowed.
+	 */
+	_localMultipathSupported = ((RR->bc->inUse()) && (ZT_PROTO_VERSION > 9));
+	if (_localMultipathSupported) {
+		int currAlivePathCount = 0;
+		int duplicatePathsFound = 0;
+		for (unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i].p) {
+				currAlivePathCount++;
+				for (unsigned int j=0;j<ZT_MAX_PEER_NETWORK_PATHS;++j) {
+					if (_paths[i].p && _paths[j].p && _paths[i].p->address().ipsEqual2(_paths[j].p->address()) && i != j) {
+						duplicatePathsFound+=1;
+						break;
+					}
+				}
+			}
+		}
+		_uniqueAlivePathCount = (currAlivePathCount - (duplicatePathsFound / 2));
+		_remoteMultipathSupported = _vProto > 9;
+		_canUseMultipath = _localMultipathSupported && _remoteMultipathSupported && (_uniqueAlivePathCount > 1);
+	}
+	if (_canUseMultipath && !_bondToPeer) {
+		if (RR->bc) {
+			_bondToPeer = RR->bc->createTransportTriggeredBond(RR, this);
+			/**
+			 * Allow new bond to retroactively learn all paths known to this peer
+			 */
+			if (_bondToPeer) {
+				for (unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+					if (_paths[i].p) {
+						_bondToPeer->nominatePath(_paths[i].p, now);
+					}
+				}
+			}
+		}
+	}
+}
+
 unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
 {
 	unsigned int sent = 0;
 	Mutex::Lock _l(_paths_m);
 
-	processBackgroundPeerTasks(now);
+	performMultipathStateCheck(now);
 
-	// Emit traces regarding aggregate link status
-	if (_canUseMultipath) {
-		int alivePathCount = aggregateLinkPhysicalPathCount();
-		if ((now - _lastAggregateStatsReport) > ZT_PATH_AGGREGATE_STATS_REPORT_INTERVAL) {
-			_lastAggregateStatsReport = now;
-			if (alivePathCount) {
-				RR->t->peerLinkAggregateStatistics(NULL,*this);
-			}
-		} if (alivePathCount < 2 && _linkIsRedundant) {
-			_linkIsRedundant = !_linkIsRedundant;
-			RR->t->peerLinkNoLongerAggregate(NULL,*this);
-		} if (alivePathCount > 1 && !_linkIsRedundant) {
-			_linkIsRedundant = !_linkIsRedundant;
-			RR->t->peerLinkNoLongerAggregate(NULL,*this);
-		}
-	}
+	const bool sendFullHello = ((now - _lastSentFullHello) >= ZT_PEER_PING_PERIOD);
+	_lastSentFullHello = now;
 
 	// Right now we only keep pinging links that have the maximum priority. The
 	// priority is used to track cluster redirections, meaning that when a cluster
@@ -1040,15 +488,13 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
 		else break;
 	}
 
-	const bool sendFullHello = ((now - _lastSentFullHello) >= ZT_PEER_PING_PERIOD);
-	_lastSentFullHello = now;
-
 	unsigned int j = 0;
 	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 		if (_paths[i].p) {
 			// Clean expired and reduced priority paths
 			if ( ((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION) && (_paths[i].priority == maxPriority) ) {
-				if ((sendFullHello)||(_paths[i].p->needsHeartbeat(now))) {
+				if ((sendFullHello)||(_paths[i].p->needsHeartbeat(now))
+					|| (_canUseMultipath && _paths[i].p->needsGratuitousHeartbeat(now))) {
 					attemptToContactAt(tPtr,_paths[i].p->localSocket(),_paths[i].p->address(),now,sendFullHello);
 					_paths[i].p->sent(now);
 					sent |= (_paths[i].p->address().ss_family == AF_INET) ? 0x1 : 0x2;
@@ -1059,14 +505,6 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
 			}
 		} else break;
 	}
-	if (canUseMultipath()) {
-		while(j < ZT_MAX_PEER_NETWORK_PATHS) {
-			_paths[j].lr = 0;
-			_paths[j].p.zero();
-			_paths[j].priority = 1;
-			++j;
-		}
-	}
 	return sent;
 }
 
@@ -1133,4 +571,30 @@ void Peer::resetWithinScope(void *tPtr,InetAddress::IpScope scope,int inetAddres
 	}
 }
 
+void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
+	uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now)
+{
+	if (!_shouldCollectPathStatistics || !_bondToPeer) {
+		return;
+	}
+	_bondToPeer->recordOutgoingPacket(path, packetId, payloadLength, verb, flowId, now);
+}
+
+void Peer::recordIncomingInvalidPacket(const SharedPtr<Path>& path)
+{
+	if (!_shouldCollectPathStatistics || !_bondToPeer) {
+		return;
+	}
+	_bondToPeer->recordIncomingInvalidPacket(path);
+}
+
+void Peer::recordIncomingPacket(void *tPtr, const SharedPtr<Path> &path, const uint64_t packetId,
+	uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now)
+{
+	if (!_shouldCollectPathStatistics || !_bondToPeer) {
+		return;
+	}
+	_bondToPeer->recordIncomingPacket(path, packetId, payloadLength, verb, flowId, now);
+}
+
 } // namespace ZeroTier

+ 84 - 211
node/Peer.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -15,8 +15,6 @@
 #define ZT_PEER_HPP
 
 #include <vector>
-#include <map>
-#include <queue>
 
 #include "../include/ZeroTierOne.h"
 
@@ -33,6 +31,8 @@
 #include "AtomicCounter.hpp"
 #include "Hashtable.hpp"
 #include "Mutex.hpp"
+#include "Bond.hpp"
+#include "BondController.hpp"
 
 #define ZT_PEER_MAX_SERIALIZED_STATE_SIZE (sizeof(Peer) + 32 + (sizeof(Path) * 2))
 
@@ -44,6 +44,9 @@ namespace ZeroTier {
 class Peer
 {
 	friend class SharedPtr<Peer>;
+	friend class SharedPtr<Bond>;
+	friend class Switch;
+	friend class Bond;
 
 private:
 	Peer() {} // disabled to prevent bugs -- should not be constructed uninitialized
@@ -97,7 +100,8 @@ public:
 		const uint64_t inRePacketId,
 		const Packet::Verb inReVerb,
 		const bool trustEstablished,
-		const uint64_t networkId);
+		const uint64_t networkId,
+		const int32_t flowId);
 
 	/**
 	 * Check whether we have an active path to this peer via the given address
@@ -136,94 +140,49 @@ public:
 		return false;
 	}
 
-	void constructSetOfVirtualPaths();
-
 	/**
-	 * Record statistics on outgoing packets
+	 * Record incoming packets to
 	 *
-	 * @param path Path over which packet was sent
-	 * @param id Packet ID
-	 * @param len Length of packet payload
+	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
+	 * @param path Path over which packet was received
+	 * @param packetId Packet ID
+	 * @param payloadLength Length of packet data payload
 	 * @param verb Packet verb
+	 * @param flowId Flow ID
 	 * @param now Current time
 	 */
-	void recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId, uint16_t payloadLength, const Packet::Verb verb, int64_t now);
+	void recordIncomingPacket(void *tPtr, const SharedPtr<Path> &path, const uint64_t packetId,
+		uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now);
 
 	/**
-	 * Record statistics on incoming packets
 	 *
-	 * @param path Path over which packet was sent
-	 * @param id Packet ID
-	 * @param len Length of packet payload
+	 * @param path Path over which packet is being sent
+	 * @param packetId Packet ID
+	 * @param payloadLength Length of packet data payload
 	 * @param verb Packet verb
+	 * @param flowId Flow ID
 	 * @param now Current time
 	 */
-	void recordIncomingPacket(void *tPtr, const SharedPtr<Path> &path, const uint64_t packetId, uint16_t payloadLength, const Packet::Verb verb, int64_t now);
+	void recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
+		uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now);
 
 	/**
-	 * Send an ACK to peer for the most recent packets received
+	 * Record an invalid incoming packet. This packet failed
+	 * MAC/compression/cipher checks and will now contribute to a
+	 * Packet Error Ratio (PER).
 	 *
-	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
-	 * @param localSocket Raw socket the ACK packet will be sent over
-	 * @param atAddress Destination for the ACK packet
-	 * @param now Current time
-	 */
-	void sendACK(void *tPtr, const SharedPtr<Path> &path, const int64_t localSocket,const InetAddress &atAddress,int64_t now);
-
-	/**
-	 * Send a QoS packet to peer so that it can evaluate the quality of this link
-	 *
-	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
-	 * @param localSocket Raw socket the QoS packet will be sent over
-	 * @param atAddress Destination for the QoS packet
-	 * @param now Current time
-	 */
-	void sendQOS_MEASUREMENT(void *tPtr, const SharedPtr<Path> &path, const int64_t localSocket,const InetAddress &atAddress,int64_t now);
-
-	/**
-	 * Compute relative quality values and allocations for the components of the aggregate link
-	 *
-	 * @param now Current time
-	 */
-	void computeAggregateAllocation(int64_t now);
-
-	/**
-	 * @return The aggregate link Packet Delay Variance (PDV)
-	 */
-	int computeAggregateLinkPacketDelayVariance();
-
-	/**
-	 * @return The aggregate link mean latency
-	 */
-	int computeAggregateLinkMeanLatency();
-
-	/**
-	 * @return The number of currently alive "physical" paths in the aggregate link
-	 */
-	int aggregateLinkPhysicalPathCount();
-
-	/**
-	 * @return The number of currently alive "logical" paths in the aggregate link
+	 * @param path Path over which packet was received
 	 */
-	int aggregateLinkLogicalPathCount();
-
-	std::vector<SharedPtr<Path>> getAllPaths(int64_t now);
+	void recordIncomingInvalidPacket(const SharedPtr<Path>& path);
 
 	/**
 	 * Get the most appropriate direct path based on current multipath and QoS configuration
 	 *
 	 * @param now Current time
-	 * @param flowId Session-specific protocol flow identifier used for path allocation
 	 * @param includeExpired If true, include even expired paths
 	 * @return Best current path or NULL if none
 	 */
-	SharedPtr<Path> getAppropriatePath(int64_t now, bool includeExpired, int64_t flowId = -1);
-
-	/**
-	 * Generate a human-readable string of interface names making up the aggregate link, also include
-	 * moving allocation and IP version number for each (for tracing)
-	 */
-	char *interfaceListStr();
+	SharedPtr<Path> getAppropriatePath(int64_t now, bool includeExpired, int32_t flowId = -1);
 
 	/**
 	 * Send VERB_RENDEZVOUS to this and another peer via the best common IP scope and path
@@ -265,6 +224,13 @@ public:
 	 */
 	void tryMemorizedPath(void *tPtr,int64_t now);
 
+	/**
+	 * A check to be performed periodically which determines whether multipath communication is
+	 * possible with this peer. This check should be performed early in the life-cycle of the peer
+	 * as well as during the process of learning new paths.
+	 */
+	void performMultipathStateCheck(int64_t now);
+
 	/**
 	 * Send pings or keepalives depending on configured timeouts
 	 *
@@ -277,16 +243,6 @@ public:
 	 */
 	unsigned int doPingAndKeepalive(void *tPtr,int64_t now);
 
-	/**
-	 * Clear paths whose localSocket(s) are in a CLOSED state or have an otherwise INVALID state.
-	 * This should be called frequently so that we can detect and remove unproductive or invalid paths.
-	 *
-	 * Under the hood this is done periodically based on ZT_CLOSED_PATH_PRUNING_INTERVAL.
-	 *
-	 * @return Number of paths that were pruned this round
-	 */
-	unsigned int prunePaths();
-
 	/**
 	 * Process a cluster redirect sent by this peer
 	 *
@@ -348,7 +304,7 @@ public:
 	inline unsigned int latency(const int64_t now)
 	{
 		if (_canUseMultipath) {
-			return (int)computeAggregateLinkMeanLatency();
+			return (int)_lastComputedAggregateMeanLatency;
 		} else {
 			SharedPtr<Path> bp(getAppropriatePath(now,false));
 			if (bp)
@@ -407,37 +363,6 @@ public:
 
 	inline bool remoteVersionKnown() const { return ((_vMajor > 0)||(_vMinor > 0)||(_vRevision > 0)); }
 
-	/**
-	 * Periodically update known multipath activation constraints. This is done so that we know when and when
-	 * not to use multipath logic. Doing this once every few seconds is sufficient.
-	 *
-	 * @param now Current time
-	 */
-	inline void processBackgroundPeerTasks(const int64_t now);
-
-	/**
-	 * Record that the remote peer does have multipath enabled. As is evident by the receipt of a VERB_ACK
-	 * or a VERB_QOS_MEASUREMENT packet at some point in the past. Until this flag is set, the local client
-	 * shall assume that multipath is not enabled and should only use classical Protocol 9 logic.
-	 */
-	inline void inferRemoteMultipathEnabled() { _remotePeerMultipathEnabled = true; }
-
-	/**
-	 * @return Whether the local client supports and is configured to use multipath
-	 */
-	inline bool localMultipathSupport() { return _localMultipathSupported; }
-
-	/**
-	 * @return Whether the remote peer supports and is configured to use multipath
-	 */
-	inline bool remoteMultipathSupport() { return _remoteMultipathSupported; }
-
-	/**
-	 * @return Whether this client can use multipath to communicate with this peer. True if both peers are using
-	 * the correct protocol and if both peers have multipath enabled. False if otherwise.
-	 */
-	inline bool canUseMultipath() { return _canUseMultipath; }
-
 	/**
 	 * @return True if peer has received a trust established packet (e.g. common network membership) in the past ZT_TRUST_EXPIRATION ms
 	 */
@@ -492,50 +417,35 @@ public:
 	}
 
 	/**
-	 * Rate limit gate for inbound ECHO requests
+	 * Rate limit gate for inbound ECHO requests. This rate limiter works
+	 * by draining a certain number of requests per unit time. Each peer may
+	 * theoretically receive up to ZT_ECHO_CUTOFF_LIMIT requests per second.
 	 */
 	inline bool rateGateEchoRequest(const int64_t now)
 	{
-		if ((now - _lastEchoRequestReceived) >= ZT_PEER_GENERAL_RATE_LIMIT) {
-			_lastEchoRequestReceived = now;
-			return true;
-		}
-		return false;
-	}
-
-	/**
-	 * Rate limit gate for VERB_ACK
-	 */
-	inline bool rateGateACK(const int64_t now)
-	{
-		if ((now - _lastACKWindowReset) >= ZT_PATH_QOS_ACK_CUTOFF_TIME) {
-			_lastACKWindowReset = now;
-			_ACKCutoffCount = 0;
-		} else {
-			++_ACKCutoffCount;
-		}
-		return (_ACKCutoffCount < ZT_PATH_QOS_ACK_CUTOFF_LIMIT);
-	}
-
-	/**
-	 * Rate limit gate for VERB_QOS_MEASUREMENT
-	 */
-	inline bool rateGateQoS(const int64_t now)
-	{
-		if ((now - _lastQoSWindowReset) >= ZT_PATH_QOS_ACK_CUTOFF_TIME) {
-			_lastQoSWindowReset = now;
-			_QoSCutoffCount = 0;
+		/*
+		// TODO: Rethink this
+		if (_canUseMultipath) {
+			_echoRequestCutoffCount++;
+			int numToDrain = (now - _lastEchoCheck) / ZT_ECHO_DRAINAGE_DIVISOR;
+			_lastEchoCheck = now;
+			fprintf(stderr, "ZT_ECHO_CUTOFF_LIMIT=%d, (now - _lastEchoCheck)=%d, numToDrain=%d, ZT_ECHO_DRAINAGE_DIVISOR=%d\n", ZT_ECHO_CUTOFF_LIMIT, (now - _lastEchoCheck), numToDrain, ZT_ECHO_DRAINAGE_DIVISOR);
+			if (_echoRequestCutoffCount > numToDrain) {
+				_echoRequestCutoffCount-=numToDrain;
+			}
+			else {
+				_echoRequestCutoffCount = 0;
+			}
+			return (_echoRequestCutoffCount < ZT_ECHO_CUTOFF_LIMIT);
 		} else {
-			++_QoSCutoffCount;
+			if ((now - _lastEchoRequestReceived) >= (ZT_PEER_GENERAL_RATE_LIMIT)) {
+				_lastEchoRequestReceived = now;
+				return true;
+			}
+			return false;
 		}
-		return (_QoSCutoffCount < ZT_PATH_QOS_ACK_CUTOFF_LIMIT);
-	}
-
-	/**
-	 * @return Whether this peer is reachable via an aggregate link
-	 */
-	inline bool hasAggregateLink() {
-		return _localMultipathSupported && _remoteMultipathSupported && _remotePeerMultipathEnabled;
+		*/
+		return true;
 	}
 
 	/**
@@ -610,6 +520,18 @@ public:
 		}
 	}
 
+	/**
+	 *
+	 * @return
+	 */
+	SharedPtr<Bond> bond() { return _bondToPeer; }
+
+	/**
+	 *
+	 * @return
+	 */
+	inline int8_t bondingPolicy() { return _bondingPolicy; }
+
 private:
 	struct _PeerPath
 	{
@@ -628,25 +550,16 @@ private:
 	int64_t _lastTriedMemorizedPath;
 	int64_t _lastDirectPathPushSent;
 	int64_t _lastDirectPathPushReceive;
+	int64_t _lastEchoRequestReceived;
 	int64_t _lastCredentialRequestSent;
 	int64_t _lastWhoisRequestReceived;
-	int64_t _lastEchoRequestReceived;
 	int64_t _lastCredentialsReceived;
 	int64_t _lastTrustEstablishedPacketReceived;
 	int64_t _lastSentFullHello;
-	int64_t _lastPathPrune;
-	int64_t _lastACKWindowReset;
-	int64_t _lastQoSWindowReset;
-	int64_t _lastMultipathCompatibilityCheck;
+	int64_t _lastEchoCheck;
 
 	unsigned char _freeRandomByte;
 
-	int _uniqueAlivePathCount;
-
-	bool _localMultipathSupported;
-	bool _remoteMultipathSupported;
-	bool _canUseMultipath;
-
 	uint16_t _vProto;
 	uint16_t _vMajor;
 	uint16_t _vMinor;
@@ -659,62 +572,22 @@ private:
 
 	unsigned int _directPathPushCutoffCount;
 	unsigned int _credentialsCutoffCount;
-	unsigned int _QoSCutoffCount;
-	unsigned int _ACKCutoffCount;
+	unsigned int _echoRequestCutoffCount;
 
 	AtomicCounter __refCount;
 
-	RingBuffer<int,ZT_MULTIPATH_PROPORTION_WIN_SZ> _pathChoiceHist;
-
-	bool _linkIsBalanced;
-	bool _linkIsRedundant;
 	bool _remotePeerMultipathEnabled;
+	int _uniqueAlivePathCount;
+	bool _localMultipathSupported;
+	bool _remoteMultipathSupported;
+	bool _canUseMultipath;
 
-	int64_t _lastAggregateStatsReport;
-	int64_t _lastAggregateAllocation;
-
-	char _interfaceListStr[256]; // 16 characters * 16 paths in a link
-
-	//
-	struct LinkPerformanceEntry
-	{
-		int64_t packetId;
-		struct VirtualPath *egressVirtualPath;
-		struct VirtualPath *ingressVirtualPath;
-	};
-
-	// Virtual paths
-	int _virtualPathCount;
-	Mutex _virtual_paths_m;
-	struct VirtualPath
-	{
-		SharedPtr<Path> p;
-		int64_t localSocket;
-		std::queue<struct LinkPerformanceEntry *> performanceEntries;
-	};
-	std::vector<struct VirtualPath*> _virtualPaths;
-
-	// Flows
-	struct Flow
-	{
-		Flow(int64_t fid, int64_t ls) :
-			flowId(fid),
-			lastSend(ls),
-			assignedPath(NULL)
-		{}
-
-		int64_t flowId;
-		int64_t bytesPerSecond;
-		int64_t lastSend;
-		struct VirtualPath *assignedPath;
-	};
-
-	std::map<int64_t, struct Flow *> _flows;
+	volatile bool _shouldCollectPathStatistics;
+	volatile int8_t _bondingPolicy;
 
-	int16_t _roundRobinPathAssignmentIdx;
+	int32_t _lastComputedAggregateMeanLatency;
 
-	SharedPtr<Path> _activeBackupPath;
-	int16_t _pathAssignmentIdx;
+	SharedPtr<Bond> _bondToPeer;
 };
 
 } // namespace ZeroTier

+ 19 - 4
node/RingBuffer.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -238,6 +238,21 @@ public:
 		return curr_cnt ? subtotal / (float)curr_cnt : 0;
 	}
 
+	/**
+	 * @return The sum of the contents of the buffer
+	 */
+	inline float sum()
+	{
+		size_t iterator = begin;
+		float total = 0;
+		size_t curr_cnt = count();
+		for (size_t i=0; i<curr_cnt; i++) {
+			iterator = (iterator + S - 1) % curr_cnt;
+			total += (float)*(buf + iterator);
+		}
+		return total;
+	}
+
 	/**
 	 * @return The sample standard deviation of element values
 	 */
@@ -306,10 +321,10 @@ public:
 		for (size_t i=0; i<S; i++) {
 			iterator = (iterator + S - 1) % S;
 			if (typeid(T) == typeid(int)) {
-				 //DEBUG_INFO("buf[%2zu]=%2d", iterator, (int)*(buf + iterator));
+				fprintf(stderr, "buf[%2zu]=%2d\n", iterator, (int)*(buf + iterator));
 			}
 			else {
-				 //DEBUG_INFO("buf[%2zu]=%2f", iterator, (float)*(buf + iterator));
+				fprintf(stderr, "buf[%2zu]=%2f\n", iterator, (float)*(buf + iterator));
 			}
 		}
 	}

+ 2 - 0
node/RuntimeEnvironment.hpp

@@ -30,6 +30,7 @@ class Multicaster;
 class NetworkController;
 class SelfAwareness;
 class Trace;
+class BondController;
 
 /**
  * Holds global state for an instance of ZeroTier::Node
@@ -75,6 +76,7 @@ public:
 	Multicaster *mc;
 	Topology *topology;
 	SelfAwareness *sa;
+	BondController *bc;
 
 	// This node's identity and string representations thereof
 	Identity identity;

+ 165 - 172
node/Switch.cpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -42,8 +42,38 @@ Switch::Switch(const RuntimeEnvironment *renv) :
 {
 }
 
+// Returns true if packet appears valid; pos and proto will be set
+static bool _ipv6GetPayload(const uint8_t *frameData,unsigned int frameLen,unsigned int &pos,unsigned int &proto)
+{
+	if (frameLen < 40)
+		return false;
+	pos = 40;
+	proto = frameData[6];
+	while (pos <= frameLen) {
+		switch(proto) {
+			case 0: // hop-by-hop options
+			case 43: // routing
+			case 60: // destination options
+			case 135: // mobility options
+				if ((pos + 8) > frameLen)
+					return false; // invalid!
+				proto = frameData[pos];
+				pos += ((unsigned int)frameData[pos + 1] * 8) + 8;
+				break;
+
+			//case 44: // fragment -- we currently can't parse these and they are deprecated in IPv6 anyway
+			//case 50:
+			//case 51: // IPSec ESP and AH -- we have to stop here since this is encrypted stuff
+			default:
+				return true;
+		}
+	}
+	return false; // overflow == invalid
+}
+
 void Switch::onRemotePacket(void *tPtr,const int64_t localSocket,const InetAddress &fromAddr,const void *data,unsigned int len)
 {
+	int32_t flowId = ZT_QOS_NO_FLOW;
 	try {
 		const int64_t now = RR->node->now();
 
@@ -112,6 +142,7 @@ void Switch::onRemotePacket(void *tPtr,const int64_t localSocket,const InetAddre
 						if (rq->packetId != fragmentPacketId) {
 							// No packet found, so we received a fragment without its head.
 
+							rq->flowId = flowId;
 							rq->timestamp = now;
 							rq->packetId = fragmentPacketId;
 							rq->frags[fragmentNumber - 1] = fragment;
@@ -130,7 +161,7 @@ void Switch::onRemotePacket(void *tPtr,const int64_t localSocket,const InetAddre
 								for(unsigned int f=1;f<totalFragments;++f)
 									rq->frag0.append(rq->frags[f - 1].payload(),rq->frags[f - 1].payloadLength());
 
-								if (rq->frag0.tryDecode(RR,tPtr)) {
+								if (rq->frag0.tryDecode(RR,tPtr,flowId)) {
 									rq->timestamp = 0; // packet decoded, free entry
 								} else {
 									rq->complete = true; // set complete flag but leave entry since it probably needs WHOIS or something
@@ -195,6 +226,7 @@ void Switch::onRemotePacket(void *tPtr,const int64_t localSocket,const InetAddre
 					if (rq->packetId != packetId) {
 						// If we have no other fragments yet, create an entry and save the head
 
+						rq->flowId = flowId;
 						rq->timestamp = now;
 						rq->packetId = packetId;
 						rq->frag0.init(data,len,path,now);
@@ -211,7 +243,7 @@ void Switch::onRemotePacket(void *tPtr,const int64_t localSocket,const InetAddre
 							for(unsigned int f=1;f<rq->totalFragments;++f)
 								rq->frag0.append(rq->frags[f - 1].payload(),rq->frags[f - 1].payloadLength());
 
-							if (rq->frag0.tryDecode(RR,tPtr)) {
+							if (rq->frag0.tryDecode(RR,tPtr,flowId)) {
 								rq->timestamp = 0; // packet decoded, free entry
 							} else {
 								rq->complete = true; // set complete flag but leave entry since it probably needs WHOIS or something
@@ -224,9 +256,10 @@ void Switch::onRemotePacket(void *tPtr,const int64_t localSocket,const InetAddre
 				} else {
 					// Packet is unfragmented, so just process it
 					IncomingPacket packet(data,len,path,now);
-					if (!packet.tryDecode(RR,tPtr)) {
+					if (!packet.tryDecode(RR,tPtr,flowId)) {
 						RXQueueEntry *const rq = _nextRXQueueEntry();
 						Mutex::Lock rql(rq->lock);
+						rq->flowId = flowId;
 						rq->timestamp = now;
 						rq->packetId = packet.packetId();
 						rq->frag0 = packet;
@@ -242,43 +275,6 @@ void Switch::onRemotePacket(void *tPtr,const int64_t localSocket,const InetAddre
 	} catch ( ... ) {} // sanity check, should be caught elsewhere
 }
 
-// Returns true if packet appears valid; pos and proto will be set
-static bool _ipv6GetPayload(const uint8_t *frameData,unsigned int frameLen,unsigned int &pos,unsigned int &proto)
-{
-	if (frameLen < 40)
-		return false;
-	pos = 40;
-	proto = frameData[6];
-	while (pos <= frameLen) {
-		switch(proto) {
-			case 0: // hop-by-hop options
-			case 43: // routing
-			case 60: // destination options
-			case 135: // mobility options
-				if ((pos + 8) > frameLen)
-					return false; // invalid!
-				proto = frameData[pos];
-				pos += ((unsigned int)frameData[pos + 1] * 8) + 8;
-				break;
-
-			//case 44: // fragment -- we currently can't parse these and they are deprecated in IPv6 anyway
-			//case 50:
-			//case 51: // IPSec ESP and AH -- we have to stop here since this is encrypted stuff
-			default:
-				return true;
-		}
-	}
-	return false; // overflow == invalid
-}
-
-bool Switch::isFlowAware()
-{
-	int mode = RR->node->getMultipathMode();
-	return (( mode == ZT_MULTIPATH_BALANCE_RR_FLOW)
-		|| (mode == ZT_MULTIPATH_BALANCE_XOR_FLOW)
-		|| (mode == ZT_MULTIPATH_BALANCE_DYNAMIC_FLOW));
-}
-
 void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const MAC &from,const MAC &to,unsigned int etherType,unsigned int vlanId,const void *data,unsigned int len)
 {
 	if (!network->hasConfig())
@@ -293,75 +289,73 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 		}
 	}
 
-	uint8_t qosBucket = ZT_QOS_DEFAULT_BUCKET;
+	uint8_t qosBucket = ZT_AQM_DEFAULT_BUCKET;
 
-	/* A pseudo-unique identifier used by the balancing and bonding policies to associate properties
-	 * of a specific protocol flow over time and to determine which virtual path this packet
-	 * shall be sent out on. This identifier consists of the source port and destination port
-	 * of the encapsulated frame.
+	/**
+	 * A pseudo-unique identifier used by balancing and bonding policies to
+	 * categorize individual flows/conversations for assignment to a specific
+	 * physical path. This identifier consists of the source port and
+	 * destination port of the encapsulated frame.
 	 *
-	 * A flowId of -1 will indicate that whatever packet we are about transmit has no
-	 * preferred virtual path and will be sent out according to what the multipath logic
-	 * deems appropriate. An example of this would be an ICMP packet.
+	 * A flowId of -1 will indicate that there is no preference for how this
+	 * packet shall be sent. An example of this would be an ICMP packet.
 	 */
 
-	int64_t flowId = -1;
-
-	if (isFlowAware()) {
-		if (etherType == ZT_ETHERTYPE_IPV4 && (len >= 20)) {
-			uint16_t srcPort = 0;
-			uint16_t dstPort = 0;
-			int8_t proto = (reinterpret_cast<const uint8_t *>(data)[9]);
-			const unsigned int headerLen = 4 * (reinterpret_cast<const uint8_t *>(data)[0] & 0xf);
-			switch(proto) {
-				case 0x01: // ICMP
-					flowId = 0x01;
-					break;
-				// All these start with 16-bit source and destination port in that order
-				case 0x06: // TCP
-				case 0x11: // UDP
-				case 0x84: // SCTP
-				case 0x88: // UDPLite
-					if (len > (headerLen + 4)) {
-						unsigned int pos = headerLen + 0;
-						srcPort = (reinterpret_cast<const uint8_t *>(data)[pos++]) << 8;
-						srcPort |= (reinterpret_cast<const uint8_t *>(data)[pos]);
-						pos++;
-						dstPort = (reinterpret_cast<const uint8_t *>(data)[pos++]) << 8;
-						dstPort |= (reinterpret_cast<const uint8_t *>(data)[pos]);
-						flowId = ((int64_t)srcPort << 48) | ((int64_t)dstPort << 32) | proto;
-					}
-					break;
-			}
+	int32_t flowId = ZT_QOS_NO_FLOW;
+
+	if (etherType == ZT_ETHERTYPE_IPV4 && (len >= 20)) {
+		uint16_t srcPort = 0;
+		uint16_t dstPort = 0;
+		uint8_t proto = (reinterpret_cast<const uint8_t *>(data)[9]);
+		const unsigned int headerLen = 4 * (reinterpret_cast<const uint8_t *>(data)[0] & 0xf);
+		switch(proto) {
+			case 0x01: // ICMP
+				//flowId = 0x01;
+				break;
+			// All these start with 16-bit source and destination port in that order
+			case 0x06: // TCP
+			case 0x11: // UDP
+			case 0x84: // SCTP
+			case 0x88: // UDPLite
+				if (len > (headerLen + 4)) {
+					unsigned int pos = headerLen + 0;
+					srcPort = (reinterpret_cast<const uint8_t *>(data)[pos++]) << 8;
+					srcPort |= (reinterpret_cast<const uint8_t *>(data)[pos]);
+					pos++;
+					dstPort = (reinterpret_cast<const uint8_t *>(data)[pos++]) << 8;
+					dstPort |= (reinterpret_cast<const uint8_t *>(data)[pos]);
+					flowId = dstPort ^ srcPort ^ proto;
+				}
+				break;
 		}
+	}
 
-		if (etherType == ZT_ETHERTYPE_IPV6 && (len >= 40)) {
-			uint16_t srcPort = 0;
-			uint16_t dstPort = 0;
-			unsigned int pos;
-			unsigned int proto;
-			_ipv6GetPayload((const uint8_t *)data, len, pos, proto);
-			switch(proto) {
-				case 0x3A: // ICMPv6
-					flowId = 0x3A;
-					break;
-				// All these start with 16-bit source and destination port in that order
-				case 0x06: // TCP
-				case 0x11: // UDP
-				case 0x84: // SCTP
-				case 0x88: // UDPLite
-					if (len > (pos + 4)) {
-						srcPort = (reinterpret_cast<const uint8_t *>(data)[pos++]) << 8;
-						srcPort |= (reinterpret_cast<const uint8_t *>(data)[pos]);
-						pos++;
-						dstPort = (reinterpret_cast<const uint8_t *>(data)[pos++]) << 8;
-						dstPort |= (reinterpret_cast<const uint8_t *>(data)[pos]);
-						flowId = ((int64_t)srcPort << 48) | ((int64_t)dstPort << 32) | proto;
-					}
-					break;
-				default:
-					break;
-			}
+	if (etherType == ZT_ETHERTYPE_IPV6 && (len >= 40)) {
+		uint16_t srcPort = 0;
+		uint16_t dstPort = 0;
+		unsigned int pos;
+		unsigned int proto;
+		_ipv6GetPayload((const uint8_t *)data, len, pos, proto);
+		switch(proto) {
+			case 0x3A: // ICMPv6
+				//flowId = 0x3A;
+				break;
+			// All these start with 16-bit source and destination port in that order
+			case 0x06: // TCP
+			case 0x11: // UDP
+			case 0x84: // SCTP
+			case 0x88: // UDPLite
+				if (len > (pos + 4)) {
+					srcPort = (reinterpret_cast<const uint8_t *>(data)[pos++]) << 8;
+					srcPort |= (reinterpret_cast<const uint8_t *>(data)[pos]);
+					pos++;
+					dstPort = (reinterpret_cast<const uint8_t *>(data)[pos++]) << 8;
+					dstPort |= (reinterpret_cast<const uint8_t *>(data)[pos]);
+					flowId = dstPort ^ srcPort ^ proto;
+				}
+				break;
+			default:
+				break;
 		}
 	}
 
@@ -595,7 +589,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 	}
 }
 
-void Switch::aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &packet,bool encrypt,int qosBucket,int64_t flowId)
+void Switch::aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &packet,bool encrypt,int qosBucket,int32_t flowId)
 {
 	if(!network->qosEnabled()) {
 		send(tPtr, packet, encrypt, flowId);
@@ -603,18 +597,16 @@ void Switch::aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &
 	}
 	NetworkQoSControlBlock *nqcb = _netQueueControlBlock[network->id()];
 	if (!nqcb) {
-		// DEBUG_INFO("creating network QoS control block (NQCB) for network %llx", network->id());
 		nqcb = new NetworkQoSControlBlock();
 		_netQueueControlBlock[network->id()] = nqcb;
 		// Initialize ZT_QOS_NUM_BUCKETS queues and place them in the INACTIVE list
 		// These queues will be shuffled between the new/old/inactive lists by the enqueue/dequeue algorithm
-		for (int i=0; i<ZT_QOS_NUM_BUCKETS; i++) {
+		for (int i=0; i<ZT_AQM_NUM_BUCKETS; i++) {
 			nqcb->inactiveQueues.push_back(new ManagedQueue(i));
 		}
 	}
 	// Don't apply QoS scheduling to ZT protocol traffic
 	if (packet.verb() != Packet::VERB_FRAME && packet.verb() != Packet::VERB_EXT_FRAME) {
-		// just send packet normally, no QoS for ZT protocol traffic
 		send(tPtr, packet, encrypt, flowId);
 	}
 
@@ -624,8 +616,9 @@ void Switch::aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &
 
 	const Address dest(packet.destination());
 	TXQueueEntry *txEntry = new TXQueueEntry(dest,RR->node->now(),packet,encrypt,flowId);
+
 	ManagedQueue *selectedQueue = nullptr;
-	for (size_t i=0; i<ZT_QOS_NUM_BUCKETS; i++) {
+	for (size_t i=0; i<ZT_AQM_NUM_BUCKETS; i++) {
 		if (i < nqcb->oldQueues.size()) { // search old queues first (I think this is best since old would imply most recent usage of the queue)
 			if (nqcb->oldQueues[i]->id == qosBucket) {
 				selectedQueue = nqcb->oldQueues[i];
@@ -638,7 +631,7 @@ void Switch::aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &
 			if (nqcb->inactiveQueues[i]->id == qosBucket) {
 				selectedQueue = nqcb->inactiveQueues[i];
 				// move queue to end of NEW queue list
-				selectedQueue->byteCredit = ZT_QOS_QUANTUM;
+				selectedQueue->byteCredit = ZT_AQM_QUANTUM;
 				// DEBUG_INFO("moving q=%p from INACTIVE to NEW list", selectedQueue);
 				nqcb->newQueues.push_back(selectedQueue);
 				nqcb->inactiveQueues.erase(nqcb->inactiveQueues.begin() + i);
@@ -657,11 +650,11 @@ void Switch::aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &
 
 	// Drop a packet if necessary
 	ManagedQueue *selectedQueueToDropFrom = nullptr;
-	if (nqcb->_currEnqueuedPackets > ZT_QOS_MAX_ENQUEUED_PACKETS)
+	if (nqcb->_currEnqueuedPackets > ZT_AQM_MAX_ENQUEUED_PACKETS)
 	{
 		// DEBUG_INFO("too many enqueued packets (%d), finding packet to drop", nqcb->_currEnqueuedPackets);
 		int maxQueueLength = 0;
-		for (size_t i=0; i<ZT_QOS_NUM_BUCKETS; i++) {
+		for (size_t i=0; i<ZT_AQM_NUM_BUCKETS; i++) {
 			if (i < nqcb->oldQueues.size()) {
 				if (nqcb->oldQueues[i]->byteLength > maxQueueLength) {
 					maxQueueLength = nqcb->oldQueues[i]->byteLength;
@@ -694,7 +687,7 @@ void Switch::aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &
 
 uint64_t Switch::control_law(uint64_t t, int count)
 {
-	return (uint64_t)(t + ZT_QOS_INTERVAL / sqrt(count));
+	return (uint64_t)(t + ZT_AQM_INTERVAL / sqrt(count));
 }
 
 Switch::dqr Switch::dodequeue(ManagedQueue *q, uint64_t now)
@@ -708,14 +701,14 @@ Switch::dqr Switch::dodequeue(ManagedQueue *q, uint64_t now)
 		return r;
 	}
 	uint64_t sojourn_time = now - r.p->creationTime;
-	if (sojourn_time < ZT_QOS_TARGET || q->byteLength <= ZT_DEFAULT_MTU) {
+	if (sojourn_time < ZT_AQM_TARGET || q->byteLength <= ZT_DEFAULT_MTU) {
 		// went below - stay below for at least interval
 		q->first_above_time = 0;
 	} else {
 		if (q->first_above_time == 0) {
 			// just went above from below. if still above at
 			// first_above_time, will say it's ok to drop.
-			q->first_above_time = now + ZT_QOS_INTERVAL;
+			q->first_above_time = now + ZT_AQM_INTERVAL;
 		} else if (now >= q->first_above_time) {
 			r.ok_to_drop = true;
 		}
@@ -747,7 +740,7 @@ Switch::TXQueueEntry * Switch::CoDelDequeue(ManagedQueue *q, bool isNew, uint64_
 		q->q.pop_front(); // drop
 		r = dodequeue(q, now);
 		q->dropping = true;
-		q->count = (q->count > 2 && now - q->drop_next < 8*ZT_QOS_INTERVAL)?
+		q->count = (q->count > 2 && now - q->drop_next < 8*ZT_AQM_INTERVAL)?
 		q->count - 2 : 1;
 		q->drop_next = control_law(now, q->count);
 	}
@@ -775,7 +768,7 @@ void Switch::aqm_dequeue(void *tPtr)
 		while (currQueues->size()) {
 			ManagedQueue *queueAtFrontOfList = currQueues->front();
 			if (queueAtFrontOfList->byteCredit < 0) {
-				queueAtFrontOfList->byteCredit += ZT_QOS_QUANTUM;
+				queueAtFrontOfList->byteCredit += ZT_AQM_QUANTUM;
 				// Move to list of OLD queues
 				// DEBUG_INFO("moving q=%p from NEW to OLD list", queueAtFrontOfList);
 				oldQueues->push_back(queueAtFrontOfList);
@@ -810,7 +803,7 @@ void Switch::aqm_dequeue(void *tPtr)
 		while (currQueues->size()) {
 			ManagedQueue *queueAtFrontOfList = currQueues->front();
 			if (queueAtFrontOfList->byteCredit < 0) {
-				queueAtFrontOfList->byteCredit += ZT_QOS_QUANTUM;
+				queueAtFrontOfList->byteCredit += ZT_AQM_QUANTUM;
 				oldQueues->push_back(queueAtFrontOfList);
 				currQueues->erase(currQueues->begin());
 			} else {
@@ -850,7 +843,7 @@ void Switch::removeNetworkQoSControlBlock(uint64_t nwid)
 	}
 }
 
-void Switch::send(void *tPtr,Packet &packet,bool encrypt,int64_t flowId)
+void Switch::send(void *tPtr,Packet &packet,bool encrypt,int32_t flowId)
 {
 	const Address dest(packet.destination());
 	if (dest == RR->identity.address())
@@ -883,7 +876,7 @@ void Switch::requestWhois(void *tPtr,const int64_t now,const Address &addr)
 
 	const SharedPtr<Peer> upstream(RR->topology->getUpstreamPeer());
 	if (upstream) {
-		int64_t flowId = -1;
+		int32_t flowId = ZT_QOS_NO_FLOW;
 		Packet outp(upstream->address(),RR->identity.address(),Packet::VERB_WHOIS);
 		addr.appendTo(outp);
 		RR->node->expectReplyTo(outp.packetId());
@@ -903,7 +896,7 @@ void Switch::doAnythingWaitingForPeer(void *tPtr,const SharedPtr<Peer> &peer)
 		RXQueueEntry *const rq = &(_rxQueue[ptr]);
 		Mutex::Lock rql(rq->lock);
 		if ((rq->timestamp)&&(rq->complete)) {
-			if ((rq->frag0.tryDecode(RR,tPtr))||((now - rq->timestamp) > ZT_RECEIVE_QUEUE_TIMEOUT))
+			if ((rq->frag0.tryDecode(RR,tPtr,rq->flowId))||((now - rq->timestamp) > ZT_RECEIVE_QUEUE_TIMEOUT))
 				rq->timestamp = 0;
 		}
 	}
@@ -954,7 +947,7 @@ unsigned long Switch::doTimerTasks(void *tPtr,int64_t now)
 		RXQueueEntry *const rq = &(_rxQueue[ptr]);
 		Mutex::Lock rql(rq->lock);
 		if ((rq->timestamp)&&(rq->complete)) {
-			if ((rq->frag0.tryDecode(RR,tPtr))||((now - rq->timestamp) > ZT_RECEIVE_QUEUE_TIMEOUT)) {
+			if ((rq->frag0.tryDecode(RR,tPtr,rq->flowId))||((now - rq->timestamp) > ZT_RECEIVE_QUEUE_TIMEOUT)) {
 				rq->timestamp = 0;
 			} else {
 				const Address src(rq->frag0.source());
@@ -1000,7 +993,7 @@ bool Switch::_shouldUnite(const int64_t now,const Address &source,const Address
 	return false;
 }
 
-bool Switch::_trySend(void *tPtr,Packet &packet,bool encrypt,int64_t flowId)
+bool Switch::_trySend(void *tPtr,Packet &packet,bool encrypt,int32_t flowId)
 {
 	SharedPtr<Path> viaPath;
 	const int64_t now = RR->node->now();
@@ -1008,8 +1001,18 @@ bool Switch::_trySend(void *tPtr,Packet &packet,bool encrypt,int64_t flowId)
 
 	const SharedPtr<Peer> peer(RR->topology->getPeer(tPtr,destination));
 	if (peer) {
-		if (RR->node->getMultipathMode() == ZT_MULTIPATH_BROADCAST) {
-			// Nothing here, we'll grab an entire set of paths to send out on below
+		if ((peer->bondingPolicy() == ZT_BONDING_POLICY_BROADCAST)
+			&& (packet.verb() == Packet::VERB_FRAME || packet.verb() == Packet::VERB_EXT_FRAME)) {
+			const SharedPtr<Peer> relay(RR->topology->getUpstreamPeer());
+			Mutex::Lock _l(peer->_paths_m);
+			for(int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+				if (peer->_paths[i].p && peer->_paths[i].p->alive(now)) {
+					char pathStr[128];
+					peer->_paths[i].p->address().toString(pathStr);
+					_sendViaSpecificPath(tPtr,peer,peer->_paths[i].p,now,packet,encrypt,flowId);
+				}
+			}
+			return true;
 		}
 		else {
 			viaPath = peer->getAppropriatePath(now,false,flowId);
@@ -1021,61 +1024,51 @@ bool Switch::_trySend(void *tPtr,Packet &packet,bool encrypt,int64_t flowId)
 						return false;
 				}
 			}
+			if (viaPath) {
+				_sendViaSpecificPath(tPtr,peer,viaPath,now,packet,encrypt,flowId);
+				return true;
+			}
 		}
-	} else {
-		return false;
 	}
+	return false;
+}
 
-	// If sending on all paths, set viaPath to first path
-	int nextPathIdx = 0;
-	std::vector<SharedPtr<Path>> paths = peer->getAllPaths(now);
-	if (RR->node->getMultipathMode() == ZT_MULTIPATH_BROADCAST) {
-		if (paths.size()) {
-			viaPath = paths[nextPathIdx++];
-		}
-	}
+void Switch::_sendViaSpecificPath(void *tPtr,SharedPtr<Peer> peer,SharedPtr<Path> viaPath,int64_t now,Packet &packet,bool encrypt,int32_t flowId)
+{
+	unsigned int mtu = ZT_DEFAULT_PHYSMTU;
+	uint64_t trustedPathId = 0;
+	RR->topology->getOutboundPathInfo(viaPath->address(),mtu,trustedPathId);
 
-	while (viaPath) {
-		unsigned int mtu = ZT_DEFAULT_PHYSMTU;
-		uint64_t trustedPathId = 0;
-		RR->topology->getOutboundPathInfo(viaPath->address(),mtu,trustedPathId);
-		unsigned int chunkSize = std::min(packet.size(),mtu);
-		packet.setFragmented(chunkSize < packet.size());
-		peer->recordOutgoingPacket(viaPath, packet.packetId(), packet.payloadLength(), packet.verb(), now);
+	unsigned int chunkSize = std::min(packet.size(),mtu);
+	packet.setFragmented(chunkSize < packet.size());
 
-		if (trustedPathId) {
-			packet.setTrusted(trustedPathId);
-		} else {
-			packet.armor(peer->key(),encrypt);
-		}
+	peer->recordOutgoingPacket(viaPath, packet.packetId(), packet.payloadLength(), packet.verb(), flowId, now);
 
-		if (viaPath->send(RR,tPtr,packet.data(),chunkSize,now)) {
-			if (chunkSize < packet.size()) {
-				// Too big for one packet, fragment the rest
-				unsigned int fragStart = chunkSize;
-				unsigned int remaining = packet.size() - chunkSize;
-				unsigned int fragsRemaining = (remaining / (mtu - ZT_PROTO_MIN_FRAGMENT_LENGTH));
-				if ((fragsRemaining * (mtu - ZT_PROTO_MIN_FRAGMENT_LENGTH)) < remaining)
-					++fragsRemaining;
-				const unsigned int totalFragments = fragsRemaining + 1;
-
-				for(unsigned int fno=1;fno<totalFragments;++fno) {
-					chunkSize = std::min(remaining,(unsigned int)(mtu - ZT_PROTO_MIN_FRAGMENT_LENGTH));
-					Packet::Fragment frag(packet,fragStart,chunkSize,fno,totalFragments);
-					viaPath->send(RR,tPtr,frag.data(),frag.size(),now);
-					fragStart += chunkSize;
-					remaining -= chunkSize;
-				}
-			}
-		}
-		viaPath.zero();
-		if (RR->node->getMultipathMode() == ZT_MULTIPATH_BROADCAST) {
-			if (paths.size() > nextPathIdx) {
-				viaPath = paths[nextPathIdx++];
+	if (trustedPathId) {
+		packet.setTrusted(trustedPathId);
+	} else {
+		packet.armor(peer->key(),encrypt);
+	}
+
+	if (viaPath->send(RR,tPtr,packet.data(),chunkSize,now)) {
+		if (chunkSize < packet.size()) {
+			// Too big for one packet, fragment the rest
+			unsigned int fragStart = chunkSize;
+			unsigned int remaining = packet.size() - chunkSize;
+			unsigned int fragsRemaining = (remaining / (mtu - ZT_PROTO_MIN_FRAGMENT_LENGTH));
+			if ((fragsRemaining * (mtu - ZT_PROTO_MIN_FRAGMENT_LENGTH)) < remaining)
+				++fragsRemaining;
+			const unsigned int totalFragments = fragsRemaining + 1;
+
+			for(unsigned int fno=1;fno<totalFragments;++fno) {
+				chunkSize = std::min(remaining,(unsigned int)(mtu - ZT_PROTO_MIN_FRAGMENT_LENGTH));
+				Packet::Fragment frag(packet,fragStart,chunkSize,fno,totalFragments);
+				viaPath->send(RR,tPtr,frag.data(),frag.size(),now);
+				fragStart += chunkSize;
+				remaining -= chunkSize;
 			}
 		}
 	}
-	return true;
 }
 
 } // namespace ZeroTier

+ 12 - 8
node/Switch.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -59,6 +59,8 @@ class Switch
 	struct ManagedQueue;
 	struct TXQueueEntry;
 
+	friend class SharedPtr<Peer>;
+
 	typedef struct {
 		TXQueueEntry *p;
 		bool ok_to_drop;
@@ -123,7 +125,7 @@ public:
 	 * @param encrypt Encrypt packet payload? (always true except for HELLO)
 	 * @param qosBucket Which bucket the rule-system determined this packet should fall into
 	 */
-	void aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &packet,bool encrypt,int qosBucket,int64_t flowId = -1);
+	void aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &packet,bool encrypt,int qosBucket,int32_t flowId = ZT_QOS_NO_FLOW);
 
 	/**
 	 * Performs a single AQM cycle and dequeues and transmits all eligible packets on all networks
@@ -169,7 +171,7 @@ public:
 	 * @param packet Packet to send (buffer may be modified)
 	 * @param encrypt Encrypt packet payload? (always true except for HELLO)
 	 */
-	void send(void *tPtr,Packet &packet,bool encrypt,int64_t flowId = -1);
+	void send(void *tPtr,Packet &packet,bool encrypt,int32_t flowId = ZT_QOS_NO_FLOW);
 
 	/**
 	 * Request WHOIS on a given address
@@ -204,7 +206,8 @@ public:
 
 private:
 	bool _shouldUnite(const int64_t now,const Address &source,const Address &destination);
-	bool _trySend(void *tPtr,Packet &packet,bool encrypt,int64_t flowId = -1); // packet is modified if return is true
+	bool _trySend(void *tPtr,Packet &packet,bool encrypt,int32_t flowId = ZT_QOS_NO_FLOW); // packet is modified if return is true
+	void _sendViaSpecificPath(void *tPtr,SharedPtr<Peer> peer,SharedPtr<Path> viaPath,int64_t now,Packet &packet,bool encrypt,int32_t flowId);
 
 	const RuntimeEnvironment *const RR;
 	int64_t _lastBeaconResponse;
@@ -225,6 +228,7 @@ private:
 		unsigned int totalFragments; // 0 if only frag0 received, waiting for frags
 		uint32_t haveFragments; // bit mask, LSB to MSB
 		volatile bool complete; // if true, packet is complete
+		volatile int32_t flowId;
 		Mutex lock;
 	};
 	RXQueueEntry _rxQueue[ZT_RX_QUEUE_SIZE];
@@ -253,7 +257,7 @@ private:
 	struct TXQueueEntry
 	{
 		TXQueueEntry() {}
-		TXQueueEntry(Address d,uint64_t ct,const Packet &p,bool enc,int64_t fid) :
+		TXQueueEntry(Address d,uint64_t ct,const Packet &p,bool enc,int32_t fid) :
 			dest(d),
 			creationTime(ct),
 			packet(p),
@@ -264,7 +268,7 @@ private:
 		uint64_t creationTime;
 		Packet packet; // unencrypted/unMAC'd packet -- this is done at send time
 		bool encrypt;
-		int64_t flowId;
+		int32_t flowId;
 	};
 	std::list< TXQueueEntry > _txQueue;
 	Mutex _txQueue_m;
@@ -296,7 +300,7 @@ private:
 	{
 		ManagedQueue(int id) :
 			id(id),
-			byteCredit(ZT_QOS_QUANTUM),
+			byteCredit(ZT_AQM_QUANTUM),
 			byteLength(0),
 			dropping(false)
 		{}

+ 7 - 10
node/Trace.cpp

@@ -94,29 +94,26 @@ void Trace::peerConfirmingUnknownPath(void *const tPtr,const uint64_t networkId,
 	}
 }
 
-void Trace::peerLinkNowAggregate(void *const tPtr,Peer &peer)
+void Trace::peerLinkNowRedundant(void *const tPtr,Peer &peer)
 {
-	if ((RR->node->getMultipathMode() == ZT_MULTIPATH_BALANCE_RANDOM)) {
-		ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx is now a randomly-distributed aggregate link",peer.address().toInt());
-	}
-	if ((RR->node->getMultipathMode() == ZT_MULTIPATH_BALANCE_DYNAMIC_OPAQUE)) {
-		ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx is now a proportionally-balanced aggregate link",peer.address().toInt());
-	}
+	//ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx is fully redundant",peer.address().toInt());
 }
 
-void Trace::peerLinkNoLongerAggregate(void *const tPtr,Peer &peer)
+void Trace::peerLinkNoLongerRedundant(void *const tPtr,Peer &peer)
 {
-	ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx has degraded and is no longer an aggregate link",peer.address().toInt());
+	//ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx is no longer redundant",peer.address().toInt());
 }
 
 void Trace::peerLinkAggregateStatistics(void *const tPtr,Peer &peer)
 {
-	ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx is composed of (%d) physical paths %s, has PDV (%.0f ms), mean latency (%.0f ms)",
+	/*
+	ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx is composed of (%d) physical paths %s, has packet delay variance (%.0f ms), mean latency (%.0f ms)",
 		peer.address().toInt(),
 		peer.aggregateLinkPhysicalPathCount(),
 		peer.interfaceListStr(),
 		peer.computeAggregateLinkPacketDelayVariance(),
 		peer.computeAggregateLinkMeanLatency());
+	*/
 }
 
 void Trace::peerLearnedNewPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath,const uint64_t packetId)

+ 2 - 2
node/Trace.hpp

@@ -109,8 +109,8 @@ public:
 
 	void peerConfirmingUnknownPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &path,const uint64_t packetId,const Packet::Verb verb);
 
-	void peerLinkNowAggregate(void *const tPtr,Peer &peer);
-	void peerLinkNoLongerAggregate(void *const tPtr,Peer &peer);
+	void peerLinkNowRedundant(void *const tPtr,Peer &peer);
+	void peerLinkNoLongerRedundant(void *const tPtr,Peer &peer);
 
 	void peerLinkAggregateStatistics(void *const tPtr,Peer &peer);
 

+ 6 - 5
node/Utils.hpp

@@ -214,12 +214,12 @@ public:
 		return l;
 	}
 
-	static inline float normalize(float value, int64_t bigMin, int64_t bigMax, int32_t targetMin, int32_t targetMax)
+	static inline float normalize(float value, float bigMin, float bigMax, float targetMin, float targetMax)
 	{
-		int64_t bigSpan = bigMax - bigMin;
-		int64_t smallSpan = targetMax - targetMin;
-		float valueScaled = (value - (float)bigMin) / (float)bigSpan;
-		return (float)targetMin + valueScaled * (float)smallSpan;
+		float bigSpan = bigMax - bigMin;
+		float smallSpan = targetMax - targetMin;
+		float valueScaled = (value - bigMin) / bigSpan;
+		return targetMin + valueScaled * smallSpan;
 	}
 
 	/**
@@ -253,6 +253,7 @@ public:
 	static inline int strToInt(const char *s) { return (int)strtol(s,(char **)0,10); }
 	static inline unsigned long strToULong(const char *s) { return strtoul(s,(char **)0,10); }
 	static inline long strToLong(const char *s) { return strtol(s,(char **)0,10); }
+	static inline double strToDouble(const char *s) { return strtod(s,NULL); }
 	static inline unsigned long long strToU64(const char *s)
 	{
 #ifdef __WINDOWS__

+ 3 - 1
objects.mk

@@ -24,7 +24,9 @@ CORE_OBJS=\
 	node/Tag.o \
 	node/Topology.o \
 	node/Trace.o \
-	node/Utils.o
+	node/Utils.o \
+	node/Bond.o \
+	node/BondController.o
 
 ONE_OBJS=\
 	controller/EmbeddedNetworkController.o \

+ 27 - 2
osdep/Binder.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -347,6 +347,23 @@ public:
 			}
 		}
 
+		// Generate set of unique interface names (used for formation of logical slave set in multipath code)
+		for(std::map<InetAddress,std::string>::const_iterator ii(localIfAddrs.begin());ii!=localIfAddrs.end();++ii) {
+			slaveIfNames.insert(ii->second);
+		}
+		for (std::set<std::string>::iterator si(slaveIfNames.begin());si!=slaveIfNames.end();si++) {
+			bool bFoundMatch = false;
+			for(std::map<InetAddress,std::string>::const_iterator ii(localIfAddrs.begin());ii!=localIfAddrs.end();++ii) {
+				if (ii->second == *si) {
+					bFoundMatch = true;
+					break;
+				}
+			}
+			if (!bFoundMatch) {
+				slaveIfNames.erase(si);
+			}
+		}
+
 		// Create new bindings for those not already bound
 		for(std::map<InetAddress,std::string>::const_iterator ii(localIfAddrs.begin());ii!=localIfAddrs.end();++ii) {
 			unsigned int bi = 0;
@@ -444,7 +461,15 @@ public:
 		return false;
 	}
 
+	inline std::set<std::string> getSlaveInterfaceNames()
+	{
+		Mutex::Lock _l(_lock);
+		return slaveIfNames;
+	}
+
 private:
+
+	std::set<std::string> slaveIfNames;
 	_Binding _bindings[ZT_BINDER_MAX_BINDINGS];
 	std::atomic<unsigned int> _bindingCount;
 	Mutex _lock;

+ 0 - 16
osdep/LinuxNetLink.cpp

@@ -55,8 +55,6 @@ LinuxNetLink::LinuxNetLink()
 {
 	// set socket timeout to 1 sec so we're not permablocking recv() calls
 	_setSocketTimeout(_fd, 1);
-	int yes=1;
-	setsockopt(_fd,SOL_SOCKET,SO_REUSEADDR,(char*)&yes,sizeof(yes));
 
 	_la.nl_family = AF_NETLINK;
 	_la.nl_pid = 0; //getpid()+1;
@@ -430,8 +428,6 @@ void LinuxNetLink::_linkDeleted(struct nlmsghdr *nlp)
 void LinuxNetLink::_requestIPv4Routes()
 {
 	int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	int yes=1;
-	setsockopt(fd,SOL_SOCKET,SO_REUSEADDR,(char*)&yes,sizeof(yes));
 	if (fd == -1) {
 		fprintf(stderr, "Error opening RTNETLINK socket: %s\n", strerror(errno));
 		return;
@@ -485,8 +481,6 @@ void LinuxNetLink::_requestIPv4Routes()
 void LinuxNetLink::_requestIPv6Routes()
 {
 	int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	int yes=1;
-	setsockopt(fd,SOL_SOCKET,SO_REUSEADDR,(char*)&yes,sizeof(yes));
 	if (fd == -1) {
 		fprintf(stderr, "Error opening RTNETLINK socket: %s\n", strerror(errno));
 		return;
@@ -540,8 +534,6 @@ void LinuxNetLink::_requestIPv6Routes()
 void LinuxNetLink::_requestInterfaceList()
 {
 	int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	int yes=1;
-	setsockopt(fd,SOL_SOCKET,SO_REUSEADDR,(char*)&yes,sizeof(yes));
 	if (fd == -1) {
 		fprintf(stderr, "Error opening RTNETLINK socket: %s\n", strerror(errno));
 		return;
@@ -595,8 +587,6 @@ void LinuxNetLink::addRoute(const InetAddress &target, const InetAddress &via, c
 	if (!target) return;
 
 	int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	int yes=1;
-	setsockopt(fd,SOL_SOCKET,SO_REUSEADDR,(char*)&yes,sizeof(yes));
 	if (fd == -1) {
 		fprintf(stderr, "Error opening RTNETLINK socket: %s\n", strerror(errno));
 		return;
@@ -713,8 +703,6 @@ void LinuxNetLink::delRoute(const InetAddress &target, const InetAddress &via, c
 	if (!target) return;
 
 	int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	int yes=1;
-	setsockopt(fd,SOL_SOCKET,SO_REUSEADDR,(char*)&yes,sizeof(yes));
 	if (fd == -1) {
 		fprintf(stderr, "Error opening RTNETLINK socket: %s\n", strerror(errno));
 		return;
@@ -828,8 +816,6 @@ void LinuxNetLink::delRoute(const InetAddress &target, const InetAddress &via, c
 void LinuxNetLink::addAddress(const InetAddress &addr, const char *iface)
 {
 	int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	int yes=1;
-	setsockopt(fd,SOL_SOCKET,SO_REUSEADDR,(char*)&yes,sizeof(yes));
 	if (fd == -1) {
 		fprintf(stderr, "Error opening RTNETLINK socket: %s\n", strerror(errno));
 		return;
@@ -948,8 +934,6 @@ void LinuxNetLink::addAddress(const InetAddress &addr, const char *iface)
 void LinuxNetLink::removeAddress(const InetAddress &addr, const char *iface)
 {
 	int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	int yes=1;
-	setsockopt(fd,SOL_SOCKET,SO_REUSEADDR,(char*)&yes,sizeof(yes));
 	if (fd == -1) {
 		fprintf(stderr, "Error opening RTNETLINK socket: %s\n", strerror(errno));
 		return;

+ 18 - 2
osdep/OSUtils.cpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -459,6 +459,22 @@ uint64_t OSUtils::jsonInt(const nlohmann::json &jv,const uint64_t dfl)
 	return dfl;
 }
 
+double OSUtils::jsonDouble(const nlohmann::json &jv,const double dfl)
+{
+	try {
+		if (jv.is_number()) {
+			return (double)jv;
+		}
+		else if (jv.is_string()) {
+			std::string s = jv;
+			return Utils::strToDouble(s.c_str());
+		} else if (jv.is_boolean()) {
+			return (double)jv;
+		}
+	} catch ( ... ) {}
+	return dfl;
+}
+
 uint64_t OSUtils::jsonIntHex(const nlohmann::json &jv,const uint64_t dfl)
 {
 	try {

+ 3 - 2
osdep/OSUtils.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -277,6 +277,7 @@ public:
 	static nlohmann::json jsonParse(const std::string &buf);
 	static std::string jsonDump(const nlohmann::json &j,int indentation = 1);
 	static uint64_t jsonInt(const nlohmann::json &jv,const uint64_t dfl);
+	static double jsonDouble(const nlohmann::json &jv,const double dfl);
 	static uint64_t jsonIntHex(const nlohmann::json &jv,const uint64_t dfl);
 	static bool jsonBool(const nlohmann::json &jv,const bool dfl);
 	static std::string jsonString(const nlohmann::json &jv,const char *dfl);

+ 2 - 42
osdep/Phy.hpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -261,46 +261,6 @@ public:
 		}
 	}
 
-	/**
-	 * Whether or not the socket object is in a closed state
-	 *
-	 * @param s Socket object
-	 * @return true if socket is closed, false if otherwise
-	 */
-	inline bool isClosed(PhySocket *s)
-	{
-		PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
-		return sws->type == ZT_PHY_SOCKET_CLOSED;
-	}
-
-	/**
-	 * Get state of socket object
-	 *
-	 * @param s Socket object
-	 * @return State of socket
-	 */
-	inline int getState(PhySocket *s)
-	{
-		PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
-		return sws->type;
-	}
-
-	/**
-	 * In the event that this socket is erased, we need a way to convey to the multipath logic
-	 * that this path is no longer valid.
-	 *
-	 * @param s Socket object
-	 * @return Whether the state of this socket is within an acceptable range of values
-	 */
-	inline bool isValidState(PhySocket *s)
-	{
-		if (s) {
-			PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s));
-			return sws->type >= ZT_PHY_SOCKET_CLOSED && sws->type <= ZT_PHY_SOCKET_UNIX_LISTEN;
-		}
-		return false;
-	}
-
 	/**
 	 * Cause poll() to stop waiting immediately
 	 *

+ 238 - 0
osdep/Slave.hpp

@@ -0,0 +1,238 @@
+/*
+ * Copyright (c)2013-2020 ZeroTier, Inc.
+ *
+ * Use of this software is governed by the Business Source License included
+ * in the LICENSE.TXT file in the project's root directory.
+ *
+ * Change Date: 2024-01-01
+ *
+ * On the date above, in accordance with the Business Source License, use
+ * of this software will be governed by version 2.0 of the Apache License.
+ */
+/****/
+
+#ifndef ZT_SLAVE_HPP
+#define ZT_SLAVE_HPP
+
+#include <string>
+
+#include "../node/AtomicCounter.hpp"
+
+namespace ZeroTier {
+
+class Slave
+{
+	friend class SharedPtr<Slave>;
+
+public:
+
+	Slave() {}
+
+	/**
+	 *
+	 * @param ifnameStr
+	 * @param ipvPref
+	 * @param speed
+	 * @param enabled
+	 * @param mode
+	 * @param failoverToSlaveStr
+	 * @param userSpecifiedAlloc
+	 */
+	Slave(std::string& ifnameStr,
+			uint8_t ipvPref,
+			uint32_t speed,
+			uint32_t slaveMonitorInterval,
+			uint32_t upDelay,
+			uint32_t downDelay,
+			bool enabled,
+			uint8_t mode,
+			std::string failoverToSlaveStr,
+			float userSpecifiedAlloc) :
+		_ifnameStr(ifnameStr),
+		_ipvPref(ipvPref),
+		_speed(speed),
+		_relativeSpeed(0),
+		_slaveMonitorInterval(slaveMonitorInterval),
+		_upDelay(upDelay),
+		_downDelay(downDelay),
+		_enabled(enabled),
+		_mode(mode),
+		_failoverToSlaveStr(failoverToSlaveStr),
+		_userSpecifiedAlloc(userSpecifiedAlloc),
+		_isUserSpecified(false)
+	{}
+	
+	/**
+	 * @return The string representation of this slave's underlying interface's system name.
+	 */
+	inline std::string ifname() { return _ifnameStr; }
+
+	/**
+	 * @return Whether this slave is designated as a primary.
+	 */
+	inline bool primary() { return _mode == ZT_MULTIPATH_SLAVE_MODE_PRIMARY; }
+
+	/**
+	 * @return Whether this slave is designated as a spare.
+	 */
+	inline bool spare() { return _mode == ZT_MULTIPATH_SLAVE_MODE_SPARE; }
+
+	/**
+	 * @return The name of the slave interface that should be used in the event of a failure.
+	 */
+	inline std::string failoverToSlave() { return _failoverToSlaveStr; }
+
+	/**
+	 * @return Whether this slave interface was specified by the user or auto-detected.
+	 */
+	inline bool isUserSpecified() { return _isUserSpecified; }
+
+	/**
+	 * Signify that this slave was specified by the user and not the result of auto-detection.
+	 *
+	 * @param isUserSpecified
+	 */
+	inline void setAsUserSpecified(bool isUserSpecified) { _isUserSpecified = isUserSpecified; }
+
+	/**
+	 * @return Whether or not the user has specified failover instructions.
+	 */
+	inline bool userHasSpecifiedFailoverInstructions() { return _failoverToSlaveStr.length(); }
+
+	/**
+	 * @return The speed of the slave relative to others in the bond.
+	 */
+	inline uint8_t relativeSpeed() { return _relativeSpeed; }
+
+	/**
+	 * Sets the speed of the slave relative to others in the bond.
+	 *
+	 * @param relativeSpeed The speed relative to the rest of the slave interfaces.
+	 */
+	inline void setRelativeSpeed(uint8_t relativeSpeed) { _relativeSpeed = relativeSpeed; }
+
+	/**
+	 * Sets the speed of the slave relative to others in the bond.
+	 *
+	 * @param relativeSpeed
+	 */
+	inline void setMonitorInterval(uint32_t interval) { _slaveMonitorInterval = interval; }
+
+	/**
+	 * @return The absolute speed of the slave interface (as specified by the user.)
+	 */
+	inline uint32_t monitorInterval() { return _slaveMonitorInterval; }
+
+	/**
+	 * @return The absolute speed of the slave interface (as specified by the user.)
+	 */
+	inline uint32_t speed() { return _speed; }
+
+	/**
+	 * @return The address preference for this slave interface (as specified by the user.)
+	 */
+	inline uint8_t ipvPref() { return _ipvPref; }
+
+	/**
+	 * @return The mode (e.g. primary/spare) for this slave interface (as specified by the user.)
+	 */
+	inline uint8_t mode() { return _mode; }
+
+	/**
+	 * @return The upDelay parameter for all paths on this slave interface.
+	 */
+	inline uint32_t upDelay() { return _upDelay; }
+
+	/**
+	 * @return The downDelay parameter for all paths on this slave interface.
+	 */
+	inline uint32_t downDelay() { return _downDelay; }
+
+	/**
+	 * @return Whether this slave is enabled or disabled
+	 */
+	inline uint8_t enabled() { return _enabled; }
+
+private:
+
+	/**
+	 * String representation of underlying interface's system name
+	 */
+	std::string _ifnameStr;
+
+	/**
+	 * What preference (if any) a user has for IP protocol version used in
+	 * path aggregations. Preference is expressed in the order of the digits:
+	 * 
+	 *  0: no preference
+	 *  4: IPv4 only
+	 *  6: IPv6 only
+	 * 46: IPv4 over IPv6
+	 * 64: IPv6 over IPv4
+	 */
+	uint8_t _ipvPref;
+
+	/**
+	 * User-specified speed of this slave/link
+	 */
+	uint32_t _speed;
+
+	/**
+	 * Speed relative to other specified slaves/links (computed by Bond)
+	 */
+	uint8_t _relativeSpeed;
+
+	/**
+	 * User-specified interval for monitoring paths on this specific slave
+	 * instead of using the more generic interval specified for the entire
+	 * bond.
+	 */
+	uint32_t _slaveMonitorInterval;
+
+	/**
+	 * How long before a path is considered to be usable after coming online. (when using policies that
+	 * support fail-over events).
+	 */
+	uint32_t _upDelay;
+
+	/**
+	 * How long before a path is considered to be dead (when using policies that
+	 * support fail-over events).
+	 */
+	uint32_t _downDelay;
+
+	/**
+	 * Whether this slave is enabled, or (disabled (possibly bad config))
+	 */
+	uint8_t _enabled;
+
+	/**
+	 * Whether this slave is designated as a primary, a spare, or no preference.
+	 */
+	uint8_t _mode;
+
+	/**
+	 * The specific name of the interface to be used in the event that this 
+	 * slave fails.
+	 */
+	std::string _failoverToSlaveStr;
+
+	/**
+	 * User-specified allocation
+	 */
+	float _userSpecifiedAlloc;
+
+	/**
+	* Whether or not this slave was created as a result of manual user specification. This is
+	* important to know because certain policy decisions are dependent on whether the user
+	* intents to use a specific set of interfaces.
+	*/
+	bool _isUserSpecified;
+
+	AtomicCounter __refCount;
+
+};
+
+} // namespace ZeroTier
+
+#endif

+ 167 - 43
service/OneService.cpp

@@ -1,10 +1,10 @@
 /*
- * Copyright (c)2019 ZeroTier, Inc.
+ * Copyright (c)2013-2020 ZeroTier, Inc.
  *
  * Use of this software is governed by the Business Source License included
  * in the LICENSE.TXT file in the project's root directory.
  *
- * Change Date: 2023-01-01
+ * Change Date: 2024-01-01
  *
  * On the date above, in accordance with the Business Source License, use
  * of this software will be governed by version 2.0 of the Apache License.
@@ -39,6 +39,8 @@
 #include "../node/Salsa20.hpp"
 #include "../node/Poly1305.hpp"
 #include "../node/SHA512.hpp"
+#include "../node/Bond.hpp"
+#include "../node/Peer.hpp"
 
 #include "../osdep/Phy.hpp"
 #include "../osdep/Thread.hpp"
@@ -48,6 +50,7 @@
 #include "../osdep/Binder.hpp"
 #include "../osdep/ManagedRoute.hpp"
 #include "../osdep/BlockingQueue.hpp"
+#include "../osdep/Slave.hpp"
 
 #include "OneService.hpp"
 #include "SoftwareUpdater.hpp"
@@ -266,37 +269,43 @@ static void _peerToJson(nlohmann::json &pj,const ZT_Peer *peer)
 	pj["paths"] = pa;
 }
 
-static void _peerAggregateLinkToJson(nlohmann::json &pj,const ZT_Peer *peer)
+static void _peerBondToJson(nlohmann::json &pj,const ZT_Peer *peer)
 {
 	char tmp[256];
 	OSUtils::ztsnprintf(tmp,sizeof(tmp),"%.10llx",peer->address);
-	pj["aggregateLinkLatency"] = peer->latency;
+	//pj["aggregateLinkLatency"] = peer->latency;
+	std::string policyStr = BondController::getPolicyStrByCode(peer->bondingPolicy);
+	pj["policy"] = policyStr;
 
 	nlohmann::json pa = nlohmann::json::array();
 	for(unsigned int i=0;i<peer->pathCount;++i) {
 		int64_t lastSend = peer->paths[i].lastSend;
 		int64_t lastReceive = peer->paths[i].lastReceive;
 		nlohmann::json j;
-		j["address"] = reinterpret_cast<const InetAddress *>(&(peer->paths[i].address))->toString(tmp);
-		j["lastSend"] = (lastSend < 0) ? 0 : lastSend;
-		j["lastReceive"] = (lastReceive < 0) ? 0 : lastReceive;
+		j["ifname"] = std::string(peer->paths[i].ifname);
+		j["path"] = reinterpret_cast<const InetAddress *>(&(peer->paths[i].address))->toString(tmp);
+		j["lastTX"] = (lastSend < 0) ? 0 : lastSend;
+		j["lastRX"] = (lastReceive < 0) ? 0 : lastReceive;
+		j["lat"] = peer->paths[i].latencyMean;
+		j["pdv"] = peer->paths[i].latencyVariance;
+
 		//j["trustedPathId"] = peer->paths[i].trustedPathId;
 		//j["active"] = (bool)(peer->paths[i].expired == 0);
 		//j["expired"] = (bool)(peer->paths[i].expired != 0);
 		//j["preferred"] = (bool)(peer->paths[i].preferred != 0);
-		j["latency"] = peer->paths[i].latency;
-		j["pdv"] = peer->paths[i].packetDelayVariance;
-		//j["throughputDisturbCoeff"] = peer->paths[i].throughputDisturbCoeff;
-		//j["packetErrorRatio"] = peer->paths[i].packetErrorRatio;
-		//j["packetLossRatio"] = peer->paths[i].packetLossRatio;
-		j["stability"] = peer->paths[i].stability;
-		j["throughput"] = peer->paths[i].throughput;
-		//j["maxThroughput"] = peer->paths[i].maxThroughput;
-		j["allocation"] = peer->paths[i].allocation;
-		j["ifname"] = peer->paths[i].ifname;
+		//j["ltm"] = peer->paths[i].latencyMax;
+		//j["plr"] = peer->paths[i].packetLossRatio;
+		//j["per"] = peer->paths[i].packetErrorRatio;
+		//j["thr"] = peer->paths[i].throughputMean;
+		//j["thm"] = peer->paths[i].throughputMax;
+		//j["thv"] = peer->paths[i].throughputVariance;
+		//j["avl"] = peer->paths[i].availability;
+		//j["age"] = peer->paths[i].age;
+		//j["alloc"] = peer->paths[i].allocation;
+		//j["ifname"] = peer->paths[i].ifname;
 		pa.push_back(j);
 	}
-	pj["paths"] = pa;
+	pj["slaves"] = pa;
 }
 
 static void _moonToJson(nlohmann::json &mj,const World &world)
@@ -429,7 +438,7 @@ public:
 	bool _updateAutoApply;
 	bool _allowTcpFallbackRelay;
 	bool _allowSecondaryPort;
-	unsigned int _multipathMode;
+
 	unsigned int _primaryPort;
 	unsigned int _secondaryPort;
 	unsigned int _tertiaryPort;
@@ -718,6 +727,7 @@ public:
 				}
 			}
 #endif
+
 			// Delete legacy iddb.d if present (cleanup)
 			OSUtils::rmDashRf((_homePath + ZT_PATH_SEPARATOR_S "iddb.d").c_str());
 
@@ -752,7 +762,6 @@ public:
 			int64_t lastTapMulticastGroupCheck = 0;
 			int64_t lastBindRefresh = 0;
 			int64_t lastUpdateCheck = clockShouldBe;
-			int64_t lastMultipathModeUpdate = 0;
 			int64_t lastCleanedPeersDb = 0;
 			int64_t lastLocalInterfaceAddressCheck = (clockShouldBe - ZT_LOCAL_INTERFACE_CHECK_INTERVAL) + 15000; // do this in 15s to give portmapper time to configure and other things time to settle
 			int64_t lastLocalConfFileCheck = OSUtils::now();
@@ -798,7 +807,7 @@ public:
 				}
 
 				// Refresh bindings in case device's interfaces have changed, and also sync routes to update any shadow routes (e.g. shadow default)
-				if (((now - lastBindRefresh) >= (_multipathMode ? ZT_BINDER_REFRESH_PERIOD / 8 : ZT_BINDER_REFRESH_PERIOD))||(restarted)) {
+				if (((now - lastBindRefresh) >= (_node->bondController()->inUse() ? ZT_BINDER_REFRESH_PERIOD / 4 : ZT_BINDER_REFRESH_PERIOD))||(restarted)) {
 					lastBindRefresh = now;
 					unsigned int p[3];
 					unsigned int pc = 0;
@@ -815,11 +824,6 @@ public:
 						}
 					}
 				}
-				// Update multipath mode (if needed)
-				if (((now - lastMultipathModeUpdate) >= ZT_BINDER_REFRESH_PERIOD / 8)||(restarted)) {
-					lastMultipathModeUpdate = now;
-					_node->setMultipathMode(_multipathMode);
-				}
 
 				// Run background task processor in core if it's time to do so
 				int64_t dl = _nextBackgroundTaskDeadline;
@@ -855,7 +859,7 @@ public:
 				}
 
 				// Sync information about physical network interfaces
-				if ((now - lastLocalInterfaceAddressCheck) >= (_multipathMode ? ZT_LOCAL_INTERFACE_CHECK_INTERVAL / 8 : ZT_LOCAL_INTERFACE_CHECK_INTERVAL)) {
+				if ((now - lastLocalInterfaceAddressCheck) >= (_node->bondController()->inUse() ? ZT_LOCAL_INTERFACE_CHECK_INTERVAL / 8 : ZT_LOCAL_INTERFACE_CHECK_INTERVAL)) {
 					lastLocalInterfaceAddressCheck = now;
 
 					_node->clearLocalInterfaceAddresses();
@@ -869,8 +873,9 @@ public:
 #endif
 
 					std::vector<InetAddress> boundAddrs(_binder.allBoundLocalInterfaceAddresses());
-					for(std::vector<InetAddress>::const_iterator i(boundAddrs.begin());i!=boundAddrs.end();++i)
+					for(std::vector<InetAddress>::const_iterator i(boundAddrs.begin());i!=boundAddrs.end();++i) {
 						_node->addLocalInterfaceAddress(reinterpret_cast<const struct sockaddr_storage *>(&(*i)));
+					}
 				}
 
 				// Clean peers.d periodically
@@ -1209,15 +1214,15 @@ public:
 					settings["primaryPort"] = OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff;
 					settings["allowTcpFallbackRelay"] = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],_allowTcpFallbackRelay);
 
-					if (_multipathMode) {
-						json &multipathConfig = res["multipath"];
+					if (_node->bondController()->inUse()) {
+						json &multipathConfig = res["bonds"];
 						ZT_PeerList *pl = _node->peers();
 						char peerAddrStr[256];
 						if (pl) {
 							for(unsigned long i=0;i<pl->peerCount;++i) {
-								if (pl->peers[i].hadAggregateLink) {
+								if (pl->peers[i].isBonded) {
 									nlohmann::json pj;
-									_peerAggregateLinkToJson(pj,&(pl->peers[i]));
+									_peerBondToJson(pj,&(pl->peers[i]));
 									OSUtils::ztsnprintf(peerAddrStr,sizeof(peerAddrStr),"%.10llx",pl->peers[i].address);
 									multipathConfig[peerAddrStr] = (pj);
 								}
@@ -1346,8 +1351,8 @@ public:
 							if (j.is_object()) {
 								seed = Utils::hexStrToU64(OSUtils::jsonString(j["seed"],"0").c_str());
 							}
-						} catch (std::exception &exc) {
 						} catch ( ... ) {
+							// discard invalid JSON
 						}
 
 						std::vector<World> moons(_node->moons());
@@ -1396,8 +1401,8 @@ public:
 											json &allowDefault = j["allowDefault"];
 											if (allowDefault.is_boolean()) localSettings.allowDefault = (bool)allowDefault;
 										}
-									} catch (std::exception &exc) {
 									} catch ( ... ) {
+										// discard invalid JSON
 									}
 
 									setNetworkSettings(nws->networks[i].nwid,localSettings);
@@ -1551,10 +1556,133 @@ public:
 
 		json &settings = lc["settings"];
 
+		if (!_node->bondController()->inUse()) {
+			// defaultBondingPolicy
+			std::string defaultBondingPolicyStr(OSUtils::jsonString(settings["defaultBondingPolicy"],""));
+			int defaultBondingPolicy = _node->bondController()->getPolicyCodeByStr(defaultBondingPolicyStr);
+			_node->bondController()->setBondingLayerDefaultPolicy(defaultBondingPolicy);
+			_node->bondController()->setBondingLayerDefaultPolicyStr(defaultBondingPolicyStr); // Used if custom policy
+			// Custom Policies
+			json &customBondingPolicies = settings["policies"];
+			for (json::iterator policyItr = customBondingPolicies.begin(); policyItr != customBondingPolicies.end();++policyItr) {
+				fprintf(stderr, "\n\n--- (%s)\n", policyItr.key().c_str());
+				// Custom Policy
+				std::string customPolicyStr(policyItr.key());
+				json &customPolicy = policyItr.value();
+				std::string basePolicyStr(OSUtils::jsonString(customPolicy["basePolicy"],""));
+				if (_node->bondController()->getPolicyCodeByStr(basePolicyStr) == ZT_BONDING_POLICY_NONE) {
+					fprintf(stderr, "error: custom policy (%s) is invalid, unknown base policy (%s).\n",
+						customPolicyStr.c_str(), basePolicyStr.c_str());
+					continue;
+				} if (_node->bondController()->getPolicyCodeByStr(customPolicyStr) != ZT_BONDING_POLICY_NONE) {
+					fprintf(stderr, "error: custom policy (%s) will be ignored, cannot use standard policy names for custom policies.\n",
+						customPolicyStr.c_str());
+					continue;
+				}
+				// New bond, used as a copy template for new instances
+				SharedPtr<Bond> newTemplateBond = new Bond(basePolicyStr, customPolicyStr, SharedPtr<Peer>());
+				// Acceptable ranges
+				newTemplateBond->setMaxAcceptableLatency(OSUtils::jsonInt(customPolicy["maxAcceptableLatency"],-1));
+				newTemplateBond->setMaxAcceptableMeanLatency(OSUtils::jsonInt(customPolicy["maxAcceptableMeanLatency"],-1));
+				newTemplateBond->setMaxAcceptablePacketDelayVariance(OSUtils::jsonInt(customPolicy["maxAcceptablePacketDelayVariance"],-1));
+				newTemplateBond->setMaxAcceptablePacketLossRatio((float)OSUtils::jsonDouble(customPolicy["maxAcceptablePacketLossRatio"],-1));
+				newTemplateBond->setMaxAcceptablePacketErrorRatio((float)OSUtils::jsonDouble(customPolicy["maxAcceptablePacketErrorRatio"],-1));
+				newTemplateBond->setMinAcceptableAllocation((float)OSUtils::jsonDouble(customPolicy["minAcceptableAllocation"],0));
+				// Quality weights
+				json &qualityWeights = customPolicy["qualityWeights"];
+				if (qualityWeights.size() == ZT_QOS_WEIGHT_SIZE) { // TODO: Generalize this
+					float weights[ZT_QOS_WEIGHT_SIZE];
+					weights[ZT_QOS_LAT_IDX] = (float)OSUtils::jsonDouble(qualityWeights["lat"],0.0);
+					weights[ZT_QOS_LTM_IDX] = (float)OSUtils::jsonDouble(qualityWeights["ltm"],0.0);
+					weights[ZT_QOS_PDV_IDX] = (float)OSUtils::jsonDouble(qualityWeights["pdv"],0.0);
+					weights[ZT_QOS_PLR_IDX] = (float)OSUtils::jsonDouble(qualityWeights["plr"],0.0);
+					weights[ZT_QOS_PER_IDX] = (float)OSUtils::jsonDouble(qualityWeights["per"],0.0);
+					weights[ZT_QOS_THR_IDX] = (float)OSUtils::jsonDouble(qualityWeights["thr"],0.0);
+					weights[ZT_QOS_THM_IDX] = (float)OSUtils::jsonDouble(qualityWeights["thm"],0.0);
+					weights[ZT_QOS_THV_IDX] = (float)OSUtils::jsonDouble(qualityWeights["thv"],0.0);
+					newTemplateBond->setUserQualityWeights(weights,ZT_QOS_WEIGHT_SIZE);
+				}
+				// Bond-specific properties
+				newTemplateBond->setUpDelay(OSUtils::jsonInt(customPolicy["upDelay"],-1));
+				newTemplateBond->setDownDelay(OSUtils::jsonInt(customPolicy["downDelay"],-1));
+				newTemplateBond->setFailoverInterval(OSUtils::jsonInt(customPolicy["failoverInterval"],(uint64_t)0));
+				newTemplateBond->setPacketsPerSlave(OSUtils::jsonInt(customPolicy["packetsPerSlave"],-1));
+				std::string slaveMonitorStrategyStr(OSUtils::jsonString(customPolicy["slaveMonitorStrategy"],""));
+				uint8_t slaveMonitorStrategy = ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_DEFAULT;
+				if (slaveMonitorStrategyStr == "passive") { newTemplateBond->setSlaveMonitorStrategy(ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_PASSIVE); }
+				if (slaveMonitorStrategyStr == "active") { newTemplateBond->setSlaveMonitorStrategy(ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_ACTIVE); }
+				if (slaveMonitorStrategyStr == "dynamic") { newTemplateBond->setSlaveMonitorStrategy(ZT_MULTIPATH_SLAVE_MONITOR_STRATEGY_DYNAMIC); }
+				// Policy-Specific slave set
+				json &slaves = customPolicy["slaves"];
+				for (json::iterator slaveItr = slaves.begin(); slaveItr != slaves.end();++slaveItr) {
+					fprintf(stderr, "\t--- slave (%s)\n", slaveItr.key().c_str());
+					std::string slaveNameStr(slaveItr.key());
+					json &slave = slaveItr.value();
+
+					bool enabled = OSUtils::jsonInt(slave["enabled"],true);
+					uint32_t speed = OSUtils::jsonInt(slave["speed"],0);
+					float alloc = (float)OSUtils::jsonDouble(slave["alloc"],0);
+
+					if (speed && alloc) {
+						fprintf(stderr, "error: cannot specify both speed (%d) and alloc (%f) for slave (%s), pick one, slave disabled.\n",
+							speed, alloc, slaveNameStr.c_str());
+						enabled = false;
+					}
+					uint32_t upDelay = OSUtils::jsonInt(slave["upDelay"],-1);
+					uint32_t downDelay = OSUtils::jsonInt(slave["downDelay"],-1);
+					uint8_t ipvPref = OSUtils::jsonInt(slave["ipvPref"],0);
+					uint32_t slaveMonitorInterval = OSUtils::jsonInt(slave["monitorInterval"],(uint64_t)0);
+					std::string failoverToStr(OSUtils::jsonString(slave["failoverTo"],""));
+					// Mode
+					std::string slaveModeStr(OSUtils::jsonString(slave["mode"],"spare"));
+					uint8_t slaveMode = ZT_MULTIPATH_SLAVE_MODE_SPARE;
+					if (slaveModeStr == "primary") { slaveMode = ZT_MULTIPATH_SLAVE_MODE_PRIMARY; }
+					if (slaveModeStr == "spare") { slaveMode = ZT_MULTIPATH_SLAVE_MODE_SPARE; }
+					// ipvPref
+					if ((ipvPref != 0) && (ipvPref != 4) && (ipvPref != 6) && (ipvPref != 46) && (ipvPref != 64)) {
+						fprintf(stderr, "error: invalid ipvPref value (%d), slave disabled.\n", ipvPref);
+						enabled = false;
+					}
+					if (slaveMode == ZT_MULTIPATH_SLAVE_MODE_SPARE && failoverToStr.length()) {
+						fprintf(stderr, "error: cannot specify failover slaves for spares, slave disabled.\n");
+						failoverToStr = "";
+						enabled = false;
+					}
+					_node->bondController()->addCustomSlave(customPolicyStr, new Slave(slaveNameStr,ipvPref,speed,slaveMonitorInterval,upDelay,downDelay,enabled,slaveMode,failoverToStr,alloc));
+				}
+				// TODO: This is dumb
+				std::string slaveSelectMethodStr(OSUtils::jsonString(customPolicy["activeReselect"],"optimize"));
+				if (slaveSelectMethodStr == "always") { newTemplateBond->setSlaveSelectMethod(ZT_MULTIPATH_RESELECTION_POLICY_ALWAYS); }
+				if (slaveSelectMethodStr == "better") { newTemplateBond->setSlaveSelectMethod(ZT_MULTIPATH_RESELECTION_POLICY_BETTER); }
+				if (slaveSelectMethodStr == "failure") { newTemplateBond->setSlaveSelectMethod(ZT_MULTIPATH_RESELECTION_POLICY_FAILURE); }
+				if (slaveSelectMethodStr == "optimize") { newTemplateBond->setSlaveSelectMethod(ZT_MULTIPATH_RESELECTION_POLICY_OPTIMIZE); }
+				if (newTemplateBond->getSlaveSelectMethod() < 0 || newTemplateBond->getSlaveSelectMethod() > 3) {
+					fprintf(stderr, "warning: invalid value (%s) for slaveSelectMethod, assuming mode: always\n", slaveSelectMethodStr.c_str());
+				}
+				/*
+				newBond->setPolicy(_node->bondController()->getPolicyCodeByStr(basePolicyStr));
+				newBond->setFlowHashing((bool)OSUtils::jsonInt(userSpecifiedBondingPolicies[i]["allowFlowHashing"],(bool)allowFlowHashing));
+				newBond->setBondMonitorInterval((unsigned int)OSUtils::jsonInt(userSpecifiedBondingPolicies[i]["monitorInterval"],(uint64_t)0));
+				newBond->setAllowPathNegotiation((bool)OSUtils::jsonInt(userSpecifiedBondingPolicies[i]["allowPathNegotiation"],(bool)false));
+				*/
+				if (!_node->bondController()->addCustomPolicy(newTemplateBond)) {
+					fprintf(stderr, "error: a custom policy of this name (%s) already exists.\n", customPolicyStr.c_str());
+				}
+			}
+			// Peer-specific bonding
+			json &peerSpecificBonds = settings["peerSpecificBonds"];
+			for (json::iterator peerItr = peerSpecificBonds.begin(); peerItr != peerSpecificBonds.end();++peerItr) {
+				_node->bondController()->assignBondingPolicyToPeer(std::stoull(peerItr.key(),0,16), peerItr.value());
+			}
+			// Check settings
+			if (defaultBondingPolicyStr.length() && !defaultBondingPolicy && !_node->bondController()->inUse()) {
+				fprintf(stderr, "error: unknown policy (%s) specified by defaultBondingPolicy, slave disabled.\n", defaultBondingPolicyStr.c_str());
+			}
+		}
+
+		// bondingPolicy cannot be used with allowTcpFallbackRelay
+		_allowTcpFallbackRelay = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],true) && !(_node->bondController()->inUse());
 		_primaryPort = (unsigned int)OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff;
-		_multipathMode = (unsigned int)OSUtils::jsonInt(settings["multipathMode"],0);
-		// multipathMode cannot be used with allowTcpFallbackRelay
-		_allowTcpFallbackRelay = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],true) && !_multipathMode;
 		_allowSecondaryPort = OSUtils::jsonBool(settings["allowSecondaryPort"],true);
 		_secondaryPort = (unsigned int)OSUtils::jsonInt(settings["secondaryPort"],0);
 		_tertiaryPort = (unsigned int)OSUtils::jsonInt(settings["tertiaryPort"],0);
@@ -1705,9 +1833,8 @@ public:
 				}
 			}
 #ifdef __SYNOLOGY__
-			if (!n.tap->addIps(newManagedIps)) {
+			if (!n.tap->addIpSyn(newManagedIps))
 				fprintf(stderr,"ERROR: unable to add ip addresses to ifcfg" ZT_EOL_S);
-			}
 #else
 			for(std::vector<InetAddress>::iterator ip(newManagedIps.begin());ip!=newManagedIps.end();++ip) {
 				if (std::find(n.managedIps.begin(),n.managedIps.end(),*ip) == n.managedIps.end()) {
@@ -2025,8 +2152,6 @@ public:
 					return;
 
 			}
-		} catch (std::exception &exc) {
-			_phy.close(sock);
 		} catch ( ... ) {
 			_phy.close(sock);
 		}
@@ -2135,8 +2260,6 @@ public:
 #endif
 						_nets.erase(nwid);
 						return -999;
-					} catch (int exc) {
-						return -999;
 					} catch ( ... ) {
 						return -999; // tap init failed
 					}
@@ -2743,6 +2866,7 @@ public:
 				if (!strncmp(p->c_str(),ifname,p->length()))
 					return false;
 			}
+			return _node->bondController()->allowedToBind(std::string(ifname));
 		}
 		{
 			// Check global blacklists