2
0
Эх сурвалжийг харах

Merge branch 'adamierymenko-dev' into windows-ui

Grant Limberg 9 жил өмнө
parent
commit
6471c1f4e2

+ 4 - 10
.gitignore

@@ -49,23 +49,17 @@ Thumbs.db
 *.rpm
 *.autosave
 *.tmp
+node_modules
 
-# Root topology build files, temporaries, and never check in secrets
-/root-topology/bin2c
-/root-topology/mktopology
-/root-topology/*.secret
-/root-topology/test/supernodes
-/root-topology/test/test-root-topology
+# cluster-geo stuff
+cluster-geo/cluster-geo/config.js
+cluster-geo/cluster-geo/cache.*
 
 # MacGap wrapper build files
 /ext/mac-ui-macgap1-wrapper/src/MacGap.xcodeproj/project.xcworkspace/xcuserdata/*
 /ext/mac-ui-macgap1-wrapper/src/MacGap.xcodeproj/xcuserdata/*
 /ext/mac-ui-macgap1-wrapper/src/build
 
-# Web UI dev temporaries
-/ui/.module-cache
-node_modules
-
 # Java/Android/JNI build droppings
 java/obj/
 java/libs/

+ 13 - 0
cluster-geo/cluster-geo.exe

@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export PATH=/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin
+
+cd `dirname $0`
+if [ ! -d cluster-geo -o ! -f cluster-geo/index.js ]; then
+	echo 'Cannot find ./cluster-geo containing NodeJS script files.'
+	exit 1
+fi
+
+cd cluster-geo
+
+exec node index.js

+ 7 - 0
cluster-geo/cluster-geo/config.js.sample

@@ -0,0 +1,7 @@
+// MaxMind GeoIP2 config
+module.exports.maxmind = {
+	userId: 1234,
+	licenseKey: 'asdf',
+	service: 'city',
+	requestTimeout: 1000
+};

+ 94 - 0
cluster-geo/cluster-geo/index.js

@@ -0,0 +1,94 @@
+//
+// GeoIP lookup service
+//
+
+// GeoIP cache TTL in ms
+var CACHE_TTL = (60 * 60 * 24 * 60 * 1000); // 60 days
+
+var config = require(__dirname + '/config.js');
+
+if (!config.maxmind) {
+	console.error('FATAL: only MaxMind GeoIP2 is currently supported and is not configured in config.js');
+	process.exit(1);
+}
+var geo = require('geoip2ws')(config.maxmind);
+
+var cache = require('levelup')(__dirname + '/cache.leveldb');
+
+function lookup(ip,callback)
+{
+	cache.get(ip,function(err,cachedEntryJson) {
+		if ((!err)&&(cachedEntryJson)) {
+			try {
+				var cachedEntry = JSON.parse(cachedEntryJson.toString());
+				if (cachedEntry) {
+					var ts = cachedEntry.ts;
+					var r = cachedEntry.r;
+					if ((ts)&&(r)) {
+						if ((Date.now() - ts) < CACHE_TTL) {
+							r._cached = true;
+							return callback(null,r);
+						}
+					}
+				}
+			} catch (e) {}
+		}
+
+		geo(ip,function(err,result) {
+			if (err)
+				return callback(err,null);
+			if ((!result)||(!result.location))
+				return callback(new Error('null result'),null);
+
+			cache.put(ip,JSON.stringify({
+				ts: Date.now(),
+				r: result
+			}),function(err) {
+				if (err)
+					console.error('Error saving to cache: '+err);
+				return callback(null,result);
+			});
+		});
+	});
+};
+
+var linebuf = '';
+process.stdin.on('readable',function() {
+	var chunk;
+	while (null !== (chunk = process.stdin.read())) {
+		for(var i=0;i<chunk.length;++i) {
+			var c = chunk[i];
+			if ((c == 0x0d)||(c == 0x0a)) {
+				if (linebuf.length > 0) {
+					var ip = linebuf;
+					lookup(ip,function(err,result) {
+						if ((err)||(!result)||(!result.location)) {
+							return process.stdout.write(ip+',0,0,0,0,0,0\n');
+						} else {
+							var lat = parseFloat(result.location.latitude);
+							var lon = parseFloat(result.location.longitude);
+
+							// Convert to X,Y,Z coordinates from Earth's origin, Earth-as-sphere approximation.
+							var latRadians = lat * 0.01745329251994; // PI / 180
+							var lonRadians = lon * 0.01745329251994; // PI / 180
+							var cosLat = Math.cos(latRadians);
+							var x = Math.round((-6371.0) * cosLat * Math.cos(lonRadians)); // 6371 == Earth's approximate radius in kilometers
+							var y = Math.round(6371.0 * Math.sin(latRadians));
+							var z = Math.round(6371.0 * cosLat * Math.sin(lonRadians));
+
+							return process.stdout.write(ip+',1,'+lat+','+lon+','+x+','+y+','+z+'\n');
+						}
+					});
+				}
+				linebuf = '';
+			} else {
+				linebuf += String.fromCharCode(c);
+			}
+		}
+	}
+});
+
+process.stdin.on('end',function() {
+	cache.close();
+	process.exit(0);
+});

+ 16 - 0
cluster-geo/cluster-geo/package.json

@@ -0,0 +1,16 @@
+{
+  "name": "cluster-geo",
+  "version": "1.0.0",
+  "description": "Cluster GEO-IP Query Service",
+  "main": "index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "ZeroTier, Inc.",
+  "license": "GPL-3.0",
+  "dependencies": {
+    "geoip2ws": "^1.7.1",
+    "leveldown": "^1.4.2",
+    "levelup": "^1.2.1"
+  }
+}

+ 120 - 1
include/ZeroTierOne.h

@@ -128,6 +128,16 @@ extern "C" {
  */
 #define ZT_CIRCUIT_TEST_MAX_HOP_BREADTH 256
 
+/**
+ * Maximum number of cluster members (and max member ID plus one)
+ */
+#define ZT_CLUSTER_MAX_MEMBERS 256
+
+/**
+ * Maximum allowed cluster message length in bytes
+ */
+#define ZT_CLUSTER_MAX_MESSAGE_LENGTH 65535
+
 /**
  * A null/empty sockaddr (all zero) to signify an unspecified socket address
  */
@@ -174,7 +184,17 @@ enum ZT_ResultCode
 	/**
 	 * Network ID not valid
 	 */
-	ZT_RESULT_ERROR_NETWORK_NOT_FOUND = 1000
+	ZT_RESULT_ERROR_NETWORK_NOT_FOUND = 1000,
+
+	/**
+	 * The requested operation is not supported on this version or build
+	 */
+	ZT_RESULT_ERROR_UNSUPPORTED_OPERATION = 1001,
+
+	/**
+	 * The requestion operation was given a bad parameter or was called in an invalid state
+	 */
+	ZT_RESULT_ERROR_BAD_PARAMETER = 1002
 };
 
 /**
@@ -1320,6 +1340,105 @@ enum ZT_ResultCode ZT_Node_circuitTestBegin(ZT_Node *node,ZT_CircuitTest *test,v
  */
 void ZT_Node_circuitTestEnd(ZT_Node *node,ZT_CircuitTest *test);
 
+/**
+ * Initialize cluster operation
+ *
+ * This initializes the internal structures and state for cluster operation.
+ * It takes two function pointers. The first is to a function that can be
+ * used to send data to cluster peers (mechanism is not defined by Node),
+ * and the second is to a function that can be used to get the location of
+ * a physical address in X,Y,Z coordinate space (e.g. as cartesian coordinates
+ * projected from the center of the Earth).
+ *
+ * Send function takes an arbitrary pointer followed by the cluster member ID
+ * to send data to, a pointer to the data, and the length of the data. The
+ * maximum message length is ZT_CLUSTER_MAX_MESSAGE_LENGTH (65535). Messages
+ * must be delivered whole and may be dropped or transposed, though high
+ * failure rates are undesirable and can cause problems. Validity checking or
+ * CRC is also not required since the Node validates the authenticity of
+ * cluster messages using cryptogrphic methods and will silently drop invalid
+ * messages.
+ *
+ * Address to location function is optional and if NULL geo-handoff is not
+ * enabled (in this case x, y, and z in clusterInit are also unused). It
+ * takes an arbitrary pointer followed by a physical address and three result
+ * parameters for x, y, and z. It returns zero on failure or nonzero if these
+ * three coordinates have been set. Coordinate space is arbitrary and can be
+ * e.g. coordinates on Earth relative to Earth's center. These can be obtained
+ * from latitutde and longitude with versions of the Haversine formula.
+ *
+ * See: http://stackoverflow.com/questions/1185408/converting-from-longitude-latitude-to-cartesian-coordinates
+ *
+ * Neither the send nor the address to location function should block. If the
+ * address to location function does not have a location for an address, it
+ * should return zero and then look up the address for future use since it
+ * will be called again in (typically) 1-3 minutes.
+ *
+ * Note that both functions can be called from any thread from which the
+ * various Node functions are called, and so must be thread safe if multiple
+ * threads are being used.
+ *
+ * @param node Node instance
+ * @param myId My cluster member ID (less than or equal to ZT_CLUSTER_MAX_MEMBERS)
+ * @param zeroTierPhysicalEndpoints Preferred physical address(es) for ZeroTier clients to contact this cluster member (for peer redirect)
+ * @param numZeroTierPhysicalEndpoints Number of physical endpoints in zeroTierPhysicalEndpoints[] (max allowed: 255)
+ * @param x My cluster member's X location
+ * @param y My cluster member's Y location
+ * @param z My cluster member's Z location
+ * @param sendFunction Function to be called to send data to other cluster members
+ * @param sendFunctionArg First argument to sendFunction()
+ * @param addressToLocationFunction Function to be called to get the location of a physical address or NULL to disable geo-handoff
+ * @param addressToLocationFunctionArg First argument to addressToLocationFunction()
+ * @return OK or UNSUPPORTED_OPERATION if this Node was not built with cluster support
+ */
+enum ZT_ResultCode ZT_Node_clusterInit(
+	ZT_Node *node,
+	unsigned int myId,
+	const struct sockaddr_storage *zeroTierPhysicalEndpoints,
+	unsigned int numZeroTierPhysicalEndpoints,
+	int x,
+	int y,
+	int z,
+	void (*sendFunction)(void *,unsigned int,const void *,unsigned int),
+	void *sendFunctionArg,
+	int (*addressToLocationFunction)(void *,const struct sockaddr_storage *,int *,int *,int *),
+	void *addressToLocationFunctionArg);
+
+/**
+ * Add a member to this cluster
+ *
+ * Calling this without having called clusterInit() will do nothing.
+ *
+ * @param node Node instance
+ * @param memberId Member ID (must be less than or equal to ZT_CLUSTER_MAX_MEMBERS)
+ * @return OK or error if clustering is disabled, ID invalid, etc.
+ */
+enum ZT_ResultCode ZT_Node_clusterAddMember(ZT_Node *node,unsigned int memberId);
+
+/**
+ * Remove a member from this cluster
+ *
+ * Calling this without having called clusterInit() will do nothing.
+ *
+ * @param node Node instance
+ * @param memberId Member ID to remove (nothing happens if not present)
+ */
+void ZT_Node_clusterRemoveMember(ZT_Node *node,unsigned int memberId);
+
+/**
+ * Handle an incoming cluster state message
+ *
+ * The message itself contains cluster member IDs, and invalid or badly
+ * addressed messages will be silently discarded.
+ *
+ * Calling this without having called clusterInit() will do nothing.
+ *
+ * @param node Node instance
+ * @param msg Cluster message
+ * @param len Length of cluster message
+ */
+void ZT_Node_clusterHandleIncomingMessage(ZT_Node *node,const void *msg,unsigned int len);
+
 /**
  * Get ZeroTier One version
  *

+ 1 - 1
make-mac.mk

@@ -6,7 +6,7 @@ ifeq ($(origin CXX),default)
 endif
 
 INCLUDES=
-DEFS=
+DEFS=-DZT_ENABLE_CLUSTER
 LIBS=
 ARCH_FLAGS=-arch x86_64
 

+ 318 - 48
node/Cluster.cpp

@@ -31,10 +31,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <math.h>
 
 #include <algorithm>
 #include <utility>
 
+#include "../version.h"
+
 #include "Cluster.hpp"
 #include "RuntimeEnvironment.hpp"
 #include "MulticastGroup.hpp"
@@ -42,22 +45,44 @@
 #include "Salsa20.hpp"
 #include "Poly1305.hpp"
 #include "Packet.hpp"
+#include "Identity.hpp"
 #include "Peer.hpp"
 #include "Switch.hpp"
 #include "Node.hpp"
 
 namespace ZeroTier {
 
-Cluster::Cluster(const RuntimeEnvironment *renv,uint16_t id,DistanceAlgorithm da,int32_t x,int32_t y,int32_t z,void (*sendFunction)(void *,uint16_t,const void *,unsigned int),void *arg) :
+static inline double _dist3d(int x1,int y1,int z1,int x2,int y2,int z2)
+	throw()
+{
+	double dx = ((double)x2 - (double)x1);
+	double dy = ((double)y2 - (double)y1);
+	double dz = ((double)z2 - (double)z1);
+	return sqrt((dx * dx) + (dy * dy) + (dz * dz));
+}
+
+Cluster::Cluster(
+	const RuntimeEnvironment *renv,
+	uint16_t id,
+	const std::vector<InetAddress> &zeroTierPhysicalEndpoints,
+	int32_t x,
+	int32_t y,
+	int32_t z,
+	void (*sendFunction)(void *,unsigned int,const void *,unsigned int),
+	void *sendFunctionArg,
+	int (*addressToLocationFunction)(void *,const struct sockaddr_storage *,int *,int *,int *),
+	void *addressToLocationFunctionArg) :
 	RR(renv),
 	_sendFunction(sendFunction),
-	_arg(arg),
+	_sendFunctionArg(sendFunctionArg),
+	_addressToLocationFunction(addressToLocationFunction),
+	_addressToLocationFunctionArg(addressToLocationFunctionArg),
 	_x(x),
 	_y(y),
 	_z(z),
-	_da(da),
 	_id(id),
-	_members(new _Member[65536])
+	_zeroTierPhysicalEndpoints(zeroTierPhysicalEndpoints),
+	_members(new _Member[ZT_CLUSTER_MAX_MEMBERS])
 {
 	uint16_t stmp[ZT_SHA512_DIGEST_LEN / sizeof(uint16_t)];
 
@@ -114,16 +139,20 @@ void Cluster::handleIncomingStateMessage(const void *msg,unsigned int len)
 		s20.decrypt12(reinterpret_cast<const char *>(msg) + 24,const_cast<void *>(dmsg.data()),dmsg.size());
 	}
 
-	if (dmsg.size() < 2)
+	if (dmsg.size() < 4)
 		return;
 	const uint16_t fromMemberId = dmsg.at<uint16_t>(0);
 	unsigned int ptr = 2;
+	if (fromMemberId == _id)
+		return;
+	const uint16_t toMemberId = dmsg.at<uint16_t>(ptr);
+	ptr += 2;
+	if (toMemberId != _id)
+		return;
 
 	_Member &m = _members[fromMemberId];
 	Mutex::Lock mlck(m.lock);
 
-	m.lastReceivedFrom = RR->node->now();
-
 	try {
 		while (ptr < dmsg.size()) {
 			const unsigned int mlen = dmsg.at<uint16_t>(ptr); ptr += 2;
@@ -143,31 +172,51 @@ void Cluster::handleIncomingStateMessage(const void *msg,unsigned int len)
 						ptr += 8; // skip local clock, not used
 						m.load = dmsg.at<uint64_t>(ptr); ptr += 8;
 						ptr += 8; // skip flags, unused
-						m.physicalAddressCount = dmsg[ptr++];
-						if (m.physicalAddressCount > ZT_CLUSTER_MEMBER_MAX_PHYSICAL_ADDRS)
-							m.physicalAddressCount = ZT_CLUSTER_MEMBER_MAX_PHYSICAL_ADDRS;
-						for(unsigned int i=0;i<m.physicalAddressCount;++i)
-							ptr += m.physicalAddresses[i].deserialize(dmsg,ptr);
+#ifdef ZT_TRACE
+						std::string addrs;
+#endif
+						unsigned int physicalAddressCount = dmsg[ptr++];
+						for(unsigned int i=0;i<physicalAddressCount;++i) {
+							m.zeroTierPhysicalEndpoints.push_back(InetAddress());
+							ptr += m.zeroTierPhysicalEndpoints.back().deserialize(dmsg,ptr);
+							if (!(m.zeroTierPhysicalEndpoints.back())) {
+								m.zeroTierPhysicalEndpoints.pop_back();
+							}
+#ifdef ZT_TRACE
+							else {
+								if (addrs.length() > 0)
+									addrs.push_back(',');
+								addrs.append(m.zeroTierPhysicalEndpoints.back().toString());
+							}
+#endif
+						}
 						m.lastReceivedAliveAnnouncement = RR->node->now();
+#ifdef ZT_TRACE
+						TRACE("[%u] I'm alive! send me peers at %s",(unsigned int)fromMemberId,addrs.c_str());
+#endif
 					}	break;
 
 					case STATE_MESSAGE_HAVE_PEER: {
 						try {
 							Identity id;
 							ptr += id.deserialize(dmsg,ptr);
-							RR->topology->saveIdentity(id);
-
-							{	// Add or update peer affinity entry
-								_PeerAffinity pa(id.address(),fromMemberId,RR->node->now());
-								Mutex::Lock _l2(_peerAffinities_m);
-								std::vector<_PeerAffinity>::iterator i(std::lower_bound(_peerAffinities.begin(),_peerAffinities.end(),pa)); // O(log(n))
-								if ((i != _peerAffinities.end())&&(i->key == pa.key)) {
-									i->timestamp = pa.timestamp;
-								} else {
-									_peerAffinities.push_back(pa);
-									std::sort(_peerAffinities.begin(),_peerAffinities.end()); // probably a more efficient way to insert but okay for now
-								}
-	 						}
+							if (id) {
+								RR->topology->saveIdentity(id);
+
+								{	// Add or update peer affinity entry
+									_PeerAffinity pa(id.address(),fromMemberId,RR->node->now());
+									Mutex::Lock _l2(_peerAffinities_m);
+									std::vector<_PeerAffinity>::iterator i(std::lower_bound(_peerAffinities.begin(),_peerAffinities.end(),pa)); // O(log(n))
+									if ((i != _peerAffinities.end())&&(i->key == pa.key)) {
+										i->timestamp = pa.timestamp;
+									} else {
+										_peerAffinities.push_back(pa);
+										std::sort(_peerAffinities.begin(),_peerAffinities.end()); // probably a more efficient way to insert but okay for now
+									}
+		 						}
+
+		 						TRACE("[%u] has %s",(unsigned int)fromMemberId,id.address().toString().c_str());
+		 					}
 						} catch ( ... ) {
 							// ignore invalid identities
 						}
@@ -179,10 +228,15 @@ void Cluster::handleIncomingStateMessage(const void *msg,unsigned int len)
 						const MAC mac(dmsg.field(ptr,6),6); ptr += 6;
 						const uint32_t adi = dmsg.at<uint32_t>(ptr); ptr += 4;
 						RR->mc->add(RR->node->now(),nwid,MulticastGroup(mac,adi),address);
+						TRACE("[%u] %s likes %s/%u on %.16llu",(unsigned int)fromMemberId,address.toString().c_str(),mac.toString().c_str(),(unsigned int)adi,nwid);
 					}	break;
 
 					case STATE_MESSAGE_COM: {
-						// TODO: not used yet
+						CertificateOfMembership com;
+						ptr += com.deserialize(dmsg,ptr);
+						if (com) {
+							TRACE("[%u] COM for %s on %.16llu rev %llu",(unsigned int)fromMemberId,com.issuedTo().toString().c_str(),com.networkId(),com.revision());
+						}
 					}	break;
 
 					case STATE_MESSAGE_RELAY: {
@@ -195,6 +249,8 @@ void Cluster::handleIncomingStateMessage(const void *msg,unsigned int len)
 
 						if (packetLen >= ZT_PROTO_MIN_FRAGMENT_LENGTH) { // ignore anything too short to contain a dest address
 							const Address destinationAddress(reinterpret_cast<const char *>(packet) + 8,ZT_ADDRESS_LENGTH);
+							TRACE("[%u] relay %u bytes to %s (%u remote paths included)",(unsigned int)fromMemberId,packetLen,destinationAddress.toString().c_str(),numRemotePeerPaths);
+
 							SharedPtr<Peer> destinationPeer(RR->topology->getPeer(destinationAddress));
 							if (destinationPeer) {
 								if (
@@ -232,8 +288,6 @@ void Cluster::handleIncomingStateMessage(const void *msg,unsigned int len)
 									remotePeerAddress.appendTo(rendezvousForDest);
 
 									Buffer<2048> rendezvousForOtherEnd;
-									rendezvousForOtherEnd.addSize(2); // leave room for payload size
-									rendezvousForOtherEnd.append((uint8_t)STATE_MESSAGE_PROXY_SEND);
 									remotePeerAddress.appendTo(rendezvousForOtherEnd);
 									rendezvousForOtherEnd.append((uint8_t)Packet::VERB_RENDEZVOUS);
 									const unsigned int rendezvousForOtherEndPayloadSizePtr = rendezvousForOtherEnd.size();
@@ -267,9 +321,8 @@ void Cluster::handleIncomingStateMessage(const void *msg,unsigned int len)
 									}
 
 									if (haveMatch) {
+										_send(fromMemberId,STATE_MESSAGE_PROXY_SEND,rendezvousForOtherEnd.data(),rendezvousForOtherEnd.size());
 										RR->sw->send(rendezvousForDest,true,0);
-										rendezvousForOtherEnd.setAt<uint16_t>(0,(uint16_t)(rendezvousForOtherEnd.size() - 2));
-										_send(fromMemberId,rendezvousForOtherEnd.data(),rendezvousForOtherEnd.size());
 									}
 								}
 							}
@@ -283,6 +336,7 @@ void Cluster::handleIncomingStateMessage(const void *msg,unsigned int len)
 						Packet outp(rcpt,RR->identity.address(),verb);
 						outp.append(dmsg.field(ptr,len),len);
 						RR->sw->send(outp,true,0);
+						TRACE("[%u] proxy send %s to %s length %u",(unsigned int)fromMemberId,Packet::verbString(verb),rcpt.toString().c_str(),len);
 					}	break;
 				}
 			} catch ( ... ) {
@@ -298,37 +352,172 @@ void Cluster::handleIncomingStateMessage(const void *msg,unsigned int len)
 	}
 }
 
-void Cluster::replicateHavePeer(const Address &peerAddress)
+bool Cluster::sendViaCluster(const Address &fromPeerAddress,const Address &toPeerAddress,const void *data,unsigned int len)
 {
+	if (len > 16384) // sanity check
+		return false;
+
+	uint64_t mostRecentTimestamp = 0;
+	uint16_t canHasPeer = 0;
+
+	{	// Anyone got this peer?
+		Mutex::Lock _l2(_peerAffinities_m);
+		std::vector<_PeerAffinity>::iterator i(std::lower_bound(_peerAffinities.begin(),_peerAffinities.end(),_PeerAffinity(toPeerAddress,0,0))); // O(log(n))
+		while ((i != _peerAffinities.end())&&(i->address() == toPeerAddress)) {
+			uint16_t mid = i->clusterMemberId();
+			if ((mid != _id)&&(i->timestamp > mostRecentTimestamp)) {
+				mostRecentTimestamp = i->timestamp;
+				canHasPeer = mid;
+			}
+		}
+	}
+
+	const uint64_t now = RR->node->now();
+	if ((now - mostRecentTimestamp) < ZT_PEER_ACTIVITY_TIMEOUT) {
+		Buffer<16384> buf;
+
+		InetAddress v4,v6;
+		if (fromPeerAddress) {
+			SharedPtr<Peer> fromPeer(RR->topology->getPeer(fromPeerAddress));
+			if (fromPeer)
+				fromPeer->getBestActiveAddresses(now,v4,v6);
+		}
+		buf.append((uint8_t)( (v4) ? ((v6) ? 2 : 1) : ((v6) ? 1 : 0) ));
+		if (v4)
+			v4.serialize(buf);
+		if (v6)
+			v6.serialize(buf);
+		buf.append((uint16_t)len);
+		buf.append(data,len);
+
+		{
+			Mutex::Lock _l2(_members[canHasPeer].lock);
+			_send(canHasPeer,STATE_MESSAGE_RELAY,buf.data(),buf.size());
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
+void Cluster::replicateHavePeer(const Identity &peerId)
+{
+	{	// Use peer affinity table to track our own last announce time for peers
+		_PeerAffinity pa(peerId.address(),_id,RR->node->now());
+		Mutex::Lock _l2(_peerAffinities_m);
+		std::vector<_PeerAffinity>::iterator i(std::lower_bound(_peerAffinities.begin(),_peerAffinities.end(),pa)); // O(log(n))
+		if ((i != _peerAffinities.end())&&(i->key == pa.key)) {
+			if ((pa.timestamp - i->timestamp) >= ZT_CLUSTER_HAVE_PEER_ANNOUNCE_PERIOD) {
+				i->timestamp = pa.timestamp;
+				// continue to announcement
+			} else {
+				// we've already announced this peer recently, so skip
+				return;
+			}
+		} else {
+			_peerAffinities.push_back(pa);
+			std::sort(_peerAffinities.begin(),_peerAffinities.end()); // probably a more efficient way to insert but okay for now
+			// continue to announcement
+		}
+	}
+
+	// announcement
+	Buffer<4096> buf;
+	peerId.serialize(buf,false);
+	{
+		Mutex::Lock _l(_memberIds_m);
+		for(std::vector<uint16_t>::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) {
+			Mutex::Lock _l2(_members[*mid].lock);
+			_send(*mid,STATE_MESSAGE_HAVE_PEER,buf.data(),buf.size());
+		}
+	}
 }
 
 void Cluster::replicateMulticastLike(uint64_t nwid,const Address &peerAddress,const MulticastGroup &group)
 {
+	Buffer<4096> buf;
+	buf.append((uint64_t)nwid);
+	peerAddress.appendTo(buf);
+	group.mac().appendTo(buf);
+	buf.append((uint32_t)group.adi());
+	{
+		Mutex::Lock _l(_memberIds_m);
+		for(std::vector<uint16_t>::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) {
+			Mutex::Lock _l2(_members[*mid].lock);
+			_send(*mid,STATE_MESSAGE_MULTICAST_LIKE,buf.data(),buf.size());
+		}
+	}
 }
 
 void Cluster::replicateCertificateOfNetworkMembership(const CertificateOfMembership &com)
 {
+	Buffer<4096> buf;
+	com.serialize(buf);
+	{
+		Mutex::Lock _l(_memberIds_m);
+		for(std::vector<uint16_t>::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) {
+			Mutex::Lock _l2(_members[*mid].lock);
+			_send(*mid,STATE_MESSAGE_COM,buf.data(),buf.size());
+		}
+	}
 }
 
 void Cluster::doPeriodicTasks()
 {
-	// Go ahead and flush whenever possible right now
+	const uint64_t now = RR->node->now();
+
 	{
 		Mutex::Lock _l(_memberIds_m);
 		for(std::vector<uint16_t>::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) {
 			Mutex::Lock _l2(_members[*mid].lock);
-			_flush(*mid);
+
+			if ((now - _members[*mid].lastAnnouncedAliveTo) >= ((ZT_CLUSTER_TIMEOUT / 2) - 1000)) {
+				Buffer<2048> alive;
+				alive.append((uint16_t)ZEROTIER_ONE_VERSION_MAJOR);
+				alive.append((uint16_t)ZEROTIER_ONE_VERSION_MINOR);
+				alive.append((uint16_t)ZEROTIER_ONE_VERSION_REVISION);
+				alive.append((uint8_t)ZT_PROTO_VERSION);
+				if (_addressToLocationFunction) {
+					alive.append((int32_t)_x);
+					alive.append((int32_t)_y);
+					alive.append((int32_t)_z);
+				} else {
+					alive.append((int32_t)0);
+					alive.append((int32_t)0);
+					alive.append((int32_t)0);
+				}
+				alive.append((uint64_t)now);
+				alive.append((uint64_t)0); // TODO: compute and send load average
+				alive.append((uint64_t)0); // unused/reserved flags
+				alive.append((uint8_t)_zeroTierPhysicalEndpoints.size());
+				for(std::vector<InetAddress>::const_iterator pe(_zeroTierPhysicalEndpoints.begin());pe!=_zeroTierPhysicalEndpoints.end();++pe)
+					pe->serialize(alive);
+				_send(*mid,STATE_MESSAGE_ALIVE,alive.data(),alive.size());
+				_members[*mid].lastAnnouncedAliveTo = now;
+			}
+
+			_flush(*mid); // does nothing if nothing to flush
 		}
 	}
 }
 
 void Cluster::addMember(uint16_t memberId)
 {
+	if (memberId >= ZT_CLUSTER_MAX_MEMBERS)
+		return;
+
 	Mutex::Lock _l2(_members[memberId].lock);
 
-	Mutex::Lock _l(_memberIds_m);
-	_memberIds.push_back(memberId);
-	std::sort(_memberIds.begin(),_memberIds.end());
+	{
+		Mutex::Lock _l(_memberIds_m);
+		if (std::find(_memberIds.begin(),_memberIds.end(),memberId) != _memberIds.end())
+			return;
+		_memberIds.push_back(memberId);
+		std::sort(_memberIds.begin(),_memberIds.end());
+	}
+
+	_members[memberId].clear();
 
 	// Generate this member's message key from the master and its ID
 	uint16_t stmp[ZT_SHA512_DIGEST_LEN / sizeof(uint16_t)];
@@ -346,27 +535,107 @@ void Cluster::addMember(uint16_t memberId)
 	_members[memberId].q.append(iv,16);
 	_members[memberId].q.addSize(8); // room for MAC
 	_members[memberId].q.append((uint16_t)_id);
+	_members[memberId].q.append((uint16_t)memberId);
 }
 
-void Cluster::_send(uint16_t memberId,const void *msg,unsigned int len)
+void Cluster::removeMember(uint16_t memberId)
 {
-	_Member &m = _members[memberId];
-	// assumes m.lock is locked!
-	for(;;) {
-		if ((m.q.size() + len) > ZT_CLUSTER_MAX_MESSAGE_LENGTH)
-			_flush(memberId);
-		else {
-			m.q.append(msg,len);
-			break;
+	Mutex::Lock _l(_memberIds_m);
+	std::vector<uint16_t> newMemberIds;
+	for(std::vector<uint16_t>::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) {
+		if (*mid != memberId)
+			newMemberIds.push_back(*mid);
+	}
+	_memberIds = newMemberIds;
+}
+
+bool Cluster::redirectPeer(const Address &peerAddress,const InetAddress &peerPhysicalAddress,bool offload)
+{
+	if (!peerPhysicalAddress) // sanity check
+		return false;
+	if (_addressToLocationFunction) {
+		// Pick based on location if it can be determined
+		int px = 0,py = 0,pz = 0;
+		if (_addressToLocationFunction(_addressToLocationFunctionArg,reinterpret_cast<const struct sockaddr_storage *>(&peerPhysicalAddress),&px,&py,&pz) == 0) {
+			// No geo-info so no change
+			return false;
 		}
+
+		// Find member closest to this peer
+		const uint64_t now = RR->node->now();
+		std::vector<InetAddress> best; // initial "best" is for peer to stay put
+		const double currentDistance = _dist3d(_x,_y,_z,px,py,pz);
+		double bestDistance = (offload ? 2147483648.0 : currentDistance);
+		unsigned int bestMember = _id;
+		{
+			Mutex::Lock _l(_memberIds_m);
+			for(std::vector<uint16_t>::const_iterator mid(_memberIds.begin());mid!=_memberIds.end();++mid) {
+				_Member &m = _members[*mid];
+				Mutex::Lock _ml(m.lock);
+
+				// Consider member if it's alive and has sent us a location and one or more physical endpoints to send peers to
+				if ( ((now - m.lastReceivedAliveAnnouncement) < ZT_CLUSTER_TIMEOUT) && ((m.x != 0)||(m.y != 0)||(m.z != 0)) && (m.zeroTierPhysicalEndpoints.size() > 0) ) {
+					double mdist = _dist3d(m.x,m.y,m.z,px,py,pz);
+					if (mdist < bestDistance) {
+						bestMember = *mid;
+						best = m.zeroTierPhysicalEndpoints;
+					}
+				}
+			}
+		}
+
+		if (best.size() > 0) {
+			TRACE("peer %s is at [%d,%d,%d], distance to us is %f, sending to %u instead for better distance %f",peerAddress.toString().c_str(),px,py,pz,currentDistance,bestMember,bestDistance);
+
+			/* if (peer->remoteVersionProtocol() >= 5) {
+				// If it's a newer peer send VERB_PUSH_DIRECT_PATHS which is more idiomatic
+			} else { */
+				// Otherwise send VERB_RENDEZVOUS for ourselves, which will trick peers into trying other endpoints for us even if they're too old for PUSH_DIRECT_PATHS
+				for(std::vector<InetAddress>::const_iterator a(best.begin());a!=best.end();++a) {
+					if ((a->ss_family == AF_INET)||(a->ss_family == AF_INET6)) {
+						Packet outp(peerAddress,RR->identity.address(),Packet::VERB_RENDEZVOUS);
+						outp.append((uint8_t)0); // no flags
+						RR->identity.address().appendTo(outp); // HACK: rendezvous with ourselves! with really old peers this will only work if I'm a root server!
+						outp.append((uint16_t)a->port());
+						if (a->ss_family == AF_INET) {
+							outp.append((uint8_t)4);
+							outp.append(a->rawIpData(),4);
+						} else {
+							outp.append((uint8_t)16);
+							outp.append(a->rawIpData(),16);
+						}
+						RR->sw->send(outp,true,0);
+					}
+				}
+			//}
+
+			return true;
+		} else {
+			TRACE("peer %s is at [%d,%d,%d], distance to us is %f and this seems to be the best",peerAddress.toString().c_str(),px,py,pz,currentDistance);
+			return false;
+		}
+	} else {
+		// TODO: pick based on load if no location info?
+		return false;
 	}
 }
 
+void Cluster::_send(uint16_t memberId,StateMessageType type,const void *msg,unsigned int len)
+{
+	_Member &m = _members[memberId];
+	// assumes m.lock is locked!
+	if ((m.q.size() + len + 3) > ZT_CLUSTER_MAX_MESSAGE_LENGTH)
+		_flush(memberId);
+	m.q.append((uint16_t)(len + 1));
+	m.q.append((uint8_t)type);
+	m.q.append(msg,len);
+}
+
 void Cluster::_flush(uint16_t memberId)
 {
 	_Member &m = _members[memberId];
 	// assumes m.lock is locked!
-	if (m.q.size() > 26) { // 16-byte IV + 8-byte MAC + 2-byte cluster member ID (latter two bytes are inside crypto envelope)
+	if (m.q.size() > (24 + 2 + 2)) { // 16-byte IV + 8-byte MAC + 2 byte from-member-ID + 2 byte to-member-ID
 		// Create key from member's key and IV
 		char keytmp[32];
 		memcpy(keytmp,m.key,32);
@@ -389,7 +658,7 @@ void Cluster::_flush(uint16_t memberId)
 		memcpy(m.q.field(16,8),mac,8);
 
 		// Send!
-		_sendFunction(_arg,memberId,m.q.data(),m.q.size());
+		_sendFunction(_sendFunctionArg,memberId,m.q.data(),m.q.size());
 
 		// Prepare for more
 		m.q.clear();
@@ -397,7 +666,8 @@ void Cluster::_flush(uint16_t memberId)
 		Utils::getSecureRandom(iv,16);
 		m.q.append(iv,16);
 		m.q.addSize(8); // room for MAC
-		m.q.append((uint16_t)_id);
+		m.q.append((uint16_t)_id); // from member ID
+		m.q.append((uint16_t)memberId); // to member ID
 	}
 }
 

+ 67 - 66
node/Cluster.hpp

@@ -34,43 +34,38 @@
 #include <algorithm>
 
 #include "Constants.hpp"
+#include "../include/ZeroTierOne.h"
 #include "Address.hpp"
 #include "InetAddress.hpp"
 #include "SHA512.hpp"
 #include "Utils.hpp"
 #include "Buffer.hpp"
 #include "Mutex.hpp"
+#include "SharedPtr.hpp"
+#include "Hashtable.hpp"
 
 /**
  * Timeout for cluster members being considered "alive"
  */
-#define ZT_CLUSTER_TIMEOUT ZT_PEER_ACTIVITY_TIMEOUT
+#define ZT_CLUSTER_TIMEOUT 30000
 
 /**
- * Maximum cluster message length in bytes
- *
- * Cluster nodes speak via TCP, with data encapsulated into individually
- * encrypted and authenticated messages. The maximum message size is
- * 65535 (0xffff) since the TCP stream uses 16-bit message size headers
- * (and this is a reasonable chunk size anyway).
- */
-#define ZT_CLUSTER_MAX_MESSAGE_LENGTH 65535
-
-/**
- * Maximum number of physical addresses we will cache for a cluster member
+ * How often should we announce that we have a peer?
  */
-#define ZT_CLUSTER_MEMBER_MAX_PHYSICAL_ADDRS 8
+#define ZT_CLUSTER_HAVE_PEER_ANNOUNCE_PERIOD 60000
 
 /**
- * How frequently should doPeriodicTasks() be ideally called? (ms)
+ * Desired period between doPeriodicTasks() in milliseconds
  */
-#define ZT_CLUSTER_PERIODIC_TASK_DEADLINE 10
+#define ZT_CLUSTER_PERIODIC_TASK_PERIOD 50
 
 namespace ZeroTier {
 
 class RuntimeEnvironment;
 class CertificateOfMembership;
 class MulticastGroup;
+class Peer;
+class Identity;
 
 /**
  * Multi-homing cluster state replication and packet relaying
@@ -95,22 +90,6 @@ class MulticastGroup;
 class Cluster
 {
 public:
-	/**
-	 * Which distance algorithm is this cluster using?
-	 */
-	enum DistanceAlgorithm
-	{
-		/**
-		 * Simple linear distance in three dimensions
-		 */
-		DISTANCE_SIMPLE = 0,
-
-		/**
-		 * Haversine formula using X,Y as lat,long and ignoring Z
-		 */
-		DISTANCE_HAVERSINE = 1
-	};
-
 	/**
 	 * State message types
 	 */
@@ -184,25 +163,18 @@ public:
 
 	/**
 	 * Construct a new cluster
-	 *
-	 * @param renv Runtime environment
-	 * @param id This member's ID in the cluster
-	 * @param da Distance algorithm this cluster uses to compute distance and hand off peers
-	 * @param x My X
-	 * @param y My Y
-	 * @param z My Z
-	 * @param sendFunction Function to call to send messages to other cluster members
-	 * @param arg First argument to sendFunction
 	 */
 	Cluster(
 		const RuntimeEnvironment *renv,
 		uint16_t id,
-		DistanceAlgorithm da,
+		const std::vector<InetAddress> &zeroTierPhysicalEndpoints,
 		int32_t x,
 		int32_t y,
 		int32_t z,
-		void (*sendFunction)(void *,uint16_t,const void *,unsigned int),
-		void *arg);
+		void (*sendFunction)(void *,unsigned int,const void *,unsigned int),
+		void *sendFunctionArg,
+		int (*addressToLocationFunction)(void *,const struct sockaddr_storage *,int *,int *,int *),
+		void *addressToLocationFunctionArg);
 
 	~Cluster();
 
@@ -219,12 +191,23 @@ public:
 	 */
 	void handleIncomingStateMessage(const void *msg,unsigned int len);
 
+	/**
+	 * Send this packet via another node in this cluster if another node has this peer
+	 *
+	 * @param fromPeerAddress Source peer address (if known, should be NULL for fragments)
+	 * @param toPeerAddress Destination peer address
+	 * @param data Packet or packet fragment data
+	 * @param len Length of packet or fragment
+	 * @return True if this data was sent via another cluster member, false if none have this peer
+	 */
+	bool sendViaCluster(const Address &fromPeerAddress,const Address &toPeerAddress,const void *data,unsigned int len);
+
 	/**
 	 * Advertise to the cluster that we have this peer
 	 *
-	 * @param peerAddress Peer address that we have
+	 * @param peerId Identity of peer that we have
 	 */
-	void replicateHavePeer(const Address &peerAddress);
+	void replicateHavePeer(const Identity &peerId);
 
 	/**
 	 * Advertise a multicast LIKE to the cluster
@@ -243,7 +226,7 @@ public:
 	void replicateCertificateOfNetworkMembership(const CertificateOfMembership &com);
 
 	/**
-	 * Call every ~ZT_CLUSTER_PERIODIC_TASK_DEADLINE milliseconds.
+	 * Call every ~ZT_CLUSTER_PERIODIC_TASK_PERIOD milliseconds.
 	 */
 	void doPeriodicTasks();
 
@@ -254,52 +237,70 @@ public:
 	 */
 	void addMember(uint16_t memberId);
 
+	/**
+	 * Remove a member ID from this cluster
+	 *
+	 * @param memberId Member ID to remove
+	 */
+	void removeMember(uint16_t memberId);
+
+	/**
+	 * Redirect this peer to a better cluster member if needed
+	 *
+	 * @param peerAddress Peer to (possibly) redirect
+	 * @param peerPhysicalAddress Physical address of peer's current best path (where packet was most recently received or getBestPath()->address())
+	 * @param offload Always redirect if possible -- can be used to offload peers during shutdown
+	 * @return True if peer was redirected
+	 */
+	bool redirectPeer(const Address &peerAddress,const InetAddress &peerPhysicalAddress,bool offload);
+
 private:
-	void _send(uint16_t memberId,const void *msg,unsigned int len);
+	void _send(uint16_t memberId,StateMessageType type,const void *msg,unsigned int len);
 	void _flush(uint16_t memberId);
 
 	// These are initialized in the constructor and remain static
 	uint16_t _masterSecret[ZT_SHA512_DIGEST_LEN / sizeof(uint16_t)];
 	unsigned char _key[ZT_PEER_SECRET_KEY_LENGTH];
 	const RuntimeEnvironment *RR;
-	void (*_sendFunction)(void *,uint16_t,const void *,unsigned int);
-	void *_arg;
+	void (*_sendFunction)(void *,unsigned int,const void *,unsigned int);
+	void *_sendFunctionArg;
+	int (*_addressToLocationFunction)(void *,const struct sockaddr_storage *,int *,int *,int *);
+	void *_addressToLocationFunctionArg;
 	const int32_t _x;
 	const int32_t _y;
 	const int32_t _z;
-	const DistanceAlgorithm _da;
 	const uint16_t _id;
+	const std::vector<InetAddress> _zeroTierPhysicalEndpoints;
 
 	struct _Member
 	{
 		unsigned char key[ZT_PEER_SECRET_KEY_LENGTH];
 
-		uint64_t lastReceivedFrom;
 		uint64_t lastReceivedAliveAnnouncement;
-		uint64_t lastSentTo;
 		uint64_t lastAnnouncedAliveTo;
 
 		uint64_t load;
 		int32_t x,y,z;
 
-		InetAddress physicalAddresses[ZT_CLUSTER_MEMBER_MAX_PHYSICAL_ADDRS];
-		unsigned int physicalAddressCount;
+		std::vector<InetAddress> zeroTierPhysicalEndpoints;
 
 		Buffer<ZT_CLUSTER_MAX_MESSAGE_LENGTH> q;
 
 		Mutex lock;
 
-		_Member() :
-			lastReceivedFrom(0),
-			lastReceivedAliveAnnouncement(0),
-			lastSentTo(0),
-			lastAnnouncedAliveTo(0),
-			load(0),
-			x(0),
-			y(0),
-			z(0),
-			physicalAddressCount(0) {}
-
+		inline void clear()
+		{
+			lastReceivedAliveAnnouncement = 0;
+			lastAnnouncedAliveTo = 0;
+			load = 0;
+			x = 0;
+			y = 0;
+			z = 0;
+			zeroTierPhysicalEndpoints.clear();
+			q.clear();
+		}
+
+		_Member() { this->clear(); }
 		~_Member() { Utils::burn(key,sizeof(key)); }
 	};
 
@@ -308,7 +309,7 @@ private:
 	std::vector<uint16_t> _memberIds;
 	Mutex _memberIds_m;
 
-	// Record tracking which members have which peers and how recently they claimed this
+	// Record tracking which members have which peers and how recently they claimed this -- also used to track our last claimed time
 	struct _PeerAffinity
 	{
 		_PeerAffinity(const Address &a,uint16_t mid,uint64_t ts) :

+ 36 - 7
node/IncomingPacket.cpp

@@ -43,6 +43,7 @@
 #include "Salsa20.hpp"
 #include "SHA512.hpp"
 #include "World.hpp"
+#include "Cluster.hpp"
 
 namespace ZeroTier {
 
@@ -272,7 +273,6 @@ bool IncomingPacket::_doHELLO(const RuntimeEnvironment *RR)
 				TRACE("rejected HELLO from %s(%s): packet failed authentication",id.address().toString().c_str(),_remoteAddress.toString().c_str());
 				return true;
 			}
-
 			peer = RR->topology->addPeer(newPeer);
 
 			// Continue at // VALID
@@ -406,6 +406,10 @@ bool IncomingPacket::_doOK(const RuntimeEnvironment *RR,const SharedPtr<Peer> &p
 					CertificateOfMembership com;
 					offset += com.deserialize(*this,ZT_PROTO_VERB_MULTICAST_FRAME__OK__IDX_COM_AND_GATHER_RESULTS);
 					peer->validateAndSetNetworkMembershipCertificate(RR,nwid,com);
+#ifdef ZT_ENABLE_CLUSTER
+					if (RR->cluster)
+						RR->cluster->replicateCertificateOfNetworkMembership(com);
+#endif
 				}
 
 				if ((flags & 0x02) != 0) {
@@ -533,6 +537,10 @@ bool IncomingPacket::_doEXT_FRAME(const RuntimeEnvironment *RR,const SharedPtr<P
 					CertificateOfMembership com;
 					comLen = com.deserialize(*this,ZT_PROTO_VERB_EXT_FRAME_IDX_COM);
 					peer->validateAndSetNetworkMembershipCertificate(RR,network->id(),com);
+#ifdef ZT_ENABLE_CLUSTER
+					if (RR->cluster)
+						RR->cluster->replicateCertificateOfNetworkMembership(com);
+#endif
 				}
 
 				if (!network->isAllowed(peer)) {
@@ -613,8 +621,15 @@ bool IncomingPacket::_doMULTICAST_LIKE(const RuntimeEnvironment *RR,const Shared
 		const uint64_t now = RR->node->now();
 
 		// Iterate through 18-byte network,MAC,ADI tuples
-		for(unsigned int ptr=ZT_PACKET_IDX_PAYLOAD;ptr<size();ptr+=18)
-			RR->mc->add(now,at<uint64_t>(ptr),MulticastGroup(MAC(field(ptr + 8,6),6),at<uint32_t>(ptr + 14)),peer->address());
+		for(unsigned int ptr=ZT_PACKET_IDX_PAYLOAD;ptr<size();ptr+=18) {
+			const uint32_t nwid(at<uint64_t>(ptr));
+			const MulticastGroup group(MAC(field(ptr + 8,6),6),at<uint32_t>(ptr + 14));
+			RR->mc->add(now,nwid,group,peer->address());
+#ifdef ZT_ENABLE_CLUSTER
+			if (RR->cluster)
+				RR->cluster->replicateMulticastLike(nwid,peer->address(),group);
+#endif
+		}
 
 		peer->received(RR,_localAddress,_remoteAddress,hops(),packetId(),Packet::VERB_MULTICAST_LIKE,0,Packet::VERB_NOP);
 	} catch ( ... ) {
@@ -632,6 +647,10 @@ bool IncomingPacket::_doNETWORK_MEMBERSHIP_CERTIFICATE(const RuntimeEnvironment
 		while (ptr < size()) {
 			ptr += com.deserialize(*this,ptr);
 			peer->validateAndSetNetworkMembershipCertificate(RR,com.networkId(),com);
+#ifdef ZT_ENABLE_CLUSTER
+			if (RR->cluster)
+				RR->cluster->replicateCertificateOfNetworkMembership(com);
+#endif
 		}
 
 		peer->received(RR,_localAddress,_remoteAddress,hops(),packetId(),Packet::VERB_NETWORK_MEMBERSHIP_CERTIFICATE,0,Packet::VERB_NOP);
@@ -787,6 +806,10 @@ bool IncomingPacket::_doMULTICAST_FRAME(const RuntimeEnvironment *RR,const Share
 				CertificateOfMembership com;
 				offset += com.deserialize(*this,ZT_PROTO_VERB_MULTICAST_FRAME_IDX_COM);
 				peer->validateAndSetNetworkMembershipCertificate(RR,nwid,com);
+#ifdef ZT_ENABLE_CLUSTER
+				if (RR->cluster)
+					RR->cluster->replicateCertificateOfNetworkMembership(com);
+#endif
 			}
 
 			// Check membership after we've read any included COM, since
@@ -871,6 +894,8 @@ bool IncomingPacket::_doPUSH_DIRECT_PATHS(const RuntimeEnvironment *RR,const Sha
 		}
 		peer->setLastDirectPathPushReceived(now);
 
+		const RemotePath *currentBest = peer->getBestPath(now);
+
 		unsigned int count = at<uint16_t>(ZT_PACKET_IDX_PAYLOAD);
 		unsigned int ptr = ZT_PACKET_IDX_PAYLOAD + 2;
 		unsigned int v4Count = 0,v6Count = 0;
@@ -889,16 +914,20 @@ bool IncomingPacket::_doPUSH_DIRECT_PATHS(const RuntimeEnvironment *RR,const Sha
 					InetAddress a(field(ptr,4),4,at<uint16_t>(ptr + 4));
 					if ( ((flags & 0x01) == 0) && (Path::isAddressValidForPath(a)) ) {
 						TRACE("attempting to contact %s at pushed direct path %s",peer->address().toString().c_str(),a.toString().c_str());
-						if (v4Count++ < ZT_PUSH_DIRECT_PATHS_MAX_ENDPOINTS_PER_TYPE)
-							peer->attemptToContactAt(RR,_localAddress,a,RR->node->now());
+						if (v4Count++ < ZT_PUSH_DIRECT_PATHS_MAX_ENDPOINTS_PER_TYPE) {
+							if ((!currentBest)||(currentBest->address() != a))
+								peer->attemptToContactAt(RR,_localAddress,a,RR->node->now());
+						}
 					}
 				}	break;
 				case 6: {
 					InetAddress a(field(ptr,16),16,at<uint16_t>(ptr + 16));
 					if ( ((flags & 0x01) == 0) && (Path::isAddressValidForPath(a)) ) {
 						TRACE("attempting to contact %s at pushed direct path %s",peer->address().toString().c_str(),a.toString().c_str());
-						if (v6Count++ < ZT_PUSH_DIRECT_PATHS_MAX_ENDPOINTS_PER_TYPE)
-							peer->attemptToContactAt(RR,_localAddress,a,RR->node->now());
+						if (v6Count++ < ZT_PUSH_DIRECT_PATHS_MAX_ENDPOINTS_PER_TYPE) {
+							if ((!currentBest)||(currentBest->address() != a))
+								peer->attemptToContactAt(RR,_localAddress,a,RR->node->now());
+						}
 					}
 				}	break;
 			}

+ 149 - 6
node/Node.cpp

@@ -46,6 +46,7 @@
 #include "Address.hpp"
 #include "Identity.hpp"
 #include "SelfAwareness.hpp"
+#include "Cluster.hpp"
 
 const struct sockaddr_storage ZT_SOCKADDR_NULL = {0};
 
@@ -135,6 +136,9 @@ Node::~Node()
 	delete RR->antiRec;
 	delete RR->mc;
 	delete RR->sw;
+#ifdef ZT_ENABLE_CLUSTER
+	delete RR->cluster;
+#endif
 }
 
 ZT_ResultCode Node::processWirePacket(
@@ -329,7 +333,18 @@ ZT_ResultCode Node::processBackgroundTasks(uint64_t now,volatile uint64_t *nextB
 	}
 
 	try {
-		*nextBackgroundTaskDeadline = now + (uint64_t)std::max(std::min(timeUntilNextPingCheck,RR->sw->doTimerTasks(now)),(unsigned long)ZT_CORE_TIMER_TASK_GRANULARITY);
+#ifdef ZT_ENABLE_CLUSTER
+		// If clustering is enabled we have to call cluster->doPeriodicTasks() very often, so we override normal timer deadline behavior
+		if (RR->cluster) {
+			RR->sw->doTimerTasks(now);
+			RR->cluster->doPeriodicTasks();
+			*nextBackgroundTaskDeadline = now + ZT_CLUSTER_PERIODIC_TASK_PERIOD; // this is really short so just tick at this rate
+		} else {
+#endif
+			*nextBackgroundTaskDeadline = now + (uint64_t)std::max(std::min(timeUntilNextPingCheck,RR->sw->doTimerTasks(now)),(unsigned long)ZT_CORE_TIMER_TASK_GRANULARITY);
+#ifdef ZT_ENABLE_CLUSTER
+		}
+#endif
 	} catch ( ... ) {
 		return ZT_RESULT_FATAL_ERROR_INTERNAL;
 	}
@@ -554,6 +569,62 @@ void Node::circuitTestEnd(ZT_CircuitTest *test)
 	}
 }
 
+ZT_ResultCode Node::clusterInit(
+	unsigned int myId,
+	const struct sockaddr_storage *zeroTierPhysicalEndpoints,
+	unsigned int numZeroTierPhysicalEndpoints,
+	int x,
+	int y,
+	int z,
+	void (*sendFunction)(void *,unsigned int,const void *,unsigned int),
+	void *sendFunctionArg,
+	int (*addressToLocationFunction)(void *,const struct sockaddr_storage *,int *,int *,int *),
+	void *addressToLocationFunctionArg)
+{
+#ifdef ZT_ENABLE_CLUSTER
+	if (RR->cluster)
+		return ZT_RESULT_ERROR_BAD_PARAMETER;
+
+	std::vector<InetAddress> eps;
+	for(unsigned int i=0;i<numZeroTierPhysicalEndpoints;++i)
+		eps.push_back(InetAddress(zeroTierPhysicalEndpoints[i]));
+	std::sort(eps.begin(),eps.end());
+	RR->cluster = new Cluster(RR,myId,eps,x,y,z,sendFunction,sendFunctionArg,addressToLocationFunction,addressToLocationFunctionArg);
+
+	return ZT_RESULT_OK;
+#else
+	return ZT_RESULT_ERROR_UNSUPPORTED_OPERATION;
+#endif
+}
+
+ZT_ResultCode Node::clusterAddMember(unsigned int memberId)
+{
+#ifdef ZT_ENABLE_CLUSTER
+	if (!RR->cluster)
+		return ZT_RESULT_ERROR_BAD_PARAMETER;
+	RR->cluster->addMember((uint16_t)memberId);
+	return ZT_RESULT_OK;
+#else
+	return ZT_RESULT_ERROR_UNSUPPORTED_OPERATION;
+#endif
+}
+
+void Node::clusterRemoveMember(unsigned int memberId)
+{
+#ifdef ZT_ENABLE_CLUSTER
+	if (RR->cluster)
+		RR->cluster->removeMember((uint16_t)memberId);
+#endif
+}
+
+void Node::clusterHandleIncomingMessage(const void *msg,unsigned int len)
+{
+#ifdef ZT_ENABLE_CLUSTER
+	if (RR->cluster)
+		RR->cluster->handleIncomingStateMessage(msg,len);
+#endif
+}
+
 /****************************************************************************/
 /* Node methods used only within node/                                      */
 /****************************************************************************/
@@ -806,6 +877,22 @@ void ZT_Node_freeQueryResult(ZT_Node *node,void *qr)
 	} catch ( ... ) {}
 }
 
+int ZT_Node_addLocalInterfaceAddress(ZT_Node *node,const struct sockaddr_storage *addr,int metric, enum ZT_LocalInterfaceAddressTrust trust)
+{
+	try {
+		return reinterpret_cast<ZeroTier::Node *>(node)->addLocalInterfaceAddress(addr,metric,trust);
+	} catch ( ... ) {
+		return 0;
+	}
+}
+
+void ZT_Node_clearLocalInterfaceAddresses(ZT_Node *node)
+{
+	try {
+		reinterpret_cast<ZeroTier::Node *>(node)->clearLocalInterfaceAddresses();
+	} catch ( ... ) {}
+}
+
 void ZT_Node_setNetconfMaster(ZT_Node *node,void *networkControllerInstance)
 {
 	try {
@@ -829,19 +916,75 @@ void ZT_Node_circuitTestEnd(ZT_Node *node,ZT_CircuitTest *test)
 	} catch ( ... ) {}
 }
 
-int ZT_Node_addLocalInterfaceAddress(ZT_Node *node,const struct sockaddr_storage *addr,int metric, enum ZT_LocalInterfaceAddressTrust trust)
+enum ZT_ResultCode ZT_Node_clusterInit(
+	ZT_Node *node,
+	unsigned int myId,
+	const struct sockaddr_storage *zeroTierPhysicalEndpoints,
+	unsigned int numZeroTierPhysicalEndpoints,
+	int x,
+	int y,
+	int z,
+	void (*sendFunction)(void *,unsigned int,const void *,unsigned int),
+	void *sendFunctionArg,
+	int (*addressToLocationFunction)(void *,const struct sockaddr_storage *,int *,int *,int *),
+	void *addressToLocationFunctionArg)
 {
 	try {
-		return reinterpret_cast<ZeroTier::Node *>(node)->addLocalInterfaceAddress(addr,metric,trust);
+		return reinterpret_cast<ZeroTier::Node *>(node)->clusterInit(myId,zeroTierPhysicalEndpoints,numZeroTierPhysicalEndpoints,x,y,z,sendFunction,sendFunctionArg,addressToLocationFunction,addressToLocationFunctionArg);
 	} catch ( ... ) {
-		return 0;
+		return ZT_RESULT_FATAL_ERROR_INTERNAL;
 	}
 }
 
-void ZT_Node_clearLocalInterfaceAddresses(ZT_Node *node)
+/**
+ * Add a member to this cluster
+ *
+ * Calling this without having called clusterInit() will do nothing.
+ *
+ * @param node Node instance
+ * @param memberId Member ID (must be less than or equal to ZT_CLUSTER_MAX_MEMBERS)
+ * @return OK or error if clustering is disabled, ID invalid, etc.
+ */
+enum ZT_ResultCode ZT_Node_clusterAddMember(ZT_Node *node,unsigned int memberId)
 {
 	try {
-		reinterpret_cast<ZeroTier::Node *>(node)->clearLocalInterfaceAddresses();
+		return reinterpret_cast<ZeroTier::Node *>(node)->clusterAddMember(memberId);
+	} catch ( ... ) {
+		return ZT_RESULT_FATAL_ERROR_INTERNAL;
+	}
+}
+
+/**
+ * Remove a member from this cluster
+ *
+ * Calling this without having called clusterInit() will do nothing.
+ *
+ * @param node Node instance
+ * @param memberId Member ID to remove (nothing happens if not present)
+ */
+void ZT_Node_clusterRemoveMember(ZT_Node *node,unsigned int memberId)
+{
+	try {
+		reinterpret_cast<ZeroTier::Node *>(node)->clusterRemoveMember(memberId);
+	} catch ( ... ) {}
+}
+
+/**
+ * Handle an incoming cluster state message
+ *
+ * The message itself contains cluster member IDs, and invalid or badly
+ * addressed messages will be silently discarded.
+ *
+ * Calling this without having called clusterInit() will do nothing.
+ *
+ * @param node Node instance
+ * @param msg Cluster message
+ * @param len Length of cluster message
+ */
+void ZT_Node_clusterHandleIncomingMessage(ZT_Node *node,const void *msg,unsigned int len)
+{
+	try {
+		reinterpret_cast<ZeroTier::Node *>(node)->clusterHandleIncomingMessage(msg,len);
 	} catch ( ... ) {}
 }
 

+ 14 - 0
node/Node.hpp

@@ -110,6 +110,20 @@ public:
 	void setNetconfMaster(void *networkControllerInstance);
 	ZT_ResultCode circuitTestBegin(ZT_CircuitTest *test,void (*reportCallback)(ZT_Node *,ZT_CircuitTest *,const ZT_CircuitTestReport *));
 	void circuitTestEnd(ZT_CircuitTest *test);
+	ZT_ResultCode clusterInit(
+		unsigned int myId,
+		const struct sockaddr_storage *zeroTierPhysicalEndpoints,
+		unsigned int numZeroTierPhysicalEndpoints,
+		int x,
+		int y,
+		int z,
+		void (*sendFunction)(void *,unsigned int,const void *,unsigned int),
+		void *sendFunctionArg,
+		int (*addressToLocationFunction)(void *,const struct sockaddr_storage *,int *,int *,int *),
+		void *addressToLocationFunctionArg);
+	ZT_ResultCode clusterAddMember(unsigned int memberId);
+	void clusterRemoveMember(unsigned int memberId);
+	void clusterHandleIncomingMessage(const void *msg,unsigned int len);
 
 	// Internal functions ------------------------------------------------------
 

+ 16 - 3
node/Peer.cpp

@@ -34,6 +34,7 @@
 #include "Network.hpp"
 #include "AntiRecursion.hpp"
 #include "SelfAwareness.hpp"
+#include "Cluster.hpp"
 
 #include <algorithm>
 
@@ -81,6 +82,7 @@ void Peer::received(
 {
 	const uint64_t now = RR->node->now();
 	bool needMulticastGroupAnnounce = false;
+	bool pathIsConfirmed = false;
 
 	{
 		Mutex::Lock _l(_lock);
@@ -88,8 +90,6 @@ void Peer::received(
 		_lastReceive = now;
 
 		if (!hops) {
-			bool pathIsConfirmed = false;
-
 			/* Learn new paths from direct (hops == 0) packets */
 			{
 				unsigned int np = _numPaths;
@@ -107,7 +107,6 @@ void Peer::received(
 						// Learn paths if they've been confirmed via a HELLO or an ECHO
 						RemotePath *slot = (RemotePath *)0;
 						if (np < ZT_MAX_PEER_NETWORK_PATHS) {
-							// Add new path
 							slot = &(_paths[np++]);
 						} else {
 							uint64_t slotLRmin = 0xffffffffffffffffULL;
@@ -154,6 +153,14 @@ void Peer::received(
 			_lastMulticastFrame = now;
 	}
 
+#ifdef ZT_ENABLE_CLUSTER
+	if ((pathIsConfirmed)&&(RR->cluster)) {
+		// Either shuttle this peer off somewhere else or report to other members that we have it
+		if (!RR->cluster->redirectPeer(_id.address(),remoteAddr,false))
+			RR->cluster->replicateHavePeer(_id);
+	}
+#endif
+
 	if (needMulticastGroupAnnounce) {
 		const std::vector< SharedPtr<Network> > networks(RR->node->allNetworks());
 		for(std::vector< SharedPtr<Network> >::const_iterator n(networks.begin());n!=networks.end();++n)
@@ -213,6 +220,12 @@ bool Peer::doPingAndKeepalive(const RuntimeEnvironment *RR,uint64_t now,int inet
 
 void Peer::pushDirectPaths(const RuntimeEnvironment *RR,RemotePath *path,uint64_t now,bool force)
 {
+#ifdef ZT_ENABLE_CLUSTER
+	// Cluster mode disables normal PUSH_DIRECT_PATHS in favor of cluster-based peer redirection
+	if (RR->cluster)
+		return;
+#endif
+
 	Mutex::Lock _l(_lock);
 
 	if (((now - _lastDirectPathPushSent) >= ZT_DIRECT_PATH_PUSH_INTERVAL)||(force)) {

+ 16 - 8
node/RuntimeEnvironment.hpp

@@ -43,6 +43,7 @@ class Multicaster;
 class AntiRecursion;
 class NetworkController;
 class SelfAwareness;
+class Cluster;
 
 /**
  * Holds global state for an instance of ZeroTier::Node
@@ -51,14 +52,17 @@ class RuntimeEnvironment
 {
 public:
 	RuntimeEnvironment(Node *n) :
-		node(n),
-		identity(),
-		localNetworkController((NetworkController *)0),
-		sw((Switch *)0),
-		mc((Multicaster *)0),
-		antiRec((AntiRecursion *)0),
-		topology((Topology *)0),
-		sa((SelfAwareness *)0)
+		node(n)
+		,identity()
+		,localNetworkController((NetworkController *)0)
+		,sw((Switch *)0)
+		,mc((Multicaster *)0)
+		,antiRec((AntiRecursion *)0)
+		,topology((Topology *)0)
+		,sa((SelfAwareness *)0)
+#ifdef ZT_ENABLE_CLUSTER
+		,cluster((Cluster *)0)
+#endif
 	{
 	}
 
@@ -86,6 +90,10 @@ public:
 	AntiRecursion *antiRec;
 	Topology *topology;
 	SelfAwareness *sa;
+
+#ifdef ZT_ENABLE_CLUSTER
+	Cluster *cluster;
+#endif
 };
 
 } // namespace ZeroTier

+ 11 - 1
node/Switch.cpp

@@ -45,6 +45,7 @@
 #include "AntiRecursion.hpp"
 #include "SelfAwareness.hpp"
 #include "Packet.hpp"
+#include "Cluster.hpp"
 
 namespace ZeroTier {
 
@@ -567,6 +568,11 @@ void Switch::_handleRemotePacketFragment(const InetAddress &localAddr,const Inet
 			// It wouldn't hurt anything, just redundant and unnecessary.
 			SharedPtr<Peer> relayTo = RR->topology->getPeer(destination);
 			if ((!relayTo)||(!relayTo->send(RR,fragment.data(),fragment.size(),RR->node->now()))) {
+#ifdef ZT_ENABLE_CLUSTER
+				if ((RR->cluster)&&(RR->cluster->sendViaCluster(Address(),destination,fragment.data(),fragment.size())))
+					return; // sent by way of another member of this cluster
+#endif
+
 				// Don't know peer or no direct path -- so relay via root server
 				relayTo = RR->topology->getBestRoot();
 				if (relayTo)
@@ -642,7 +648,11 @@ void Switch::_handleRemotePacketHead(const InetAddress &localAddr,const InetAddr
 			if ((relayTo)&&((relayTo->send(RR,packet->data(),packet->size(),RR->node->now())))) {
 				unite(source,destination,false);
 			} else {
-				// Don't know peer or no direct path -- so relay via root server
+#ifdef ZT_ENABLE_CLUSTER
+				if ((RR->cluster)&&(RR->cluster->sendViaCluster(source,destination,packet->data(),packet->size())))
+					return; // sent by way of another member of this cluster
+#endif
+
 				relayTo = RR->topology->getBestRoot(&source,1,true);
 				if (relayTo)
 					relayTo->send(RR,packet->data(),packet->size(),RR->node->now());

+ 15 - 12
node/Topology.cpp

@@ -122,18 +122,22 @@ Topology::~Topology()
 SharedPtr<Peer> Topology::addPeer(const SharedPtr<Peer> &peer)
 {
 	if (peer->address() == RR->identity.address()) {
-		TRACE("BUG: addNewPeer() caught and ignored attempt to add peer for self");
+		TRACE("BUG: addPeer() caught and ignored attempt to add peer for self");
 		throw std::logic_error("cannot add peer for self");
 	}
 
-	const uint64_t now = RR->node->now();
-	Mutex::Lock _l(_lock);
-
-	SharedPtr<Peer> &p = _peers.set(peer->address(),peer);
-	p->use(now);
-	saveIdentity(p->identity());
+	SharedPtr<Peer> np;
+	{
+		Mutex::Lock _l(_lock);
+		SharedPtr<Peer> &hp = _peers[peer->address()];
+		if (!hp)
+			hp = peer;
+		np = hp;
+	}
+	np->use(RR->node->now());
+	saveIdentity(np->identity());
 
-	return p;
+	return np;
 }
 
 SharedPtr<Peer> Topology::getPeer(const Address &zta)
@@ -143,13 +147,12 @@ SharedPtr<Peer> Topology::getPeer(const Address &zta)
 		return SharedPtr<Peer>();
 	}
 
-	const uint64_t now = RR->node->now();
 	Mutex::Lock _l(_lock);
 
 	SharedPtr<Peer> &ap = _peers[zta];
 
 	if (ap) {
-		ap->use(now);
+		ap->use(RR->node->now());
 		return ap;
 	}
 
@@ -157,13 +160,13 @@ SharedPtr<Peer> Topology::getPeer(const Address &zta)
 	if (id) {
 		try {
 			ap = SharedPtr<Peer>(new Peer(RR->identity,id));
-			ap->use(now);
+			ap->use(RR->node->now());
 			return ap;
 		} catch ( ... ) {} // invalid identity?
 	}
 
+	// If we get here it means we read an invalid cache identity or had some other error
 	_peers.erase(zta);
-
 	return SharedPtr<Peer>();
 }