Pārlūkot izejas kodu

Make Peer bootstrap field an Endpoint even though we only support InetAddress right now, and a bunch more stuff.

Adam Ierymenko 5 gadi atpakaļ
vecāks
revīzija
53b7c8f725

+ 1 - 1
go/pkg/zerotier/endpoint.go → go/attic/endpoint.go

@@ -1,4 +1,4 @@
-package zerotier
+package attic
 
 import (
 	"encoding/binary"

+ 1 - 1
go/pkg/zerotier/locator.go → go/attic/locator.go

@@ -1,4 +1,4 @@
-package zerotier
+package attic
 
 import (
 	"encoding/binary"

+ 0 - 13
go/pkg/zerotier/root.go

@@ -1,13 +0,0 @@
-package zerotier
-
-// Root nodes are long-lived nodes at stable physical addresses that can help locate other nodes.
-type Root struct {
-	// Identity is this root's address and public key(s).
-	Identity Identity `json:"identity"`
-
-	// Locator describes the endpoints where this root may be found.
-	Locator  Locator `json:"locator,omitempty"`
-
-	// Bootstrap is an array of IP/port locations where this root might be found if a locator is not known.
-	Bootstrap []InetAddress `json:"bootstrap,omitempty"`
-}

+ 36 - 15
include/ZeroTierCore.h

@@ -1032,6 +1032,11 @@ typedef struct
 	 */
 	const ZT_Identity *identity;
 
+	/**
+	 * Hash of identity public key(s)
+	 */
+	uint8_t identityHash[48];
+
 	/**
 	 * Remote major version or -1 if not known
 	 */
@@ -1057,6 +1062,15 @@ typedef struct
 	 */
 	enum ZT_PeerRole role;
 
+	/**
+	 * Bootstrap address
+	 *
+	 * This is a memo-ized recently valid address that can be saved and used
+	 * to attempt rapid reconnection with this peer. If the ss_family field
+	 * is 0 this field is considered null/empty.
+	 */
+	struct sockaddr_storage bootstrap;
+
 	/**
 	 * Number of paths (size of paths[])
 	 */
@@ -1090,7 +1104,7 @@ enum ZT_StateObjectType
 	/**
 	 * Public address and public key
 	 *
-	 * Object ID: this node's address if known, or 0 if unknown (first query)
+	 * Object ID: (unused)
 	 * Canonical path: <HOME>/identity.public
    * Persistence: required
 	 */
@@ -1099,7 +1113,7 @@ enum ZT_StateObjectType
 	/**
 	 * Full identity with secret key
 	 *
-	 * Object ID: this node's address if known, or 0 if unknown (first query)
+	 * Object ID: (unused)
 	 * Canonical path: <HOME>/identity.secret
    * Persistence: required, should be stored with restricted permissions e.g. mode 0600 on *nix
 	 */
@@ -1108,7 +1122,7 @@ enum ZT_StateObjectType
 	/**
 	 * This node's locator
 	 *
-	 * Object ID: 0
+	 * Object ID: (unused)
 	 * Canonical path: <HOME>/locator
 	 * Persistence: optional
 	 */
@@ -1126,7 +1140,7 @@ enum ZT_StateObjectType
 	/**
 	 * Network configuration
 	 *
-	 * Object ID: peer address
+	 * Object ID: network ID
 	 * Canonical path: <HOME>/networks.d/<NETWORKID>.conf (16-digit hex ID)
 	 * Persistence: required if network memberships should persist
 	 */
@@ -1135,7 +1149,7 @@ enum ZT_StateObjectType
 	/**
 	 * Root list
 	 *
-	 * Object ID: 0
+	 * Object ID: (unused)
 	 * Canonical path: <HOME>/roots
 	 * Persistence: required if root settings should persist
 	 */
@@ -1235,8 +1249,10 @@ typedef void (*ZT_StatePutFunction)(
  * Callback for retrieving stored state information
  *
  * This function should return the number of bytes actually stored to the
- * buffer or -1 if the state object was not found or the buffer was too
- * small to store it.
+ * buffer or -1 if the state object was not found. The buffer itself should
+ * be set to point to the data, and the last result parameter must point to
+ * a function that will be used to free the buffer when the core is done
+ * with it. This is very often just a pointer to free().
  */
 typedef int (*ZT_StateGetFunction)(
 	ZT_Node *,                             /* Node */
@@ -1244,8 +1260,8 @@ typedef int (*ZT_StateGetFunction)(
 	void *,                                /* Thread ptr */
 	enum ZT_StateObjectType,               /* State object type */
 	const uint64_t [2],                    /* State object ID (if applicable) */
-	void *,                                /* Buffer to store state object data */
-	unsigned int);                         /* Length of data buffer in bytes */
+	void **,                               /* Result parameter: data */
+	void (**)(void *));                    /* Result parameter: data free function */
 
 /**
  * Function to send a ZeroTier packet out over the physical wire (L2/L3)
@@ -1288,8 +1304,9 @@ typedef int (*ZT_WirePacketSendFunction)(
  *  (1) Node
  *  (2) User pointer
  *  (3) ZeroTier address or 0 for none/any
- *  (4) Local socket or -1 if unknown
- *  (5) Remote address
+ *  (4) Full identity or NULL for none/any
+ *  (5) Local socket or -1 if unknown
+ *  (6) Remote address
  *
  * This function must return nonzero (true) if the path should be used.
  *
@@ -1307,6 +1324,7 @@ typedef int (*ZT_PathCheckFunction)(
 	void *,                           /* User ptr */
 	void *,                           /* Thread ptr */
 	uint64_t,                         /* ZeroTier address */
+	const ZT_Identity *,              /* Full identity of node */
 	int64_t,                          /* Local socket or -1 if unknown */
 	const struct sockaddr_storage *); /* Remote address */
 
@@ -1563,10 +1581,12 @@ ZT_SDK_API enum ZT_ResultCode ZT_Node_multicastUnsubscribe(ZT_Node *node,uint64_
  * Add a root server (has no effect if already added)
  *
  * @param node Node instance
- * @param identity Identity of this root server in string format
+ * @param tptr Thread pointer to pass to functions/callbacks resulting from this call
+ * @param identity Identity of this root server
+ * @param bootstrap Optional bootstrap address for initial contact
  * @return OK (0) or error code if a fatal error condition has occurred
  */
-ZT_SDK_API enum ZT_ResultCode ZT_Node_addRoot(ZT_Node *node,const char *identity);
+ZT_SDK_API enum ZT_ResultCode ZT_Node_addRoot(ZT_Node *node,void *tptr,const ZT_Identity *identity,const struct sockaddr_storage *bootstrap);
 
 /**
  * Remove a root server
@@ -1575,10 +1595,11 @@ ZT_SDK_API enum ZT_ResultCode ZT_Node_addRoot(ZT_Node *node,const char *identity
  * from communicating with it or close active paths to it.
  *
  * @param node Node instance
- * @param identity Identity in string format
+ * @param tptr Thread pointer to pass to functions/callbacks resulting from this call
+ * @param identity Identity to remove
  * @return OK (0) or error code if a fatal error condition has occurred
  */
-ZT_SDK_API enum ZT_ResultCode ZT_Node_removeRoot(ZT_Node *node,const char *identity);
+ZT_SDK_API enum ZT_ResultCode ZT_Node_removeRoot(ZT_Node *node,void *tptr,const ZT_Identity *identity);
 
 /**
  * Get this node's 40-bit ZeroTier address

+ 386 - 5
node/AES.cpp

@@ -192,11 +192,11 @@ typedef unsigned uint128_t __attribute__((mode(TI)));
 
 static inline void s_bmul64(const uint64_t x,const uint64_t y,uint64_t &r_high,uint64_t &r_low)
 {
-	static uint128_t m1 = (uint128_t)0x2108421084210842ULL << 64 | 0x1084210842108421ULL;
-	static uint128_t m2 = (uint128_t)0x4210842108421084ULL << 64 | 0x2108421084210842ULL;
-	static uint128_t m3 = (uint128_t)0x8421084210842108ULL << 64 | 0x4210842108421084ULL;
-	static uint128_t m4 = (uint128_t)0x0842108421084210ULL << 64 | 0x8421084210842108ULL;
-	static uint128_t m5 = (uint128_t)0x1084210842108421ULL << 64 | 0x0842108421084210ULL;
+	static uint128_t m1 = (uint128_t)0x2108421084210842ULL << 64U | 0x1084210842108421ULL;
+	static uint128_t m2 = (uint128_t)0x4210842108421084ULL << 64U | 0x2108421084210842ULL;
+	static uint128_t m3 = (uint128_t)0x8421084210842108ULL << 64U | 0x4210842108421084ULL;
+	static uint128_t m4 = (uint128_t)0x0842108421084210ULL << 64U | 0x8421084210842108ULL;
+	static uint128_t m5 = (uint128_t)0x1084210842108421ULL << 64U | 0x0842108421084210ULL;
 	uint128_t x1 = x & m1;
 	uint128_t y1 = y & m1;
 	uint128_t x2 = x & m2;
@@ -378,4 +378,385 @@ void AES::_gmacSW(const uint8_t iv[12],const uint8_t *in,unsigned int len,uint8_
 #endif
 }
 
+#ifdef ZT_AES_AESNI
+
+static ZT_ALWAYS_INLINE inline __m128i _mult_block_aesni(__m128i shuf,__m128i h,__m128i y)
+{
+	y = _mm_shuffle_epi8(y,shuf);
+	__m128i t1 = _mm_clmulepi64_si128(h,y,0x00);
+	__m128i t2 = _mm_clmulepi64_si128(h,y,0x01);
+	__m128i t3 = _mm_clmulepi64_si128(h,y,0x10);
+	__m128i t4 = _mm_clmulepi64_si128(h,y,0x11);
+	t2 = _mm_xor_si128(t2,t3);
+	t3 = _mm_slli_si128(t2,8);
+	t2 = _mm_srli_si128(t2,8);
+	t1 = _mm_xor_si128(t1,t3);
+	t4 = _mm_xor_si128(t4,t2);
+	__m128i t5 = _mm_srli_epi32(t1,31);
+	t1 = _mm_slli_epi32(t1,1);
+	__m128i t6 = _mm_srli_epi32(t4,31);
+	t4 = _mm_slli_epi32(t4,1);
+	t3 = _mm_srli_si128(t5,12);
+	t6 = _mm_slli_si128(t6,4);
+	t5 = _mm_slli_si128(t5,4);
+	t1 = _mm_or_si128(t1,t5);
+	t4 = _mm_or_si128(t4,t6);
+	t4 = _mm_or_si128(t4,t3);
+	t5 = _mm_slli_epi32(t1,31);
+	t6 = _mm_slli_epi32(t1,30);
+	t3 = _mm_slli_epi32(t1,25);
+	t5 = _mm_xor_si128(t5,t6);
+	t5 = _mm_xor_si128(t5,t3);
+	t6 = _mm_srli_si128(t5,4);
+	t4 = _mm_xor_si128(t4,t6);
+	t5 = _mm_slli_si128(t5,12);
+	t1 = _mm_xor_si128(t1,t5);
+	t4 = _mm_xor_si128(t4,t1);
+	t5 = _mm_srli_epi32(t1,1);
+	t2 = _mm_srli_epi32(t1,2);
+	t3 = _mm_srli_epi32(t1,7);
+	t4 = _mm_xor_si128(t4,t2);
+	t4 = _mm_xor_si128(t4,t3);
+	t4 = _mm_xor_si128(t4,t5);
+	return _mm_shuffle_epi8(t4,shuf);
+}
+static ZT_ALWAYS_INLINE __m128i _ghash_aesni(__m128i shuf,__m128i h,__m128i y,__m128i x)
+{
+	return _mult_block_aesni(shuf,h,_mm_xor_si128(y,x));
+}
+
+static ZT_ALWAYS_INLINE __m128i _init256_1_aesni(__m128i a,__m128i b)
+{
+	__m128i x,y;
+	b = _mm_shuffle_epi32(b,0xff);
+	y = _mm_slli_si128(a,0x04);
+	x = _mm_xor_si128(a,y);
+	y = _mm_slli_si128(y,0x04);
+	x = _mm_xor_si128(x,y);
+	y = _mm_slli_si128(y,0x04);
+	x = _mm_xor_si128(x,y);
+	x = _mm_xor_si128(x,b);
+	return x;
+}
+static ZT_ALWAYS_INLINE __m128i _init256_2_aesni(__m128i a,__m128i b)
+{
+	__m128i x,y,z;
+	y = _mm_aeskeygenassist_si128(a,0x00);
+	z = _mm_shuffle_epi32(y,0xaa);
+	y = _mm_slli_si128(b,0x04);
+	x = _mm_xor_si128(b,y);
+	y = _mm_slli_si128(y,0x04);
+	x = _mm_xor_si128(x,y);
+	y = _mm_slli_si128(y,0x04);
+	x = _mm_xor_si128(x,y);
+	x = _mm_xor_si128(x,z);
+	return x;
+}
+
+void AES::_init_aesni(const uint8_t key[32])
+{
+	__m128i t1,t2;
+	_k.ni.k[0] = t1 = _mm_loadu_si128((const __m128i *)key);
+	_k.ni.k[1] = t2 = _mm_loadu_si128((const __m128i *)(key+16));
+	_k.ni.k[2] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x01));
+	_k.ni.k[3] = t2 = _init256_2_aesni(t1,t2);
+	_k.ni.k[4] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x02));
+	_k.ni.k[5] = t2 = _init256_2_aesni(t1,t2);
+	_k.ni.k[6] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x04));
+	_k.ni.k[7] = t2 = _init256_2_aesni(t1,t2);
+	_k.ni.k[8] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x08));
+	_k.ni.k[9] = t2 = _init256_2_aesni(t1,t2);
+	_k.ni.k[10] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x10));
+	_k.ni.k[11] = t2 = _init256_2_aesni(t1,t2);
+	_k.ni.k[12] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x20));
+	_k.ni.k[13] = t2 = _init256_2_aesni(t1,t2);
+	_k.ni.k[14] = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x40));
+
+	__m128i h = _mm_xor_si128(_mm_setzero_si128(),_k.ni.k[0]);
+	h = _mm_aesenc_si128(h,_k.ni.k[1]);
+	h = _mm_aesenc_si128(h,_k.ni.k[2]);
+	h = _mm_aesenc_si128(h,_k.ni.k[3]);
+	h = _mm_aesenc_si128(h,_k.ni.k[4]);
+	h = _mm_aesenc_si128(h,_k.ni.k[5]);
+	h = _mm_aesenc_si128(h,_k.ni.k[6]);
+	h = _mm_aesenc_si128(h,_k.ni.k[7]);
+	h = _mm_aesenc_si128(h,_k.ni.k[8]);
+	h = _mm_aesenc_si128(h,_k.ni.k[9]);
+	h = _mm_aesenc_si128(h,_k.ni.k[10]);
+	h = _mm_aesenc_si128(h,_k.ni.k[11]);
+	h = _mm_aesenc_si128(h,_k.ni.k[12]);
+	h = _mm_aesenc_si128(h,_k.ni.k[13]);
+	h = _mm_aesenclast_si128(h,_k.ni.k[14]);
+
+	const __m128i shuf = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+	__m128i hswap = _mm_shuffle_epi8(h,shuf);
+	__m128i hh = _mult_block_aesni(shuf,hswap,h);
+	__m128i hhh = _mult_block_aesni(shuf,hswap,hh);
+	__m128i hhhh = _mult_block_aesni(shuf,hswap,hhh);
+	_k.ni.h = hswap;
+	_k.ni.hh = _mm_shuffle_epi8(hh,shuf);
+	_k.ni.hhh = _mm_shuffle_epi8(hhh,shuf);
+	_k.ni.hhhh = _mm_shuffle_epi8(hhhh,shuf);
+}
+
+void AES::_gmac_aesni(const uint8_t iv[12],const uint8_t *in,const unsigned int len,uint8_t out[16]) const
+{
+	const __m128i *const ab = (const __m128i *)in;
+	const unsigned int blocks = len / 16;
+	const unsigned int pblocks = blocks - (blocks % 4);
+	const unsigned int rem = len % 16;
+
+	const __m128i shuf = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+	__m128i y = _mm_setzero_si128();
+	unsigned int i = 0;
+	for (;i<pblocks;i+=4) {
+		__m128i d1 = _mm_shuffle_epi8(_mm_xor_si128(y,_mm_loadu_si128(ab + i + 0)),shuf);
+		__m128i d2 = _mm_shuffle_epi8(_mm_loadu_si128(ab + i + 1),shuf);
+		__m128i d3 = _mm_shuffle_epi8(_mm_loadu_si128(ab + i + 2),shuf);
+		__m128i d4 = _mm_shuffle_epi8(_mm_loadu_si128(ab + i + 3),shuf);
+		_mm_prefetch(ab + i + 4,_MM_HINT_T0);
+		__m128i t0 = _mm_clmulepi64_si128(_k.ni.hhhh,d1,0x00);
+		__m128i t1 = _mm_clmulepi64_si128(_k.ni.hhh,d2,0x00);
+		__m128i t2 = _mm_clmulepi64_si128(_k.ni.hh,d3,0x00);
+		__m128i t3 = _mm_clmulepi64_si128(_k.ni.h,d4,0x00);
+		__m128i t8 = _mm_xor_si128(t0,t1);
+		t8 = _mm_xor_si128(t8,t2);
+		t8 = _mm_xor_si128(t8,t3);
+		__m128i t4 = _mm_clmulepi64_si128(_k.ni.hhhh,d1,0x11);
+		__m128i t5 = _mm_clmulepi64_si128(_k.ni.hhh,d2,0x11);
+		__m128i t6 = _mm_clmulepi64_si128(_k.ni.hh,d3,0x11);
+		__m128i t7 = _mm_clmulepi64_si128(_k.ni.h,d4,0x11);
+		__m128i t9 = _mm_xor_si128(t4,t5);
+		t9 = _mm_xor_si128(t9,t6);
+		t9 = _mm_xor_si128(t9,t7);
+		t0 = _mm_shuffle_epi32(_k.ni.hhhh,78);
+		t4 = _mm_shuffle_epi32(d1,78);
+		t0 = _mm_xor_si128(t0,_k.ni.hhhh);
+		t4 = _mm_xor_si128(t4,d1);
+		t1 = _mm_shuffle_epi32(_k.ni.hhh,78);
+		t5 = _mm_shuffle_epi32(d2,78);
+		t1 = _mm_xor_si128(t1,_k.ni.hhh);
+		t5 = _mm_xor_si128(t5,d2);
+		t2 = _mm_shuffle_epi32(_k.ni.hh,78);
+		t6 = _mm_shuffle_epi32(d3,78);
+		t2 = _mm_xor_si128(t2,_k.ni.hh);
+		t6 = _mm_xor_si128(t6,d3);
+		t3 = _mm_shuffle_epi32(_k.ni.h,78);
+		t7 = _mm_shuffle_epi32(d4,78);
+		t3 = _mm_xor_si128(t3,_k.ni.h);
+		t7 = _mm_xor_si128(t7,d4);
+		t0 = _mm_clmulepi64_si128(t0,t4,0x00);
+		t1 = _mm_clmulepi64_si128(t1,t5,0x00);
+		t2 = _mm_clmulepi64_si128(t2,t6,0x00);
+		t3 = _mm_clmulepi64_si128(t3,t7,0x00);
+		t0 = _mm_xor_si128(t0,t8);
+		t0 = _mm_xor_si128(t0,t9);
+		t0 = _mm_xor_si128(t1,t0);
+		t0 = _mm_xor_si128(t2,t0);
+		t0 = _mm_xor_si128(t3,t0);
+		t4 = _mm_slli_si128(t0,8);
+		t0 = _mm_srli_si128(t0,8);
+		t3 = _mm_xor_si128(t4,t8);
+		t6 = _mm_xor_si128(t0,t9);
+		t7 = _mm_srli_epi32(t3,31);
+		t8 = _mm_srli_epi32(t6,31);
+		t3 = _mm_slli_epi32(t3,1);
+		t6 = _mm_slli_epi32(t6,1);
+		t9 = _mm_srli_si128(t7,12);
+		t8 = _mm_slli_si128(t8,4);
+		t7 = _mm_slli_si128(t7,4);
+		t3 = _mm_or_si128(t3,t7);
+		t6 = _mm_or_si128(t6,t8);
+		t6 = _mm_or_si128(t6,t9);
+		t7 = _mm_slli_epi32(t3,31);
+		t8 = _mm_slli_epi32(t3,30);
+		t9 = _mm_slli_epi32(t3,25);
+		t7 = _mm_xor_si128(t7,t8);
+		t7 = _mm_xor_si128(t7,t9);
+		t8 = _mm_srli_si128(t7,4);
+		t7 = _mm_slli_si128(t7,12);
+		t3 = _mm_xor_si128(t3,t7);
+		t2 = _mm_srli_epi32(t3,1);
+		t4 = _mm_srli_epi32(t3,2);
+		t5 = _mm_srli_epi32(t3,7);
+		t2 = _mm_xor_si128(t2,t4);
+		t2 = _mm_xor_si128(t2,t5);
+		t2 = _mm_xor_si128(t2,t8);
+		t3 = _mm_xor_si128(t3,t2);
+		t6 = _mm_xor_si128(t6,t3);
+		y = _mm_shuffle_epi8(t6,shuf);
+	}
+
+	for (;i<blocks;++i)
+		y = _ghash_aesni(shuf,_k.ni.h,y,_mm_loadu_si128(ab + i));
+
+	if (rem) {
+		__m128i last = _mm_setzero_si128();
+		memcpy(&last,ab + blocks,rem);
+		y = _ghash_aesni(shuf,_k.ni.h,y,last);
+	}
+
+	y = _ghash_aesni(shuf,_k.ni.h,y,_mm_set_epi64((__m64)0LL,(__m64)Utils::hton((uint64_t)len * (uint64_t)8)));
+
+	__m128i t = _mm_xor_si128(_mm_set_epi32(0x01000000,(int)*((const uint32_t *)(iv+8)),(int)*((const uint32_t *)(iv+4)),(int)*((const uint32_t *)(iv))),_k.ni.k[0]);
+	t = _mm_aesenc_si128(t,_k.ni.k[1]);
+	t = _mm_aesenc_si128(t,_k.ni.k[2]);
+	t = _mm_aesenc_si128(t,_k.ni.k[3]);
+	t = _mm_aesenc_si128(t,_k.ni.k[4]);
+	t = _mm_aesenc_si128(t,_k.ni.k[5]);
+	t = _mm_aesenc_si128(t,_k.ni.k[6]);
+	t = _mm_aesenc_si128(t,_k.ni.k[7]);
+	t = _mm_aesenc_si128(t,_k.ni.k[8]);
+	t = _mm_aesenc_si128(t,_k.ni.k[9]);
+	t = _mm_aesenc_si128(t,_k.ni.k[10]);
+	t = _mm_aesenc_si128(t,_k.ni.k[11]);
+	t = _mm_aesenc_si128(t,_k.ni.k[12]);
+	t = _mm_aesenc_si128(t,_k.ni.k[13]);
+	t = _mm_aesenclast_si128(t,_k.ni.k[14]);
+	_mm_storeu_si128((__m128i *)out,_mm_xor_si128(y,t));
+}
+
+#define ZT_AES_CTR_AESNI_ROUND(kk) c0 = _mm_aesenc_si128(c0,kk); c1 = _mm_aesenc_si128(c1,kk); c2 = _mm_aesenc_si128(c2,kk); c3 = _mm_aesenc_si128(c3,kk);
+void AES::_ctr_aesni(const __m128i key[14],const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out)
+{
+	/* Because our CTR supports full 128-bit nonces, we must do a full 128-bit (big-endian)
+	 * increment to be compatible with canonical NIST-certified CTR implementations. That's
+	 * because it's possible to have a lot of bit saturation in the least significant 64
+	 * bits, which could on rare occasions actually cause a 64-bit wrap. If this happened
+	 * without carry it would result in incompatibility and quietly dropped packets. The
+	 * probability is low, so this would be a one in billions packet loss bug that would
+	 * probably never be found.
+	 *
+	 * This crazy code does a branch-free 128-bit increment by adding a one or a zero to
+	 * the most significant 64 bits of the 128-bit vector based on whether the add we want
+	 * to do to the least significant 64 bits would overflow. This can be computed by
+	 * NOTing those bits and comparing with what we want to add, since NOT is the same
+	 * as subtracting from uint64_max. This generates branch-free ASM on x64 with most
+	 * good compilers. */
+	__m128i swap128 = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+	__m128i ctr0 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)iv),swap128);
+	uint64_t notctr0msq = ~((uint64_t)_mm_extract_epi64(ctr0,0));
+	__m128i ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 1ULL),1LL)),swap128);
+	__m128i ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 2ULL),2LL)),swap128);
+	__m128i ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 3ULL),3LL)),swap128);
+	ctr0 = _mm_shuffle_epi8(ctr0,swap128);
+
+	__m128i k0 = key[0];
+	__m128i k1 = key[1];
+
+	while (len >= 64) {
+		__m128i ka = key[2];
+		__m128i c0 = _mm_xor_si128(ctr0,k0);
+		__m128i c1 = _mm_xor_si128(ctr1,k0);
+		__m128i c2 = _mm_xor_si128(ctr2,k0);
+		__m128i c3 = _mm_xor_si128(ctr3,k0);
+		ctr0 = _mm_shuffle_epi8(ctr0,swap128);
+		notctr0msq = ~((uint64_t)_mm_extract_epi64(ctr0,0));
+		ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 5ULL),5LL)),swap128);
+		ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 6ULL),6LL)),swap128);
+		ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 7ULL),7LL)),swap128);
+		ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 4ULL),4LL)),swap128);
+		__m128i kb = key[3];
+		ZT_AES_CTR_AESNI_ROUND(k1);
+		__m128i kc = key[4];
+		ZT_AES_CTR_AESNI_ROUND(ka);
+		__m128i kd = key[5];
+		ZT_AES_CTR_AESNI_ROUND(kb);
+		ka = key[6];
+		ZT_AES_CTR_AESNI_ROUND(kc);
+		kb = key[7];
+		ZT_AES_CTR_AESNI_ROUND(kd);
+		kc = key[8];
+		ZT_AES_CTR_AESNI_ROUND(ka);
+		kd = key[9];
+		ZT_AES_CTR_AESNI_ROUND(kb);
+		ka = key[10];
+		ZT_AES_CTR_AESNI_ROUND(kc);
+		kb = key[11];
+		ZT_AES_CTR_AESNI_ROUND(kd);
+		kc = key[12];
+		ZT_AES_CTR_AESNI_ROUND(ka);
+		kd = key[13];
+		ZT_AES_CTR_AESNI_ROUND(kb);
+		ka = key[14];
+		ZT_AES_CTR_AESNI_ROUND(kc);
+		ZT_AES_CTR_AESNI_ROUND(kd);
+		_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,ka)));
+		_mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,ka)));
+		_mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,ka)));
+		_mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,ka)));
+		in += 64;
+		out += 64;
+		len -= 64;
+	}
+
+	__m128i k2 = key[2];
+	__m128i k3 = key[3];
+	__m128i k4 = key[4];
+	__m128i k5 = key[5];
+	__m128i k6 = key[6];
+	__m128i k7 = key[7];
+
+	while (len >= 16) {
+		__m128i c0 = _mm_xor_si128(ctr0,k0);
+		ctr0 = _mm_shuffle_epi8(ctr0,swap128);
+		ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 1ULL),1LL)),swap128);
+		c0 = _mm_aesenc_si128(c0,k1);
+		c0 = _mm_aesenc_si128(c0,k2);
+		c0 = _mm_aesenc_si128(c0,k3);
+		c0 = _mm_aesenc_si128(c0,k4);
+		c0 = _mm_aesenc_si128(c0,k5);
+		c0 = _mm_aesenc_si128(c0,k6);
+		__m128i ka = key[8];
+		c0 = _mm_aesenc_si128(c0,k7);
+		__m128i kb = key[9];
+		c0 = _mm_aesenc_si128(c0,ka);
+		ka = key[10];
+		c0 = _mm_aesenc_si128(c0,kb);
+		kb = key[11];
+		c0 = _mm_aesenc_si128(c0,ka);
+		ka = key[12];
+		c0 = _mm_aesenc_si128(c0,kb);
+		kb = key[13];
+		c0 = _mm_aesenc_si128(c0,ka);
+		ka = key[14];
+		c0 = _mm_aesenc_si128(c0,kb);
+		_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,ka)));
+		in += 16;
+		out += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		__m128i c0 = _mm_xor_si128(ctr0,k0);
+		k0 = key[8];
+		c0 = _mm_aesenc_si128(c0,k1);
+		c0 = _mm_aesenc_si128(c0,k2);
+		k1 = key[9];
+		c0 = _mm_aesenc_si128(c0,k3);
+		c0 = _mm_aesenc_si128(c0,k4);
+		k2 = key[10];
+		c0 = _mm_aesenc_si128(c0,k5);
+		c0 = _mm_aesenc_si128(c0,k6);
+		k3 = key[11];
+		c0 = _mm_aesenc_si128(c0,k7);
+		c0 = _mm_aesenc_si128(c0,k0);
+		k0 = key[12];
+		c0 = _mm_aesenc_si128(c0,k1);
+		c0 = _mm_aesenc_si128(c0,k2);
+		k1 = key[13];
+		c0 = _mm_aesenc_si128(c0,k3);
+		c0 = _mm_aesenc_si128(c0,k0);
+		k2 = key[14];
+		c0 = _mm_aesenc_si128(c0,k1);
+		c0 = _mm_aesenclast_si128(c0,k2);
+		uint8_t tmp[16];
+		_mm_storeu_si128((__m128i *)tmp,c0);
+		for(unsigned int i=0;i<len;++i)
+			out[i] = in[i] ^ tmp[i];
+	}
+}
+
+#endif // ZT_AES_AESNI
+
 } // namespace ZeroTier

+ 8 - 372
node/AES.hpp

@@ -292,16 +292,21 @@ private:
 	/**************************************************************************/
 	union {
 #ifdef ZT_AES_ARMNEON
+		// ARM NEON key and GMAC parameters
 		struct {
 			uint32x4_t k[15];
 		} neon;
 #endif
+
 #ifdef ZT_AES_AESNI
+		// AES-NI key and GMAC parameters
 		struct {
 			__m128i k[15];
 			__m128i h,hh,hhh,hhhh;
 		} ni;
 #endif
+
+		// Software mode key and GMAC parameters
 		struct {
 			uint64_t h[2];
 			uint32_t ek[60];
@@ -393,78 +398,7 @@ private:
 #endif /*********************************************************************/
 
 #ifdef ZT_AES_AESNI /********************************************************/
-	static ZT_ALWAYS_INLINE __m128i _init256_1_aesni(__m128i a,__m128i b)
-	{
-		__m128i x,y;
-		b = _mm_shuffle_epi32(b,0xff);
-		y = _mm_slli_si128(a,0x04);
-		x = _mm_xor_si128(a,y);
-		y = _mm_slli_si128(y,0x04);
-		x = _mm_xor_si128(x,y);
-		y = _mm_slli_si128(y,0x04);
-		x = _mm_xor_si128(x,y);
-		x = _mm_xor_si128(x,b);
-		return x;
-	}
-	static ZT_ALWAYS_INLINE __m128i _init256_2_aesni(__m128i a,__m128i b)
-	{
-		__m128i x,y,z;
-		y = _mm_aeskeygenassist_si128(a,0x00);
-		z = _mm_shuffle_epi32(y,0xaa);
-		y = _mm_slli_si128(b,0x04);
-		x = _mm_xor_si128(b,y);
-		y = _mm_slli_si128(y,0x04);
-		x = _mm_xor_si128(x,y);
-		y = _mm_slli_si128(y,0x04);
-		x = _mm_xor_si128(x,y);
-		x = _mm_xor_si128(x,z);
-		return x;
-	}
-	ZT_ALWAYS_INLINE void _init_aesni(const uint8_t key[32])
-	{
-		__m128i t1,t2;
-		_k.ni.k[0] = t1 = _mm_loadu_si128((const __m128i *)key);
-		_k.ni.k[1] = t2 = _mm_loadu_si128((const __m128i *)(key+16));
-		_k.ni.k[2] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x01));
-		_k.ni.k[3] = t2 = _init256_2_aesni(t1,t2);
-		_k.ni.k[4] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x02));
-		_k.ni.k[5] = t2 = _init256_2_aesni(t1,t2);
-		_k.ni.k[6] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x04));
-		_k.ni.k[7] = t2 = _init256_2_aesni(t1,t2);
-		_k.ni.k[8] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x08));
-		_k.ni.k[9] = t2 = _init256_2_aesni(t1,t2);
-		_k.ni.k[10] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x10));
-		_k.ni.k[11] = t2 = _init256_2_aesni(t1,t2);
-		_k.ni.k[12] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x20));
-		_k.ni.k[13] = t2 = _init256_2_aesni(t1,t2);
-		_k.ni.k[14] = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x40));
-
-		__m128i h = _mm_xor_si128(_mm_setzero_si128(),_k.ni.k[0]);
-		h = _mm_aesenc_si128(h,_k.ni.k[1]);
-		h = _mm_aesenc_si128(h,_k.ni.k[2]);
-		h = _mm_aesenc_si128(h,_k.ni.k[3]);
-		h = _mm_aesenc_si128(h,_k.ni.k[4]);
-		h = _mm_aesenc_si128(h,_k.ni.k[5]);
-		h = _mm_aesenc_si128(h,_k.ni.k[6]);
-		h = _mm_aesenc_si128(h,_k.ni.k[7]);
-		h = _mm_aesenc_si128(h,_k.ni.k[8]);
-		h = _mm_aesenc_si128(h,_k.ni.k[9]);
-		h = _mm_aesenc_si128(h,_k.ni.k[10]);
-		h = _mm_aesenc_si128(h,_k.ni.k[11]);
-		h = _mm_aesenc_si128(h,_k.ni.k[12]);
-		h = _mm_aesenc_si128(h,_k.ni.k[13]);
-		h = _mm_aesenclast_si128(h,_k.ni.k[14]);
-
-		const __m128i shuf = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
-		__m128i hswap = _mm_shuffle_epi8(h,shuf);
-		__m128i hh = _mult_block_aesni(shuf,hswap,h);
-		__m128i hhh = _mult_block_aesni(shuf,hswap,hh);
-		__m128i hhhh = _mult_block_aesni(shuf,hswap,hhh);
-		_k.ni.h = hswap;
-		_k.ni.hh = _mm_shuffle_epi8(hh,shuf);
-		_k.ni.hhh = _mm_shuffle_epi8(hhh,shuf);
-		_k.ni.hhhh = _mm_shuffle_epi8(hhhh,shuf);
-	}
+	void _init_aesni(const uint8_t key[32]);
 
 	ZT_ALWAYS_INLINE void _encrypt_aesni(const void *in,void *out) const
 	{
@@ -487,306 +421,8 @@ private:
 		_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni.k[14]));
 	}
 
-	static ZT_ALWAYS_INLINE inline __m128i _mult_block_aesni(__m128i shuf,__m128i h,__m128i y)
-	{
-		y = _mm_shuffle_epi8(y,shuf);
-		__m128i t1 = _mm_clmulepi64_si128(h,y,0x00);
-		__m128i t2 = _mm_clmulepi64_si128(h,y,0x01);
-		__m128i t3 = _mm_clmulepi64_si128(h,y,0x10);
-		__m128i t4 = _mm_clmulepi64_si128(h,y,0x11);
-		t2 = _mm_xor_si128(t2,t3);
-		t3 = _mm_slli_si128(t2,8);
-		t2 = _mm_srli_si128(t2,8);
-		t1 = _mm_xor_si128(t1,t3);
-		t4 = _mm_xor_si128(t4,t2);
-		__m128i t5 = _mm_srli_epi32(t1,31);
-		t1 = _mm_slli_epi32(t1,1);
-		__m128i t6 = _mm_srli_epi32(t4,31);
-		t4 = _mm_slli_epi32(t4,1);
-		t3 = _mm_srli_si128(t5,12);
-		t6 = _mm_slli_si128(t6,4);
-		t5 = _mm_slli_si128(t5,4);
-		t1 = _mm_or_si128(t1,t5);
-		t4 = _mm_or_si128(t4,t6);
-		t4 = _mm_or_si128(t4,t3);
-		t5 = _mm_slli_epi32(t1,31);
-		t6 = _mm_slli_epi32(t1,30);
-		t3 = _mm_slli_epi32(t1,25);
-		t5 = _mm_xor_si128(t5,t6);
-		t5 = _mm_xor_si128(t5,t3);
-		t6 = _mm_srli_si128(t5,4);
-		t4 = _mm_xor_si128(t4,t6);
-		t5 = _mm_slli_si128(t5,12);
-		t1 = _mm_xor_si128(t1,t5);
-		t4 = _mm_xor_si128(t4,t1);
-		t5 = _mm_srli_epi32(t1,1);
-		t2 = _mm_srli_epi32(t1,2);
-		t3 = _mm_srli_epi32(t1,7);
-		t4 = _mm_xor_si128(t4,t2);
-		t4 = _mm_xor_si128(t4,t3);
-		t4 = _mm_xor_si128(t4,t5);
-		return _mm_shuffle_epi8(t4,shuf);
-	}
-	static inline __m128i _ghash_aesni(__m128i shuf,__m128i h,__m128i y,__m128i x) { return _mult_block_aesni(shuf,h,_mm_xor_si128(y,x)); }
-
-	ZT_ALWAYS_INLINE void _gmac_aesni(const uint8_t iv[12],const uint8_t *in,const unsigned int len,uint8_t out[16]) const
-	{
-		const __m128i *const ab = (const __m128i *)in;
-		const unsigned int blocks = len / 16;
-		const unsigned int pblocks = blocks - (blocks % 4);
-		const unsigned int rem = len % 16;
-
-		const __m128i shuf = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
-		__m128i y = _mm_setzero_si128();
-		unsigned int i = 0;
-		for (;i<pblocks;i+=4) {
-			__m128i d1 = _mm_shuffle_epi8(_mm_xor_si128(y,_mm_loadu_si128(ab + i + 0)),shuf);
-			__m128i d2 = _mm_shuffle_epi8(_mm_loadu_si128(ab + i + 1),shuf);
-			__m128i d3 = _mm_shuffle_epi8(_mm_loadu_si128(ab + i + 2),shuf);
-			__m128i d4 = _mm_shuffle_epi8(_mm_loadu_si128(ab + i + 3),shuf);
-			_mm_prefetch(ab + i + 4,_MM_HINT_T0);
-			__m128i t0 = _mm_clmulepi64_si128(_k.ni.hhhh,d1,0x00);
-			__m128i t1 = _mm_clmulepi64_si128(_k.ni.hhh,d2,0x00);
-			__m128i t2 = _mm_clmulepi64_si128(_k.ni.hh,d3,0x00);
-			__m128i t3 = _mm_clmulepi64_si128(_k.ni.h,d4,0x00);
-			__m128i t8 = _mm_xor_si128(t0,t1);
-			t8 = _mm_xor_si128(t8,t2);
-			t8 = _mm_xor_si128(t8,t3);
-			__m128i t4 = _mm_clmulepi64_si128(_k.ni.hhhh,d1,0x11);
-			__m128i t5 = _mm_clmulepi64_si128(_k.ni.hhh,d2,0x11);
-			__m128i t6 = _mm_clmulepi64_si128(_k.ni.hh,d3,0x11);
-			__m128i t7 = _mm_clmulepi64_si128(_k.ni.h,d4,0x11);
-			__m128i t9 = _mm_xor_si128(t4,t5);
-			t9 = _mm_xor_si128(t9,t6);
-			t9 = _mm_xor_si128(t9,t7);
-			t0 = _mm_shuffle_epi32(_k.ni.hhhh,78);
-			t4 = _mm_shuffle_epi32(d1,78);
-			t0 = _mm_xor_si128(t0,_k.ni.hhhh);
-			t4 = _mm_xor_si128(t4,d1);
-			t1 = _mm_shuffle_epi32(_k.ni.hhh,78);
-			t5 = _mm_shuffle_epi32(d2,78);
-			t1 = _mm_xor_si128(t1,_k.ni.hhh);
-			t5 = _mm_xor_si128(t5,d2);
-			t2 = _mm_shuffle_epi32(_k.ni.hh,78);
-			t6 = _mm_shuffle_epi32(d3,78);
-			t2 = _mm_xor_si128(t2,_k.ni.hh);
-			t6 = _mm_xor_si128(t6,d3);
-			t3 = _mm_shuffle_epi32(_k.ni.h,78);
-			t7 = _mm_shuffle_epi32(d4,78);
-			t3 = _mm_xor_si128(t3,_k.ni.h);
-			t7 = _mm_xor_si128(t7,d4);
-			t0 = _mm_clmulepi64_si128(t0,t4,0x00);
-			t1 = _mm_clmulepi64_si128(t1,t5,0x00);
-			t2 = _mm_clmulepi64_si128(t2,t6,0x00);
-			t3 = _mm_clmulepi64_si128(t3,t7,0x00);
-			t0 = _mm_xor_si128(t0,t8);
-			t0 = _mm_xor_si128(t0,t9);
-			t0 = _mm_xor_si128(t1,t0);
-			t0 = _mm_xor_si128(t2,t0);
-			t0 = _mm_xor_si128(t3,t0);
-			t4 = _mm_slli_si128(t0,8);
-			t0 = _mm_srli_si128(t0,8);
-			t3 = _mm_xor_si128(t4,t8);
-			t6 = _mm_xor_si128(t0,t9);
-			t7 = _mm_srli_epi32(t3,31);
-			t8 = _mm_srli_epi32(t6,31);
-			t3 = _mm_slli_epi32(t3,1);
-			t6 = _mm_slli_epi32(t6,1);
-			t9 = _mm_srli_si128(t7,12);
-			t8 = _mm_slli_si128(t8,4);
-			t7 = _mm_slli_si128(t7,4);
-			t3 = _mm_or_si128(t3,t7);
-			t6 = _mm_or_si128(t6,t8);
-			t6 = _mm_or_si128(t6,t9);
-			t7 = _mm_slli_epi32(t3,31);
-			t8 = _mm_slli_epi32(t3,30);
-			t9 = _mm_slli_epi32(t3,25);
-			t7 = _mm_xor_si128(t7,t8);
-			t7 = _mm_xor_si128(t7,t9);
-			t8 = _mm_srli_si128(t7,4);
-			t7 = _mm_slli_si128(t7,12);
-			t3 = _mm_xor_si128(t3,t7);
-			t2 = _mm_srli_epi32(t3,1);
-			t4 = _mm_srli_epi32(t3,2);
-			t5 = _mm_srli_epi32(t3,7);
-			t2 = _mm_xor_si128(t2,t4);
-			t2 = _mm_xor_si128(t2,t5);
-			t2 = _mm_xor_si128(t2,t8);
-			t3 = _mm_xor_si128(t3,t2);
-			t6 = _mm_xor_si128(t6,t3);
-			y = _mm_shuffle_epi8(t6,shuf);
-		}
-
-		for (;i<blocks;++i)
-			y = _ghash_aesni(shuf,_k.ni.h,y,_mm_loadu_si128(ab + i));
-
-		if (rem) {
-			__m128i last = _mm_setzero_si128();
-			memcpy(&last,ab + blocks,rem);
-			y = _ghash_aesni(shuf,_k.ni.h,y,last);
-		}
-
-		y = _ghash_aesni(shuf,_k.ni.h,y,_mm_set_epi64((__m64)0LL,(__m64)Utils::hton((uint64_t)len * (uint64_t)8)));
-
-		__m128i t = _mm_xor_si128(_mm_set_epi32(0x01000000,(int)*((const uint32_t *)(iv+8)),(int)*((const uint32_t *)(iv+4)),(int)*((const uint32_t *)(iv))),_k.ni.k[0]);
-		t = _mm_aesenc_si128(t,_k.ni.k[1]);
-		t = _mm_aesenc_si128(t,_k.ni.k[2]);
-		t = _mm_aesenc_si128(t,_k.ni.k[3]);
-		t = _mm_aesenc_si128(t,_k.ni.k[4]);
-		t = _mm_aesenc_si128(t,_k.ni.k[5]);
-		t = _mm_aesenc_si128(t,_k.ni.k[6]);
-		t = _mm_aesenc_si128(t,_k.ni.k[7]);
-		t = _mm_aesenc_si128(t,_k.ni.k[8]);
-		t = _mm_aesenc_si128(t,_k.ni.k[9]);
-		t = _mm_aesenc_si128(t,_k.ni.k[10]);
-		t = _mm_aesenc_si128(t,_k.ni.k[11]);
-		t = _mm_aesenc_si128(t,_k.ni.k[12]);
-		t = _mm_aesenc_si128(t,_k.ni.k[13]);
-		t = _mm_aesenclast_si128(t,_k.ni.k[14]);
-		_mm_storeu_si128((__m128i *)out,_mm_xor_si128(y,t));
-	}
-
-#define ZT_AES_CTR_AESNI_ROUND(kk) c0 = _mm_aesenc_si128(c0,kk); c1 = _mm_aesenc_si128(c1,kk); c2 = _mm_aesenc_si128(c2,kk); c3 = _mm_aesenc_si128(c3,kk);
-
-	static ZT_ALWAYS_INLINE void _ctr_aesni(const __m128i key[14],const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out)
-	{
-		/* Because our CTR supports full 128-bit nonces, we must do a full 128-bit (big-endian)
-		 * increment to be compatible with canonical NIST-certified CTR implementations. That's
-		 * because it's possible to have a lot of bit saturation in the least significant 64
-		 * bits, which could on rare occasions actually cause a 64-bit wrap. If this happened
-		 * without carry it would result in incompatibility and quietly dropped packets. The
-		 * probability is low, so this would be a one in billions packet loss bug that would
-		 * probably never be found.
-		 *
-		 * This crazy code does a branch-free 128-bit increment by adding a one or a zero to
-		 * the most significant 64 bits of the 128-bit vector based on whether the add we want
-		 * to do to the least significant 64 bits would overflow. This can be computed by
-		 * NOTing those bits and comparing with what we want to add, since NOT is the same
-		 * as subtracting from uint64_max. This generates branch-free ASM on x64 with most
-		 * good compilers. */
-		__m128i swap128 = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
-		__m128i ctr0 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)iv),swap128);
-		uint64_t notctr0msq = ~((uint64_t)_mm_extract_epi64(ctr0,0));
-		__m128i ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 1ULL),1LL)),swap128);
-		__m128i ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 2ULL),2LL)),swap128);
-		__m128i ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 3ULL),3LL)),swap128);
-		ctr0 = _mm_shuffle_epi8(ctr0,swap128);
-
-		__m128i k0 = key[0];
-		__m128i k1 = key[1];
-
-		while (len >= 64) {
-			__m128i ka = key[2];
-			__m128i c0 = _mm_xor_si128(ctr0,k0);
-			__m128i c1 = _mm_xor_si128(ctr1,k0);
-			__m128i c2 = _mm_xor_si128(ctr2,k0);
-			__m128i c3 = _mm_xor_si128(ctr3,k0);
-			ctr0 = _mm_shuffle_epi8(ctr0,swap128);
-			notctr0msq = ~((uint64_t)_mm_extract_epi64(ctr0,0));
-			ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 5ULL),5LL)),swap128);
-			ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 6ULL),6LL)),swap128);
-			ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 7ULL),7LL)),swap128);
-			ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 4ULL),4LL)),swap128);
-			__m128i kb = key[3];
-			ZT_AES_CTR_AESNI_ROUND(k1);
-			__m128i kc = key[4];
-			ZT_AES_CTR_AESNI_ROUND(ka);
-			__m128i kd = key[5];
-			ZT_AES_CTR_AESNI_ROUND(kb);
-			ka = key[6];
-			ZT_AES_CTR_AESNI_ROUND(kc);
-			kb = key[7];
-			ZT_AES_CTR_AESNI_ROUND(kd);
-			kc = key[8];
-			ZT_AES_CTR_AESNI_ROUND(ka);
-			kd = key[9];
-			ZT_AES_CTR_AESNI_ROUND(kb);
-			ka = key[10];
-			ZT_AES_CTR_AESNI_ROUND(kc);
-			kb = key[11];
-			ZT_AES_CTR_AESNI_ROUND(kd);
-			kc = key[12];
-			ZT_AES_CTR_AESNI_ROUND(ka);
-			kd = key[13];
-			ZT_AES_CTR_AESNI_ROUND(kb);
-			ka = key[14];
-			ZT_AES_CTR_AESNI_ROUND(kc);
-			ZT_AES_CTR_AESNI_ROUND(kd);
-			_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,ka)));
-			_mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,ka)));
-			_mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,ka)));
-			_mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,ka)));
-			in += 64;
-			out += 64;
-			len -= 64;
-		}
-
-		__m128i k2 = key[2];
-		__m128i k3 = key[3];
-		__m128i k4 = key[4];
-		__m128i k5 = key[5];
-		__m128i k6 = key[6];
-		__m128i k7 = key[7];
-
-		while (len >= 16) {
-			__m128i c0 = _mm_xor_si128(ctr0,k0);
-			ctr0 = _mm_shuffle_epi8(ctr0,swap128);
-			ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 1ULL),1LL)),swap128);
-			c0 = _mm_aesenc_si128(c0,k1);
-			c0 = _mm_aesenc_si128(c0,k2);
-			c0 = _mm_aesenc_si128(c0,k3);
-			c0 = _mm_aesenc_si128(c0,k4);
-			c0 = _mm_aesenc_si128(c0,k5);
-			c0 = _mm_aesenc_si128(c0,k6);
-			__m128i ka = key[8];
-			c0 = _mm_aesenc_si128(c0,k7);
-			__m128i kb = key[9];
-			c0 = _mm_aesenc_si128(c0,ka);
-			ka = key[10];
-			c0 = _mm_aesenc_si128(c0,kb);
-			kb = key[11];
-			c0 = _mm_aesenc_si128(c0,ka);
-			ka = key[12];
-			c0 = _mm_aesenc_si128(c0,kb);
-			kb = key[13];
-			c0 = _mm_aesenc_si128(c0,ka);
-			ka = key[14];
-			c0 = _mm_aesenc_si128(c0,kb);
-			_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,ka)));
-			in += 16;
-			out += 16;
-			len -= 16;
-		}
-
-		if (len) {
-			__m128i c0 = _mm_xor_si128(ctr0,k0);
-			k0 = key[8];
-			c0 = _mm_aesenc_si128(c0,k1);
-			c0 = _mm_aesenc_si128(c0,k2);
-			k1 = key[9];
-			c0 = _mm_aesenc_si128(c0,k3);
-			c0 = _mm_aesenc_si128(c0,k4);
-			k2 = key[10];
-			c0 = _mm_aesenc_si128(c0,k5);
-			c0 = _mm_aesenc_si128(c0,k6);
-			k3 = key[11];
-			c0 = _mm_aesenc_si128(c0,k7);
-			c0 = _mm_aesenc_si128(c0,k0);
-			k0 = key[12];
-			c0 = _mm_aesenc_si128(c0,k1);
-			c0 = _mm_aesenc_si128(c0,k2);
-			k1 = key[13];
-			c0 = _mm_aesenc_si128(c0,k3);
-			c0 = _mm_aesenc_si128(c0,k0);
-			k2 = key[14];
-			c0 = _mm_aesenc_si128(c0,k1);
-			c0 = _mm_aesenclast_si128(c0,k2);
-			uint8_t tmp[16];
-			_mm_storeu_si128((__m128i *)tmp,c0);
-			for(unsigned int i=0;i<len;++i)
-				out[i] = in[i] ^ tmp[i];
-		}
-	}
+	void _gmac_aesni(const uint8_t iv[12],const uint8_t *in,const unsigned int len,uint8_t out[16]) const;
+	static void _ctr_aesni(const __m128i key[14],const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out);
 #endif /* ZT_AES_AESNI ******************************************************/
 };
 

+ 7 - 0
node/C25519.cpp

@@ -2366,6 +2366,13 @@ static inline void get_hram(unsigned char *hram, const unsigned char *sm, const
 
 namespace ZeroTier {
 
+void C25519::generate(uint8_t pub[ZT_C25519_PUBLIC_KEY_LEN],uint8_t priv[ZT_C25519_PRIVATE_KEY_LEN])
+{
+	Utils::getSecureRandom(priv,ZT_C25519_PRIVATE_KEY_LEN);
+	_calcPubDH(pub,priv);
+	_calcPubED(pub,priv);
+}
+
 void C25519::agree(const uint8_t mine[ZT_C25519_PRIVATE_KEY_LEN],const uint8_t their[ZT_C25519_PUBLIC_KEY_LEN],uint8_t rawkey[32])
 {
 	crypto_scalarmult(rawkey,mine,their);

+ 3 - 7
node/C25519.hpp

@@ -14,6 +14,7 @@
 #ifndef ZT_C25519_HPP
 #define ZT_C25519_HPP
 
+#include "Constants.hpp"
 #include "Utils.hpp"
 
 namespace ZeroTier {
@@ -32,12 +33,7 @@ public:
 	/**
 	 * Generate a C25519 elliptic curve key pair
 	 */
-	static inline void generate(uint8_t pub[ZT_C25519_PUBLIC_KEY_LEN],uint8_t priv[ZT_C25519_PRIVATE_KEY_LEN])
-	{
-		Utils::getSecureRandom(priv,ZT_C25519_PRIVATE_KEY_LEN);
-		_calcPubDH(pub,priv);
-		_calcPubED(pub,priv);
-	}
+	static void generate(uint8_t pub[ZT_C25519_PUBLIC_KEY_LEN],uint8_t priv[ZT_C25519_PRIVATE_KEY_LEN]);
 
 	/**
 	 * Generate a key pair satisfying a condition
@@ -53,7 +49,7 @@ public:
 	 * @tparam F Type of 'cond'
 	 */
 	template<typename F>
-	static inline void generateSatisfying(F cond,uint8_t pub[ZT_C25519_PUBLIC_KEY_LEN],uint8_t priv[ZT_C25519_PRIVATE_KEY_LEN])
+	static ZT_ALWAYS_INLINE void generateSatisfying(F cond,uint8_t pub[ZT_C25519_PUBLIC_KEY_LEN],uint8_t priv[ZT_C25519_PRIVATE_KEY_LEN])
 	{
 		Utils::getSecureRandom(priv,ZT_C25519_PRIVATE_KEY_LEN);
 		_calcPubED(pub,priv); // do Ed25519 key -- bytes 32-63 of pub and priv

+ 87 - 13
node/Endpoint.hpp

@@ -49,23 +49,97 @@ public:
 		UNRECOGNIZED = 255 // Unrecognized endpoint type encountered in stream
 	};
 
-	ZT_ALWAYS_INLINE Endpoint() { memset(reinterpret_cast<void *>(this),0,sizeof(Endpoint)); }
+	ZT_ALWAYS_INLINE Endpoint()
+	{
+		memset(reinterpret_cast<void *>(this),0,sizeof(Endpoint));
+	}
+
+	ZT_ALWAYS_INLINE Endpoint(const Endpoint &ep)
+	{
+		memcpy(reinterpret_cast<void *>(this),&ep,sizeof(Endpoint));
+	}
+
+	explicit ZT_ALWAYS_INLINE Endpoint(const InetAddress &sa) :
+		_t(INETADDR)
+	{
+		_v.sa = sa;
+	}
+
+	ZT_ALWAYS_INLINE Endpoint(const Address &zt,const uint8_t identityHash[ZT_IDENTITY_HASH_SIZE]) :
+		_t(ZEROTIER)
+	{
+		_v.zt.a = zt.toInt();
+		memcpy(_v.zt.idh,identityHash,ZT_IDENTITY_HASH_SIZE);
+	}
+
+	ZT_ALWAYS_INLINE Endpoint(const char *name,const int port) :
+		_t(DNSNAME)
+	{
+		_v.dns.port = port;
+		Utils::scopy(_v.dns.name,sizeof(_v.dns.name),name);
+	}
 
-	explicit ZT_ALWAYS_INLINE Endpoint(const InetAddress &sa) : _t(INETADDR) { _v.sa = sa; }
-	ZT_ALWAYS_INLINE Endpoint(const Address &zt,const uint8_t identityHash[ZT_IDENTITY_HASH_SIZE]) : _t(ZEROTIER) { _v.zt.a = zt.toInt(); memcpy(_v.zt.idh,identityHash,ZT_IDENTITY_HASH_SIZE); }
-	ZT_ALWAYS_INLINE Endpoint(const char *name,const int port) : _t(DNSNAME) { Utils::scopy(_v.dns.name,sizeof(_v.dns.name),name); _v.dns.port = port; }
-	explicit ZT_ALWAYS_INLINE Endpoint(const char *url) : _t(URL) { Utils::scopy(_v.url,sizeof(_v.url),url); }
+	explicit ZT_ALWAYS_INLINE Endpoint(const char *url) :
+		_t(URL)
+	{
+		Utils::scopy(_v.url,sizeof(_v.url),url);
+	}
+
+	ZT_ALWAYS_INLINE Endpoint &operator=(const Endpoint &ep)
+	{
+		memcpy(reinterpret_cast<void *>(this),&ep,sizeof(Endpoint));
+		return *this;
+	}
 
-	ZT_ALWAYS_INLINE const InetAddress *inetAddr() const { return (_t == INETADDR) ? reinterpret_cast<const InetAddress *>(&_v.sa) : nullptr; }
-	ZT_ALWAYS_INLINE const char *dnsName() const { return (_t == DNSNAME) ? _v.dns.name : nullptr; }
+	ZT_ALWAYS_INLINE Endpoint &operator=(const InetAddress &sa)
+	{
+		_t = INETADDR;
+		_v.sa = sa;
+		return *this;
+	}
+
+	/**
+	 * @return InetAddress or NIL if not of this type
+	 */
+	ZT_ALWAYS_INLINE const InetAddress &inetAddr() const { return (_t == INETADDR) ? *reinterpret_cast<const InetAddress *>(&_v.sa) : InetAddress::NIL; }
+
+	/**
+	 * @return DNS name or empty string if not of this type
+	 */
+	ZT_ALWAYS_INLINE const char *dnsName() const { return (_t == DNSNAME) ? _v.dns.name : ""; }
+
+	/**
+	 * @return Port associated with DNS name or -1 if not of this type
+	 */
 	ZT_ALWAYS_INLINE int dnsPort() const { return (_t == DNSNAME) ? _v.dns.port : -1; }
+
+	/**
+	 * @return ZeroTier address or NIL if not of this type
+	 */
 	ZT_ALWAYS_INLINE Address ztAddress() const { return Address((_t == ZEROTIER) ? _v.zt.a : (uint64_t)0); }
+
+	/**
+	 * @return 384-bit hash of identity keys or NULL if not of this type
+	 */
 	ZT_ALWAYS_INLINE const uint8_t *ztIdentityHash() const { return (_t == ZEROTIER) ? _v.zt.idh : nullptr; }
-	ZT_ALWAYS_INLINE const char *url() const { return (_t == URL) ? _v.url : nullptr; }
+
+	/**
+	 * @return URL or empty string if not of this type
+	 */
+	ZT_ALWAYS_INLINE const char *url() const { return (_t == URL) ? _v.url : ""; }
+
+	/**
+	 * @return Ethernet address or NIL if not of this type
+	 */
 	ZT_ALWAYS_INLINE MAC ethernet() const { return (_t == ETHERNET) ? MAC(_v.eth) : MAC(); }
 
+	/**
+	 * @return Endpoint type or NIL if unset/empty
+	 */
 	ZT_ALWAYS_INLINE Type type() const { return _t; }
 
+	explicit ZT_ALWAYS_INLINE operator bool() const { return _t != NIL; }
+
 	bool operator==(const Endpoint &ep) const;
 	ZT_ALWAYS_INLINE bool operator!=(const Endpoint &ep) const { return (!(*this == ep)); }
 	bool operator<(const Endpoint &ep) const;
@@ -82,14 +156,14 @@ private:
 	int _l[3]; // X,Y,Z location in kilometers from the nearest gravitational center of mass
 	union {
 		struct sockaddr_storage sa;
-		struct {
-			char name[ZT_ENDPOINT_MAX_NAME_SIZE];
+		ZT_PACKED_STRUCT(struct {
 			uint16_t port;
-		} dns;
-		struct {
+			char name[ZT_ENDPOINT_MAX_NAME_SIZE];
+		}) dns;
+		ZT_PACKED_STRUCT(struct {
 			uint64_t a;
 			uint8_t idh[ZT_IDENTITY_HASH_SIZE];
-		} zt;
+		}) zt;
 		char url[ZT_ENDPOINT_MAX_NAME_SIZE];
 		uint64_t eth;
 	} _v;

+ 72 - 50
node/Node.cpp

@@ -37,13 +37,13 @@ namespace ZeroTier {
 /* Public Node interface (C++, exposed via CAPI bindings)                   */
 /****************************************************************************/
 
-Node::Node(void *uPtr, void *tPtr, const struct ZT_Node_Callbacks *callbacks, int64_t now) :
+Node::Node(void *uPtr,void *tPtr,const struct ZT_Node_Callbacks *callbacks,int64_t now) :
 	_RR(this),
 	RR(&_RR),
 	_cb(*callbacks),
 	_uPtr(uPtr),
 	_networks(),
-	_networksMask(255),
+	_networksMask(63),
 	_now(now),
 	_lastPing(0),
 	_lastHousekeepingRun(0),
@@ -51,68 +51,58 @@ Node::Node(void *uPtr, void *tPtr, const struct ZT_Node_Callbacks *callbacks, in
 	_lastPathKeepaliveCheck(0),
 	_online(false)
 {
-	_networks.resize(256); // _networksMask + 1, must be power of two
+	_networks.resize(64); // _networksMask + 1, must be power of two
 
 	memset((void *)_expectingRepliesToBucketPtr,0,sizeof(_expectingRepliesToBucketPtr));
 	memset((void *)_expectingRepliesTo,0,sizeof(_expectingRepliesTo));
 	memset((void *)_lastIdentityVerification,0,sizeof(_lastIdentityVerification));
 
-	uint64_t idtmp[2];
-	idtmp[0] = 0; idtmp[1] = 0;
-	char tmp[2048];
-	int n = stateObjectGet(tPtr, ZT_STATE_OBJECT_IDENTITY_SECRET, idtmp, tmp, sizeof(tmp) - 1);
-	if (n > 0) {
-		tmp[n] = (char)0;
-		if (RR->identity.fromString(tmp)) {
+	uint64_t idtmp[2]; idtmp[0] = 0; idtmp[1] = 0;
+	std::vector<uint8_t> data(stateObjectGet(tPtr,ZT_STATE_OBJECT_IDENTITY_SECRET,idtmp));
+	bool haveIdentity = false;
+	if (!data.empty()) {
+		data.push_back(0); // zero-terminate string
+		if (RR->identity.fromString((const char *)data.data())) {
 			RR->identity.toString(false,RR->publicIdentityStr);
 			RR->identity.toString(true,RR->secretIdentityStr);
-		} else {
-			n = -1;
+			haveIdentity = true;
 		}
 	}
 
-	if (n <= 0) {
+	if (!haveIdentity) {
 		RR->identity.generate(Identity::C25519);
 		RR->identity.toString(false,RR->publicIdentityStr);
 		RR->identity.toString(true,RR->secretIdentityStr);
 		idtmp[0] = RR->identity.address().toInt(); idtmp[1] = 0;
-		stateObjectPut(tPtr, ZT_STATE_OBJECT_IDENTITY_SECRET, idtmp, RR->secretIdentityStr, (unsigned int)strlen(RR->secretIdentityStr));
-		stateObjectPut(tPtr, ZT_STATE_OBJECT_IDENTITY_PUBLIC, idtmp, RR->publicIdentityStr, (unsigned int)strlen(RR->publicIdentityStr));
+		stateObjectPut(tPtr,ZT_STATE_OBJECT_IDENTITY_SECRET,idtmp,RR->secretIdentityStr,(unsigned int)strlen(RR->secretIdentityStr));
+		stateObjectPut(tPtr,ZT_STATE_OBJECT_IDENTITY_PUBLIC,idtmp,RR->publicIdentityStr,(unsigned int)strlen(RR->publicIdentityStr));
 	} else {
 		idtmp[0] = RR->identity.address().toInt(); idtmp[1] = 0;
-		n = stateObjectGet(tPtr, ZT_STATE_OBJECT_IDENTITY_PUBLIC, idtmp, tmp, sizeof(tmp) - 1);
-		if ((n > 0)&&(n < (int)sizeof(RR->publicIdentityStr))&&(n < (int)sizeof(tmp))) {
-			if (memcmp(tmp,RR->publicIdentityStr,n) != 0)
-				stateObjectPut(tPtr, ZT_STATE_OBJECT_IDENTITY_PUBLIC, idtmp, RR->publicIdentityStr, (unsigned int)strlen(RR->publicIdentityStr));
-		}
+		data = stateObjectGet(tPtr,ZT_STATE_OBJECT_IDENTITY_PUBLIC,idtmp);
+		if ((data.empty())||(memcmp(data.data(),RR->publicIdentityStr,strlen(RR->publicIdentityStr)) != 0))
+			stateObjectPut(tPtr,ZT_STATE_OBJECT_IDENTITY_PUBLIC,idtmp,RR->publicIdentityStr,(unsigned int)strlen(RR->publicIdentityStr));
 	}
 
-	char *m = (char *)0;
+	char *m = nullptr;
 	try {
-		const unsigned long ts = sizeof(Trace) + (((sizeof(Trace) & 0xf) != 0) ? (16 - (sizeof(Trace) & 0xf)) : 0);
-		const unsigned long sws = sizeof(Switch) + (((sizeof(Switch) & 0xf) != 0) ? (16 - (sizeof(Switch) & 0xf)) : 0);
-		const unsigned long topologys = sizeof(Topology) + (((sizeof(Topology) & 0xf) != 0) ? (16 - (sizeof(Topology) & 0xf)) : 0);
-		const unsigned long sas = sizeof(SelfAwareness) + (((sizeof(SelfAwareness) & 0xf) != 0) ? (16 - (sizeof(SelfAwareness) & 0xf)) : 0);
-
-		m = reinterpret_cast<char *>(malloc(16 + ts + sws + topologys + sas));
+		m = reinterpret_cast<char *>(malloc(16 + sizeof(Trace) + sizeof(Switch) + sizeof(Topology) + sizeof(SelfAwareness)));
 		if (!m)
 			throw std::bad_alloc();
 		RR->rtmem = m;
 		while (((uintptr_t)m & 0xfU) != 0) ++m;
-
 		RR->t = new (m) Trace(RR);
-		m += ts;
+		m += sizeof(Trace);
 		RR->sw = new (m) Switch(RR);
-		m += sws;
-		RR->topology = new (m) Topology(RR,RR->identity);
-		m += topologys;
+		m += sizeof(Switch);
+		RR->topology = new (m) Topology(RR,RR->identity,tPtr);
+		m += sizeof(Topology);
 		RR->sa = new (m) SelfAwareness(RR);
 	} catch ( ... ) {
 		if (RR->sa) RR->sa->~SelfAwareness();
 		if (RR->topology) RR->topology->~Topology();
 		if (RR->sw) RR->sw->~Switch();
 		if (RR->t) RR->t->~Trace();
-		::free(m);
+		if (m) ::free(m);
 		throw;
 	}
 
@@ -123,7 +113,8 @@ Node::~Node()
 {
 	{
 		RWMutex::Lock _l(_networks_m);
-		_networks.clear(); // destroy all networks before shutdown
+		for(std::vector< SharedPtr<Network> >::iterator i(_networks.begin());i!=_networks.end();++i)
+			i->zero();
 	}
 	if (RR->sa) RR->sa->~SelfAwareness();
 	if (RR->topology) RR->topology->~Topology();
@@ -375,25 +366,22 @@ ZT_ResultCode Node::multicastUnsubscribe(uint64_t nwid,uint64_t multicastGroup,u
 	} else return ZT_RESULT_ERROR_NETWORK_NOT_FOUND;
 }
 
-ZT_ResultCode Node::addRoot(const char *identity)
+ZT_ResultCode Node::addRoot(void *tptr,const ZT_Identity *identity,const sockaddr_storage *bootstrap)
 {
 	if (!identity)
 		return ZT_RESULT_ERROR_BAD_PARAMETER;
-	Identity id;
-	if (!id.fromString(identity))
-		return ZT_RESULT_ERROR_BAD_PARAMETER;
-	RR->topology->addRoot(id);
+	InetAddress a;
+	if (bootstrap)
+		a = bootstrap;
+	RR->topology->addRoot(tptr,*reinterpret_cast<const Identity *>(identity),a);
 	return ZT_RESULT_OK;
 }
 
-ZT_ResultCode Node::removeRoot(const char *identity)
+ZT_ResultCode Node::removeRoot(void *tptr,const ZT_Identity *identity)
 {
 	if (!identity)
 		return ZT_RESULT_ERROR_BAD_PARAMETER;
-	Identity id;
-	if (!id.fromString(identity))
-		return ZT_RESULT_ERROR_BAD_PARAMETER;
-	RR->topology->removeRoot(id);
+	RR->topology->removeRoot(*reinterpret_cast<const Identity *>(identity));
 	return ZT_RESULT_OK;
 }
 
@@ -434,6 +422,7 @@ ZT_PeerList *Node::peers() const
 		p->address = (*pi)->address().toInt();
 		identities[pl->peerCount] = (*pi)->identity(); // need to make a copy in case peer gets deleted
 		p->identity = &identities[pl->peerCount];
+		(*pi)->identity().hash(p->identityHash,false);
 		if ((*pi)->remoteVersionKnown()) {
 			p->versionMajor = (int)(*pi)->remoteVersionMajor();
 			p->versionMinor = (int)(*pi)->remoteVersionMinor();
@@ -447,6 +436,7 @@ ZT_PeerList *Node::peers() const
 		if (p->latency >= 0xffff)
 			p->latency = -1;
 		p->role = RR->topology->isRoot((*pi)->identity()) ? ZT_PEER_ROLE_ROOT : ZT_PEER_ROLE_LEAF;
+		memcpy(&p->bootstrap,&((*pi)->bootstrap()),sizeof(sockaddr_storage));
 
 		std::vector< SharedPtr<Path> > paths;
 		(*pi)->getAllPaths(paths);
@@ -561,7 +551,29 @@ void Node::setController(void *networkControllerInstance)
 /* Node methods used only within node/                                      */
 /****************************************************************************/
 
-bool Node::shouldUsePathForZeroTierTraffic(void *tPtr,const Address &ztaddr,const int64_t localSocket,const InetAddress &remoteAddress)
+std::vector<uint8_t> Node::stateObjectGet(void *const tPtr,ZT_StateObjectType type,const uint64_t id[2])
+{
+	std::vector<uint8_t> r;
+	if (_cb.stateGetFunction) {
+		void *data = 0;
+		void (*freeFunc)(void *) = 0;
+		int l = _cb.stateGetFunction(
+			reinterpret_cast<ZT_Node *>(this),
+			_uPtr,
+			tPtr,
+			type,
+			id,
+			&data,
+			&freeFunc);
+		if ((l > 0)&&(data)&&(freeFunc)) {
+			r.assign(reinterpret_cast<const uint8_t *>(data),reinterpret_cast<const uint8_t *>(data) + l);
+			freeFunc(data);
+		}
+	}
+	return r;
+}
+
+bool Node::shouldUsePathForZeroTierTraffic(void *tPtr,const Identity &id,const int64_t localSocket,const InetAddress &remoteAddress)
 {
 	if (Path::isAddressValidForPath(remoteAddress)) {
 		RWMutex::RLock l(_networks_m);
@@ -576,7 +588,17 @@ bool Node::shouldUsePathForZeroTierTraffic(void *tPtr,const Address &ztaddr,cons
 	} else {
 		return false;
 	}
-	return ((_cb.pathCheckFunction) ? (_cb.pathCheckFunction(reinterpret_cast<ZT_Node *>(this),_uPtr,tPtr,ztaddr.toInt(),localSocket,reinterpret_cast<const struct sockaddr_storage *>(&remoteAddress)) != 0) : true);
+	if (_cb.pathCheckFunction) {
+		return (_cb.pathCheckFunction(
+			reinterpret_cast<ZT_Node *>(this),
+			_uPtr,
+			tPtr,
+			id.address().toInt(),
+			(const ZT_Identity *)&id,
+			localSocket,
+			reinterpret_cast<const struct sockaddr_storage *>(&remoteAddress)) != 0);
+	}
+	return true;
 }
 
 bool Node::externalPathLookup(void *tPtr,const Identity &id,int family,InetAddress &addr)
@@ -840,10 +862,10 @@ enum ZT_ResultCode ZT_Node_multicastUnsubscribe(ZT_Node *node,uint64_t nwid,uint
 	}
 }
 
-enum ZT_ResultCode ZT_Node_addRoot(ZT_Node *node,const char *identity)
+enum ZT_ResultCode ZT_Node_addRoot(ZT_Node *node,void *tptr,const ZT_Identity *identity,const struct sockaddr_storage *bootstrap)
 {
 	try {
-		return reinterpret_cast<ZeroTier::Node *>(node)->addRoot(identity);
+		return reinterpret_cast<ZeroTier::Node *>(node)->addRoot(tptr,identity,bootstrap);
 	} catch (std::bad_alloc &exc) {
 		return ZT_RESULT_FATAL_ERROR_OUT_OF_MEMORY;
 	} catch ( ... ) {
@@ -851,10 +873,10 @@ enum ZT_ResultCode ZT_Node_addRoot(ZT_Node *node,const char *identity)
 	}
 }
 
-enum ZT_ResultCode ZT_Node_removeRoot(ZT_Node *node,const char *identity)
+enum ZT_ResultCode ZT_Node_removeRoot(ZT_Node *node,void *tptr,const ZT_Identity *identity)
 {
 	try {
-		return reinterpret_cast<ZeroTier::Node *>(node)->removeRoot(identity);
+		return reinterpret_cast<ZeroTier::Node *>(node)->removeRoot(tptr,identity);
 	} catch (std::bad_alloc &exc) {
 		return ZT_RESULT_FATAL_ERROR_OUT_OF_MEMORY;
 	} catch ( ... ) {

+ 24 - 12
node/Node.hpp

@@ -95,8 +95,8 @@ public:
 	ZT_ResultCode leave(uint64_t nwid,void **uptr,void *tptr);
 	ZT_ResultCode multicastSubscribe(void *tptr,uint64_t nwid,uint64_t multicastGroup,unsigned long multicastAdi);
 	ZT_ResultCode multicastUnsubscribe(uint64_t nwid,uint64_t multicastGroup,unsigned long multicastAdi);
-	ZT_ResultCode addRoot(const char *identity);
-	ZT_ResultCode removeRoot(const char *identity);
+	ZT_ResultCode addRoot(void *tptr,const ZT_Identity *identity,const sockaddr_storage *bootstrap);
+	ZT_ResultCode removeRoot(void *tptr,const ZT_Identity *identity);
 	uint64_t address() const;
 	void status(ZT_NodeStatus *status) const;
 	ZT_PeerList *peers() const;
@@ -194,7 +194,10 @@ public:
 	 * @param ev Event object
 	 * @param md Event data or NULL if none
 	 */
-	ZT_ALWAYS_INLINE void postEvent(void *tPtr,ZT_Event ev,const void *md = (const void *)0) { _cb.eventCallback(reinterpret_cast<ZT_Node *>(this),_uPtr,tPtr,ev,md); }
+	ZT_ALWAYS_INLINE void postEvent(void *tPtr,ZT_Event ev,const void *md = (const void *)0)
+	{
+		_cb.eventCallback(reinterpret_cast<ZT_Node *>(this),_uPtr,tPtr,ev,md);
+	}
 
 	/**
 	 * Post network port configuration via external callback
@@ -205,7 +208,10 @@ public:
 	 * @param op Config operation or event type
 	 * @param nc Network config info
 	 */
-	ZT_ALWAYS_INLINE void configureVirtualNetworkPort(void *tPtr,uint64_t nwid,void **nuptr,ZT_VirtualNetworkConfigOperation op,const ZT_VirtualNetworkConfig *nc) { _cb.virtualNetworkConfigFunction(reinterpret_cast<ZT_Node *>(this),_uPtr,tPtr,nwid,nuptr,op,nc); }
+	ZT_ALWAYS_INLINE void configureVirtualNetworkPort(void *tPtr,uint64_t nwid,void **nuptr,ZT_VirtualNetworkConfigOperation op,const ZT_VirtualNetworkConfig *nc)
+	{
+		_cb.virtualNetworkConfigFunction(reinterpret_cast<ZT_Node *>(this),_uPtr,tPtr,nwid,nuptr,op,nc);
+	}
 
 	/**
 	 * @return True if node appears online
@@ -218,11 +224,9 @@ public:
 	 * @param tPtr Thread pointer
 	 * @param type Object type to get
 	 * @param id Object ID
-	 * @param data Data buffer
-	 * @param maxlen Maximum data length
-	 * @return Number of bytes actually read or 0 if not found
+	 * @return Vector containing data or empty vector if not found or empty
 	 */
-	ZT_ALWAYS_INLINE int stateObjectGet(void *const tPtr,ZT_StateObjectType type,const uint64_t id[2],void *const data,const unsigned int maxlen) { return _cb.stateGetFunction(reinterpret_cast<ZT_Node *>(this),_uPtr,tPtr,type,id,data,maxlen); }
+	std::vector<uint8_t> stateObjectGet(void *const tPtr,ZT_StateObjectType type,const uint64_t id[2]);
 
 	/**
 	 * Store a state object
@@ -233,7 +237,11 @@ public:
 	 * @param data Data to store
 	 * @param len Length of data
 	 */
-	ZT_ALWAYS_INLINE void stateObjectPut(void *const tPtr,ZT_StateObjectType type,const uint64_t id[2],const void *const data,const unsigned int len) { _cb.statePutFunction(reinterpret_cast<ZT_Node *>(this),_uPtr,tPtr,type,id,data,(int)len); }
+	ZT_ALWAYS_INLINE void stateObjectPut(void *const tPtr,ZT_StateObjectType type,const uint64_t id[2],const void *const data,const unsigned int len)
+	{
+		if (_cb.statePutFunction)
+			_cb.statePutFunction(reinterpret_cast<ZT_Node *>(this),_uPtr,tPtr,type,id,data,(int)len);
+	}
 
 	/**
 	 * Delete a state object
@@ -242,7 +250,11 @@ public:
 	 * @param type Object type to delete
 	 * @param id Object ID
 	 */
-	ZT_ALWAYS_INLINE void stateObjectDelete(void *const tPtr,ZT_StateObjectType type,const uint64_t id[2]) { _cb.statePutFunction(reinterpret_cast<ZT_Node *>(this),_uPtr,tPtr,type,id,(const void *)0,-1); }
+	ZT_ALWAYS_INLINE void stateObjectDelete(void *const tPtr,ZT_StateObjectType type,const uint64_t id[2])
+	{
+		if (_cb.statePutFunction)
+			_cb.statePutFunction(reinterpret_cast<ZT_Node *>(this),_uPtr,tPtr,type,id,(const void *)0,-1);
+	}
 
 	/**
 	 * Check whether a path should be used for ZeroTier traffic
@@ -250,12 +262,12 @@ public:
 	 * This performs internal checks and also calls out to an external callback if one is defined.
 	 *
 	 * @param tPtr Thread pointer
-	 * @param ztaddr ZeroTier address
+	 * @param id Identity of peer
 	 * @param localSocket Local socket or -1 if unknown
 	 * @param remoteAddress Remote address
 	 * @return True if path should be used
 	 */
-	bool shouldUsePathForZeroTierTraffic(void *tPtr,const Address &ztaddr,const int64_t localSocket,const InetAddress &remoteAddress);
+	bool shouldUsePathForZeroTierTraffic(void *tPtr,const Identity &id,const int64_t localSocket,const InetAddress &remoteAddress);
 
 	/**
 	 * Query callback for a physical address for a peer

+ 3 - 0
node/OS.hpp

@@ -145,6 +145,9 @@
 #define __CPP11__
 #endif
 #endif
+#ifndef __CPP11__
+#define nullptr (0)
+#endif
 
 #ifdef SOCKET
 #define ZT_SOCKET SOCKET

+ 8 - 8
node/Peer.cpp

@@ -114,7 +114,7 @@ void Peer::received(
 			_prioritizePaths(now);
 			RR->t->peerLearnedNewPath(tPtr,networkId,*this,path,packetId);
 		} else {
-			if (RR->node->shouldUsePathForZeroTierTraffic(tPtr,_id.address(),path->localSocket(),path->address())) {
+			if (RR->node->shouldUsePathForZeroTierTraffic(tPtr,_id,path->localSocket(),path->address())) {
 				sendHELLO(tPtr,path->localSocket(),path->address(),now);
 				path->sent(now);
 				RR->t->peerConfirmingUnknownPath(tPtr,networkId,*this,path,packetId,verb);
@@ -128,10 +128,10 @@ path_check_done:
 		_lastAttemptedP2PInit = now;
 
 		InetAddress addr;
-		if (_bootstrap)
-			sendHELLO(tPtr,-1,_bootstrap,now);
+		if (_bootstrap.type() == Endpoint::INETADDR)
+			sendHELLO(tPtr,-1,_bootstrap.inetAddr(),now);
 		if (RR->node->externalPathLookup(tPtr,_id,-1,addr)) {
-			if (RR->node->shouldUsePathForZeroTierTraffic(tPtr,_id.address(),-1,addr))
+			if (RR->node->shouldUsePathForZeroTierTraffic(tPtr,_id,-1,addr))
 				sendHELLO(tPtr,-1,addr,now);
 		}
 
@@ -212,7 +212,7 @@ bool Peer::shouldTryPath(void *tPtr,int64_t now,const SharedPtr<Peer> &suggested
 			}
 		}
 	}
-	return ( ((int)addr.ipScope() > maxHaveScope) && RR->node->shouldUsePathForZeroTierTraffic(tPtr,_id.address(),-1,addr) );
+	return ( ((int)addr.ipScope() > maxHaveScope) && RR->node->shouldUsePathForZeroTierTraffic(tPtr,_id,-1,addr) );
 }
 
 void Peer::sendHELLO(void *tPtr,const int64_t localSocket,const InetAddress &atAddress,int64_t now)
@@ -254,11 +254,11 @@ void Peer::ping(void *tPtr,int64_t now,const bool pingAllAddressTypes)
 		return;
 	}
 
-	if (_bootstrap)
-		sendHELLO(tPtr,-1,_bootstrap,now);
+	if (_bootstrap.type() == Endpoint::INETADDR)
+		sendHELLO(tPtr,-1,_bootstrap.inetAddr(),now);
 
 	SharedPtr<Peer> r(RR->topology->root());
-	if (r) {
+	if ((r)&&(r.ptr() != this)) {
 		SharedPtr<Path> rp(r->path(now));
 		if (rp) {
 			sendHELLO(tPtr,rp->localSocket(),rp->address(),now);

+ 19 - 1
node/Peer.hpp

@@ -27,6 +27,7 @@
 #include "AtomicCounter.hpp"
 #include "Hashtable.hpp"
 #include "Mutex.hpp"
+#include "Endpoint.hpp"
 #include "Locator.hpp"
 
 #include <vector>
@@ -162,6 +163,23 @@ public:
 	 */
 	void updateLatency(unsigned int l);
 
+	/**
+	 * @return Bootstrap address or NULL if none
+	 */
+	ZT_ALWAYS_INLINE const Endpoint &bootstrap() const { return _bootstrap; }
+
+	/**
+	 * Set bootstrap endpoint
+	 *
+	 * @param ep Bootstrap endpoint
+	 */
+	ZT_ALWAYS_INLINE void setBootstrap(const Endpoint &ep)
+	{
+		_lock.lock();
+		_bootstrap = ep;
+		_lock.unlock();
+	}
+
 	/**
 	 * @return Time of last receive of anything, whether direct or relayed
 	 */
@@ -299,7 +317,7 @@ private:
 
 	Identity _id;
 	Locator _locator;
-	InetAddress _bootstrap;
+	Endpoint _bootstrap; // right now only InetAddress endpoints are supported for bootstrap
 
 	uint16_t _vProto;
 	uint16_t _vMajor;

+ 40 - 8
node/SelfAwareness.cpp

@@ -103,33 +103,65 @@ void SelfAwareness::clean(int64_t now)
 {
 	Mutex::Lock l(_phy_l);
 	Hashtable< PhySurfaceKey,PhySurfaceEntry >::Iterator i(_phy);
-	PhySurfaceKey *k = (PhySurfaceKey *)0;
-	PhySurfaceEntry *e = (PhySurfaceEntry *)0;
+	PhySurfaceKey *k = nullptr;
+	PhySurfaceEntry *e = nullptr;
 	while (i.next(k,e)) {
 		if ((now - e->ts) >= ZT_SELFAWARENESS_ENTRY_TIMEOUT)
 			_phy.erase(*k);
 	}
 }
 
+bool SelfAwareness::symmetricNat(const int64_t now) const
+{
+	Hashtable< InetAddress,std::pair< std::set<int>,std::set<int64_t> > > ipToPortsAndLocalSockets(16);
+
+	{
+		Mutex::Lock l(_phy_l);
+		Hashtable<PhySurfaceKey,PhySurfaceEntry>::Iterator i(const_cast<SelfAwareness *>(this)->_phy);
+		PhySurfaceKey *k = nullptr;
+		PhySurfaceEntry *e = nullptr;
+		while (i.next(k,e)) {
+			if ((now - e->ts) < ZT_SELFAWARENESS_ENTRY_TIMEOUT) {
+				std::pair< std::set<int>,std::set<int64_t> > &ii = ipToPortsAndLocalSockets[e->mySurface.ipOnly()];
+				ii.first.insert(e->mySurface.port());
+				if (k->receivedOnLocalSocket != -1)
+					ii.second.insert(k->receivedOnLocalSocket);
+			}
+		}
+	}
+
+	Hashtable< InetAddress,std::pair< std::set<int>,std::set<int64_t> > >::Iterator i(ipToPortsAndLocalSockets);
+	InetAddress *k = nullptr;
+	std::pair< std::set<int>,std::set<int64_t> > *v = nullptr;
+	while (i.next(k,v)) {
+		if (v->first.size() > v->second.size()) // more external ports than local sockets for a given external IP
+			return true;
+	}
+	return false;
+}
+
 std::multimap<unsigned long,InetAddress> SelfAwareness::externalAddresses(const int64_t now) const
 {
-	Hashtable<InetAddress,unsigned long> counts;
+	std::multimap<unsigned long,InetAddress> r;
+	Hashtable<InetAddress,unsigned long> counts(16);
+
 	{
 		Mutex::Lock l(_phy_l);
 		Hashtable<PhySurfaceKey,PhySurfaceEntry>::Iterator i(const_cast<SelfAwareness *>(this)->_phy);
-		PhySurfaceKey *k = (PhySurfaceKey *)0;
-		PhySurfaceEntry *e = (PhySurfaceEntry *)0;
+		PhySurfaceKey *k = nullptr;
+		PhySurfaceEntry *e = nullptr;
 		while (i.next(k,e)) {
 			if ((now - e->ts) < ZT_SELFAWARENESS_ENTRY_TIMEOUT)
 				++counts[e->mySurface];
 		}
 	}
-	std::multimap<unsigned long,InetAddress> r;
+
 	Hashtable<InetAddress,unsigned long>::Iterator i(counts);
-	InetAddress *k = (InetAddress *)0;
-	unsigned long *c = (unsigned long *)0;
+	InetAddress *k = nullptr;
+	unsigned long *c = nullptr;
 	while (i.next(k,c))
 		r.insert(std::pair<unsigned long,InetAddress>(*c,*k));
+
 	return r;
 }
 

+ 11 - 1
node/SelfAwareness.hpp

@@ -27,7 +27,9 @@ namespace ZeroTier {
 class RuntimeEnvironment;
 
 /**
- * Tracks changes to this peer's real world addresses
+ * SelfAwareness manages awareness of this peer's external address(es) and NAT situation.
+ *
+ * This code should not be capable of achieving sentience and triggering the Terminator wars.
  */
 class SelfAwareness
 {
@@ -54,6 +56,14 @@ public:
 	 */
 	void clean(int64_t now);
 
+	/**
+	 * Check whether this node appears to be behind a symmetric NAT
+	 *
+	 * @param now Current time
+	 * @return True if it looks like we're behind a symmetric NAT
+	 */
+	bool symmetricNat(int64_t now) const;
+
 	/**
 	 * Get external address consensus, which is the statistical "mode" of external addresses.
 	 *

+ 14 - 18
node/Str.hpp

@@ -20,15 +20,14 @@
 #include "MAC.hpp"
 #include "InetAddress.hpp"
 
-#include <string>
-
-#define ZT_STR_CAPACITY 1021
-
 namespace ZeroTier {
 
 /**
  * A short non-allocating replacement for std::string
+ *
+ * @tparam C Maximum capacity (default: 1021 to make total size 1024)
  */
+template<unsigned long C = 1021>
 class Str
 {
 public:
@@ -47,10 +46,7 @@ public:
 		_s[0] = 0;
 		(*this) << s;
 	}
-	ZT_ALWAYS_INLINE Str(const std::string &s)
-	{
-		*this = s;
-	}
+	ZT_ALWAYS_INLINE Str(const std::string &s) { *this = s; }
 
 	ZT_ALWAYS_INLINE Str &operator=(const Str &s)
 	{
@@ -66,7 +62,7 @@ public:
 	}
 	ZT_ALWAYS_INLINE Str &operator=(const std::string &s)
 	{
-		if (s.length() > ZT_STR_CAPACITY) {
+		if (s.length() > C) {
 			_l = 0;
 			_s[0] = 0;
 			throw ZT_EXCEPTION_OUT_OF_BOUNDS;
@@ -99,9 +95,9 @@ public:
 		if (likely(s != (const char *)0)) {
 			unsigned long l = _l;
 			while (*s) {
-				if (unlikely(l >= ZT_STR_CAPACITY)) {
-					_s[ZT_STR_CAPACITY] = 0;
-					_l = ZT_STR_CAPACITY;
+				if (unlikely(l >= C)) {
+					_s[C] = 0;
+					_l = C;
 					throw ZT_EXCEPTION_OUT_OF_BOUNDS;
 				}
 				_s[l++] = *(s++);
@@ -114,8 +110,8 @@ public:
 	ZT_ALWAYS_INLINE Str &operator<<(const Str &s) { return ((*this) << s._s); }
 	ZT_ALWAYS_INLINE Str &operator<<(const char c)
 	{
-		if (unlikely(_l >= ZT_STR_CAPACITY)) {
-			_s[ZT_STR_CAPACITY] = 0;
+		if (unlikely(_l >= C)) {
+			_s[C] = 0;
 			throw ZT_EXCEPTION_OUT_OF_BOUNDS;
 		}
 		_s[(unsigned long)(_l++)] = c;
@@ -157,9 +153,9 @@ public:
 			unsigned int c = 0;
 			while (*s) {
 				if (c++ >= max) break;
-				if (unlikely(l >= ZT_STR_CAPACITY)) {
-					_s[ZT_STR_CAPACITY] = 0;
-					_l = ZT_STR_CAPACITY;
+				if (unlikely(l >= C)) {
+					_s[C] = 0;
+					_l = C;
 					throw ZT_EXCEPTION_OUT_OF_BOUNDS;
 				}
 				_s[l++] = *s;
@@ -191,7 +187,7 @@ public:
 
 private:
 	uint16_t _l;
-	char _s[ZT_STR_CAPACITY+1];
+	char _s[C+1];
 };
 
 } // namespace ZeroTier

+ 1 - 1
node/Switch.hpp

@@ -58,7 +58,7 @@ public:
 	 * @param data Packet data
 	 * @param len Packet length
 	 */
-	void onRemotePacket(void *tPtr,const int64_t localSocket,const InetAddress &fromAddr,const void *data,unsigned int len);
+	void onRemotePacket(void *tPtr,int64_t localSocket,const InetAddress &fromAddr,const void *data,unsigned int len);
 
 	/**
 	 * Called when a packet comes from a local Ethernet tap

+ 65 - 3
node/Topology.cpp

@@ -15,6 +15,7 @@
 
 namespace ZeroTier {
 
+// Sorts roots so as to put the lowest latency alive root first.
 struct _RootSortComparisonOperator
 {
 	ZT_ALWAYS_INLINE _RootSortComparisonOperator(const int64_t now) : _now(now) {}
@@ -26,18 +27,43 @@ struct _RootSortComparisonOperator
 				return (a->latency() < b->latency());
 			return true;
 		}
-		return false;
+		return a->bootstrap() >= b->bootstrap();
 	}
 	const int64_t _now;
 };
 
-Topology::Topology(const RuntimeEnvironment *renv,const Identity &myId) :
+Topology::Topology(const RuntimeEnvironment *renv,const Identity &myId,void *tPtr) :
 	RR(renv),
 	_myIdentity(myId),
 	_numConfiguredPhysicalPaths(0),
 	_peers(128),
 	_paths(256)
 {
+	uint64_t idtmp[2]; idtmp[0] = 0; idtmp[1] = 0;
+	std::vector<uint8_t> data(RR->node->stateObjectGet(tPtr,ZT_STATE_OBJECT_ROOTS,idtmp));
+	if (!data.empty()) {
+		uint8_t *dptr = data.data();
+		int drem = (int)data.size();
+		while (drem > 0) {
+			Identity id;
+			int l = id.unmarshal(dptr,drem);
+			if (l > 0) {
+				_roots.insert(id);
+				dptr += l;
+				drem -= l;
+			}
+		}
+	}
+
+	for(std::set<Identity>::const_iterator r(_roots.begin());r!=_roots.end();++r) {
+		SharedPtr<Peer> p;
+		_loadCached(tPtr,r->address(),p);
+		if ((!p)||(p->identity() != *r)) {
+			p.set(new Peer(RR));
+			p->init(myId,*r);
+		}
+		_rootPeers.push_back(p);
+	}
 }
 
 Topology::~Topology()
@@ -103,7 +129,7 @@ void Topology::setPhysicalPathConfiguration(const struct sockaddr_storage *pathN
 	}
 }
 
-void Topology::addRoot(const Identity &id)
+void Topology::addRoot(void *tPtr,const Identity &id,const InetAddress &bootstrap)
 {
 	if (id == _myIdentity) return; // sanity check
 	RWMutex::Lock l1(_peers_l);
@@ -113,8 +139,25 @@ void Topology::addRoot(const Identity &id)
 		if (!p) {
 			p.set(new Peer(RR));
 			p->init(_myIdentity,id);
+			if (bootstrap)
+				p->setBootstrap(Endpoint(bootstrap));
 		}
 		_rootPeers.push_back(p);
+
+		uint8_t *const roots = (uint8_t *)malloc(ZT_IDENTITY_MARSHAL_SIZE_MAX * _roots.size());
+		if (roots) {
+			int p = 0;
+			for(std::set<Identity>::const_iterator i(_roots.begin());i!=_roots.end();++i) {
+				int pp = i->marshal(roots + p,false);
+				if (pp > 0)
+					p += pp;
+			}
+			uint64_t id[2];
+			id[0] = 0;
+			id[1] = 0;
+			RR->node->stateObjectPut(tPtr,ZT_STATE_OBJECT_ROOTS,id,roots,(unsigned int)p);
+			free(roots);
+		}
 	}
 }
 
@@ -182,6 +225,25 @@ void Topology::saveAll(void *tPtr)
 
 void Topology::_loadCached(void *tPtr,const Address &zta,SharedPtr<Peer> &peer)
 {
+	uint64_t id[2];
+	id[0] = zta.toInt();
+	id[1] = 0;
+	std::vector<uint8_t> data(RR->node->stateObjectGet(tPtr,ZT_STATE_OBJECT_PEER,id));
+	if (!data.empty()) {
+		const uint8_t *d = data.data();
+		int dl = (int)data.size();
+		for(;;) {
+			Peer *const p = new Peer(RR);
+			int n = p->unmarshal(d,dl);
+			if (n > 0) {
+				// TODO: will eventually handle multiple peers
+				peer.set(p);
+				return;
+			} else {
+				delete p;
+			}
+		}
+	}
 }
 
 } // namespace ZeroTier

+ 4 - 2
node/Topology.hpp

@@ -45,7 +45,7 @@ class RuntimeEnvironment;
 class Topology
 {
 public:
-	Topology(const RuntimeEnvironment *renv,const Identity &myId);
+	Topology(const RuntimeEnvironment *renv,const Identity &myId,void *tPtr);
 	~Topology();
 
 	/**
@@ -280,9 +280,11 @@ public:
 	/**
 	 * Add a root server's identity to the root server set
 	 *
+	 * @param tPtr Thread pointer
 	 * @param id Root server identity
+	 * @param bootstrap If non-NULL, a bootstrap address to attempt to find this root
 	 */
-	void addRoot(const Identity &id);
+	void addRoot(void *tPtr,const Identity &id,const InetAddress &bootstrap);
 
 	/**
 	 * Remove a root server's identity from the root server set

+ 18 - 21
node/Utils.cpp

@@ -101,16 +101,16 @@ char *decimal(unsigned long n,char s[24])
 
 char *hex10(uint64_t i,char s[11])
 {
-	s[0] = HEXCHARS[(i >> 36) & 0xf];
-	s[1] = HEXCHARS[(i >> 32) & 0xf];
-	s[2] = HEXCHARS[(i >> 28) & 0xf];
-	s[3] = HEXCHARS[(i >> 24) & 0xf];
-	s[4] = HEXCHARS[(i >> 20) & 0xf];
-	s[5] = HEXCHARS[(i >> 16) & 0xf];
-	s[6] = HEXCHARS[(i >> 12) & 0xf];
-	s[7] = HEXCHARS[(i >> 8) & 0xf];
-	s[8] = HEXCHARS[(i >> 4) & 0xf];
-	s[9] = HEXCHARS[i & 0xf];
+	s[0] = HEXCHARS[(i >> 36U) & 0xfU];
+	s[1] = HEXCHARS[(i >> 32U) & 0xfU];
+	s[2] = HEXCHARS[(i >> 28U) & 0xfU];
+	s[3] = HEXCHARS[(i >> 24U) & 0xfU];
+	s[4] = HEXCHARS[(i >> 20U) & 0xfU];
+	s[5] = HEXCHARS[(i >> 16U) & 0xfU];
+	s[6] = HEXCHARS[(i >> 12U) & 0xfU];
+	s[7] = HEXCHARS[(i >> 8U) & 0xfU];
+	s[8] = HEXCHARS[(i >> 4U) & 0xfU];
+	s[9] = HEXCHARS[i & 0xfU];
 	s[10] = (char)0;
 	return s;
 }
@@ -120,8 +120,8 @@ char *hex(const void *d,unsigned int l,char *s)
 	char *const save = s;
 	for(unsigned int i=0;i<l;++i) {
 		const unsigned int b = reinterpret_cast<const uint8_t *>(d)[i];
-		*(s++) = HEXCHARS[b >> 4];
-		*(s++) = HEXCHARS[b & 0xf];
+		*(s++) = HEXCHARS[b >> 4U];
+		*(s++) = HEXCHARS[b & 0xfU];
 	}
 	*s = (char)0;
 	return save;
@@ -166,7 +166,7 @@ void getSecureRandom(void *buf,unsigned int bytes)
 	static Mutex globalLock;
 	static bool initialized = false;
 	static uint64_t randomState[4];
-	static uint8_t randomBuf[16384];
+	static uint8_t randomBuf[65536];
 	static unsigned long randomPtr = sizeof(randomBuf);
 
 	Mutex::Lock gl(globalLock);
@@ -225,10 +225,7 @@ void getSecureRandom(void *buf,unsigned int bytes)
 #endif
 			}
 
-			for(unsigned int k=0;k<4;++k) { // treat random state like a 256-bit counter; endian-ness is irrelevant since we just want random
-				if (++randomState[k] != 0)
-					break;
-			}
+			for(int k=0;k<4;++k) { if (++randomState[k] != 0) break; }
 			uint8_t h[48];
 			HMACSHA384((const uint8_t *)randomState,randomBuf,sizeof(randomBuf),h); // compute HMAC on random buffer using state as secret key
 			AES c(h);
@@ -253,8 +250,8 @@ int b32e(const uint8_t *data,int length,char *result,int bufSize)
     while (count < bufSize && (bitsLeft > 0 || next < length)) {
       if (bitsLeft < 5) {
         if (next < length) {
-          buffer <<= 8;
-          buffer |= data[next++] & 0xFF;
+          buffer <<= 8U;
+          buffer |= data[next++] & 0xffU;
           bitsLeft += 8;
         } else {
           int pad = 5 - bitsLeft;
@@ -262,7 +259,7 @@ int b32e(const uint8_t *data,int length,char *result,int bufSize)
           bitsLeft += pad;
         }
       }
-      int index = 0x1F & (buffer >> (bitsLeft - 5));
+      int index = 0x1f & (buffer >> (unsigned int)(bitsLeft - 5));
       bitsLeft -= 5;
       result[count++] = "abcdefghijklmnopqrstuvwxyz234567"[index];
     }
@@ -296,7 +293,7 @@ int b32d(const char *encoded,uint8_t *result,int bufSize)
     }
 
     if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) {
-      ch = (ch & 0x1F) - 1;
+      ch = (ch & 0x1f) - 1;
     } else if (ch >= '2' && ch <= '7') {
       ch -= '2' - 26;
     } else {

+ 14 - 2
node/Utils.hpp

@@ -26,7 +26,6 @@
 #include <immintrin.h>
 #endif
 
-#include <string>
 #include <stdexcept>
 #include <vector>
 #include <map>
@@ -64,7 +63,12 @@ extern const char HEXCHARS[16];
 bool secureEq(const void *a,const void *b,unsigned int len);
 
 /**
- * Zero memory, ensuring to avoid any compiler optimizations or other things that may stop this.
+ * Be absolutely sure to zero memory
+ *
+ * This uses some hacks to be totally sure the compiler does not optimize it out.
+ *
+ * @param ptr Memory to zero
+ * @param len Length of memory in bytes
  */
 void burn(void *ptr,unsigned int len);
 
@@ -174,6 +178,14 @@ uint64_t random();
  */
 bool scopy(char *dest,unsigned int len,const char *src);
 
+/**
+ * Wrapper around reentrant strtok functions, which differ in name by platform
+ *
+ * @param str String to tokenize or NULL for subsequent calls
+ * @param delim Delimiter
+ * @param saveptr Pointer to pointer where function can save state
+ * @return Next token or NULL if none
+ */
 static ZT_ALWAYS_INLINE char *stok(char *str,const char *delim,char **saveptr)
 {
 #ifdef __WINDOWS__