Browse Source

Some perf stuff, docs, HELLO design tweaks for ephemeral keys.

Adam Ierymenko 5 years ago
parent
commit
369df245e3
3 changed files with 147 additions and 243 deletions
  1. BIN
      doc/2015-GCM-SIV.pdf
  2. 81 41
      node/Protocol.hpp
  3. 66 202
      node/Utils.hpp

BIN
doc/2015-GCM-SIV.pdf


+ 81 - 41
node/Protocol.hpp

@@ -252,17 +252,17 @@
 /**
  * HELLO exchange meta-data: ephemeral C25519 public key
  */
-#define ZT_PROTO_HELLO_NODE_META_EPHEMERAL_KEY_C25519 "e0"
+#define ZT_PROTO_HELLO_NODE_META_EPHEMERAL_C25519 "e0"
 
 /**
  * HELLO exchange meta-data: ephemeral NIST P-384 public key
  */
-#define ZT_PROTO_HELLO_NODE_META_EPHEMERAL_KEY_P384 "e1"
+#define ZT_PROTO_HELLO_NODE_META_EPHEMERAL_P384 "e1"
 
 /**
  * HELLO exchange meta-data: address(es) of nodes to whom this node will relay
  */
-#define ZT_PROTO_HELLO_NODE_META_WILL_RELAY_TO "wr"
+#define ZT_PROTO_HELLO_NODE_META_NEIGHBORS "wr"
 
 /**
  * HELLO exchange meta-data: X coordinate of your node (sent in OK(HELLO))
@@ -304,53 +304,93 @@ enum Verb
 	/**
 	 * Announcement of a node's existence and vitals:
 	 *   <[1] protocol version>
-	 *   <[1] software major version>
-	 *   <[1] software minor version>
-	 *   <[2] software revision>
-	 *   <[8] timestamp for determining latency>
+	 *   <[1] software major version (LEGACY)>
+	 *   <[1] software minor version (LEGACY)>
+	 *   <[2] software revision (LEGACY)>
+	 *   <[8] timestamp for determining latency (LEGACY)>
 	 *   <[...] binary serialized identity>
-	 *   <[...] physical destination address of packet>
-	 *   [... begin encrypted region ...]
-	 *   <[2] 16-bit reserved (legacy) field, always 0>
-	 *   <[2] 16-bit length of meta-data dictionary>
-	 *   <[...] meta-data dictionary>
-	 *   <[2] 16-bit length of any additional fields>
-	 *   [... end encrypted region ...]
-	 *   <[48] HMAC-SHA384 of packet (with hops field masked to 0)>
+	 *   <[...] physical destination address of packet (LEGACY)>
+	 *   <[2] 16-bit reserved "encrypted zero" field (LEGACY)>
+	 *   <[...] encrypted dictionary>
+	 *   <[2] 16-bit length of preceding encrypted dictionary>
+	 *   <[48] HMAC-SHA384 of plaintext packet (with hops masked to 0)>
+	 *
+	 * HELLO is sent to initiate a new pairing between two nodes.
+	 *
+	 * HELLO is the only packet ever sent without normal payload encryption,
+	 * though an inner encrypted envelope exists to obscure all fields that
+	 * do not need to be sent in the clear. HELLO's MAC field contains a
+	 * Poly1305 MAC for backward compatibility, and v2.x adds an additional
+	 * HMAC-SHA384 at the end for stronger authentication of sessions. HELLO
+	 * authentication is performed using the long-lived identity key only,
+	 * and the encryption of the inner dictionary field is done using a key
+	 * derived from this identity key explicitly for this purpose.
+	 *
+	 * The main payload of HELLO is the protocol version and the full identity
+	 * of the sender, which includes the sender's public key(s). An encrypted
+	 * dictionary (key/value store) is also included for additional information.
+	 * This is encrypted using AES-CTR with a derived key and using the final
+	 * 96 bits of the packet's HMAC-SHA384 as the CTR IV. (The HMAC authenticates
+	 * the packet prior to this field being encrypted, making this a SIV
+	 * construction much like AES-GMAC-SIV.)
+	 *
+	 * The length of the dictionary field is included immediately after it so
+	 * that it can be decrypted and the HMAC validated without performing any
+	 * parsing of anything else, since it's a good idea to authenticate any
+	 * message as early as possible in any secure protocol.
+	 *
+	 * V1.x will ignore the HMAC and dictionary fields as it doesn't understand
+	 * them, but the packet is constructed so that 1.x nodes will parse what
+	 * they need to communicate with 2.x nodes (without forward secrecy) as long
+	 * as we wish to support this.
+	 *
+	 * Several legacy fields are present as well for the benefit of 1.x nodes.
+	 * These will go away and become simple reserved space once 1.x is no longer
+	 * supported. Some are self-explanatory. The "encrypted zero" is rather
+	 * strange. It's a 16-bit zero value encrypted using Salsa20/12 and the
+	 * long-lived identity key shared by the two peers. It tells 1.x that an
+	 * old encrypted field is no longer there and that it should stop parsing
+	 * the packet at that point.
+	 *
+	 * The following fields are nearly always present and must exist to support
+	 * forward secrecy (in the case of the instance ID, keys, and key revision)
+	 * or federated root membership (in the case of the locator).
 	 *
-	 * HELLO is sent using the POLY1305_NONE cipher setting (MAC but
-	 * no encryption) and as of protocol version 11 contains an extra
-	 * HMAC-SHA384 MAC for additional authentication hardening.
+	 *   TIMESTAMP - node's timestamp in milliseconds (supersedes legacy field)
+	 *   INSTANCE_ID - a 64-bit unique value generated on each node start
+	 *   EPHEMERAL_C25519 - an ephemeral Curve25519 public key
+	 *   EPHEMERAL_P384 - an ephemeral NIST P-384 public key
+	 *   EPHEMERAL_REVISION - 64-bit monotonically increasing per-instance counter
+	 *   LOCATOR - signed record enumerating this node's trusted contact points
 	 *
-	 * The physical desgination address is the raw InetAddress to which the
-	 * packet was sent, regardless of any relaying used.
+	 * The following optional fields may also be present:
 	 *
-	 * HELLO packets have an encrypted section that is encrypted with
-	 * Salsa20/12 using the two peers' long-term negotiated keys and with
-	 * the packet ID (with least significant 3 bits masked to 0 for legacy
-	 * reasons) as the Salsa20/12 IV. This encryption is technically not
-	 * necessary but serves to protect the privacy of locators and other
-	 * fields for a little added defense in depth. Note to auditors: for FIPS
-	 * or other auditing purposes this crypto can be ignored as its
-	 * compromise poses no risk to peer or network authentication or transport
-	 * data privacy. HMAC is computed after this encryption is performed and
-	 * is verified before decryption is performed.
+	 *   NAME - abitrary short user-defined name for this node
+	 *   CONTACT - arbitrary short contact information string for this node
+	 *   NEIGHBORS - addresses of node(s) to whom we'll relay (mesh-like routing)
+	 *   LOC_X, LOC_Y, LOC_Z - location relative to the nearest large center of mass
+	 *   PEER_LOC_X, PEER_LOC_Y, PEER_LOC_Z - where sender thinks peer is located
+	 *   SOFTWARE_VENDOR - short name or description of vendor, such as a URL
+	 *   SOFTWARE_VERSION - major, minor, revision, and build, and 16-bit integers
+	 *   PHYSICAL_DEST - serialized Endpoint to which this message was sent
+	 *   VIRTUAL_DEST - ZeroTier address of first hop (if first hop wasn't destination)
+	 *   COMPLIANCE - bit mask containing bits for e.g. a FIPS-compliant node
 	 *
 	 * A valid and successfully authenticated HELLO will generate the following
-	 * OK response which contains much of the same information about the
-	 * responding peer.
+	 * OK response. It contains an echo of the timestamp supplied by the
+	 * initiating peer, the protocol version, and a dictionary containing
+	 * the same information about the responding peer as the originating peer
+	 * sent.
 	 *
 	 * OK payload:
-	 *   <[8] timestamp echoed from original HELLO packet>
+	 *   <[8] timestamp echoed from original HELLO>
 	 *   <[1] protocol version>
-	 *   <[1] software major version>
-	 *   <[1] software minor version>
-	 *   <[2] software revision>
-	 *   <[...] physical destination address of packet>
-	 *   <[2] 16-bit reserved (legacy) field, currently must be 0>
-	 *   <[2] 16-bit length of meta-data dictionary>
-	 *   <[...] meta-data dictionary>
-	 *   <[2] 16-bit length of any additional fields>
+	 *   <[1] software major version (LEGACY)>
+	 *   <[1] software minor version (LEGACY)>
+	 *   <[2] software revision (LEGACY)>
+	 *   <[...] physical destination address of packet (LEGACY)>
+	 *   <[2] 16-bit reserved zero field (LEGACY)>
+	 *   <[...] dictionary>
 	 *   <[48] HMAC-SHA384 of plaintext packet (with hops masked to 0)>
 	 */
 	VERB_HELLO = 0x01,

+ 66 - 202
node/Utils.hpp

@@ -583,99 +583,18 @@ static ZT_INLINE void storeLittleEndian(void *const p,const I i) noexcept
 #endif
 }
 
-template<unsigned int L>
-static ZT_INLINE void copy(void *dest,const void *src) noexcept;
-template<>
-ZT_INLINE void copy<64>(void *const dest,const void *const src) noexcept
-{
-#ifdef ZT_ARCH_X64
-	__m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
-	__m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 1);
-	__m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 2);
-	__m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 3);
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest),a);
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1,b);
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 2,c);
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 3,d);
-#else
-	uint64_t a = reinterpret_cast<const uint64_t *>(src)[0];
-	uint64_t b = reinterpret_cast<const uint64_t *>(src)[1];
-	uint64_t c = reinterpret_cast<const uint64_t *>(src)[2];
-	uint64_t d = reinterpret_cast<const uint64_t *>(src)[3];
-	uint64_t e = reinterpret_cast<const uint64_t *>(src)[4];
-	uint64_t f = reinterpret_cast<const uint64_t *>(src)[5];
-	uint64_t g = reinterpret_cast<const uint64_t *>(src)[6];
-	uint64_t h = reinterpret_cast<const uint64_t *>(src)[7];
-	reinterpret_cast<uint64_t *>(dest)[0] = a;
-	reinterpret_cast<uint64_t *>(dest)[1] = b;
-	reinterpret_cast<uint64_t *>(dest)[2] = c;
-	reinterpret_cast<uint64_t *>(dest)[3] = d;
-	reinterpret_cast<uint64_t *>(dest)[4] = e;
-	reinterpret_cast<uint64_t *>(dest)[5] = f;
-	reinterpret_cast<uint64_t *>(dest)[6] = g;
-	reinterpret_cast<uint64_t *>(dest)[7] = h;
-#endif
-}
-template<>
-ZT_INLINE void copy<32>(void *const dest,const void *const src) noexcept
-{
-#ifdef ZT_ARCH_X64
-	__m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
-	__m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 1);
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest),a);
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1,b);
-#else
-	uint64_t a = reinterpret_cast<const uint64_t *>(src)[0];
-	uint64_t b = reinterpret_cast<const uint64_t *>(src)[1];
-	uint64_t c = reinterpret_cast<const uint64_t *>(src)[2];
-	uint64_t d = reinterpret_cast<const uint64_t *>(src)[3];
-	reinterpret_cast<uint64_t *>(dest)[0] = a;
-	reinterpret_cast<uint64_t *>(dest)[1] = b;
-	reinterpret_cast<uint64_t *>(dest)[2] = c;
-	reinterpret_cast<uint64_t *>(dest)[3] = d;
-#endif
-}
-template<>
-ZT_INLINE void copy<16>(void *const dest,const void *const src) noexcept
-{
-#ifdef ZT_ARCH_X64
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest),_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
-#else
-	uint64_t a = reinterpret_cast<const uint64_t *>(src)[0];
-	uint64_t b = reinterpret_cast<const uint64_t *>(src)[1];
-	reinterpret_cast<uint64_t *>(dest)[0] = a;
-	reinterpret_cast<uint64_t *>(dest)[1] = b;
-#endif
-}
-template<>
-ZT_INLINE void copy<8>(void *const dest,const void *const src) noexcept
-{
-	*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
-}
-template<>
-ZT_INLINE void copy<4>(void *const dest,const void *const src) noexcept
-{
-	*reinterpret_cast<uint32_t *>(dest) = *reinterpret_cast<const uint32_t *>(src);
-}
-template<>
-ZT_INLINE void copy<2>(void *const dest,const void *const src) noexcept
-{
-	*reinterpret_cast<uint16_t *>(dest) = *reinterpret_cast<const uint16_t *>(src);
-}
-template<>
-ZT_INLINE void copy<1>(void *const dest,const void *const src) noexcept
-{
-	*reinterpret_cast<uint8_t *>(dest) = *reinterpret_cast<const uint8_t *>(src);
-}
-template<>
-ZT_INLINE void copy<0>(void *const dest,const void *const src) noexcept
-{
-}
+/**
+ * Copy memory block whose size is known at compile time
+ *
+ * @tparam L Size of memory
+ * @param dest Destination memory
+ * @param src Source memory
+ */
 template<unsigned int L>
 static ZT_INLINE void copy(void *const dest,const void *const src) noexcept
 {
 #ifdef ZT_NO_UNALIGNED_ACCESS
-	if ((((uintptr_t)dest | (uintptr_t)src) & 7U) != 0) {
+	if ((((uintptr_t)dest | (uintptr_t)src) & (sizeof(uintptr_t) - 1)) != 0) {
 		memcpy(dest,src,L);
 		return;
 	}
@@ -684,154 +603,99 @@ static ZT_INLINE void copy(void *const dest,const void *const src) noexcept
 	uint8_t *d = reinterpret_cast<uint8_t *>(dest);
 	const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
 
+#ifdef ZT_ARCH_X64
 	for(unsigned int i=0;i<(L / 64U);++i) {
-		copy<64>(d,s);
+		__m128i x0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s));
+		__m128i x1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s) + 1);
+		__m128i x2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s) + 2);
+		__m128i x3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s) + 3);
+		_mm_storeu_si128(reinterpret_cast<__m128i *>(d),x0);
+		_mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 1,x1);
+		_mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 2,x2);
+		_mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 3,x3);
 		d += 64;
 		s += 64;
 	}
 	if ((L & 63U) >= 32U) {
-		copy<32>(d,s);
+		__m128i x0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s));
+		__m128i x1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s) + 1);
+		_mm_storeu_si128(reinterpret_cast<__m128i *>(d),x0);
+		_mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 1,x1);
 		d += 32;
 		s += 32;
 	}
 	if ((L & 31U) >= 16U) {
-		copy<16>(d,s);
+		__m128i x0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s));
+		_mm_storeu_si128(reinterpret_cast<__m128i *>(d),x0);
 		d += 16;
 		s += 16;
 	}
 	if ((L & 15U) >= 8U) {
-		copy<8>(d,s);
+		*reinterpret_cast<uint64_t *>(d) = *reinterpret_cast<const uint64_t *>(s);
 		d += 8;
 		s += 8;
 	}
 	if ((L & 7U) >= 4U) {
-		copy<4>(d,s);
+		*reinterpret_cast<uint32_t *>(d) = *reinterpret_cast<const uint32_t *>(s);
 		d += 4;
 		s += 4;
 	}
 	if ((L & 3U) >= 2U) {
-		copy<2>(d,s);
+		*reinterpret_cast<uint16_t *>(d) = *reinterpret_cast<const uint16_t *>(s);
 		d += 2;
 		s += 2;
 	}
 	if ((L & 1U) != 0U) {
-		copy<1>(d,s);
+		*d = *s;
 	}
-}
-static ZT_INLINE void copy(void *const dest,const void *const src,const unsigned int len) noexcept
-{
-	memcpy(dest,src,len);
-}
-
-template<unsigned int L>
-static ZT_INLINE void zero(void *dest) noexcept;
-template<>
-ZT_INLINE void zero<64>(void *const dest) noexcept
-{
-#ifdef ZT_ARCH_X64
-	const __m128i z = _mm_setzero_si128();
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest),z);
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1,z);
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 2,z);
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 3,z);
 #else
-	const uint64_t z = 0;
-	reinterpret_cast<uint64_t *>(dest)[0] = z;
-	reinterpret_cast<uint64_t *>(dest)[1] = z;
-	reinterpret_cast<uint64_t *>(dest)[2] = z;
-	reinterpret_cast<uint64_t *>(dest)[3] = z;
-	reinterpret_cast<uint64_t *>(dest)[4] = z;
-	reinterpret_cast<uint64_t *>(dest)[5] = z;
-	reinterpret_cast<uint64_t *>(dest)[6] = z;
-	reinterpret_cast<uint64_t *>(dest)[7] = z;
-#endif
-}
-template<>
-ZT_INLINE void zero<32>(void *const dest) noexcept
-{
-#ifdef ZT_ARCH_X64
-	const __m128i z = _mm_setzero_si128();
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest),z);
-	_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1,z);
-#else
-	const uint64_t z = 0;
-	reinterpret_cast<uint64_t *>(dest)[0] = z;
-	reinterpret_cast<uint64_t *>(dest)[1] = z;
-	reinterpret_cast<uint64_t *>(dest)[2] = z;
-	reinterpret_cast<uint64_t *>(dest)[3] = z;
+	for(unsigned int i=0;i<(L / (sizeof(uintptr_t) * 4));++i) {
+		uintptr_t x0 = reinterpret_cast<const uintptr_t *>(s)[0];
+		uintptr_t x1 = reinterpret_cast<const uintptr_t *>(s)[1];
+		uintptr_t x2 = reinterpret_cast<const uintptr_t *>(s)[2];
+		uintptr_t x3 = reinterpret_cast<const uintptr_t *>(s)[3];
+		reinterpret_cast<uintptr_t *>(d)[0] = x0;
+		reinterpret_cast<uintptr_t *>(d)[1] = x1;
+		reinterpret_cast<uintptr_t *>(d)[2] = x2;
+		reinterpret_cast<uintptr_t *>(d)[3] = x3;
+		s += (sizeof(uintptr_t) * 4);
+		d += (sizeof(uintptr_t) * 4);
+	}
+	for(unsigned int i=0;i<(L & ((sizeof(uintptr_t) * 4) - 1));++i)
+		d[i] = s[i];
 #endif
 }
-template<>
-ZT_INLINE void zero<16>(void *const dest) noexcept
-{
-	const uint64_t z = 0;
-	reinterpret_cast<uint64_t *>(dest)[0] = z;
-	reinterpret_cast<uint64_t *>(dest)[1] = z;
-}
-template<>
-ZT_INLINE void zero<8>(void *const dest) noexcept
-{
-	*reinterpret_cast<uint64_t *>(dest) = 0;
-}
-template<>
-ZT_INLINE void zero<4>(void *const dest) noexcept
-{
-	*reinterpret_cast<uint32_t *>(dest) = 0;
-}
-template<>
-ZT_INLINE void zero<2>(void *const dest) noexcept
-{
-	*reinterpret_cast<uint16_t *>(dest) = 0;
-}
-template<>
-ZT_INLINE void zero<1>(void *const dest) noexcept
-{
-	*reinterpret_cast<uint8_t *>(dest) = 0;
-}
-template<>
-ZT_INLINE void zero<0>(void *const dest) noexcept
+
+/**
+ * Copy memory block whose size is known at run time
+ *
+ * @param dest Destination memory
+ * @param src Source memory
+ * @param len Bytes to copy
+ */
+static ZT_INLINE void copy(void *const dest,const void *const src,unsigned int len) noexcept
 {
+	memcpy(dest,src,len);
 }
+
+/**
+ * Zero memory block whose size is known at compile time
+ *
+ * @tparam L Size in bytes
+ * @param dest Memory to zero
+ */
 template<unsigned int L>
 static ZT_INLINE void zero(void *const dest) noexcept
 {
-#ifdef ZT_NO_UNALIGNED_ACCESS
-	if ((((uintptr_t)dest | (uintptr_t)src) & 7U) != 0) {
-		memset(dest,0,L);
-		return;
-	}
-#endif
-
-	uint8_t *d = reinterpret_cast<uint8_t *>(dest);
-
-	for(unsigned int i=0;i<(L / 64U);++i) {
-		zero<64>(d);
-		d += 64;
-	}
-	if ((L & 63U) >= 32U) {
-		zero<32>(d);
-		d += 32;
-	}
-	if ((L & 31U) >= 16U) {
-		zero<16>(d);
-		d += 16;
-	}
-	if ((L & 15U) >= 8U) {
-		zero<8>(d);
-		d += 8;
-	}
-	if ((L & 7U) >= 4U) {
-		zero<4>(d);
-		d += 4;
-	}
-	if ((L & 3U) >= 2U) {
-		zero<2>(d);
-		d += 2;
-	}
-	if ((L & 1U) != 0U) {
-		zero<1>(d);
-	}
+	memset(dest,0,L);
 }
+
+/**
+ * Zero memory block whose size is known at run time
+ *
+ * @param dest Memory to zero
+ * @param len Size in bytes
+ */
 static ZT_INLINE void zero(void *const dest,const unsigned int len) noexcept
 {
 	memset(dest,0,len);