Browse Source

Some cleanup and optimization.

Adam Ierymenko 5 years ago
parent
commit
bd0299f392
3 changed files with 60 additions and 9 deletions
  1. 1 0
      core/OS.hpp
  2. 3 3
      core/SHA512.cpp
  3. 56 6
      core/Utils.hpp

+ 1 - 0
core/OS.hpp

@@ -39,6 +39,7 @@
 #include <WinSock2.h>
 #include <ws2tcpip.h>
 #include <Windows.h>
+#include <memoryapi.h>
 #include <shlwapi.h>
 #include <Shlobj.h>
 #include <sys/param.h>

+ 3 - 3
core/SHA512.cpp

@@ -122,7 +122,7 @@ static void sha512_process(sha512_state *const md,const uint8_t *in,unsigned lon
 			inlen          -= 128;
 		} else {
 			unsigned long n = std::min(inlen,(128 - md->curlen));
-			memcpy(md->buf + md->curlen,in,n);
+			Utils::copy(md->buf + md->curlen,in,n);
 			md->curlen += n;
 			in             += n;
 			inlen          -= n;
@@ -179,7 +179,7 @@ void SHA384(void *digest,const void *data,unsigned int len)
 	sha384_init(&state);
 	sha512_process(&state,(uint8_t *)data,(unsigned long)len);
 	sha512_done(&state,tmp);
-	memcpy(digest,tmp,48);
+	Utils::copy<48>(digest,tmp);
 }
 
 void SHA384(void *digest,const void *data0,unsigned int len0,const void *data1,unsigned int len1)
@@ -190,7 +190,7 @@ void SHA384(void *digest,const void *data0,unsigned int len0,const void *data1,u
 	sha512_process(&state,(uint8_t *)data0,(unsigned long)len0);
 	sha512_process(&state,(uint8_t *)data1,(unsigned long)len1);
 	sha512_done(&state,tmp);
-	memcpy(digest,tmp,48);
+	Utils::copy<48>(digest,tmp);
 }
 
 #endif // !ZT_HAVE_NATIVE_SHA512

+ 56 - 6
core/Utils.hpp

@@ -55,7 +55,6 @@ namespace Utils {
 #define ZT_ROL32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
 
 #ifdef ZT_ARCH_X64
-
 struct CPUIDRegisters
 {
 	CPUIDRegisters() noexcept;
@@ -70,7 +69,6 @@ struct CPUIDRegisters
 	bool sha;
 	bool fsrm;
 };
-
 extern const CPUIDRegisters CPUID;
 #endif
 
@@ -104,7 +102,9 @@ extern const uint64_t s_mapNonce;
  */
 static ZT_INLINE void memoryLock(const void *const p, const unsigned int l) noexcept
 {
-#ifndef __WINDOWS__
+#ifdef __WINDOWS__
+	VirtualLock(p, l);
+#else
 	mlock(p, l);
 #endif
 }
@@ -117,7 +117,9 @@ static ZT_INLINE void memoryLock(const void *const p, const unsigned int l) noex
  */
 static ZT_INLINE void memoryUnlock(const void *const p, const unsigned int l) noexcept
 {
-#ifndef __WINDOWS__
+#ifdef __WINDOWS__
+	VirtualUnlock(p, l);
+#else
 	munlock(p, l);
 #endif
 }
@@ -695,6 +697,23 @@ static ZT_INLINE void storeLittleEndian(void *const p, const I i) noexcept
 #endif
 }
 
+/*
+ * Note on copy() and zero():
+ *
+ * On X64, rep/movsb and rep/stosb are almost always faster for small memory
+ * regions on all but the oldest microarchitectures (and even there the
+ * difference is not large). While more aggressive memcpy() implementations
+ * may be faster in micro-benchmarks, these fail to account for real world
+ * context such as instruction cache and pipeline pressure. A simple
+ * instruction like rep/movsb takes up only a few spots in caches and pipelines
+ * and requires no branching or function calls. Specialized memcpy() can still
+ * be faster for large memory regions, but ZeroTier doesn't copy anything
+ * much larger than 16KiB.
+ *
+ * A templated version for statically known sizes is provided since this can
+ * allow some nice optimizations in some cases.
+ */
+
 /**
  * Copy memory block whose size is known at compile time.
  *
@@ -706,13 +725,44 @@ template< unsigned long L >
 static ZT_INLINE void copy(void *dest, const void *src) noexcept
 {
 #if defined(ZT_ARCH_X64) && defined(__GNUC__)
-	unsigned long l = L;
+	uintptr_t l = L;
 	asm volatile ("cld ; rep movsb" : "+c"(l), "+S"(src), "+D"(dest));
 #else
 	memcpy(dest, src, L);
 #endif
 }
 
+// Avoid rep/movsb startup time for some small common sizes.
+template<>
+ZT_INLINE void copy<4>(void *dest, const void *src) noexcept
+{
+	*reinterpret_cast<uint32_t *>(dest) = *reinterpret_cast<const uint32_t *>(src);
+}
+template<>
+ZT_INLINE void copy<8>(void *dest, const void *src) noexcept
+{
+	*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
+}
+template<>
+ZT_INLINE void copy<12>(void *dest, const void *src) noexcept
+{
+	*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
+	*reinterpret_cast<uint32_t *>(reinterpret_cast<uint8_t *>(dest) + 8) = *reinterpret_cast<const uint32_t *>(reinterpret_cast<const uint8_t *>(src) + 8);
+}
+template<>
+ZT_INLINE void copy<16>(void *dest, const void *src) noexcept
+{
+	*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
+	*reinterpret_cast<uint64_t *>(reinterpret_cast<uint8_t *>(dest) + 8) = *reinterpret_cast<const uint64_t *>(reinterpret_cast<const uint8_t *>(src) + 8);
+}
+template<>
+ZT_INLINE void copy<24>(void *dest, const void *src) noexcept
+{
+	*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
+	*reinterpret_cast<uint64_t *>(reinterpret_cast<uint8_t *>(dest) + 8) = *reinterpret_cast<const uint64_t *>(reinterpret_cast<const uint8_t *>(src) + 8);
+	*reinterpret_cast<uint64_t *>(reinterpret_cast<uint8_t *>(dest) + 16) = *reinterpret_cast<const uint64_t *>(reinterpret_cast<const uint8_t *>(src) + 16);
+}
+
 /**
  * Copy memory block whose size is known at run time
  *
@@ -739,7 +789,7 @@ template< unsigned long L >
 static ZT_INLINE void zero(void *dest) noexcept
 {
 #if defined(ZT_ARCH_X64) && defined(__GNUC__)
-	unsigned long l = L;
+	uintptr_t l = L;
 	asm volatile ("cld ; rep stosb" :"+c" (l), "+D" (dest) : "a" (0));
 #else
 	memset(dest, 0, L);