Ver código fonte

Starting on AES ARM intrinsics work.

Adam Ierymenko 5 anos atrás
pai
commit
9a501a76d1
3 arquivos alterados com 73 adições e 0 exclusões
  1. 44 0
      core/AES.cpp
  2. 23 0
      core/AES.hpp
  3. 6 0
      core/OS.hpp

+ 44 - 0
core/AES.cpp

@@ -1257,4 +1257,48 @@ void AES::_decrypt_aesni(const void *in, void *out) const noexcept
 
 #endif // ZT_AES_AESNI
 
+#ifdef ZT_ARCH_ARM_HAS_NEON
+
+void AES::_encrypt_armneon_crypto(const void *const in, void *const out) const noexcept
+{
+	uint8x16_t tmp = vld1q_u8(reinterpret_cast<const uint8_t *>(in));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[0]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[1]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[2]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[3]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[4]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[5]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[6]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[7]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[8]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[9]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[10]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[11]));
+	tmp = vaesmcq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[12]));
+	tmp = veorq_u8(vaeseq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[13]), reinterpret_cast<const uint8x16_t *>(_k.sw.ek)[14]);
+	vst1q_u8(reinterpret_cast<uint8_t *>(out), tmp);
+}
+
+void AES::_decrypt_armneon_crypto(const void *const in, void *const out) const noexcept
+{
+	uint8x16_t tmp = vld1q_u8(reinterpret_cast<const uint8_t *>(in));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[0]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[1]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[2]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[3]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[4]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[5]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[6]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[7]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[8]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[9]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[10]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[11]));
+	tmp = vaesimcq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[12]));
+	tmp = veorq_u8(vaesdq_u8(tmp, reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[13]), reinterpret_cast<const uint8x16_t *>(_k.sw.dk)[14]);
+	vst1q_u8(reinterpret_cast<uint8_t *>(out), tmp);
+}
+
+#endif // ZT_ARCH_ARM_HAS_NEON
+
 } // namespace ZeroTier

+ 23 - 0
core/AES.hpp

@@ -91,6 +91,12 @@ public:
 			_encrypt_aesni(in, out);
 			return;
 		}
+#endif
+#ifdef ZT_ARCH_ARM_HAS_NEON
+		if (true) {
+			_encrypt_armneon_crypto(in, out);
+			return;
+		}
 #endif
 		_encryptSW(reinterpret_cast<const uint8_t *>(in), reinterpret_cast<uint8_t *>(out));
 	}
@@ -108,6 +114,12 @@ public:
 			_decrypt_aesni(in, out);
 			return;
 		}
+#endif
+#ifdef ZT_ARCH_ARM_HAS_NEON
+		if (true) {
+			_decrypt_armneon_crypto(in, out);
+			return;
+		}
 #endif
 		_decryptSW(reinterpret_cast<const uint8_t *>(in), reinterpret_cast<uint8_t *>(out));
 	}
@@ -506,7 +518,13 @@ private:
 		struct
 		{
 			uint64_t h[2];
+#if defined(ZT_ARCH_ARM_HAS_NEON) && !defined(_MSC_VER) && !defined(ZT_AES_NO_ACCEL)
+			__attribute__((aligned(16)))
+#endif
 			uint32_t ek[60];
+#if defined(ZT_ARCH_ARM_HAS_NEON) && !defined(_MSC_VER) && !defined(ZT_AES_NO_ACCEL)
+			__attribute__((aligned(16)))
+#endif
 			uint32_t dk[60];
 		} sw;
 	} _k;
@@ -516,6 +534,11 @@ private:
 	void _encrypt_aesni(const void *const in, void *const out) const noexcept;
 	void _decrypt_aesni(const void *in, void *out) const noexcept;
 #endif
+
+#ifdef ZT_ARCH_ARM_HAS_NEON
+	void _encrypt_armneon_crypto(const void *const in, void *const out) const noexcept;
+	void _decrypt_armneon_crypto(const void *const in, void *const out) const noexcept;
+#endif
 };
 
 } // namespace ZeroTier

+ 6 - 0
core/OS.hpp

@@ -115,6 +115,12 @@
 #include <mmintrin.h>
 #endif
 
+#if (defined(__ARM_NEON) || defined(__ARM_NEON__))
+#define ZT_ARCH_ARM_HAS_NEON 1
+#include <arm_neon.h>
+/*#include <arm_acle.h>*/
+#endif
+
 #if defined(ZT_ARCH_X64) || defined(i386) || defined(__i386) || defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_) || defined(__I86__) || defined(__INTEL__) || defined(__386)
 #define ZT_ARCH_X86 1
 #endif