Simd.h 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. // Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
  2. // All rights reserved.
  3. // Code licensed under the BSD License.
  4. // http://www.anki3d.org/LICENSE
  5. #pragma once
  6. #include <AnKi/Util/StdTypes.h>
  7. #if ANKI_SIMD_SSE
  8. # include <smmintrin.h>
  9. #elif ANKI_SIMD_NEON
  10. # include <arm_neon.h>
  11. #elif !ANKI_SIMD_NONE
  12. # error "See file"
  13. #endif
  14. namespace anki {
  15. /// Template class that holds SIMD info for the math classes.
  16. template<typename T, U32 N>
  17. class MathSimd
  18. {
  19. public:
  20. using Type = T[N];
  21. static constexpr U32 kAlignment = alignof(T);
  22. };
  23. #if ANKI_SIMD_SSE
  24. // Specialize for F32
  25. template<>
  26. class MathSimd<F32, 4>
  27. {
  28. public:
  29. using Type = __m128;
  30. static constexpr U32 kAlignment = 16;
  31. };
  32. #elif ANKI_SIMD_NEON
  33. // Specialize for F32
  34. template<>
  35. class MathSimd<F32, 4>
  36. {
  37. public:
  38. using Type = float32x4_t;
  39. static constexpr U32 kAlignment = 16;
  40. };
  41. #endif
  42. // Suffle NEON vector. Code stolen by Jolt
  43. #if ANKI_SIMD_NEON
  44. // Constructing NEON values
  45. # if ANKI_COMPILER_MSVC
  46. # define ANKI_NEON_INT32x4(v1, v2, v3, v4) \
  47. { \
  48. I64(v1) + (I64(v2) << 32), I64(v3) + (I64(v4) << 32) \
  49. }
  50. # define ANKI_NEON_UINT32x4(v1, v2, v3, v4) \
  51. { \
  52. U64(v1) + (U64(v2) << 32), U64(v3) + (U64(v4) << 32) \
  53. }
  54. # define ANKI_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) \
  55. { \
  56. I64(v1) + (I64(v2) << 8) + (I64(v3) << 16) + (I64(v4) << 24) + (I64(v5) << 32) + (I64(v6) << 40) + (I64(v7) << 48) \
  57. + (I64(v8) << 56), \
  58. I64(v9) + (I64(v10) << 8) + (I64(v11) << 16) + (I64(v12) << 24) + (I64(v13) << 32) + (I64(v14) << 40) + (I64(v15) << 48) \
  59. + (I64(v16) << 56) \
  60. }
  61. # define ANKI_NEON_UINT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) \
  62. { \
  63. U64(v1) + (U64(v2) << 8) + (U64(v3) << 16) + (U64(v4) << 24) + (U64(v5) << 32) + (U64(v6) << 40) + (U64(v7) << 48) \
  64. + (U64(v8) << 56), \
  65. U64(v9) + (U64(v10) << 8) + (U64(v11) << 16) + (U64(v12) << 24) + (U64(v13) << 32) + (U64(v14) << 40) + (U64(v15) << 48) \
  66. + (U64(v16) << 56) \
  67. }
  68. # else
  69. # define ANKI_NEON_INT32x4(v1, v2, v3, v4) \
  70. { \
  71. v1, v2, v3, v4 \
  72. }
  73. # define ANKI_NEON_UINT32x4(v1, v2, v3, v4) \
  74. { \
  75. v1, v2, v3, v4 \
  76. }
  77. # define ANKI_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) \
  78. { \
  79. v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16 \
  80. }
  81. # define ANKI_NEON_UINT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) \
  82. { \
  83. v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16 \
  84. }
  85. # endif
  86. // MSVC and GCC prior to version 12 don't define __builtin_shufflevector
  87. # if ANKI_COMPILER_MSVC || (ANKI_COMPILER_GCC && __GNUC__ < 12)
  88. // Generic shuffle vector template
  89. template<unsigned I1, unsigned I2, unsigned I3, unsigned I4>
  90. inline float32x4_t neonSuffleFloat32x4(float32x4_t inV1, float32x4_t inV2)
  91. {
  92. float32x4_t ret;
  93. ret = vmovq_n_f32(vgetq_lane_f32(I1 >= 4 ? inV2 : inV1, I1 & 0b11));
  94. ret = vsetq_lane_f32(vgetq_lane_f32(I2 >= 4 ? inV2 : inV1, I2 & 0b11), ret, 1);
  95. ret = vsetq_lane_f32(vgetq_lane_f32(I3 >= 4 ? inV2 : inV1, I3 & 0b11), ret, 2);
  96. ret = vsetq_lane_f32(vgetq_lane_f32(I4 >= 4 ? inV2 : inV1, I4 & 0b11), ret, 3);
  97. return ret;
  98. }
  99. // Specializations
  100. template<>
  101. inline float32x4_t neonSuffleFloat32x4<0, 1, 2, 2>(float32x4_t inV1, [[maybe_unused]] float32x4_t inV2)
  102. {
  103. return vcombine_f32(vget_low_f32(inV1), vdup_lane_f32(vget_high_f32(inV1), 0));
  104. }
  105. template<>
  106. inline float32x4_t neonSuffleFloat32x4<0, 1, 3, 3>(float32x4_t inV1, [[maybe_unused]] float32x4_t inV2)
  107. {
  108. return vcombine_f32(vget_low_f32(inV1), vdup_lane_f32(vget_high_f32(inV1), 1));
  109. }
  110. template<>
  111. inline float32x4_t neonSuffleFloat32x4<0, 1, 2, 3>(float32x4_t inV1, [[maybe_unused]] float32x4_t inV2)
  112. {
  113. return inV1;
  114. }
  115. template<>
  116. inline float32x4_t neonSuffleFloat32x4<1, 0, 3, 2>(float32x4_t inV1, [[maybe_unused]] float32x4_t inV2)
  117. {
  118. return vcombine_f32(vrev64_f32(vget_low_f32(inV1)), vrev64_f32(vget_high_f32(inV1)));
  119. }
  120. template<>
  121. inline float32x4_t neonSuffleFloat32x4<2, 2, 1, 0>(float32x4_t inV1, [[maybe_unused]] float32x4_t inV2)
  122. {
  123. return vcombine_f32(vdup_lane_f32(vget_high_f32(inV1), 0), vrev64_f32(vget_low_f32(inV1)));
  124. }
  125. template<>
  126. inline float32x4_t neonSuffleFloat32x4<2, 3, 0, 1>(float32x4_t inV1, [[maybe_unused]] float32x4_t inV2)
  127. {
  128. return vcombine_f32(vget_high_f32(inV1), vget_low_f32(inV1));
  129. }
  130. // Used extensively by cross product
  131. template<>
  132. inline float32x4_t neonSuffleFloat32x4<1, 2, 0, 0>(float32x4_t inV1, [[maybe_unused]] float32x4_t inV2)
  133. {
  134. static uint8x16_t table = ANKI_NEON_UINT8x16(0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03);
  135. return vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(inV1), table));
  136. }
  137. # define ANKI_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) neonSuffleFloat32x4<index1, index2, index3, index4>(vec1, vec2)
  138. # define ANKI_NEON_SHUFFLE_U32x4(vec1, vec2, index1, index2, index3, index4) \
  139. vreinterpretq_u32_f32((neonSuffleFloat32x4<index1, index2, index3, index4>(vreinterpretq_f32_u32(vec1), vreinterpretq_f32_u32(vec2))))
  140. # else
  141. # define ANKI_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) \
  142. __builtin_shufflevector(vec1, vec2, index1, index2, index3, index4)
  143. # define ANKI_NEON_SHUFFLE_U32x4(vec1, vec2, index1, index2, index3, index4) \
  144. __builtin_shufflevector(vec1, vec2, index1, index2, index3, index4)
  145. # endif
  146. #endif // ANKI_SIMD_NEON
  147. } // end namespace anki