ARMNeon.h 3.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. // Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
  2. // SPDX-FileCopyrightText: 2022 Jorrit Rouwe
  3. // SPDX-License-Identifier: MIT
  4. #pragma once
  5. #ifdef JPH_USE_NEON
  6. #ifdef JPH_COMPILER_MSVC
  7. JPH_NAMESPACE_BEGIN
  8. // Constructing NEON values
  9. #define JPH_NEON_INT32x4(v1, v2, v3, v4) { int64_t(v1) + (int64_t(v2) << 32), int64_t(v3) + (int64_t(v4) << 32) }
  10. #define JPH_NEON_UINT32x4(v1, v2, v3, v4) { uint64_t(v1) + (uint64_t(v2) << 32), uint64_t(v3) + (uint64_t(v4) << 32) }
  11. #define JPH_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { int64_t(v1) + (int64_t(v2) << 8) + (int64_t(v3) << 16) + (int64_t(v4) << 24) + (int64_t(v5) << 32) + (int64_t(v6) << 40) + (int64_t(v7) << 48) + (int64_t(v8) << 56), int64_t(v9) + (int64_t(v10) << 8) + (int64_t(v11) << 16) + (int64_t(v12) << 24) + (int64_t(v13) << 32) + (int64_t(v14) << 40) + (int64_t(v15) << 48) + (int64_t(v16) << 56) }
  12. // Generic shuffle vector template
  13. template <unsigned I1, unsigned I2, unsigned I3, unsigned I4>
  14. JPH_INLINE float32x4_t NeonShuffleFloat32x4(float32x4_t inV1, float32x4_t inV2)
  15. {
  16. float32x4_t ret;
  17. ret = vmovq_n_f32(vgetq_lane_f32(I1 >= 4? inV2 : inV1, I1 & 0b11));
  18. ret = vsetq_lane_f32(vgetq_lane_f32(I2 >= 4? inV2 : inV1, I2 & 0b11), ret, 1);
  19. ret = vsetq_lane_f32(vgetq_lane_f32(I3 >= 4? inV2 : inV1, I3 & 0b11), ret, 2);
  20. ret = vsetq_lane_f32(vgetq_lane_f32(I4 >= 4? inV2 : inV1, I4 & 0b11), ret, 3);
  21. return ret;
  22. }
  23. // Specializations
  24. template <>
  25. JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 2, 2>(float32x4_t inV1, float32x4_t inV2)
  26. {
  27. return vcombine_f32(vget_low_f32(inV1), vdup_lane_s32(vget_high_f32(inV1), 0));
  28. }
  29. template <>
  30. JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 3, 3>(float32x4_t inV1, float32x4_t inV2)
  31. {
  32. return vcombine_f32(vget_low_f32(inV1), vdup_lane_s32(vget_high_f32(inV1), 1));
  33. }
  34. template <>
  35. JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 2, 3>(float32x4_t inV1, float32x4_t inV2)
  36. {
  37. return inV1;
  38. }
  39. template <>
  40. JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 0, 3, 2>(float32x4_t inV1, float32x4_t inV2)
  41. {
  42. return vcombine_f32(vrev64_f32(vget_low_f32(inV1)), vrev64_f32(vget_high_f32(inV1)));
  43. }
  44. template <>
  45. JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 1, 0>(float32x4_t inV1, float32x4_t inV2)
  46. {
  47. return vcombine_f32(vdup_lane_s32(vget_high_f32(inV1), 0), vrev64_f32(vget_low_f32(inV1)));
  48. }
  49. template <>
  50. JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 0, 1>(float32x4_t inV1, float32x4_t inV2)
  51. {
  52. return vcombine_f32(vget_high_f32(inV1), vget_low_f32(inV1));
  53. }
  54. // Used extensively by cross product
  55. template <>
  56. JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 0, 0>(float32x4_t inV1, float32x4_t inV2)
  57. {
  58. static int8x16_t table = JPH_NEON_INT8x16(0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03);
  59. return vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(inV1), table));
  60. }
  61. // Shuffle a vector
  62. #define JPH_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) NeonShuffleFloat32x4<index1, index2, index3, index4>(vec1, vec2)
  63. JPH_NAMESPACE_END
  64. #else
  65. // Constructing NEON values
  66. #define JPH_NEON_INT32x4(v1, v2, v3, v4) { v1, v2, v3, v4 }
  67. #define JPH_NEON_UINT32x4(v1, v2, v3, v4) { v1, v2, v3, v4 }
  68. #define JPH_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16 }
  69. // Shuffle a vector
  70. #define JPH_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) __builtin_shufflevector(vec1, vec2, index1, index2, index3, index4)
  71. #endif
  72. #endif // JPH_USE_NEON