neon.h 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. /// @ref simd_neon
  2. /// @file glm/simd/neon.h
  3. #pragma once
  4. #if GLM_ARCH & GLM_ARCH_NEON_BIT
  5. #include <arm_neon.h>
  6. namespace glm {
  7. namespace neon {
  8. static inline float32x4_t dupq_lane(float32x4_t vsrc, int lane) {
  9. switch(lane) {
  10. #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
  11. case 0: return vdupq_laneq_f32(vsrc, 0);
  12. case 1: return vdupq_laneq_f32(vsrc, 1);
  13. case 2: return vdupq_laneq_f32(vsrc, 2);
  14. case 3: return vdupq_laneq_f32(vsrc, 3);
  15. #else
  16. case 0: return vdupq_n_f32(vgetq_lane_f32(vsrc, 0));
  17. case 1: return vdupq_n_f32(vgetq_lane_f32(vsrc, 1));
  18. case 2: return vdupq_n_f32(vgetq_lane_f32(vsrc, 2));
  19. case 3: return vdupq_n_f32(vgetq_lane_f32(vsrc, 3));
  20. #endif
  21. }
  22. assert(false); //Unreachable code executed!
  23. return vdupq_n_f32(0.0f);
  24. }
  25. static inline float32x2_t dup_lane(float32x4_t vsrc, int lane) {
  26. switch(lane) {
  27. #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
  28. case 0: return vdup_laneq_f32(vsrc, 0);
  29. case 1: return vdup_laneq_f32(vsrc, 1);
  30. case 2: return vdup_laneq_f32(vsrc, 2);
  31. case 3: return vdup_laneq_f32(vsrc, 3);
  32. #else
  33. case 0: return vdup_n_f32(vgetq_lane_f32(vsrc, 0));
  34. case 1: return vdup_n_f32(vgetq_lane_f32(vsrc, 1));
  35. case 2: return vdup_n_f32(vgetq_lane_f32(vsrc, 2));
  36. case 3: return vdup_n_f32(vgetq_lane_f32(vsrc, 3));
  37. #endif
  38. }
  39. assert(false); //Unreachable code executed!
  40. return vdup_n_f32(0.0f);
  41. }
  42. static inline float32x4_t copy_lane(float32x4_t vdst, int dlane, float32x4_t vsrc, int slane) {
  43. #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
  44. switch(dlane) {
  45. case 0:
  46. switch(slane) {
  47. case 0: return vcopyq_laneq_f32(vdst, 0, vsrc, 0);
  48. case 1: return vcopyq_laneq_f32(vdst, 0, vsrc, 1);
  49. case 2: return vcopyq_laneq_f32(vdst, 0, vsrc, 2);
  50. case 3: return vcopyq_laneq_f32(vdst, 0, vsrc, 3);
  51. }
  52. assert(false); //Unreachable code executed!
  53. break;
  54. case 1:
  55. switch(slane) {
  56. case 0: return vcopyq_laneq_f32(vdst, 1, vsrc, 0);
  57. case 1: return vcopyq_laneq_f32(vdst, 1, vsrc, 1);
  58. case 2: return vcopyq_laneq_f32(vdst, 1, vsrc, 2);
  59. case 3: return vcopyq_laneq_f32(vdst, 1, vsrc, 3);
  60. }
  61. assert(false); //Unreachable code executed!
  62. break;
  63. case 2:
  64. switch(slane) {
  65. case 0: return vcopyq_laneq_f32(vdst, 2, vsrc, 0);
  66. case 1: return vcopyq_laneq_f32(vdst, 2, vsrc, 1);
  67. case 2: return vcopyq_laneq_f32(vdst, 2, vsrc, 2);
  68. case 3: return vcopyq_laneq_f32(vdst, 2, vsrc, 3);
  69. }
  70. assert(false); //Unreachable code executed!
  71. break;
  72. case 3:
  73. switch(slane) {
  74. case 0: return vcopyq_laneq_f32(vdst, 3, vsrc, 0);
  75. case 1: return vcopyq_laneq_f32(vdst, 3, vsrc, 1);
  76. case 2: return vcopyq_laneq_f32(vdst, 3, vsrc, 2);
  77. case 3: return vcopyq_laneq_f32(vdst, 3, vsrc, 3);
  78. }
  79. assert(false); //Unreachable code executed!
  80. break;
  81. }
  82. #else
  83. float l;
  84. switch(slane) {
  85. case 0: l = vgetq_lane_f32(vsrc, 0); break;
  86. case 1: l = vgetq_lane_f32(vsrc, 1); break;
  87. case 2: l = vgetq_lane_f32(vsrc, 2); break;
  88. case 3: l = vgetq_lane_f32(vsrc, 3); break;
  89. default:
  90. assert(false); //Unreachable code executed!
  91. }
  92. switch(dlane) {
  93. case 0: return vsetq_lane_f32(l, vdst, 0);
  94. case 1: return vsetq_lane_f32(l, vdst, 1);
  95. case 2: return vsetq_lane_f32(l, vdst, 2);
  96. case 3: return vsetq_lane_f32(l, vdst, 3);
  97. }
  98. #endif
  99. assert(false); //Unreachable code executed!
  100. return vdupq_n_f32(0.0f);
  101. }
  102. static inline float32x4_t mul_lane(float32x4_t v, float32x4_t vlane, int lane) {
  103. #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
  104. switch(lane) {
  105. case 0: return vmulq_laneq_f32(v, vlane, 0); break;
  106. case 1: return vmulq_laneq_f32(v, vlane, 1); break;
  107. case 2: return vmulq_laneq_f32(v, vlane, 2); break;
  108. case 3: return vmulq_laneq_f32(v, vlane, 3); break;
  109. default:
  110. assert(false); //Unreachable code executed!
  111. }
  112. assert(false); //Unreachable code executed!
  113. return vdupq_n_f32(0.0f);
  114. #else
  115. return vmulq_f32(v, dupq_lane(vlane, lane));
  116. #endif
  117. }
  118. static inline float32x4_t madd_lane(float32x4_t acc, float32x4_t v, float32x4_t vlane, int lane) {
  119. #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
  120. #ifdef GLM_CONFIG_FORCE_FMA
  121. # define FMADD_LANE(acc, x, y, L) do { asm volatile ("fmla %0.4s, %1.4s, %2.4s" : "+w"(acc) : "w"(x), "w"(dup_lane(y, L))); } while(0)
  122. #else
  123. # define FMADD_LANE(acc, x, y, L) do { acc = vmlaq_laneq_f32(acc, x, y, L); } while(0)
  124. #endif
  125. switch(lane) {
  126. case 0:
  127. FMADD_LANE(acc, v, vlane, 0);
  128. return acc;
  129. case 1:
  130. FMADD_LANE(acc, v, vlane, 1);
  131. return acc;
  132. case 2:
  133. FMADD_LANE(acc, v, vlane, 2);
  134. return acc;
  135. case 3:
  136. FMADD_LANE(acc, v, vlane, 3);
  137. return acc;
  138. default:
  139. assert(false); //Unreachable code executed!
  140. }
  141. assert(false); //Unreachable code executed!
  142. return vdupq_n_f32(0.0f);
  143. # undef FMADD_LANE
  144. #else
  145. return vaddq_f32(acc, vmulq_f32(v, dupq_lane(vlane, lane)));
  146. #endif
  147. }
  148. } //namespace neon
  149. } // namespace glm
  150. #endif // GLM_ARCH & GLM_ARCH_NEON_BIT