Vec4.inl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. #include <Math/Vec3.h>
  4. #include <Math/UVec4.h>
  5. namespace JPH {
  6. // Constructor
  7. Vec4::Vec4(Vec3Arg inRHS) :
  8. mValue(inRHS.mValue)
  9. {
  10. }
  11. Vec4::Vec4(Vec3Arg inRHS, float inW)
  12. {
  13. #if defined(JPH_USE_SSE)
  14. mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
  15. #elif defined(JPH_USE_NEON)
  16. mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
  17. #else
  18. #error Undefined CPU architecture
  19. #endif
  20. }
  21. Vec4::Vec4(float inX, float inY, float inZ, float inW)
  22. {
  23. #if defined(JPH_USE_SSE)
  24. mValue = _mm_set_ps(inW, inZ, inY, inX);
  25. #elif defined(JPH_USE_NEON)
  26. uint32x2_t xy = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32 *>(&inX)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inY)) << 32));
  27. uint32x2_t zw = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32* >(&inZ)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inW)) << 32));
  28. mValue = vcombine_f32(xy, zw);
  29. #else
  30. #error Undefined CPU architecture
  31. #endif
  32. }
  33. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
  34. Vec4 Vec4::Swizzle() const
  35. {
  36. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  37. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  38. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  39. static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
  40. #if defined(JPH_USE_SSE)
  41. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
  42. #elif defined(JPH_USE_NEON)
  43. return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
  44. #else
  45. #error Unsupported CPU architecture
  46. #endif
  47. }
  48. Vec4 Vec4::sZero()
  49. {
  50. #if defined(JPH_USE_SSE)
  51. return _mm_setzero_ps();
  52. #elif defined(JPH_USE_NEON)
  53. return vdupq_n_f32(0);
  54. #else
  55. #error Unsupported CPU architecture
  56. #endif
  57. }
  58. Vec4 Vec4::sReplicate(float inV)
  59. {
  60. #if defined(JPH_USE_SSE)
  61. return _mm_set1_ps(inV);
  62. #elif defined(JPH_USE_NEON)
  63. return vdupq_n_f32(inV);
  64. #else
  65. #error Unsupported CPU architecture
  66. #endif
  67. }
  68. Vec4 Vec4::sNaN()
  69. {
  70. return sReplicate(numeric_limits<float>::quiet_NaN());
  71. }
  72. Vec4 Vec4::sLoadFloat4(const Float4 *inV)
  73. {
  74. #if defined(JPH_USE_SSE)
  75. return _mm_loadu_ps(&inV->x);
  76. #elif defined(JPH_USE_NEON)
  77. return vld1q_f32(&inV->x);
  78. #else
  79. #error Unsupported CPU architecture
  80. #endif
  81. }
  82. Vec4 Vec4::sLoadFloat4Aligned(const Float4 *inV)
  83. {
  84. #if defined(JPH_USE_SSE)
  85. return _mm_load_ps(&inV->x);
  86. #elif defined(JPH_USE_NEON)
  87. return vld1q_f32(&inV->x);
  88. #else
  89. #error Unsupported CPU architecture
  90. #endif
  91. }
  92. template <const int Scale>
  93. Vec4 Vec4::sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
  94. {
  95. #if defined(JPH_USE_SSE)
  96. #ifdef JPH_USE_AVX2
  97. return _mm_i32gather_ps(inBase, inOffsets.mValue, Scale);
  98. #else
  99. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  100. Type x = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale));
  101. Type y = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale));
  102. Type xy = _mm_unpacklo_ps(x, y);
  103. Type z = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale));
  104. Type w = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale));
  105. Type zw = _mm_unpacklo_ps(z, w);
  106. return _mm_movelh_ps(xy, zw);
  107. #endif
  108. #else
  109. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  110. float x = *reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale);
  111. float y = *reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale);
  112. float z = *reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale);
  113. float w = *reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale);
  114. return Vec4(x, y, z, w);
  115. #endif
  116. }
  117. Vec4 Vec4::sMin(Vec4Arg inV1, Vec4Arg inV2)
  118. {
  119. #if defined(JPH_USE_SSE)
  120. return _mm_min_ps(inV1.mValue, inV2.mValue);
  121. #elif defined(JPH_USE_NEON)
  122. return vminq_f32(inV1.mValue, inV2.mValue);
  123. #else
  124. #error Unsupported CPU architecture
  125. #endif
  126. }
  127. Vec4 Vec4::sMax(Vec4Arg inV1, Vec4Arg inV2)
  128. {
  129. #if defined(JPH_USE_SSE)
  130. return _mm_max_ps(inV1.mValue, inV2.mValue);
  131. #elif defined(JPH_USE_NEON)
  132. return vmaxq_f32(inV1.mValue, inV2.mValue);
  133. #else
  134. #error Unsupported CPU architecture
  135. #endif
  136. }
  137. UVec4 Vec4::sEquals(Vec4Arg inV1, Vec4Arg inV2)
  138. {
  139. #if defined(JPH_USE_SSE)
  140. return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
  141. #elif defined(JPH_USE_NEON)
  142. return vceqq_f32(inV1.mValue, inV2.mValue);
  143. #else
  144. #error Unsupported CPU architecture
  145. #endif
  146. }
  147. UVec4 Vec4::sLess(Vec4Arg inV1, Vec4Arg inV2)
  148. {
  149. #if defined(JPH_USE_SSE)
  150. return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
  151. #elif defined(JPH_USE_NEON)
  152. return vcltq_f32(inV1.mValue, inV2.mValue);
  153. #else
  154. #error Unsupported CPU architecture
  155. #endif
  156. }
  157. UVec4 Vec4::sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  158. {
  159. #if defined(JPH_USE_SSE)
  160. return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
  161. #elif defined(JPH_USE_NEON)
  162. return vcleq_f32(inV1.mValue, inV2.mValue);
  163. #else
  164. #error Unsupported CPU architecture
  165. #endif
  166. }
  167. UVec4 Vec4::sGreater(Vec4Arg inV1, Vec4Arg inV2)
  168. {
  169. #if defined(JPH_USE_SSE)
  170. return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
  171. #elif defined(JPH_USE_NEON)
  172. return vcgtq_f32(inV1.mValue, inV2.mValue);
  173. #else
  174. #error Unsupported CPU architecture
  175. #endif
  176. }
  177. UVec4 Vec4::sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  178. {
  179. #if defined(JPH_USE_SSE)
  180. return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
  181. #elif defined(JPH_USE_NEON)
  182. return vcgeq_f32(inV1.mValue, inV2.mValue);
  183. #else
  184. #error Unsupported CPU architecture
  185. #endif
  186. }
  187. Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
  188. {
  189. #if defined(JPH_USE_SSE)
  190. #ifdef JPH_USE_FMADD
  191. return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
  192. #else
  193. return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
  194. #endif
  195. #elif defined(JPH_USE_NEON)
  196. return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
  197. #else
  198. #error Unsupported CPU architecture
  199. #endif
  200. }
  201. Vec4 Vec4::sSelect(Vec4Arg inV1, Vec4Arg inV2, UVec4Arg inControl)
  202. {
  203. #if defined(JPH_USE_SSE)
  204. return _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
  205. #elif defined(JPH_USE_NEON)
  206. return vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
  207. #else
  208. #error Unsupported CPU architecture
  209. #endif
  210. }
  211. Vec4 Vec4::sOr(Vec4Arg inV1, Vec4Arg inV2)
  212. {
  213. #if defined(JPH_USE_SSE)
  214. return _mm_or_ps(inV1.mValue, inV2.mValue);
  215. #elif defined(JPH_USE_NEON)
  216. return vorrq_s32(inV1.mValue, inV2.mValue);
  217. #else
  218. #error Unsupported CPU architecture
  219. #endif
  220. }
  221. Vec4 Vec4::sXor(Vec4Arg inV1, Vec4Arg inV2)
  222. {
  223. #if defined(JPH_USE_SSE)
  224. return _mm_xor_ps(inV1.mValue, inV2.mValue);
  225. #elif defined(JPH_USE_NEON)
  226. return veorq_s32(inV1.mValue, inV2.mValue);
  227. #else
  228. #error Unsupported CPU architecture
  229. #endif
  230. }
  231. Vec4 Vec4::sAnd(Vec4Arg inV1, Vec4Arg inV2)
  232. {
  233. #if defined(JPH_USE_SSE)
  234. return _mm_and_ps(inV1.mValue, inV2.mValue);
  235. #elif defined(JPH_USE_NEON)
  236. return vandq_s32(inV1.mValue, inV2.mValue);
  237. #else
  238. #error Unsupported CPU architecture
  239. #endif
  240. }
  241. void Vec4::sSort4(Vec4 &ioValue, UVec4 &ioIndex)
  242. {
  243. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  244. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  245. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  246. UVec4 c1 = sLess(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  247. ioValue = sSelect(ioValue, v1, c1);
  248. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  249. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  250. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  251. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  252. UVec4 c2 = sLess(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  253. ioValue = sSelect(ioValue, v2, c2);
  254. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  255. // Pass 3, test 2nd vs 3rd component
  256. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  257. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  258. UVec4 c3 = sLess(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  259. ioValue = sSelect(ioValue, v3, c3);
  260. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  261. }
  262. void Vec4::sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
  263. {
  264. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  265. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  266. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  267. UVec4 c1 = sGreater(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  268. ioValue = sSelect(ioValue, v1, c1);
  269. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  270. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  271. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  272. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  273. UVec4 c2 = sGreater(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  274. ioValue = sSelect(ioValue, v2, c2);
  275. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  276. // Pass 3, test 2nd vs 3rd component
  277. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  278. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  279. UVec4 c3 = sGreater(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  280. ioValue = sSelect(ioValue, v3, c3);
  281. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  282. }
  283. bool Vec4::operator == (Vec4Arg inV2) const
  284. {
  285. return sEquals(*this, inV2).TestAllTrue();
  286. }
  287. bool Vec4::IsClose(Vec4Arg inV2, float inMaxDistSq) const
  288. {
  289. return (inV2 - *this).LengthSq() <= inMaxDistSq;
  290. }
  291. bool Vec4::IsNormalized(float inTolerance) const
  292. {
  293. return abs(LengthSq() - 1.0f) <= inTolerance;
  294. }
  295. bool Vec4::IsNaN() const
  296. {
  297. #if defined(JPH_USE_SSE)
  298. return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
  299. #elif defined(JPH_USE_NEON)
  300. uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
  301. return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
  302. #else
  303. #error Unsupported CPU architecture
  304. #endif
  305. }
  306. Vec4 Vec4::operator * (Vec4Arg inV2) const
  307. {
  308. #if defined(JPH_USE_SSE)
  309. return _mm_mul_ps(mValue, inV2.mValue);
  310. #elif defined(JPH_USE_NEON)
  311. return vmulq_f32(mValue, inV2.mValue);
  312. #else
  313. #error Unsupported CPU architecture
  314. #endif
  315. }
  316. Vec4 Vec4::operator * (float inV2) const
  317. {
  318. #if defined(JPH_USE_SSE)
  319. return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  320. #elif defined(JPH_USE_NEON)
  321. return vmulq_n_f32(mValue, inV2);
  322. #else
  323. #error Unsupported CPU architecture
  324. #endif
  325. }
  326. /// Multiply vector with float
  327. Vec4 operator * (float inV1, Vec4Arg inV2)
  328. {
  329. #if defined(JPH_USE_SSE)
  330. return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
  331. #elif defined(JPH_USE_NEON)
  332. return vmulq_n_f32(inV2.mValue, inV1);
  333. #else
  334. #error Unsupported CPU architecture
  335. #endif
  336. }
  337. Vec4 Vec4::operator / (float inV2) const
  338. {
  339. #if defined(JPH_USE_SSE)
  340. return _mm_div_ps(mValue, _mm_set1_ps(inV2));
  341. #elif defined(JPH_USE_NEON)
  342. return vdivq_f32(mValue, vdupq_n_f32(inV2));
  343. #else
  344. #error Unsupported CPU architecture
  345. #endif
  346. }
  347. Vec4 &Vec4::operator *= (float inV2)
  348. {
  349. #if defined(JPH_USE_SSE)
  350. mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  351. #elif defined(JPH_USE_NEON)
  352. mValue = vmulq_n_f32(mValue, inV2);
  353. #else
  354. #error Unsupported CPU architecture
  355. #endif
  356. return *this;
  357. }
  358. Vec4 &Vec4::operator *= (Vec4Arg inV2)
  359. {
  360. #if defined(JPH_USE_SSE)
  361. mValue = _mm_mul_ps(mValue, inV2.mValue);
  362. #elif defined(JPH_USE_NEON)
  363. mValue = vmulq_f32(mValue, inV2.mValue);
  364. #else
  365. #error Unsupported CPU architecture
  366. #endif
  367. return *this;
  368. }
  369. Vec4 &Vec4::operator /= (float inV2)
  370. {
  371. #if defined(JPH_USE_SSE)
  372. mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
  373. #elif defined(JPH_USE_NEON)
  374. mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
  375. #else
  376. #error Unsupported CPU architecture
  377. #endif
  378. return *this;
  379. }
  380. Vec4 Vec4::operator + (Vec4Arg inV2) const
  381. {
  382. #if defined(JPH_USE_SSE)
  383. return _mm_add_ps(mValue, inV2.mValue);
  384. #elif defined(JPH_USE_NEON)
  385. return vaddq_f32(mValue, inV2.mValue);
  386. #else
  387. #error Unsupported CPU architecture
  388. #endif
  389. }
  390. Vec4 &Vec4::operator += (Vec4Arg inV2)
  391. {
  392. #if defined(JPH_USE_SSE)
  393. mValue = _mm_add_ps(mValue, inV2.mValue);
  394. #elif defined(JPH_USE_NEON)
  395. mValue = vaddq_f32(mValue, inV2.mValue);
  396. #else
  397. #error Unsupported CPU architecture
  398. #endif
  399. return *this;
  400. }
  401. Vec4 Vec4::operator - () const
  402. {
  403. #if defined(JPH_USE_SSE)
  404. return _mm_sub_ps(_mm_setzero_ps(), mValue);
  405. #elif defined(JPH_USE_NEON)
  406. return vnegq_f32(mValue);
  407. #else
  408. #error Unsupported CPU architecture
  409. #endif
  410. }
  411. Vec4 Vec4::operator - (Vec4Arg inV2) const
  412. {
  413. #if defined(JPH_USE_SSE)
  414. return _mm_sub_ps(mValue, inV2.mValue);
  415. #elif defined(JPH_USE_NEON)
  416. return vsubq_f32(mValue, inV2.mValue);
  417. #else
  418. #error Unsupported CPU architecture
  419. #endif
  420. }
  421. Vec4 &Vec4::operator -= (Vec4Arg inV2)
  422. {
  423. #if defined(JPH_USE_SSE)
  424. mValue = _mm_sub_ps(mValue, inV2.mValue);
  425. #elif defined(JPH_USE_NEON)
  426. mValue = vsubq_f32(mValue, inV2.mValue);
  427. #else
  428. #error Unsupported CPU architecture
  429. #endif
  430. return *this;
  431. }
  432. Vec4 Vec4::operator / (Vec4Arg inV2) const
  433. {
  434. #if defined(JPH_USE_SSE)
  435. return _mm_div_ps(mValue, inV2.mValue);
  436. #elif defined(JPH_USE_NEON)
  437. return vdivq_f32(mValue, inV2.mValue);
  438. #else
  439. #error Unsupported CPU architecture
  440. #endif
  441. }
  442. Vec4 Vec4::SplatX() const
  443. {
  444. #if defined(JPH_USE_SSE)
  445. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
  446. #elif defined(JPH_USE_NEON)
  447. return vdupq_laneq_f32(mValue, 0);
  448. #else
  449. #error Unsupported CPU architecture
  450. #endif
  451. }
  452. Vec4 Vec4::SplatY() const
  453. {
  454. #if defined(JPH_USE_SSE)
  455. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
  456. #elif defined(JPH_USE_NEON)
  457. return vdupq_laneq_f32(mValue, 1);
  458. #else
  459. #error Unsupported CPU architecture
  460. #endif
  461. }
  462. Vec4 Vec4::SplatZ() const
  463. {
  464. #if defined(JPH_USE_SSE)
  465. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
  466. #elif defined(JPH_USE_NEON)
  467. return vdupq_laneq_f32(mValue, 2);
  468. #else
  469. #error Unsupported CPU architecture
  470. #endif
  471. }
  472. Vec4 Vec4::SplatW() const
  473. {
  474. #if defined(JPH_USE_SSE)
  475. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
  476. #elif defined(JPH_USE_NEON)
  477. return vdupq_laneq_f32(mValue, 3);
  478. #else
  479. #error Unsupported CPU architecture
  480. #endif
  481. }
  482. Vec4 Vec4::Abs() const
  483. {
  484. #if defined(JPH_USE_SSE)
  485. return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
  486. #elif defined(JPH_USE_NEON)
  487. return vabsq_f32(mValue);
  488. #else
  489. #error Unsupported CPU architecture
  490. #endif
  491. }
  492. Vec4 Vec4::Reciprocal() const
  493. {
  494. return sReplicate(1.0f) / mValue;
  495. }
  496. Vec4 Vec4::DotV(Vec4Arg inV2) const
  497. {
  498. #if defined(JPH_USE_SSE)
  499. return _mm_dp_ps(mValue, inV2.mValue, 0xff);
  500. #elif defined(JPH_USE_NEON)
  501. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  502. return vdupq_n_f32(vaddvq_f32(mul));
  503. #else
  504. #error Unsupported CPU architecture
  505. #endif
  506. }
  507. float Vec4::Dot(Vec4Arg inV2) const
  508. {
  509. #if defined(JPH_USE_SSE)
  510. return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
  511. #elif defined(JPH_USE_NEON)
  512. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  513. return vaddvq_f32(mul);
  514. #else
  515. #error Unsupported CPU architecture
  516. #endif
  517. }
  518. float Vec4::LengthSq() const
  519. {
  520. #if defined(JPH_USE_SSE)
  521. return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
  522. #elif defined(JPH_USE_NEON)
  523. float32x4_t mul = vmulq_f32(mValue, mValue);
  524. return vaddvq_f32(mul);
  525. #else
  526. #error Unsupported CPU architecture
  527. #endif
  528. }
  529. float Vec4::Length() const
  530. {
  531. #if defined(JPH_USE_SSE)
  532. return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
  533. #elif defined(JPH_USE_NEON)
  534. float32x4_t mul = vmulq_f32(mValue, mValue);
  535. float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
  536. return vget_lane_f32(vsqrt_f32(sum), 0);
  537. #else
  538. #error Unsupported CPU architecture
  539. #endif
  540. }
  541. Vec4 Vec4::Sqrt() const
  542. {
  543. #if defined(JPH_USE_SSE)
  544. return _mm_sqrt_ps(mValue);
  545. #elif defined(JPH_USE_NEON)
  546. return vsqrtq_f32(mValue);
  547. #else
  548. #error Unsupported CPU architecture
  549. #endif
  550. }
  551. Vec4 Vec4::GetSign() const
  552. {
  553. #if defined(JPH_USE_SSE)
  554. Type minus_one = _mm_set1_ps(-1.0f);
  555. Type one = _mm_set1_ps(1.0f);
  556. return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
  557. #elif defined(JPH_USE_NEON)
  558. Type minus_one = vdupq_n_f32(-1.0f);
  559. Type one = vdupq_n_f32(1.0f);
  560. return vorrq_s32(vandq_s32(mValue, minus_one), one);
  561. #else
  562. #error Unsupported CPU architecture
  563. #endif
  564. }
  565. Vec4 Vec4::Normalized() const
  566. {
  567. #if defined(JPH_USE_SSE)
  568. return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
  569. #elif defined(JPH_USE_NEON)
  570. float32x4_t mul = vmulq_f32(mValue, mValue);
  571. float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
  572. return vdivq_f32(mValue, vsqrtq_f32(sum));
  573. #else
  574. #error Unsupported CPU architecture
  575. #endif
  576. }
  577. void Vec4::StoreFloat4(Float4 *outV) const
  578. {
  579. #if defined(JPH_USE_SSE)
  580. _mm_storeu_ps(&outV->x, mValue);
  581. #elif defined(JPH_USE_NEON)
  582. vst1q_f32(&outV->x, mValue);
  583. #else
  584. #error Unsupported CPU architecture
  585. #endif
  586. }
  587. UVec4 Vec4::ToInt() const
  588. {
  589. #if defined(JPH_USE_SSE)
  590. return _mm_cvttps_epi32(mValue);
  591. #elif defined(JPH_USE_NEON)
  592. return vcvtq_u32_f32(mValue);
  593. #else
  594. #error Unsupported CPU architecture
  595. #endif
  596. }
  597. UVec4 Vec4::ReinterpretAsInt() const
  598. {
  599. #if defined(JPH_USE_SSE)
  600. return UVec4(_mm_castps_si128(mValue));
  601. #elif defined(JPH_USE_NEON)
  602. return vreinterpretq_u32_f32(mValue);
  603. #else
  604. #error Unsupported CPU architecture
  605. #endif
  606. }
  607. int Vec4::GetSignBits() const
  608. {
  609. #if defined(JPH_USE_SSE)
  610. return _mm_movemask_ps(mValue);
  611. #elif defined(JPH_USE_NEON)
  612. int32x4_t shift = { 0, 1, 2, 3 };
  613. return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
  614. #else
  615. #error Unsupported CPU architecture
  616. #endif
  617. }
  618. float Vec4::ReduceMin() const
  619. {
  620. Vec4 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  621. v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  622. return v.GetX();
  623. }
  624. float Vec4::ReduceMax() const
  625. {
  626. Vec4 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  627. v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  628. return v.GetX();
  629. }
  630. } // JPH