Vec4.inl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. #include <Jolt/Math/Vec3.h>
  4. #include <Jolt/Math/UVec4.h>
  5. namespace JPH {
  6. // Constructor
  7. Vec4::Vec4(Vec3Arg inRHS) :
  8. mValue(inRHS.mValue)
  9. {
  10. }
  11. Vec4::Vec4(Vec3Arg inRHS, float inW)
  12. {
  13. #if defined(JPH_USE_SSE4_1)
  14. mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
  15. #elif defined(JPH_USE_NEON)
  16. mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
  17. #else
  18. for (int i = 0; i < 3; i++)
  19. mF32[i] = inRHS.mF32[i];
  20. mF32[3] = inW;
  21. #endif
  22. }
  23. Vec4::Vec4(float inX, float inY, float inZ, float inW)
  24. {
  25. #if defined(JPH_USE_SSE)
  26. mValue = _mm_set_ps(inW, inZ, inY, inX);
  27. #elif defined(JPH_USE_NEON)
  28. uint32x2_t xy = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32 *>(&inX)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inY)) << 32));
  29. uint32x2_t zw = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32* >(&inZ)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inW)) << 32));
  30. mValue = vcombine_f32(xy, zw);
  31. #else
  32. #error Undefined CPU architecture
  33. #endif
  34. }
  35. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
  36. Vec4 Vec4::Swizzle() const
  37. {
  38. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  39. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  40. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  41. static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
  42. #if defined(JPH_USE_SSE)
  43. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
  44. #elif defined(JPH_USE_NEON)
  45. return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
  46. #else
  47. #error Unsupported CPU architecture
  48. #endif
  49. }
  50. Vec4 Vec4::sZero()
  51. {
  52. #if defined(JPH_USE_SSE)
  53. return _mm_setzero_ps();
  54. #elif defined(JPH_USE_NEON)
  55. return vdupq_n_f32(0);
  56. #else
  57. #error Unsupported CPU architecture
  58. #endif
  59. }
  60. Vec4 Vec4::sReplicate(float inV)
  61. {
  62. #if defined(JPH_USE_SSE)
  63. return _mm_set1_ps(inV);
  64. #elif defined(JPH_USE_NEON)
  65. return vdupq_n_f32(inV);
  66. #else
  67. #error Unsupported CPU architecture
  68. #endif
  69. }
  70. Vec4 Vec4::sNaN()
  71. {
  72. return sReplicate(numeric_limits<float>::quiet_NaN());
  73. }
  74. Vec4 Vec4::sLoadFloat4(const Float4 *inV)
  75. {
  76. #if defined(JPH_USE_SSE)
  77. return _mm_loadu_ps(&inV->x);
  78. #elif defined(JPH_USE_NEON)
  79. return vld1q_f32(&inV->x);
  80. #else
  81. #error Unsupported CPU architecture
  82. #endif
  83. }
  84. Vec4 Vec4::sLoadFloat4Aligned(const Float4 *inV)
  85. {
  86. #if defined(JPH_USE_SSE)
  87. return _mm_load_ps(&inV->x);
  88. #elif defined(JPH_USE_NEON)
  89. return vld1q_f32(&inV->x);
  90. #else
  91. #error Unsupported CPU architecture
  92. #endif
  93. }
  94. template <const int Scale>
  95. Vec4 Vec4::sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
  96. {
  97. #if defined(JPH_USE_SSE)
  98. #ifdef JPH_USE_AVX2
  99. return _mm_i32gather_ps(inBase, inOffsets.mValue, Scale);
  100. #else
  101. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  102. Type x = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale));
  103. Type y = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale));
  104. Type xy = _mm_unpacklo_ps(x, y);
  105. Type z = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale));
  106. Type w = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale));
  107. Type zw = _mm_unpacklo_ps(z, w);
  108. return _mm_movelh_ps(xy, zw);
  109. #endif
  110. #else
  111. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  112. float x = *reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale);
  113. float y = *reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale);
  114. float z = *reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale);
  115. float w = *reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale);
  116. return Vec4(x, y, z, w);
  117. #endif
  118. }
  119. Vec4 Vec4::sMin(Vec4Arg inV1, Vec4Arg inV2)
  120. {
  121. #if defined(JPH_USE_SSE)
  122. return _mm_min_ps(inV1.mValue, inV2.mValue);
  123. #elif defined(JPH_USE_NEON)
  124. return vminq_f32(inV1.mValue, inV2.mValue);
  125. #else
  126. #error Unsupported CPU architecture
  127. #endif
  128. }
  129. Vec4 Vec4::sMax(Vec4Arg inV1, Vec4Arg inV2)
  130. {
  131. #if defined(JPH_USE_SSE)
  132. return _mm_max_ps(inV1.mValue, inV2.mValue);
  133. #elif defined(JPH_USE_NEON)
  134. return vmaxq_f32(inV1.mValue, inV2.mValue);
  135. #else
  136. #error Unsupported CPU architecture
  137. #endif
  138. }
  139. UVec4 Vec4::sEquals(Vec4Arg inV1, Vec4Arg inV2)
  140. {
  141. #if defined(JPH_USE_SSE)
  142. return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
  143. #elif defined(JPH_USE_NEON)
  144. return vceqq_f32(inV1.mValue, inV2.mValue);
  145. #else
  146. #error Unsupported CPU architecture
  147. #endif
  148. }
  149. UVec4 Vec4::sLess(Vec4Arg inV1, Vec4Arg inV2)
  150. {
  151. #if defined(JPH_USE_SSE)
  152. return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
  153. #elif defined(JPH_USE_NEON)
  154. return vcltq_f32(inV1.mValue, inV2.mValue);
  155. #else
  156. #error Unsupported CPU architecture
  157. #endif
  158. }
  159. UVec4 Vec4::sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  160. {
  161. #if defined(JPH_USE_SSE)
  162. return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
  163. #elif defined(JPH_USE_NEON)
  164. return vcleq_f32(inV1.mValue, inV2.mValue);
  165. #else
  166. #error Unsupported CPU architecture
  167. #endif
  168. }
  169. UVec4 Vec4::sGreater(Vec4Arg inV1, Vec4Arg inV2)
  170. {
  171. #if defined(JPH_USE_SSE)
  172. return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
  173. #elif defined(JPH_USE_NEON)
  174. return vcgtq_f32(inV1.mValue, inV2.mValue);
  175. #else
  176. #error Unsupported CPU architecture
  177. #endif
  178. }
  179. UVec4 Vec4::sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  180. {
  181. #if defined(JPH_USE_SSE)
  182. return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
  183. #elif defined(JPH_USE_NEON)
  184. return vcgeq_f32(inV1.mValue, inV2.mValue);
  185. #else
  186. #error Unsupported CPU architecture
  187. #endif
  188. }
  189. Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
  190. {
  191. #if defined(JPH_USE_SSE)
  192. #ifdef JPH_USE_FMADD
  193. return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
  194. #else
  195. return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
  196. #endif
  197. #elif defined(JPH_USE_NEON)
  198. return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
  199. #else
  200. #error Unsupported CPU architecture
  201. #endif
  202. }
  203. Vec4 Vec4::sSelect(Vec4Arg inV1, Vec4Arg inV2, UVec4Arg inControl)
  204. {
  205. #if defined(JPH_USE_SSE4_1)
  206. return _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
  207. #elif defined(JPH_USE_NEON)
  208. return vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
  209. #else
  210. Vec4 result;
  211. for (int i = 0; i < 4; i++)
  212. result.mF32[i] = inControl.mU32[i] ? inV2.mF32[i] : inV1.mF32[i];
  213. return result;
  214. #endif
  215. }
  216. Vec4 Vec4::sOr(Vec4Arg inV1, Vec4Arg inV2)
  217. {
  218. #if defined(JPH_USE_SSE)
  219. return _mm_or_ps(inV1.mValue, inV2.mValue);
  220. #elif defined(JPH_USE_NEON)
  221. return vorrq_s32(inV1.mValue, inV2.mValue);
  222. #else
  223. #error Unsupported CPU architecture
  224. #endif
  225. }
  226. Vec4 Vec4::sXor(Vec4Arg inV1, Vec4Arg inV2)
  227. {
  228. #if defined(JPH_USE_SSE)
  229. return _mm_xor_ps(inV1.mValue, inV2.mValue);
  230. #elif defined(JPH_USE_NEON)
  231. return veorq_s32(inV1.mValue, inV2.mValue);
  232. #else
  233. #error Unsupported CPU architecture
  234. #endif
  235. }
  236. Vec4 Vec4::sAnd(Vec4Arg inV1, Vec4Arg inV2)
  237. {
  238. #if defined(JPH_USE_SSE)
  239. return _mm_and_ps(inV1.mValue, inV2.mValue);
  240. #elif defined(JPH_USE_NEON)
  241. return vandq_s32(inV1.mValue, inV2.mValue);
  242. #else
  243. #error Unsupported CPU architecture
  244. #endif
  245. }
  246. void Vec4::sSort4(Vec4 &ioValue, UVec4 &ioIndex)
  247. {
  248. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  249. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  250. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  251. UVec4 c1 = sLess(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  252. ioValue = sSelect(ioValue, v1, c1);
  253. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  254. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  255. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  256. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  257. UVec4 c2 = sLess(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  258. ioValue = sSelect(ioValue, v2, c2);
  259. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  260. // Pass 3, test 2nd vs 3rd component
  261. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  262. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  263. UVec4 c3 = sLess(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  264. ioValue = sSelect(ioValue, v3, c3);
  265. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  266. }
  267. void Vec4::sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
  268. {
  269. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  270. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  271. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  272. UVec4 c1 = sGreater(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  273. ioValue = sSelect(ioValue, v1, c1);
  274. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  275. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  276. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  277. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  278. UVec4 c2 = sGreater(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  279. ioValue = sSelect(ioValue, v2, c2);
  280. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  281. // Pass 3, test 2nd vs 3rd component
  282. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  283. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  284. UVec4 c3 = sGreater(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  285. ioValue = sSelect(ioValue, v3, c3);
  286. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  287. }
  288. bool Vec4::operator == (Vec4Arg inV2) const
  289. {
  290. return sEquals(*this, inV2).TestAllTrue();
  291. }
  292. bool Vec4::IsClose(Vec4Arg inV2, float inMaxDistSq) const
  293. {
  294. return (inV2 - *this).LengthSq() <= inMaxDistSq;
  295. }
  296. bool Vec4::IsNormalized(float inTolerance) const
  297. {
  298. return abs(LengthSq() - 1.0f) <= inTolerance;
  299. }
  300. bool Vec4::IsNaN() const
  301. {
  302. #if defined(JPH_USE_SSE)
  303. return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
  304. #elif defined(JPH_USE_NEON)
  305. uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
  306. return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
  307. #else
  308. #error Unsupported CPU architecture
  309. #endif
  310. }
  311. Vec4 Vec4::operator * (Vec4Arg inV2) const
  312. {
  313. #if defined(JPH_USE_SSE)
  314. return _mm_mul_ps(mValue, inV2.mValue);
  315. #elif defined(JPH_USE_NEON)
  316. return vmulq_f32(mValue, inV2.mValue);
  317. #else
  318. #error Unsupported CPU architecture
  319. #endif
  320. }
  321. Vec4 Vec4::operator * (float inV2) const
  322. {
  323. #if defined(JPH_USE_SSE)
  324. return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  325. #elif defined(JPH_USE_NEON)
  326. return vmulq_n_f32(mValue, inV2);
  327. #else
  328. #error Unsupported CPU architecture
  329. #endif
  330. }
  331. /// Multiply vector with float
  332. Vec4 operator * (float inV1, Vec4Arg inV2)
  333. {
  334. #if defined(JPH_USE_SSE)
  335. return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
  336. #elif defined(JPH_USE_NEON)
  337. return vmulq_n_f32(inV2.mValue, inV1);
  338. #else
  339. #error Unsupported CPU architecture
  340. #endif
  341. }
  342. Vec4 Vec4::operator / (float inV2) const
  343. {
  344. #if defined(JPH_USE_SSE)
  345. return _mm_div_ps(mValue, _mm_set1_ps(inV2));
  346. #elif defined(JPH_USE_NEON)
  347. return vdivq_f32(mValue, vdupq_n_f32(inV2));
  348. #else
  349. #error Unsupported CPU architecture
  350. #endif
  351. }
  352. Vec4 &Vec4::operator *= (float inV2)
  353. {
  354. #if defined(JPH_USE_SSE)
  355. mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  356. #elif defined(JPH_USE_NEON)
  357. mValue = vmulq_n_f32(mValue, inV2);
  358. #else
  359. #error Unsupported CPU architecture
  360. #endif
  361. return *this;
  362. }
  363. Vec4 &Vec4::operator *= (Vec4Arg inV2)
  364. {
  365. #if defined(JPH_USE_SSE)
  366. mValue = _mm_mul_ps(mValue, inV2.mValue);
  367. #elif defined(JPH_USE_NEON)
  368. mValue = vmulq_f32(mValue, inV2.mValue);
  369. #else
  370. #error Unsupported CPU architecture
  371. #endif
  372. return *this;
  373. }
  374. Vec4 &Vec4::operator /= (float inV2)
  375. {
  376. #if defined(JPH_USE_SSE)
  377. mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
  378. #elif defined(JPH_USE_NEON)
  379. mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
  380. #else
  381. #error Unsupported CPU architecture
  382. #endif
  383. return *this;
  384. }
  385. Vec4 Vec4::operator + (Vec4Arg inV2) const
  386. {
  387. #if defined(JPH_USE_SSE)
  388. return _mm_add_ps(mValue, inV2.mValue);
  389. #elif defined(JPH_USE_NEON)
  390. return vaddq_f32(mValue, inV2.mValue);
  391. #else
  392. #error Unsupported CPU architecture
  393. #endif
  394. }
  395. Vec4 &Vec4::operator += (Vec4Arg inV2)
  396. {
  397. #if defined(JPH_USE_SSE)
  398. mValue = _mm_add_ps(mValue, inV2.mValue);
  399. #elif defined(JPH_USE_NEON)
  400. mValue = vaddq_f32(mValue, inV2.mValue);
  401. #else
  402. #error Unsupported CPU architecture
  403. #endif
  404. return *this;
  405. }
  406. Vec4 Vec4::operator - () const
  407. {
  408. #if defined(JPH_USE_SSE)
  409. return _mm_sub_ps(_mm_setzero_ps(), mValue);
  410. #elif defined(JPH_USE_NEON)
  411. return vnegq_f32(mValue);
  412. #else
  413. #error Unsupported CPU architecture
  414. #endif
  415. }
  416. Vec4 Vec4::operator - (Vec4Arg inV2) const
  417. {
  418. #if defined(JPH_USE_SSE)
  419. return _mm_sub_ps(mValue, inV2.mValue);
  420. #elif defined(JPH_USE_NEON)
  421. return vsubq_f32(mValue, inV2.mValue);
  422. #else
  423. #error Unsupported CPU architecture
  424. #endif
  425. }
  426. Vec4 &Vec4::operator -= (Vec4Arg inV2)
  427. {
  428. #if defined(JPH_USE_SSE)
  429. mValue = _mm_sub_ps(mValue, inV2.mValue);
  430. #elif defined(JPH_USE_NEON)
  431. mValue = vsubq_f32(mValue, inV2.mValue);
  432. #else
  433. #error Unsupported CPU architecture
  434. #endif
  435. return *this;
  436. }
  437. Vec4 Vec4::operator / (Vec4Arg inV2) const
  438. {
  439. #if defined(JPH_USE_SSE)
  440. return _mm_div_ps(mValue, inV2.mValue);
  441. #elif defined(JPH_USE_NEON)
  442. return vdivq_f32(mValue, inV2.mValue);
  443. #else
  444. #error Unsupported CPU architecture
  445. #endif
  446. }
  447. Vec4 Vec4::SplatX() const
  448. {
  449. #if defined(JPH_USE_SSE)
  450. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
  451. #elif defined(JPH_USE_NEON)
  452. return vdupq_laneq_f32(mValue, 0);
  453. #else
  454. #error Unsupported CPU architecture
  455. #endif
  456. }
  457. Vec4 Vec4::SplatY() const
  458. {
  459. #if defined(JPH_USE_SSE)
  460. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
  461. #elif defined(JPH_USE_NEON)
  462. return vdupq_laneq_f32(mValue, 1);
  463. #else
  464. #error Unsupported CPU architecture
  465. #endif
  466. }
  467. Vec4 Vec4::SplatZ() const
  468. {
  469. #if defined(JPH_USE_SSE)
  470. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
  471. #elif defined(JPH_USE_NEON)
  472. return vdupq_laneq_f32(mValue, 2);
  473. #else
  474. #error Unsupported CPU architecture
  475. #endif
  476. }
  477. Vec4 Vec4::SplatW() const
  478. {
  479. #if defined(JPH_USE_SSE)
  480. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
  481. #elif defined(JPH_USE_NEON)
  482. return vdupq_laneq_f32(mValue, 3);
  483. #else
  484. #error Unsupported CPU architecture
  485. #endif
  486. }
  487. Vec4 Vec4::Abs() const
  488. {
  489. #if defined(JPH_USE_SSE)
  490. return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
  491. #elif defined(JPH_USE_NEON)
  492. return vabsq_f32(mValue);
  493. #else
  494. #error Unsupported CPU architecture
  495. #endif
  496. }
  497. Vec4 Vec4::Reciprocal() const
  498. {
  499. return sReplicate(1.0f) / mValue;
  500. }
  501. Vec4 Vec4::DotV(Vec4Arg inV2) const
  502. {
  503. #if defined(JPH_USE_SSE4_1)
  504. return _mm_dp_ps(mValue, inV2.mValue, 0xff);
  505. #elif defined(JPH_USE_NEON)
  506. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  507. return vdupq_n_f32(vaddvq_f32(mul));
  508. #else
  509. float dot = 0.0f;
  510. for (int i = 0; i < 4; i++)
  511. dot += mF32[i] * inV2.mF32[i];
  512. return Vec4::sReplicate(dot);
  513. #endif
  514. }
  515. float Vec4::Dot(Vec4Arg inV2) const
  516. {
  517. #if defined(JPH_USE_SSE4_1)
  518. return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
  519. #elif defined(JPH_USE_NEON)
  520. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  521. return vaddvq_f32(mul);
  522. #else
  523. float dot = 0.0f;
  524. for (int i = 0; i < 4; i++)
  525. dot += mF32[i] * inV2.mF32[i];
  526. return dot;
  527. #endif
  528. }
  529. float Vec4::LengthSq() const
  530. {
  531. #if defined(JPH_USE_SSE4_1)
  532. return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
  533. #elif defined(JPH_USE_NEON)
  534. float32x4_t mul = vmulq_f32(mValue, mValue);
  535. return vaddvq_f32(mul);
  536. #else
  537. float len_sq = 0.0f;
  538. for (int i = 0; i < 4; i++)
  539. len_sq += mF32[i] * mF32[i];
  540. return len_sq;
  541. #endif
  542. }
  543. float Vec4::Length() const
  544. {
  545. #if defined(JPH_USE_SSE4_1)
  546. return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
  547. #elif defined(JPH_USE_NEON)
  548. float32x4_t mul = vmulq_f32(mValue, mValue);
  549. float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
  550. return vget_lane_f32(vsqrt_f32(sum), 0);
  551. #else
  552. return sqrt(LengthSq());
  553. #endif
  554. }
  555. Vec4 Vec4::Sqrt() const
  556. {
  557. #if defined(JPH_USE_SSE)
  558. return _mm_sqrt_ps(mValue);
  559. #elif defined(JPH_USE_NEON)
  560. return vsqrtq_f32(mValue);
  561. #else
  562. #error Unsupported CPU architecture
  563. #endif
  564. }
  565. Vec4 Vec4::GetSign() const
  566. {
  567. #if defined(JPH_USE_SSE)
  568. Type minus_one = _mm_set1_ps(-1.0f);
  569. Type one = _mm_set1_ps(1.0f);
  570. return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
  571. #elif defined(JPH_USE_NEON)
  572. Type minus_one = vdupq_n_f32(-1.0f);
  573. Type one = vdupq_n_f32(1.0f);
  574. return vorrq_s32(vandq_s32(mValue, minus_one), one);
  575. #else
  576. #error Unsupported CPU architecture
  577. #endif
  578. }
  579. Vec4 Vec4::Normalized() const
  580. {
  581. #if defined(JPH_USE_SSE4_1)
  582. return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
  583. #elif defined(JPH_USE_NEON)
  584. float32x4_t mul = vmulq_f32(mValue, mValue);
  585. float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
  586. return vdivq_f32(mValue, vsqrtq_f32(sum));
  587. #else
  588. return *this / Length();
  589. #endif
  590. }
  591. void Vec4::StoreFloat4(Float4 *outV) const
  592. {
  593. #if defined(JPH_USE_SSE)
  594. _mm_storeu_ps(&outV->x, mValue);
  595. #elif defined(JPH_USE_NEON)
  596. vst1q_f32(&outV->x, mValue);
  597. #else
  598. #error Unsupported CPU architecture
  599. #endif
  600. }
  601. UVec4 Vec4::ToInt() const
  602. {
  603. #if defined(JPH_USE_SSE)
  604. return _mm_cvttps_epi32(mValue);
  605. #elif defined(JPH_USE_NEON)
  606. return vcvtq_u32_f32(mValue);
  607. #else
  608. #error Unsupported CPU architecture
  609. #endif
  610. }
  611. UVec4 Vec4::ReinterpretAsInt() const
  612. {
  613. #if defined(JPH_USE_SSE)
  614. return UVec4(_mm_castps_si128(mValue));
  615. #elif defined(JPH_USE_NEON)
  616. return vreinterpretq_u32_f32(mValue);
  617. #else
  618. #error Unsupported CPU architecture
  619. #endif
  620. }
  621. int Vec4::GetSignBits() const
  622. {
  623. #if defined(JPH_USE_SSE)
  624. return _mm_movemask_ps(mValue);
  625. #elif defined(JPH_USE_NEON)
  626. int32x4_t shift = { 0, 1, 2, 3 };
  627. return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
  628. #else
  629. #error Unsupported CPU architecture
  630. #endif
  631. }
  632. float Vec4::ReduceMin() const
  633. {
  634. Vec4 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  635. v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  636. return v.GetX();
  637. }
  638. float Vec4::ReduceMax() const
  639. {
  640. Vec4 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  641. v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  642. return v.GetX();
  643. }
  644. } // JPH