Vec4.inl 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. #include <Jolt/Math/Trigonometry.h>
  4. #include <Jolt/Math/Vec3.h>
  5. #include <Jolt/Math/UVec4.h>
  6. JPH_NAMESPACE_BEGIN
  7. // Constructor
  8. Vec4::Vec4(Vec3Arg inRHS) :
  9. mValue(inRHS.mValue)
  10. {
  11. }
  12. Vec4::Vec4(Vec3Arg inRHS, float inW)
  13. {
  14. #if defined(JPH_USE_SSE4_1)
  15. mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
  16. #elif defined(JPH_USE_NEON)
  17. mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
  18. #else
  19. for (int i = 0; i < 3; i++)
  20. mF32[i] = inRHS.mF32[i];
  21. mF32[3] = inW;
  22. #endif
  23. }
  24. Vec4::Vec4(float inX, float inY, float inZ, float inW)
  25. {
  26. #if defined(JPH_USE_SSE)
  27. mValue = _mm_set_ps(inW, inZ, inY, inX);
  28. #elif defined(JPH_USE_NEON)
  29. uint32x2_t xy = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32 *>(&inX)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inY)) << 32));
  30. uint32x2_t zw = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32* >(&inZ)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inW)) << 32));
  31. mValue = vcombine_f32(xy, zw);
  32. #else
  33. #error Undefined CPU architecture
  34. #endif
  35. }
  36. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
  37. Vec4 Vec4::Swizzle() const
  38. {
  39. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  40. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  41. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  42. static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
  43. #if defined(JPH_USE_SSE)
  44. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
  45. #elif defined(JPH_USE_NEON)
  46. return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
  47. #else
  48. #error Unsupported CPU architecture
  49. #endif
  50. }
  51. Vec4 Vec4::sZero()
  52. {
  53. #if defined(JPH_USE_SSE)
  54. return _mm_setzero_ps();
  55. #elif defined(JPH_USE_NEON)
  56. return vdupq_n_f32(0);
  57. #else
  58. #error Unsupported CPU architecture
  59. #endif
  60. }
  61. Vec4 Vec4::sReplicate(float inV)
  62. {
  63. #if defined(JPH_USE_SSE)
  64. return _mm_set1_ps(inV);
  65. #elif defined(JPH_USE_NEON)
  66. return vdupq_n_f32(inV);
  67. #else
  68. #error Unsupported CPU architecture
  69. #endif
  70. }
  71. Vec4 Vec4::sNaN()
  72. {
  73. return sReplicate(numeric_limits<float>::quiet_NaN());
  74. }
  75. Vec4 Vec4::sLoadFloat4(const Float4 *inV)
  76. {
  77. #if defined(JPH_USE_SSE)
  78. return _mm_loadu_ps(&inV->x);
  79. #elif defined(JPH_USE_NEON)
  80. return vld1q_f32(&inV->x);
  81. #else
  82. #error Unsupported CPU architecture
  83. #endif
  84. }
  85. Vec4 Vec4::sLoadFloat4Aligned(const Float4 *inV)
  86. {
  87. #if defined(JPH_USE_SSE)
  88. return _mm_load_ps(&inV->x);
  89. #elif defined(JPH_USE_NEON)
  90. return vld1q_f32(&inV->x);
  91. #else
  92. #error Unsupported CPU architecture
  93. #endif
  94. }
  95. template <const int Scale>
  96. Vec4 Vec4::sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
  97. {
  98. #if defined(JPH_USE_SSE)
  99. #ifdef JPH_USE_AVX2
  100. return _mm_i32gather_ps(inBase, inOffsets.mValue, Scale);
  101. #else
  102. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  103. Type x = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale));
  104. Type y = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale));
  105. Type xy = _mm_unpacklo_ps(x, y);
  106. Type z = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale));
  107. Type w = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale));
  108. Type zw = _mm_unpacklo_ps(z, w);
  109. return _mm_movelh_ps(xy, zw);
  110. #endif
  111. #else
  112. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  113. float x = *reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale);
  114. float y = *reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale);
  115. float z = *reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale);
  116. float w = *reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale);
  117. return Vec4(x, y, z, w);
  118. #endif
  119. }
  120. Vec4 Vec4::sMin(Vec4Arg inV1, Vec4Arg inV2)
  121. {
  122. #if defined(JPH_USE_SSE)
  123. return _mm_min_ps(inV1.mValue, inV2.mValue);
  124. #elif defined(JPH_USE_NEON)
  125. return vminq_f32(inV1.mValue, inV2.mValue);
  126. #else
  127. #error Unsupported CPU architecture
  128. #endif
  129. }
  130. Vec4 Vec4::sMax(Vec4Arg inV1, Vec4Arg inV2)
  131. {
  132. #if defined(JPH_USE_SSE)
  133. return _mm_max_ps(inV1.mValue, inV2.mValue);
  134. #elif defined(JPH_USE_NEON)
  135. return vmaxq_f32(inV1.mValue, inV2.mValue);
  136. #else
  137. #error Unsupported CPU architecture
  138. #endif
  139. }
  140. UVec4 Vec4::sEquals(Vec4Arg inV1, Vec4Arg inV2)
  141. {
  142. #if defined(JPH_USE_SSE)
  143. return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
  144. #elif defined(JPH_USE_NEON)
  145. return vceqq_f32(inV1.mValue, inV2.mValue);
  146. #else
  147. #error Unsupported CPU architecture
  148. #endif
  149. }
  150. UVec4 Vec4::sLess(Vec4Arg inV1, Vec4Arg inV2)
  151. {
  152. #if defined(JPH_USE_SSE)
  153. return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
  154. #elif defined(JPH_USE_NEON)
  155. return vcltq_f32(inV1.mValue, inV2.mValue);
  156. #else
  157. #error Unsupported CPU architecture
  158. #endif
  159. }
  160. UVec4 Vec4::sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  161. {
  162. #if defined(JPH_USE_SSE)
  163. return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
  164. #elif defined(JPH_USE_NEON)
  165. return vcleq_f32(inV1.mValue, inV2.mValue);
  166. #else
  167. #error Unsupported CPU architecture
  168. #endif
  169. }
  170. UVec4 Vec4::sGreater(Vec4Arg inV1, Vec4Arg inV2)
  171. {
  172. #if defined(JPH_USE_SSE)
  173. return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
  174. #elif defined(JPH_USE_NEON)
  175. return vcgtq_f32(inV1.mValue, inV2.mValue);
  176. #else
  177. #error Unsupported CPU architecture
  178. #endif
  179. }
  180. UVec4 Vec4::sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  181. {
  182. #if defined(JPH_USE_SSE)
  183. return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
  184. #elif defined(JPH_USE_NEON)
  185. return vcgeq_f32(inV1.mValue, inV2.mValue);
  186. #else
  187. #error Unsupported CPU architecture
  188. #endif
  189. }
  190. Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
  191. {
  192. #if defined(JPH_USE_SSE)
  193. #ifdef JPH_USE_FMADD
  194. return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
  195. #else
  196. return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
  197. #endif
  198. #elif defined(JPH_USE_NEON)
  199. return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
  200. #else
  201. #error Unsupported CPU architecture
  202. #endif
  203. }
  204. Vec4 Vec4::sSelect(Vec4Arg inV1, Vec4Arg inV2, UVec4Arg inControl)
  205. {
  206. #if defined(JPH_USE_SSE4_1)
  207. return _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
  208. #elif defined(JPH_USE_NEON)
  209. return vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
  210. #else
  211. Vec4 result;
  212. for (int i = 0; i < 4; i++)
  213. result.mF32[i] = inControl.mU32[i] ? inV2.mF32[i] : inV1.mF32[i];
  214. return result;
  215. #endif
  216. }
  217. Vec4 Vec4::sOr(Vec4Arg inV1, Vec4Arg inV2)
  218. {
  219. #if defined(JPH_USE_SSE)
  220. return _mm_or_ps(inV1.mValue, inV2.mValue);
  221. #elif defined(JPH_USE_NEON)
  222. return vorrq_s32(inV1.mValue, inV2.mValue);
  223. #else
  224. #error Unsupported CPU architecture
  225. #endif
  226. }
  227. Vec4 Vec4::sXor(Vec4Arg inV1, Vec4Arg inV2)
  228. {
  229. #if defined(JPH_USE_SSE)
  230. return _mm_xor_ps(inV1.mValue, inV2.mValue);
  231. #elif defined(JPH_USE_NEON)
  232. return veorq_s32(inV1.mValue, inV2.mValue);
  233. #else
  234. #error Unsupported CPU architecture
  235. #endif
  236. }
  237. Vec4 Vec4::sAnd(Vec4Arg inV1, Vec4Arg inV2)
  238. {
  239. #if defined(JPH_USE_SSE)
  240. return _mm_and_ps(inV1.mValue, inV2.mValue);
  241. #elif defined(JPH_USE_NEON)
  242. return vandq_s32(inV1.mValue, inV2.mValue);
  243. #else
  244. #error Unsupported CPU architecture
  245. #endif
  246. }
  247. void Vec4::sSort4(Vec4 &ioValue, UVec4 &ioIndex)
  248. {
  249. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  250. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  251. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  252. UVec4 c1 = sLess(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  253. ioValue = sSelect(ioValue, v1, c1);
  254. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  255. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  256. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  257. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  258. UVec4 c2 = sLess(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  259. ioValue = sSelect(ioValue, v2, c2);
  260. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  261. // Pass 3, test 2nd vs 3rd component
  262. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  263. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  264. UVec4 c3 = sLess(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  265. ioValue = sSelect(ioValue, v3, c3);
  266. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  267. }
  268. void Vec4::sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
  269. {
  270. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  271. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  272. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  273. UVec4 c1 = sGreater(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  274. ioValue = sSelect(ioValue, v1, c1);
  275. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  276. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  277. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  278. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  279. UVec4 c2 = sGreater(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  280. ioValue = sSelect(ioValue, v2, c2);
  281. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  282. // Pass 3, test 2nd vs 3rd component
  283. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  284. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  285. UVec4 c3 = sGreater(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  286. ioValue = sSelect(ioValue, v3, c3);
  287. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  288. }
  289. bool Vec4::operator == (Vec4Arg inV2) const
  290. {
  291. return sEquals(*this, inV2).TestAllTrue();
  292. }
  293. bool Vec4::IsClose(Vec4Arg inV2, float inMaxDistSq) const
  294. {
  295. return (inV2 - *this).LengthSq() <= inMaxDistSq;
  296. }
  297. bool Vec4::IsNormalized(float inTolerance) const
  298. {
  299. return abs(LengthSq() - 1.0f) <= inTolerance;
  300. }
  301. bool Vec4::IsNaN() const
  302. {
  303. #if defined(JPH_USE_SSE)
  304. return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
  305. #elif defined(JPH_USE_NEON)
  306. uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
  307. return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
  308. #else
  309. #error Unsupported CPU architecture
  310. #endif
  311. }
  312. Vec4 Vec4::operator * (Vec4Arg inV2) const
  313. {
  314. #if defined(JPH_USE_SSE)
  315. return _mm_mul_ps(mValue, inV2.mValue);
  316. #elif defined(JPH_USE_NEON)
  317. return vmulq_f32(mValue, inV2.mValue);
  318. #else
  319. #error Unsupported CPU architecture
  320. #endif
  321. }
  322. Vec4 Vec4::operator * (float inV2) const
  323. {
  324. #if defined(JPH_USE_SSE)
  325. return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  326. #elif defined(JPH_USE_NEON)
  327. return vmulq_n_f32(mValue, inV2);
  328. #else
  329. #error Unsupported CPU architecture
  330. #endif
  331. }
  332. /// Multiply vector with float
  333. Vec4 operator * (float inV1, Vec4Arg inV2)
  334. {
  335. #if defined(JPH_USE_SSE)
  336. return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
  337. #elif defined(JPH_USE_NEON)
  338. return vmulq_n_f32(inV2.mValue, inV1);
  339. #else
  340. #error Unsupported CPU architecture
  341. #endif
  342. }
  343. Vec4 Vec4::operator / (float inV2) const
  344. {
  345. #if defined(JPH_USE_SSE)
  346. return _mm_div_ps(mValue, _mm_set1_ps(inV2));
  347. #elif defined(JPH_USE_NEON)
  348. return vdivq_f32(mValue, vdupq_n_f32(inV2));
  349. #else
  350. #error Unsupported CPU architecture
  351. #endif
  352. }
  353. Vec4 &Vec4::operator *= (float inV2)
  354. {
  355. #if defined(JPH_USE_SSE)
  356. mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  357. #elif defined(JPH_USE_NEON)
  358. mValue = vmulq_n_f32(mValue, inV2);
  359. #else
  360. #error Unsupported CPU architecture
  361. #endif
  362. return *this;
  363. }
  364. Vec4 &Vec4::operator *= (Vec4Arg inV2)
  365. {
  366. #if defined(JPH_USE_SSE)
  367. mValue = _mm_mul_ps(mValue, inV2.mValue);
  368. #elif defined(JPH_USE_NEON)
  369. mValue = vmulq_f32(mValue, inV2.mValue);
  370. #else
  371. #error Unsupported CPU architecture
  372. #endif
  373. return *this;
  374. }
  375. Vec4 &Vec4::operator /= (float inV2)
  376. {
  377. #if defined(JPH_USE_SSE)
  378. mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
  379. #elif defined(JPH_USE_NEON)
  380. mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
  381. #else
  382. #error Unsupported CPU architecture
  383. #endif
  384. return *this;
  385. }
  386. Vec4 Vec4::operator + (Vec4Arg inV2) const
  387. {
  388. #if defined(JPH_USE_SSE)
  389. return _mm_add_ps(mValue, inV2.mValue);
  390. #elif defined(JPH_USE_NEON)
  391. return vaddq_f32(mValue, inV2.mValue);
  392. #else
  393. #error Unsupported CPU architecture
  394. #endif
  395. }
  396. Vec4 &Vec4::operator += (Vec4Arg inV2)
  397. {
  398. #if defined(JPH_USE_SSE)
  399. mValue = _mm_add_ps(mValue, inV2.mValue);
  400. #elif defined(JPH_USE_NEON)
  401. mValue = vaddq_f32(mValue, inV2.mValue);
  402. #else
  403. #error Unsupported CPU architecture
  404. #endif
  405. return *this;
  406. }
  407. Vec4 Vec4::operator - () const
  408. {
  409. #if defined(JPH_USE_SSE)
  410. return _mm_sub_ps(_mm_setzero_ps(), mValue);
  411. #elif defined(JPH_USE_NEON)
  412. return vnegq_f32(mValue);
  413. #else
  414. #error Unsupported CPU architecture
  415. #endif
  416. }
  417. Vec4 Vec4::operator - (Vec4Arg inV2) const
  418. {
  419. #if defined(JPH_USE_SSE)
  420. return _mm_sub_ps(mValue, inV2.mValue);
  421. #elif defined(JPH_USE_NEON)
  422. return vsubq_f32(mValue, inV2.mValue);
  423. #else
  424. #error Unsupported CPU architecture
  425. #endif
  426. }
  427. Vec4 &Vec4::operator -= (Vec4Arg inV2)
  428. {
  429. #if defined(JPH_USE_SSE)
  430. mValue = _mm_sub_ps(mValue, inV2.mValue);
  431. #elif defined(JPH_USE_NEON)
  432. mValue = vsubq_f32(mValue, inV2.mValue);
  433. #else
  434. #error Unsupported CPU architecture
  435. #endif
  436. return *this;
  437. }
  438. Vec4 Vec4::operator / (Vec4Arg inV2) const
  439. {
  440. #if defined(JPH_USE_SSE)
  441. return _mm_div_ps(mValue, inV2.mValue);
  442. #elif defined(JPH_USE_NEON)
  443. return vdivq_f32(mValue, inV2.mValue);
  444. #else
  445. #error Unsupported CPU architecture
  446. #endif
  447. }
  448. Vec4 Vec4::SplatX() const
  449. {
  450. #if defined(JPH_USE_SSE)
  451. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
  452. #elif defined(JPH_USE_NEON)
  453. return vdupq_laneq_f32(mValue, 0);
  454. #else
  455. #error Unsupported CPU architecture
  456. #endif
  457. }
  458. Vec4 Vec4::SplatY() const
  459. {
  460. #if defined(JPH_USE_SSE)
  461. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
  462. #elif defined(JPH_USE_NEON)
  463. return vdupq_laneq_f32(mValue, 1);
  464. #else
  465. #error Unsupported CPU architecture
  466. #endif
  467. }
  468. Vec4 Vec4::SplatZ() const
  469. {
  470. #if defined(JPH_USE_SSE)
  471. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
  472. #elif defined(JPH_USE_NEON)
  473. return vdupq_laneq_f32(mValue, 2);
  474. #else
  475. #error Unsupported CPU architecture
  476. #endif
  477. }
  478. Vec4 Vec4::SplatW() const
  479. {
  480. #if defined(JPH_USE_SSE)
  481. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
  482. #elif defined(JPH_USE_NEON)
  483. return vdupq_laneq_f32(mValue, 3);
  484. #else
  485. #error Unsupported CPU architecture
  486. #endif
  487. }
  488. Vec4 Vec4::Abs() const
  489. {
  490. #if defined(JPH_USE_AVX512)
  491. return _mm_range_ps(mValue, mValue, 0b1000);
  492. #elif defined(JPH_USE_SSE)
  493. return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
  494. #elif defined(JPH_USE_NEON)
  495. return vabsq_f32(mValue);
  496. #else
  497. #error Unsupported CPU architecture
  498. #endif
  499. }
  500. Vec4 Vec4::Reciprocal() const
  501. {
  502. return sReplicate(1.0f) / mValue;
  503. }
  504. Vec4 Vec4::DotV(Vec4Arg inV2) const
  505. {
  506. #if defined(JPH_USE_SSE4_1)
  507. return _mm_dp_ps(mValue, inV2.mValue, 0xff);
  508. #elif defined(JPH_USE_NEON)
  509. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  510. return vdupq_n_f32(vaddvq_f32(mul));
  511. #else
  512. float dot = 0.0f;
  513. for (int i = 0; i < 4; i++)
  514. dot += mF32[i] * inV2.mF32[i];
  515. return Vec4::sReplicate(dot);
  516. #endif
  517. }
  518. float Vec4::Dot(Vec4Arg inV2) const
  519. {
  520. #if defined(JPH_USE_SSE4_1)
  521. return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
  522. #elif defined(JPH_USE_NEON)
  523. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  524. return vaddvq_f32(mul);
  525. #else
  526. float dot = 0.0f;
  527. for (int i = 0; i < 4; i++)
  528. dot += mF32[i] * inV2.mF32[i];
  529. return dot;
  530. #endif
  531. }
  532. float Vec4::LengthSq() const
  533. {
  534. #if defined(JPH_USE_SSE4_1)
  535. return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
  536. #elif defined(JPH_USE_NEON)
  537. float32x4_t mul = vmulq_f32(mValue, mValue);
  538. return vaddvq_f32(mul);
  539. #else
  540. float len_sq = 0.0f;
  541. for (int i = 0; i < 4; i++)
  542. len_sq += mF32[i] * mF32[i];
  543. return len_sq;
  544. #endif
  545. }
  546. float Vec4::Length() const
  547. {
  548. #if defined(JPH_USE_SSE4_1)
  549. return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
  550. #elif defined(JPH_USE_NEON)
  551. float32x4_t mul = vmulq_f32(mValue, mValue);
  552. float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
  553. return vget_lane_f32(vsqrt_f32(sum), 0);
  554. #else
  555. return sqrt(LengthSq());
  556. #endif
  557. }
  558. Vec4 Vec4::Sqrt() const
  559. {
  560. #if defined(JPH_USE_SSE)
  561. return _mm_sqrt_ps(mValue);
  562. #elif defined(JPH_USE_NEON)
  563. return vsqrtq_f32(mValue);
  564. #else
  565. #error Unsupported CPU architecture
  566. #endif
  567. }
  568. Vec4 Vec4::GetSign() const
  569. {
  570. #if defined(JPH_USE_AVX512)
  571. return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
  572. #elif defined(JPH_USE_SSE)
  573. Type minus_one = _mm_set1_ps(-1.0f);
  574. Type one = _mm_set1_ps(1.0f);
  575. return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
  576. #elif defined(JPH_USE_NEON)
  577. Type minus_one = vdupq_n_f32(-1.0f);
  578. Type one = vdupq_n_f32(1.0f);
  579. return vorrq_s32(vandq_s32(mValue, minus_one), one);
  580. #else
  581. #error Unsupported CPU architecture
  582. #endif
  583. }
  584. Vec4 Vec4::Normalized() const
  585. {
  586. #if defined(JPH_USE_SSE4_1)
  587. return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
  588. #elif defined(JPH_USE_NEON)
  589. float32x4_t mul = vmulq_f32(mValue, mValue);
  590. float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
  591. return vdivq_f32(mValue, vsqrtq_f32(sum));
  592. #else
  593. return *this / Length();
  594. #endif
  595. }
  596. void Vec4::StoreFloat4(Float4 *outV) const
  597. {
  598. #if defined(JPH_USE_SSE)
  599. _mm_storeu_ps(&outV->x, mValue);
  600. #elif defined(JPH_USE_NEON)
  601. vst1q_f32(&outV->x, mValue);
  602. #else
  603. #error Unsupported CPU architecture
  604. #endif
  605. }
  606. UVec4 Vec4::ToInt() const
  607. {
  608. #if defined(JPH_USE_SSE)
  609. return _mm_cvttps_epi32(mValue);
  610. #elif defined(JPH_USE_NEON)
  611. return vcvtq_u32_f32(mValue);
  612. #else
  613. #error Unsupported CPU architecture
  614. #endif
  615. }
  616. UVec4 Vec4::ReinterpretAsInt() const
  617. {
  618. #if defined(JPH_USE_SSE)
  619. return UVec4(_mm_castps_si128(mValue));
  620. #elif defined(JPH_USE_NEON)
  621. return vreinterpretq_u32_f32(mValue);
  622. #else
  623. #error Unsupported CPU architecture
  624. #endif
  625. }
  626. int Vec4::GetSignBits() const
  627. {
  628. #if defined(JPH_USE_SSE)
  629. return _mm_movemask_ps(mValue);
  630. #elif defined(JPH_USE_NEON)
  631. int32x4_t shift = { 0, 1, 2, 3 };
  632. return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
  633. #else
  634. #error Unsupported CPU architecture
  635. #endif
  636. }
  637. float Vec4::ReduceMin() const
  638. {
  639. Vec4 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  640. v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  641. return v.GetX();
  642. }
  643. float Vec4::ReduceMax() const
  644. {
  645. Vec4 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  646. v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  647. return v.GetX();
  648. }
  649. void Vec4::SinCos(Vec4 &outSin, Vec4 &outCos) const
  650. {
  651. // Implementation based on sinf.c from the cephes library, combines sinf and cosf in a single function and vectorizes it
  652. // Original implementation by Stephen L. Moshier (See: http://www.netlib.org/cephes/)
  653. // Make argument positive and remember sign (highest bit set is negative)
  654. UVec4 sin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  655. Vec4 x = Vec4::sXor(*this, sin_sign.ReinterpretAsFloat());
  656. // Integer part of x / (PI / 4)
  657. UVec4 int_val = (1.27323954473516f * x).ToInt();
  658. Vec4 y = int_val.ToFloat();
  659. // Integer and fractional part modulo one octant, map zeros to origin
  660. // if (int_val & 1) int_val++, y += 1;
  661. UVec4 and_1 = int_val.LogicalShiftLeft<31>().ArithmeticShiftRight<31>();
  662. int_val += UVec4::sAnd(and_1, UVec4::sReplicate(1));
  663. y += Vec4::sAnd(and_1.ReinterpretAsFloat(), Vec4::sReplicate(1.0f));
  664. // Extended precision modular arithmetic
  665. x = ((x - y * 0.78515625f) - y * 2.4187564849853515625e-4f) - y * 3.77489497744594108e-8f;
  666. // Calculate both results
  667. Vec4 z = x * x;
  668. Vec4 y1 = ((2.443315711809948e-5f * z - Vec4::sReplicate(1.388731625493765e-3f)) * z + Vec4::sReplicate(4.166664568298827e-2f)) * z * z - 0.5f * z + Vec4::sReplicate(1.0f);
  669. Vec4 y2 = ((-1.9515295891e-4f * z + Vec4::sReplicate(8.3321608736e-3f)) * z - Vec4::sReplicate(1.6666654611e-1f)) * z * x + x;
  670. // From here we deviate form the original cephes code, we would have to write:
  671. //
  672. // j &= 7;
  673. //
  674. // if (j > 3)
  675. // {
  676. // j -= 4;
  677. // sin_sign = -sin_sign;
  678. // cos_sign = -cos_sign;
  679. // }
  680. //
  681. // if (j > 1)
  682. // cos_sign = -cos_sign;
  683. //
  684. // ...
  685. //
  686. // if (j == 1 || j == 2) // condition
  687. // ...
  688. //
  689. // j sin_sign cos_sign condition
  690. // 000b 1 1 0
  691. // 001b 1 1 1
  692. // 010b 1 -1 1
  693. // 011b 1 -1 0
  694. // 100b -1 -1 0
  695. // 101b -1 -1 1
  696. // 110b -1 1 1
  697. // 111b -1 1 0
  698. //
  699. // So: sin_sign = bit3, cos_sign = bit2 ^ bit3, condition = bit1 ^ bit2
  700. UVec4 bit1 = int_val.LogicalShiftLeft<31>();
  701. UVec4 bit2 = UVec4::sAnd(int_val.LogicalShiftLeft<30>(), UVec4::sReplicate(0x80000000U));
  702. UVec4 bit3 = UVec4::sAnd(int_val.LogicalShiftLeft<29>(), UVec4::sReplicate(0x80000000U));
  703. // Select which one of the results is sin and which one is cos
  704. UVec4 xor_1_2 = UVec4::sXor(bit1, bit2);
  705. Vec4 s = Vec4::sSelect(y2, y1, xor_1_2);
  706. Vec4 c = Vec4::sSelect(y1, y2, xor_1_2);
  707. // Update the signs
  708. sin_sign = UVec4::sXor(sin_sign, bit3);
  709. UVec4 cos_sign = UVec4::sXor(bit2, bit3);
  710. // Correct the signs
  711. outSin = Vec4::sXor(s, sin_sign.ReinterpretAsFloat());
  712. outCos = Vec4::sXor(c, cos_sign.ReinterpretAsFloat());
  713. }
  714. JPH_NAMESPACE_END