Vec3.inl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. #include <Math/Vec4.h>
  4. #include <Math/UVec4.h>
  5. #include <Core/HashCombine.h>
  6. #include <random>
  7. // Create a std::hash for Vec3
  8. JPH_MAKE_HASHABLE(JPH::Vec3, t.GetX(), t.GetY(), t.GetZ())
  9. namespace JPH {
  10. void Vec3::CheckW() const
  11. {
  12. #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
  13. // Avoid asserts when both components are NaN
  14. JPH_ASSERT(reinterpret_cast<const uint32 *>(mF32)[2] == reinterpret_cast<const uint32 *>(mF32)[3]);
  15. #endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
  16. }
  17. JPH_INLINE Vec3::Type Vec3::sFixW(Type inValue)
  18. {
  19. #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
  20. #if defined(JPH_USE_SSE)
  21. return _mm_shuffle_ps(inValue, inValue, _MM_SHUFFLE(2, 2, 1, 0));
  22. #elif defined(JPH_USE_NEON)
  23. return __builtin_shufflevector(inValue, inValue, 0, 1, 2, 2);
  24. #else
  25. #error Unsupported CPU architecture
  26. #endif
  27. #else
  28. return inValue;
  29. #endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
  30. }
  31. Vec3::Vec3(Vec4Arg inRHS) :
  32. mValue(sFixW(inRHS.mValue))
  33. {
  34. }
  35. Vec3::Vec3(const Float3 &inV)
  36. {
  37. #if defined(JPH_USE_SSE)
  38. Type x = _mm_load_ss(&inV.x);
  39. Type y = _mm_load_ss(&inV.y);
  40. Type z = _mm_load_ss(&inV.z);
  41. Type xy = _mm_unpacklo_ps(x, y);
  42. mValue = _mm_shuffle_ps(xy, z, _MM_SHUFFLE(0, 0, 1, 0)); // Assure Z and W are the same
  43. #elif defined(JPH_USE_NEON)
  44. float32x2_t xy = vld1_f32(&inV.x);
  45. float32x2_t zz = vdup_n_f32(inV.z); // Assure Z and W are the same
  46. mValue = vcombine_f32(xy, zz);
  47. #else
  48. #error Undefined CPU architecture
  49. #endif
  50. }
  51. Vec3::Vec3(float inX, float inY, float inZ)
  52. {
  53. #if defined(JPH_USE_SSE)
  54. mValue = _mm_set_ps(inZ, inZ, inY, inX);
  55. #elif defined(JPH_USE_NEON)
  56. uint32x2_t xy = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32 *>(&inX)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inY)) << 32));
  57. uint32x2_t zz = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32* >(&inZ)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inZ)) << 32));
  58. mValue = vcombine_f32(xy, zz);
  59. #else
  60. #error Undefined CPU architecture
  61. #endif
  62. }
  63. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ>
  64. Vec3 Vec3::Swizzle() const
  65. {
  66. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  67. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  68. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  69. #if defined(JPH_USE_SSE)
  70. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleZ, SwizzleZ, SwizzleY, SwizzleX)); // Assure Z and W are the same
  71. #elif defined(JPH_USE_NEON)
  72. return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleZ);
  73. #else
  74. #error Unsupported CPU architecture
  75. #endif
  76. }
  77. Vec3 Vec3::sZero()
  78. {
  79. #if defined(JPH_USE_SSE)
  80. return _mm_setzero_ps();
  81. #elif defined(JPH_USE_NEON)
  82. return vdupq_n_f32(0);
  83. #else
  84. #error Unsupported CPU architecture
  85. #endif
  86. }
  87. Vec3 Vec3::sReplicate(float inV)
  88. {
  89. #if defined(JPH_USE_SSE)
  90. return _mm_set1_ps(inV);
  91. #elif defined(JPH_USE_NEON)
  92. return vdupq_n_f32(inV);
  93. #else
  94. #error Unsupported CPU architecture
  95. #endif
  96. }
  97. Vec3 Vec3::sNaN()
  98. {
  99. return sReplicate(numeric_limits<float>::quiet_NaN());
  100. }
  101. Vec3 Vec3::sLoadFloat3Unsafe(const Float3 &inV)
  102. {
  103. #if defined(JPH_USE_SSE)
  104. Type v = _mm_loadu_ps(&inV.x);
  105. #elif defined(JPH_USE_NEON)
  106. Type v = vld1q_f32(&inV.x);
  107. #else
  108. #error Unsupported CPU architecture
  109. #endif
  110. return sFixW(v);
  111. }
  112. Vec3 Vec3::sMin(Vec3Arg inV1, Vec3Arg inV2)
  113. {
  114. #if defined(JPH_USE_SSE)
  115. return _mm_min_ps(inV1.mValue, inV2.mValue);
  116. #elif defined(JPH_USE_NEON)
  117. return vminq_f32(inV1.mValue, inV2.mValue);
  118. #else
  119. #error Unsupported CPU architecture
  120. #endif
  121. }
  122. Vec3 Vec3::sMax(Vec3Arg inV1, Vec3Arg inV2)
  123. {
  124. #if defined(JPH_USE_SSE)
  125. return _mm_max_ps(inV1.mValue, inV2.mValue);
  126. #elif defined(JPH_USE_NEON)
  127. return vmaxq_f32(inV1.mValue, inV2.mValue);
  128. #else
  129. #error Unsupported CPU architecture
  130. #endif
  131. }
  132. Vec3 Vec3::sClamp(Vec3Arg inV, Vec3Arg inMin, Vec3Arg inMax)
  133. {
  134. return sMax(sMin(inV, inMax), inMin);
  135. }
  136. UVec4 Vec3::sEquals(Vec3Arg inV1, Vec3Arg inV2)
  137. {
  138. #if defined(JPH_USE_SSE)
  139. return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
  140. #elif defined(JPH_USE_NEON)
  141. return vceqq_f32(inV1.mValue, inV2.mValue);
  142. #else
  143. #error Unsupported CPU architecture
  144. #endif
  145. }
  146. UVec4 Vec3::sLess(Vec3Arg inV1, Vec3Arg inV2)
  147. {
  148. #if defined(JPH_USE_SSE)
  149. return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
  150. #elif defined(JPH_USE_NEON)
  151. return vcltq_f32(inV1.mValue, inV2.mValue);
  152. #else
  153. #error Unsupported CPU architecture
  154. #endif
  155. }
  156. UVec4 Vec3::sLessOrEqual(Vec3Arg inV1, Vec3Arg inV2)
  157. {
  158. #if defined(JPH_USE_SSE)
  159. return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
  160. #elif defined(JPH_USE_NEON)
  161. return vcleq_f32(inV1.mValue, inV2.mValue);
  162. #else
  163. #error Unsupported CPU architecture
  164. #endif
  165. }
  166. UVec4 Vec3::sGreater(Vec3Arg inV1, Vec3Arg inV2)
  167. {
  168. #if defined(JPH_USE_SSE)
  169. return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
  170. #elif defined(JPH_USE_NEON)
  171. return vcgtq_f32(inV1.mValue, inV2.mValue);
  172. #else
  173. #error Unsupported CPU architecture
  174. #endif
  175. }
  176. UVec4 Vec3::sGreaterOrEqual(Vec3Arg inV1, Vec3Arg inV2)
  177. {
  178. #if defined(JPH_USE_SSE)
  179. return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
  180. #elif defined(JPH_USE_NEON)
  181. return vcgeq_f32(inV1.mValue, inV2.mValue);
  182. #else
  183. #error Unsupported CPU architecture
  184. #endif
  185. }
  186. Vec3 Vec3::sFusedMultiplyAdd(Vec3Arg inMul1, Vec3Arg inMul2, Vec3Arg inAdd)
  187. {
  188. #if defined(JPH_USE_SSE)
  189. #ifdef JPH_USE_FMADD
  190. return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
  191. #else
  192. return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
  193. #endif
  194. #elif defined(JPH_USE_NEON)
  195. return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
  196. #else
  197. #error Unsupported CPU architecture
  198. #endif
  199. }
  200. Vec3 Vec3::sSelect(Vec3Arg inV1, Vec3Arg inV2, UVec4Arg inControl)
  201. {
  202. #if defined(JPH_USE_SSE)
  203. Type v = _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
  204. #elif defined(JPH_USE_NEON)
  205. Type v = vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
  206. #else
  207. #error Unsupported CPU architecture
  208. #endif
  209. return sFixW(v);
  210. }
  211. Vec3 Vec3::sOr(Vec3Arg inV1, Vec3Arg inV2)
  212. {
  213. #if defined(JPH_USE_SSE)
  214. return _mm_or_ps(inV1.mValue, inV2.mValue);
  215. #elif defined(JPH_USE_NEON)
  216. return vorrq_s32(inV1.mValue, inV2.mValue);
  217. #else
  218. #error Unsupported CPU architecture
  219. #endif
  220. }
  221. Vec3 Vec3::sXor(Vec3Arg inV1, Vec3Arg inV2)
  222. {
  223. #if defined(JPH_USE_SSE)
  224. return _mm_xor_ps(inV1.mValue, inV2.mValue);
  225. #elif defined(JPH_USE_NEON)
  226. return veorq_s32(inV1.mValue, inV2.mValue);
  227. #else
  228. #error Unsupported CPU architecture
  229. #endif
  230. }
  231. Vec3 Vec3::sAnd(Vec3Arg inV1, Vec3Arg inV2)
  232. {
  233. #if defined(JPH_USE_SSE)
  234. return _mm_and_ps(inV1.mValue, inV2.mValue);
  235. #elif defined(JPH_USE_NEON)
  236. return vandq_s32(inV1.mValue, inV2.mValue);
  237. #else
  238. #error Unsupported CPU architecture
  239. #endif
  240. }
  241. Vec3 Vec3::sUnitSpherical(float inTheta, float inPhi)
  242. {
  243. float sint = sin(inTheta);
  244. return Vec3(sint * cos(inPhi), sint * sin(inPhi), cos(inTheta));
  245. }
  246. template <class Random>
  247. Vec3 Vec3::sRandom(Random &inRandom)
  248. {
  249. uniform_real_distribution<float> zero_to_one(0.0f, 1.0f);
  250. float theta = JPH_PI * zero_to_one(inRandom);
  251. float phi = 2.0f * JPH_PI * zero_to_one(inRandom);
  252. return sUnitSpherical(theta, phi);
  253. }
  254. bool Vec3::operator == (Vec3Arg inV2) const
  255. {
  256. return sEquals(*this, inV2).TestAllXYZTrue();
  257. }
  258. bool Vec3::IsClose(Vec3Arg inV2, float inMaxDistSq) const
  259. {
  260. return (inV2 - *this).LengthSq() <= inMaxDistSq;
  261. }
  262. bool Vec3::IsNearZero(float inMaxDistSq) const
  263. {
  264. return LengthSq() <= inMaxDistSq;
  265. }
  266. Vec3 Vec3::operator * (Vec3Arg inV2) const
  267. {
  268. #if defined(JPH_USE_SSE)
  269. return _mm_mul_ps(mValue, inV2.mValue);
  270. #elif defined(JPH_USE_NEON)
  271. return vmulq_f32(mValue, inV2.mValue);
  272. #else
  273. #error Unsupported CPU architecture
  274. #endif
  275. }
  276. Vec3 Vec3::operator * (float inV2) const
  277. {
  278. #if defined(JPH_USE_SSE)
  279. return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  280. #elif defined(JPH_USE_NEON)
  281. return vmulq_n_f32(mValue, inV2);
  282. #else
  283. #error Unsupported CPU architecture
  284. #endif
  285. }
  286. Vec3 operator * (float inV1, Vec3Arg inV2)
  287. {
  288. #if defined(JPH_USE_SSE)
  289. return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
  290. #elif defined(JPH_USE_NEON)
  291. return vmulq_n_f32(inV2.mValue, inV1);
  292. #else
  293. #error Unsupported CPU architecture
  294. #endif
  295. }
  296. Vec3 Vec3::operator / (float inV2) const
  297. {
  298. #if defined(JPH_USE_SSE)
  299. return _mm_div_ps(mValue, _mm_set1_ps(inV2));
  300. #elif defined(JPH_USE_NEON)
  301. return vdivq_f32(mValue, vdupq_n_f32(inV2));
  302. #else
  303. #error Unsupported CPU architecture
  304. #endif
  305. }
  306. Vec3 &Vec3::operator *= (float inV2)
  307. {
  308. #if defined(JPH_USE_SSE)
  309. mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  310. #elif defined(JPH_USE_NEON)
  311. mValue = vmulq_n_f32(mValue, inV2);
  312. #else
  313. #error Unsupported CPU architecture
  314. #endif
  315. return *this;
  316. }
  317. Vec3 &Vec3::operator *= (Vec3Arg inV2)
  318. {
  319. #if defined(JPH_USE_SSE)
  320. mValue = _mm_mul_ps(mValue, inV2.mValue);
  321. #elif defined(JPH_USE_NEON)
  322. mValue = vmulq_f32(mValue, inV2.mValue);
  323. #else
  324. #error Unsupported CPU architecture
  325. #endif
  326. return *this;
  327. }
  328. Vec3 &Vec3::operator /= (float inV2)
  329. {
  330. #if defined(JPH_USE_SSE)
  331. mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
  332. #elif defined(JPH_USE_NEON)
  333. mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
  334. #else
  335. #error Unsupported CPU architecture
  336. #endif
  337. return *this;
  338. }
  339. Vec3 Vec3::operator + (Vec3Arg inV2) const
  340. {
  341. #if defined(JPH_USE_SSE)
  342. return _mm_add_ps(mValue, inV2.mValue);
  343. #elif defined(JPH_USE_NEON)
  344. return vaddq_f32(mValue, inV2.mValue);
  345. #else
  346. #error Unsupported CPU architecture
  347. #endif
  348. }
  349. Vec3 &Vec3::operator += (Vec3Arg inV2)
  350. {
  351. #if defined(JPH_USE_SSE)
  352. mValue = _mm_add_ps(mValue, inV2.mValue);
  353. #elif defined(JPH_USE_NEON)
  354. mValue = vaddq_f32(mValue, inV2.mValue);
  355. #else
  356. #error Unsupported CPU architecture
  357. #endif
  358. return *this;
  359. }
  360. Vec3 Vec3::operator - () const
  361. {
  362. #if defined(JPH_USE_SSE)
  363. return _mm_sub_ps(_mm_setzero_ps(), mValue);
  364. #elif defined(JPH_USE_NEON)
  365. return vnegq_f32(mValue);
  366. #else
  367. #error Unsupported CPU architecture
  368. #endif
  369. }
  370. Vec3 Vec3::operator - (Vec3Arg inV2) const
  371. {
  372. #if defined(JPH_USE_SSE)
  373. return _mm_sub_ps(mValue, inV2.mValue);
  374. #elif defined(JPH_USE_NEON)
  375. return vsubq_f32(mValue, inV2.mValue);
  376. #else
  377. #error Unsupported CPU architecture
  378. #endif
  379. }
  380. Vec3 &Vec3::operator -= (Vec3Arg inV2)
  381. {
  382. #if defined(JPH_USE_SSE)
  383. mValue = _mm_sub_ps(mValue, inV2.mValue);
  384. #elif defined(JPH_USE_NEON)
  385. mValue = vsubq_f32(mValue, inV2.mValue);
  386. #else
  387. #error Unsupported CPU architecture
  388. #endif
  389. return *this;
  390. }
  391. Vec3 Vec3::operator / (Vec3Arg inV2) const
  392. {
  393. inV2.CheckW(); // Check W equals Z to avoid div by zero
  394. #if defined(JPH_USE_SSE)
  395. return _mm_div_ps(mValue, inV2.mValue);
  396. #elif defined(JPH_USE_NEON)
  397. return vdivq_f32(mValue, inV2.mValue);
  398. #else
  399. #error Unsupported CPU architecture
  400. #endif
  401. }
  402. Vec4 Vec3::SplatX() const
  403. {
  404. #if defined(JPH_USE_SSE)
  405. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
  406. #elif defined(JPH_USE_NEON)
  407. return vdupq_laneq_f32(mValue, 0);
  408. #else
  409. #error Unsupported CPU architecture
  410. #endif
  411. }
  412. Vec4 Vec3::SplatY() const
  413. {
  414. #if defined(JPH_USE_SSE)
  415. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
  416. #elif defined(JPH_USE_NEON)
  417. return vdupq_laneq_f32(mValue, 1);
  418. #else
  419. #error Unsupported CPU architecture
  420. #endif
  421. }
  422. Vec4 Vec3::SplatZ() const
  423. {
  424. #if defined(JPH_USE_SSE)
  425. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
  426. #elif defined(JPH_USE_NEON)
  427. return vdupq_laneq_f32(mValue, 2);
  428. #else
  429. #error Unsupported CPU architecture
  430. #endif
  431. }
  432. int Vec3::GetLowestComponentIndex() const
  433. {
  434. return GetX() < GetY() ? (GetZ() < GetX() ? 2 : 0) : (GetZ() < GetY() ? 2 : 1);
  435. }
  436. int Vec3::GetHighestComponentIndex() const
  437. {
  438. return GetX() > GetY() ? (GetZ() > GetX() ? 2 : 0) : (GetZ() > GetY() ? 2 : 1);
  439. }
  440. Vec3 Vec3::Abs() const
  441. {
  442. #if defined(JPH_USE_SSE)
  443. return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
  444. #elif defined(JPH_USE_NEON)
  445. return vabsq_f32(mValue);
  446. #else
  447. #error Unsupported CPU architecture
  448. #endif
  449. }
  450. Vec3 Vec3::Reciprocal() const
  451. {
  452. return sReplicate(1.0f) / mValue;
  453. }
  454. Vec3 Vec3::Cross(Vec3Arg inV2) const
  455. {
  456. #if defined(JPH_USE_SSE)
  457. Type t1 = _mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
  458. t1 = _mm_mul_ps(t1, mValue);
  459. Type t2 = _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
  460. t2 = _mm_mul_ps(t2, inV2.mValue);
  461. Type t3 = _mm_sub_ps(t1, t2);
  462. return _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
  463. #elif defined(JPH_USE_NEON)
  464. Type t1 = __builtin_shufflevector(inV2.mValue, inV2.mValue, 1, 2, 0, 0); // Assure Z and W are the same
  465. t1 = vmulq_f32(t1, mValue);
  466. Type t2 = __builtin_shufflevector(mValue, mValue, 1, 2, 0, 0); // Assure Z and W are the same
  467. t2 = vmulq_f32(t2, inV2.mValue);
  468. Type t3 = vsubq_f32(t1, t2);
  469. return __builtin_shufflevector(t3, t3, 1, 2, 0, 0); // Assure Z and W are the same
  470. #else
  471. #error Unsupported CPU architecture
  472. #endif
  473. }
  474. Vec3 Vec3::DotV(Vec3Arg inV2) const
  475. {
  476. #if defined(JPH_USE_SSE)
  477. return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
  478. #elif defined(JPH_USE_NEON)
  479. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  480. mul = vsetq_lane_f32(0, mul, 3);
  481. return vdupq_n_f32(vaddvq_f32(mul));
  482. #else
  483. #error Unsupported CPU architecture
  484. #endif
  485. }
  486. Vec4 Vec3::DotV4(Vec3Arg inV2) const
  487. {
  488. #if defined(JPH_USE_SSE)
  489. return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
  490. #elif defined(JPH_USE_NEON)
  491. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  492. mul = vsetq_lane_f32(0, mul, 3);
  493. return vdupq_n_f32(vaddvq_f32(mul));
  494. #else
  495. #error Unsupported CPU architecture
  496. #endif
  497. }
  498. float Vec3::Dot(Vec3Arg inV2) const
  499. {
  500. #if defined(JPH_USE_SSE)
  501. return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0x7f));
  502. #elif defined(JPH_USE_NEON)
  503. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  504. mul = vsetq_lane_f32(0, mul, 3);
  505. return vaddvq_f32(mul);
  506. #else
  507. #error Unsupported CPU architecture
  508. #endif
  509. }
  510. float Vec3::LengthSq() const
  511. {
  512. #if defined(JPH_USE_SSE)
  513. return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0x7f));
  514. #elif defined(JPH_USE_NEON)
  515. float32x4_t mul = vmulq_f32(mValue, mValue);
  516. mul = vsetq_lane_f32(0, mul, 3);
  517. return vaddvq_f32(mul);
  518. #else
  519. #error Unsupported CPU architecture
  520. #endif
  521. }
  522. float Vec3::Length() const
  523. {
  524. #if defined(JPH_USE_SSE)
  525. return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0x7f)));
  526. #elif defined(JPH_USE_NEON)
  527. float32x4_t mul = vmulq_f32(mValue, mValue);
  528. mul = vsetq_lane_f32(0, mul, 3);
  529. float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
  530. return vget_lane_f32(vsqrt_f32(sum), 0);
  531. #else
  532. #error Unsupported CPU architecture
  533. #endif
  534. }
  535. Vec3 Vec3::Sqrt() const
  536. {
  537. #if defined(JPH_USE_SSE)
  538. return _mm_sqrt_ps(mValue);
  539. #elif defined(JPH_USE_NEON)
  540. return vsqrtq_f32(mValue);
  541. #else
  542. #error Unsupported CPU architecture
  543. #endif
  544. }
  545. Vec3 Vec3::Normalized() const
  546. {
  547. #if defined(JPH_USE_SSE)
  548. return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0x7f)));
  549. #elif defined(JPH_USE_NEON)
  550. float32x4_t mul = vmulq_f32(mValue, mValue);
  551. mul = vsetq_lane_f32(0, mul, 3);
  552. float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
  553. return vdivq_f32(mValue, vsqrtq_f32(sum));
  554. #else
  555. #error Unsupported CPU architecture
  556. #endif
  557. }
  558. Vec3 Vec3::NormalizedOr(Vec3Arg inZeroValue) const
  559. {
  560. #if defined(JPH_USE_SSE)
  561. Type len_sq = _mm_dp_ps(mValue, mValue, 0x7f);
  562. Type is_zero = _mm_cmpeq_ps(len_sq, _mm_setzero_ps());
  563. #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
  564. if (_mm_movemask_ps(is_zero) == 0xf)
  565. return inZeroValue;
  566. else
  567. return _mm_div_ps(mValue, _mm_sqrt_ps(len_sq));
  568. #else
  569. return _mm_blendv_ps(_mm_div_ps(mValue, _mm_sqrt_ps(len_sq)), inZeroValue.mValue, is_zero);
  570. #endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
  571. #elif defined(JPH_USE_NEON)
  572. float32x4_t mul = vmulq_f32(mValue, mValue);
  573. mul = vsetq_lane_f32(0, mul, 3);
  574. float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
  575. float32x4_t len = vsqrtq_f32(sum);
  576. float32x4_t is_zero = vceqq_f32(len, vdupq_n_f32(0));
  577. return vbslq_f32(is_zero, inZeroValue.mValue, vdivq_f32(mValue, len));
  578. #else
  579. #error Unsupported CPU architecture
  580. #endif
  581. }
  582. bool Vec3::IsNormalized(float inTolerance) const
  583. {
  584. return abs(LengthSq() - 1.0f) <= inTolerance;
  585. }
  586. bool Vec3::IsNaN() const
  587. {
  588. #if defined(JPH_USE_SSE)
  589. return (_mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) & 0x7) != 0;
  590. #elif defined(JPH_USE_NEON)
  591. uint32x4_t mask = { 1, 1, 1, 0 };
  592. uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
  593. return vaddvq_u32(vandq_u32(is_equal, mask)) != 3;
  594. #else
  595. #error Unsupported CPU architecture
  596. #endif
  597. }
  598. void Vec3::StoreFloat3(Float3 *outV) const
  599. {
  600. #if defined(JPH_USE_SSE)
  601. _mm_store_ss(&outV->x, mValue);
  602. Vec3 t = Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_UNUSED>();
  603. _mm_store_ss(&outV->y, t.mValue);
  604. t = t.Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_UNUSED>();
  605. _mm_store_ss(&outV->z, t.mValue);
  606. #elif defined(JPH_USE_NEON)
  607. float32x2_t xy = vget_low_f32(mValue);
  608. vst1_f32(&outV->x, xy);
  609. vst1q_lane_f32(&outV->z, mValue, 2);
  610. #else
  611. #error Unsupported CPU architecture
  612. #endif
  613. }
  614. UVec4 Vec3::ToInt() const
  615. {
  616. #if defined(JPH_USE_SSE)
  617. return _mm_cvttps_epi32(mValue);
  618. #elif defined(JPH_USE_NEON)
  619. return vcvtq_u32_f32(mValue);
  620. #else
  621. #error Unsupported CPU architecture
  622. #endif
  623. }
  624. UVec4 Vec3::ReinterpretAsInt() const
  625. {
  626. #if defined(JPH_USE_SSE)
  627. return UVec4(_mm_castps_si128(mValue));
  628. #elif defined(JPH_USE_NEON)
  629. return vreinterpretq_u32_f32(mValue);
  630. #else
  631. #error Unsupported CPU architecture
  632. #endif
  633. }
  634. float Vec3::ReduceMin() const
  635. {
  636. Vec3 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_Z>());
  637. v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  638. return v.GetX();
  639. }
  640. float Vec3::ReduceMax() const
  641. {
  642. Vec3 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_Z>());
  643. v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  644. return v.GetX();
  645. }
  646. Vec3 Vec3::GetNormalizedPerpendicular() const
  647. {
  648. if (abs(mF32[0]) > abs(mF32[1]))
  649. {
  650. float len = sqrt(mF32[0] * mF32[0] + mF32[2] * mF32[2]);
  651. return Vec3(mF32[2], 0.0f, -mF32[0]) / len;
  652. }
  653. else
  654. {
  655. float len = sqrt(mF32[1] * mF32[1] + mF32[2] * mF32[2]);
  656. return Vec3(0.0f, mF32[2], -mF32[1]) / len;
  657. }
  658. }
  659. Vec3 Vec3::GetSign() const
  660. {
  661. #if defined(JPH_USE_SSE)
  662. Type minus_one = _mm_set1_ps(-1.0f);
  663. Type one = _mm_set1_ps(1.0f);
  664. return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
  665. #elif defined(JPH_USE_NEON)
  666. Type minus_one = vdupq_n_f32(-1.0f);
  667. Type one = vdupq_n_f32(1.0f);
  668. return vorrq_s32(vandq_s32(mValue, minus_one), one);
  669. #else
  670. #error Unsupported CPU architecture
  671. #endif
  672. }
  673. } // JPH