Vec4.inl 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. #include <Jolt/Math/Trigonometry.h>
  4. #include <Jolt/Math/Vec3.h>
  5. #include <Jolt/Math/UVec4.h>
  6. JPH_NAMESPACE_BEGIN
  7. // Constructor
  8. Vec4::Vec4(Vec3Arg inRHS) :
  9. mValue(inRHS.mValue)
  10. {
  11. }
  12. Vec4::Vec4(Vec3Arg inRHS, float inW)
  13. {
  14. #if defined(JPH_USE_SSE4_1)
  15. mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
  16. #elif defined(JPH_USE_NEON)
  17. mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
  18. #else
  19. for (int i = 0; i < 3; i++)
  20. mF32[i] = inRHS.mF32[i];
  21. mF32[3] = inW;
  22. #endif
  23. }
  24. Vec4::Vec4(float inX, float inY, float inZ, float inW)
  25. {
  26. #if defined(JPH_USE_SSE)
  27. mValue = _mm_set_ps(inW, inZ, inY, inX);
  28. #elif defined(JPH_USE_NEON)
  29. uint32x2_t xy = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32 *>(&inX)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inY)) << 32));
  30. uint32x2_t zw = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32* >(&inZ)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inW)) << 32));
  31. mValue = vcombine_f32(xy, zw);
  32. #else
  33. mF32[0] = inX;
  34. mF32[1] = inY;
  35. mF32[2] = inZ;
  36. mF32[3] = inW;
  37. #endif
  38. }
  39. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
  40. Vec4 Vec4::Swizzle() const
  41. {
  42. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  43. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  44. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  45. static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
  46. #if defined(JPH_USE_SSE)
  47. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
  48. #elif defined(JPH_USE_NEON)
  49. return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
  50. #else
  51. return Vec4(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ], mF32[SwizzleW]);
  52. #endif
  53. }
  54. Vec4 Vec4::sZero()
  55. {
  56. #if defined(JPH_USE_SSE)
  57. return _mm_setzero_ps();
  58. #elif defined(JPH_USE_NEON)
  59. return vdupq_n_f32(0);
  60. #else
  61. return Vec4(0, 0, 0, 0);
  62. #endif
  63. }
  64. Vec4 Vec4::sReplicate(float inV)
  65. {
  66. #if defined(JPH_USE_SSE)
  67. return _mm_set1_ps(inV);
  68. #elif defined(JPH_USE_NEON)
  69. return vdupq_n_f32(inV);
  70. #else
  71. return Vec4(inV, inV, inV, inV);
  72. #endif
  73. }
  74. Vec4 Vec4::sNaN()
  75. {
  76. return sReplicate(numeric_limits<float>::quiet_NaN());
  77. }
  78. Vec4 Vec4::sLoadFloat4(const Float4 *inV)
  79. {
  80. #if defined(JPH_USE_SSE)
  81. return _mm_loadu_ps(&inV->x);
  82. #elif defined(JPH_USE_NEON)
  83. return vld1q_f32(&inV->x);
  84. #else
  85. return Vec4(inV->x, inV->y, inV->z, inV->w);
  86. #endif
  87. }
  88. Vec4 Vec4::sLoadFloat4Aligned(const Float4 *inV)
  89. {
  90. #if defined(JPH_USE_SSE)
  91. return _mm_load_ps(&inV->x);
  92. #elif defined(JPH_USE_NEON)
  93. return vld1q_f32(&inV->x);
  94. #else
  95. return Vec4(inV->x, inV->y, inV->z, inV->w);
  96. #endif
  97. }
  98. template <const int Scale>
  99. Vec4 Vec4::sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
  100. {
  101. #if defined(JPH_USE_SSE)
  102. #ifdef JPH_USE_AVX2
  103. return _mm_i32gather_ps(inBase, inOffsets.mValue, Scale);
  104. #else
  105. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  106. Type x = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale));
  107. Type y = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale));
  108. Type xy = _mm_unpacklo_ps(x, y);
  109. Type z = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale));
  110. Type w = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale));
  111. Type zw = _mm_unpacklo_ps(z, w);
  112. return _mm_movelh_ps(xy, zw);
  113. #endif
  114. #else
  115. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  116. float x = *reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale);
  117. float y = *reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale);
  118. float z = *reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale);
  119. float w = *reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale);
  120. return Vec4(x, y, z, w);
  121. #endif
  122. }
  123. Vec4 Vec4::sMin(Vec4Arg inV1, Vec4Arg inV2)
  124. {
  125. #if defined(JPH_USE_SSE)
  126. return _mm_min_ps(inV1.mValue, inV2.mValue);
  127. #elif defined(JPH_USE_NEON)
  128. return vminq_f32(inV1.mValue, inV2.mValue);
  129. #else
  130. return Vec4(min(inV1.mF32[0], inV2.mF32[0]),
  131. min(inV1.mF32[1], inV2.mF32[1]),
  132. min(inV1.mF32[2], inV2.mF32[2]),
  133. min(inV1.mF32[3], inV2.mF32[3]));
  134. #endif
  135. }
  136. Vec4 Vec4::sMax(Vec4Arg inV1, Vec4Arg inV2)
  137. {
  138. #if defined(JPH_USE_SSE)
  139. return _mm_max_ps(inV1.mValue, inV2.mValue);
  140. #elif defined(JPH_USE_NEON)
  141. return vmaxq_f32(inV1.mValue, inV2.mValue);
  142. #else
  143. return Vec4(max(inV1.mF32[0], inV2.mF32[0]),
  144. max(inV1.mF32[1], inV2.mF32[1]),
  145. max(inV1.mF32[2], inV2.mF32[2]),
  146. max(inV1.mF32[3], inV2.mF32[3]));
  147. #endif
  148. }
  149. UVec4 Vec4::sEquals(Vec4Arg inV1, Vec4Arg inV2)
  150. {
  151. #if defined(JPH_USE_SSE)
  152. return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
  153. #elif defined(JPH_USE_NEON)
  154. return vceqq_f32(inV1.mValue, inV2.mValue);
  155. #else
  156. return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0,
  157. inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0,
  158. inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0,
  159. inV1.mF32[3] == inV2.mF32[3]? 0xffffffffu : 0);
  160. #endif
  161. }
  162. UVec4 Vec4::sLess(Vec4Arg inV1, Vec4Arg inV2)
  163. {
  164. #if defined(JPH_USE_SSE)
  165. return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
  166. #elif defined(JPH_USE_NEON)
  167. return vcltq_f32(inV1.mValue, inV2.mValue);
  168. #else
  169. return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0,
  170. inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0,
  171. inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0,
  172. inV1.mF32[3] < inV2.mF32[3]? 0xffffffffu : 0);
  173. #endif
  174. }
  175. UVec4 Vec4::sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  176. {
  177. #if defined(JPH_USE_SSE)
  178. return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
  179. #elif defined(JPH_USE_NEON)
  180. return vcleq_f32(inV1.mValue, inV2.mValue);
  181. #else
  182. return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0,
  183. inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0,
  184. inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0,
  185. inV1.mF32[3] <= inV2.mF32[3]? 0xffffffffu : 0);
  186. #endif
  187. }
  188. UVec4 Vec4::sGreater(Vec4Arg inV1, Vec4Arg inV2)
  189. {
  190. #if defined(JPH_USE_SSE)
  191. return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
  192. #elif defined(JPH_USE_NEON)
  193. return vcgtq_f32(inV1.mValue, inV2.mValue);
  194. #else
  195. return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0,
  196. inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0,
  197. inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0,
  198. inV1.mF32[3] > inV2.mF32[3]? 0xffffffffu : 0);
  199. #endif
  200. }
  201. UVec4 Vec4::sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  202. {
  203. #if defined(JPH_USE_SSE)
  204. return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
  205. #elif defined(JPH_USE_NEON)
  206. return vcgeq_f32(inV1.mValue, inV2.mValue);
  207. #else
  208. return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0,
  209. inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0,
  210. inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0,
  211. inV1.mF32[3] >= inV2.mF32[3]? 0xffffffffu : 0);
  212. #endif
  213. }
  214. Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
  215. {
  216. #if defined(JPH_USE_SSE)
  217. #ifdef JPH_USE_FMADD
  218. return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
  219. #else
  220. return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
  221. #endif
  222. #elif defined(JPH_USE_NEON)
  223. return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
  224. #else
  225. return Vec4(inMul1.mF32[0] * inMul2.mF32[0] + inAdd.mF32[0],
  226. inMul1.mF32[1] * inMul2.mF32[1] + inAdd.mF32[1],
  227. inMul1.mF32[2] * inMul2.mF32[2] + inAdd.mF32[2],
  228. inMul1.mF32[3] * inMul2.mF32[3] + inAdd.mF32[3]);
  229. #endif
  230. }
  231. Vec4 Vec4::sSelect(Vec4Arg inV1, Vec4Arg inV2, UVec4Arg inControl)
  232. {
  233. #if defined(JPH_USE_SSE4_1)
  234. return _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
  235. #elif defined(JPH_USE_NEON)
  236. return vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
  237. #else
  238. Vec4 result;
  239. for (int i = 0; i < 4; i++)
  240. result.mF32[i] = inControl.mU32[i] ? inV2.mF32[i] : inV1.mF32[i];
  241. return result;
  242. #endif
  243. }
  244. Vec4 Vec4::sOr(Vec4Arg inV1, Vec4Arg inV2)
  245. {
  246. #if defined(JPH_USE_SSE)
  247. return _mm_or_ps(inV1.mValue, inV2.mValue);
  248. #elif defined(JPH_USE_NEON)
  249. return vorrq_s32(inV1.mValue, inV2.mValue);
  250. #else
  251. return UVec4::sOr(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
  252. #endif
  253. }
  254. Vec4 Vec4::sXor(Vec4Arg inV1, Vec4Arg inV2)
  255. {
  256. #if defined(JPH_USE_SSE)
  257. return _mm_xor_ps(inV1.mValue, inV2.mValue);
  258. #elif defined(JPH_USE_NEON)
  259. return veorq_s32(inV1.mValue, inV2.mValue);
  260. #else
  261. return UVec4::sXor(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
  262. #endif
  263. }
  264. Vec4 Vec4::sAnd(Vec4Arg inV1, Vec4Arg inV2)
  265. {
  266. #if defined(JPH_USE_SSE)
  267. return _mm_and_ps(inV1.mValue, inV2.mValue);
  268. #elif defined(JPH_USE_NEON)
  269. return vandq_s32(inV1.mValue, inV2.mValue);
  270. #else
  271. return UVec4::sAnd(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
  272. #endif
  273. }
  274. void Vec4::sSort4(Vec4 &ioValue, UVec4 &ioIndex)
  275. {
  276. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  277. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  278. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  279. UVec4 c1 = sLess(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  280. ioValue = sSelect(ioValue, v1, c1);
  281. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  282. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  283. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  284. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  285. UVec4 c2 = sLess(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  286. ioValue = sSelect(ioValue, v2, c2);
  287. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  288. // Pass 3, test 2nd vs 3rd component
  289. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  290. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  291. UVec4 c3 = sLess(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  292. ioValue = sSelect(ioValue, v3, c3);
  293. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  294. }
  295. void Vec4::sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
  296. {
  297. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  298. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  299. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  300. UVec4 c1 = sGreater(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  301. ioValue = sSelect(ioValue, v1, c1);
  302. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  303. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  304. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  305. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  306. UVec4 c2 = sGreater(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  307. ioValue = sSelect(ioValue, v2, c2);
  308. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  309. // Pass 3, test 2nd vs 3rd component
  310. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  311. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  312. UVec4 c3 = sGreater(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  313. ioValue = sSelect(ioValue, v3, c3);
  314. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  315. }
  316. bool Vec4::operator == (Vec4Arg inV2) const
  317. {
  318. return sEquals(*this, inV2).TestAllTrue();
  319. }
  320. bool Vec4::IsClose(Vec4Arg inV2, float inMaxDistSq) const
  321. {
  322. return (inV2 - *this).LengthSq() <= inMaxDistSq;
  323. }
  324. bool Vec4::IsNormalized(float inTolerance) const
  325. {
  326. return abs(LengthSq() - 1.0f) <= inTolerance;
  327. }
  328. bool Vec4::IsNaN() const
  329. {
  330. #if defined(JPH_USE_AVX512)
  331. return _mm_fpclass_ps_mask(mValue, 0b10000001) != 0;
  332. #elif defined(JPH_USE_SSE)
  333. return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
  334. #elif defined(JPH_USE_NEON)
  335. uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
  336. return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
  337. #else
  338. return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]) || isnan(mF32[3]);
  339. #endif
  340. }
  341. Vec4 Vec4::operator * (Vec4Arg inV2) const
  342. {
  343. #if defined(JPH_USE_SSE)
  344. return _mm_mul_ps(mValue, inV2.mValue);
  345. #elif defined(JPH_USE_NEON)
  346. return vmulq_f32(mValue, inV2.mValue);
  347. #else
  348. return Vec4(mF32[0] * inV2.mF32[0],
  349. mF32[1] * inV2.mF32[1],
  350. mF32[2] * inV2.mF32[2],
  351. mF32[3] * inV2.mF32[3]);
  352. #endif
  353. }
  354. Vec4 Vec4::operator * (float inV2) const
  355. {
  356. #if defined(JPH_USE_SSE)
  357. return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  358. #elif defined(JPH_USE_NEON)
  359. return vmulq_n_f32(mValue, inV2);
  360. #else
  361. return Vec4(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2, mF32[3] * inV2);
  362. #endif
  363. }
  364. /// Multiply vector with float
  365. Vec4 operator * (float inV1, Vec4Arg inV2)
  366. {
  367. #if defined(JPH_USE_SSE)
  368. return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
  369. #elif defined(JPH_USE_NEON)
  370. return vmulq_n_f32(inV2.mValue, inV1);
  371. #else
  372. return Vec4(inV1 * inV2.mF32[0],
  373. inV1 * inV2.mF32[1],
  374. inV1 * inV2.mF32[2],
  375. inV1 * inV2.mF32[3]);
  376. #endif
  377. }
  378. Vec4 Vec4::operator / (float inV2) const
  379. {
  380. #if defined(JPH_USE_SSE)
  381. return _mm_div_ps(mValue, _mm_set1_ps(inV2));
  382. #elif defined(JPH_USE_NEON)
  383. return vdivq_f32(mValue, vdupq_n_f32(inV2));
  384. #else
  385. return Vec4(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2, mF32[3] / inV2);
  386. #endif
  387. }
  388. Vec4 &Vec4::operator *= (float inV2)
  389. {
  390. #if defined(JPH_USE_SSE)
  391. mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  392. #elif defined(JPH_USE_NEON)
  393. mValue = vmulq_n_f32(mValue, inV2);
  394. #else
  395. for (int i = 0; i < 4; ++i)
  396. mF32[i] *= inV2;
  397. #endif
  398. return *this;
  399. }
  400. Vec4 &Vec4::operator *= (Vec4Arg inV2)
  401. {
  402. #if defined(JPH_USE_SSE)
  403. mValue = _mm_mul_ps(mValue, inV2.mValue);
  404. #elif defined(JPH_USE_NEON)
  405. mValue = vmulq_f32(mValue, inV2.mValue);
  406. #else
  407. for (int i = 0; i < 4; ++i)
  408. mF32[i] *= inV2.mF32[i];
  409. #endif
  410. return *this;
  411. }
  412. Vec4 &Vec4::operator /= (float inV2)
  413. {
  414. #if defined(JPH_USE_SSE)
  415. mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
  416. #elif defined(JPH_USE_NEON)
  417. mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
  418. #else
  419. for (int i = 0; i < 4; ++i)
  420. mF32[i] /= inV2;
  421. #endif
  422. return *this;
  423. }
  424. Vec4 Vec4::operator + (Vec4Arg inV2) const
  425. {
  426. #if defined(JPH_USE_SSE)
  427. return _mm_add_ps(mValue, inV2.mValue);
  428. #elif defined(JPH_USE_NEON)
  429. return vaddq_f32(mValue, inV2.mValue);
  430. #else
  431. return Vec4(mF32[0] + inV2.mF32[0],
  432. mF32[1] + inV2.mF32[1],
  433. mF32[2] + inV2.mF32[2],
  434. mF32[3] + inV2.mF32[3]);
  435. #endif
  436. }
  437. Vec4 &Vec4::operator += (Vec4Arg inV2)
  438. {
  439. #if defined(JPH_USE_SSE)
  440. mValue = _mm_add_ps(mValue, inV2.mValue);
  441. #elif defined(JPH_USE_NEON)
  442. mValue = vaddq_f32(mValue, inV2.mValue);
  443. #else
  444. for (int i = 0; i < 4; ++i)
  445. mF32[i] += inV2.mF32[i];
  446. #endif
  447. return *this;
  448. }
  449. Vec4 Vec4::operator - () const
  450. {
  451. #if defined(JPH_USE_SSE)
  452. return _mm_sub_ps(_mm_setzero_ps(), mValue);
  453. #elif defined(JPH_USE_NEON)
  454. return vnegq_f32(mValue);
  455. #else
  456. return Vec4(-mF32[0], -mF32[1], -mF32[2], -mF32[3]);
  457. #endif
  458. }
  459. Vec4 Vec4::operator - (Vec4Arg inV2) const
  460. {
  461. #if defined(JPH_USE_SSE)
  462. return _mm_sub_ps(mValue, inV2.mValue);
  463. #elif defined(JPH_USE_NEON)
  464. return vsubq_f32(mValue, inV2.mValue);
  465. #else
  466. return Vec4(mF32[0] - inV2.mF32[0],
  467. mF32[1] - inV2.mF32[1],
  468. mF32[2] - inV2.mF32[2],
  469. mF32[3] - inV2.mF32[3]);
  470. #endif
  471. }
  472. Vec4 &Vec4::operator -= (Vec4Arg inV2)
  473. {
  474. #if defined(JPH_USE_SSE)
  475. mValue = _mm_sub_ps(mValue, inV2.mValue);
  476. #elif defined(JPH_USE_NEON)
  477. mValue = vsubq_f32(mValue, inV2.mValue);
  478. #else
  479. for (int i = 0; i < 4; ++i)
  480. mF32[i] -= inV2.mF32[i];
  481. #endif
  482. return *this;
  483. }
  484. Vec4 Vec4::operator / (Vec4Arg inV2) const
  485. {
  486. #if defined(JPH_USE_SSE)
  487. return _mm_div_ps(mValue, inV2.mValue);
  488. #elif defined(JPH_USE_NEON)
  489. return vdivq_f32(mValue, inV2.mValue);
  490. #else
  491. return Vec4(mF32[0] / inV2.mF32[0],
  492. mF32[1] / inV2.mF32[1],
  493. mF32[2] / inV2.mF32[2],
  494. mF32[3] / inV2.mF32[3]);
  495. #endif
  496. }
  497. Vec4 Vec4::SplatX() const
  498. {
  499. #if defined(JPH_USE_SSE)
  500. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
  501. #elif defined(JPH_USE_NEON)
  502. return vdupq_laneq_f32(mValue, 0);
  503. #else
  504. return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
  505. #endif
  506. }
  507. Vec4 Vec4::SplatY() const
  508. {
  509. #if defined(JPH_USE_SSE)
  510. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
  511. #elif defined(JPH_USE_NEON)
  512. return vdupq_laneq_f32(mValue, 1);
  513. #else
  514. return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
  515. #endif
  516. }
  517. Vec4 Vec4::SplatZ() const
  518. {
  519. #if defined(JPH_USE_SSE)
  520. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
  521. #elif defined(JPH_USE_NEON)
  522. return vdupq_laneq_f32(mValue, 2);
  523. #else
  524. return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
  525. #endif
  526. }
  527. Vec4 Vec4::SplatW() const
  528. {
  529. #if defined(JPH_USE_SSE)
  530. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
  531. #elif defined(JPH_USE_NEON)
  532. return vdupq_laneq_f32(mValue, 3);
  533. #else
  534. return Vec4(mF32[3], mF32[3], mF32[3], mF32[3]);
  535. #endif
  536. }
  537. Vec4 Vec4::Abs() const
  538. {
  539. #if defined(JPH_USE_AVX512)
  540. return _mm_range_ps(mValue, mValue, 0b1000);
  541. #elif defined(JPH_USE_SSE)
  542. return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
  543. #elif defined(JPH_USE_NEON)
  544. return vabsq_f32(mValue);
  545. #else
  546. return Vec4(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]), abs(mF32[3]));
  547. #endif
  548. }
  549. Vec4 Vec4::Reciprocal() const
  550. {
  551. return sReplicate(1.0f) / mValue;
  552. }
  553. Vec4 Vec4::DotV(Vec4Arg inV2) const
  554. {
  555. #if defined(JPH_USE_SSE4_1)
  556. return _mm_dp_ps(mValue, inV2.mValue, 0xff);
  557. #elif defined(JPH_USE_NEON)
  558. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  559. return vdupq_n_f32(vaddvq_f32(mul));
  560. #else
  561. float dot = 0.0f;
  562. for (int i = 0; i < 4; i++)
  563. dot += mF32[i] * inV2.mF32[i];
  564. return Vec4::sReplicate(dot);
  565. #endif
  566. }
  567. float Vec4::Dot(Vec4Arg inV2) const
  568. {
  569. #if defined(JPH_USE_SSE4_1)
  570. return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
  571. #elif defined(JPH_USE_NEON)
  572. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  573. return vaddvq_f32(mul);
  574. #else
  575. float dot = 0.0f;
  576. for (int i = 0; i < 4; i++)
  577. dot += mF32[i] * inV2.mF32[i];
  578. return dot;
  579. #endif
  580. }
  581. float Vec4::LengthSq() const
  582. {
  583. #if defined(JPH_USE_SSE4_1)
  584. return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
  585. #elif defined(JPH_USE_NEON)
  586. float32x4_t mul = vmulq_f32(mValue, mValue);
  587. return vaddvq_f32(mul);
  588. #else
  589. float len_sq = 0.0f;
  590. for (int i = 0; i < 4; i++)
  591. len_sq += mF32[i] * mF32[i];
  592. return len_sq;
  593. #endif
  594. }
  595. float Vec4::Length() const
  596. {
  597. #if defined(JPH_USE_SSE4_1)
  598. return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
  599. #elif defined(JPH_USE_NEON)
  600. float32x4_t mul = vmulq_f32(mValue, mValue);
  601. float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
  602. return vget_lane_f32(vsqrt_f32(sum), 0);
  603. #else
  604. return sqrt(LengthSq());
  605. #endif
  606. }
  607. Vec4 Vec4::Sqrt() const
  608. {
  609. #if defined(JPH_USE_SSE)
  610. return _mm_sqrt_ps(mValue);
  611. #elif defined(JPH_USE_NEON)
  612. return vsqrtq_f32(mValue);
  613. #else
  614. return Vec4(sqrt(mF32[0]), sqrt(mF32[1]), sqrt(mF32[2]), sqrt(mF32[3]));
  615. #endif
  616. }
  617. Vec4 Vec4::GetSign() const
  618. {
  619. #if defined(JPH_USE_AVX512)
  620. return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
  621. #elif defined(JPH_USE_SSE)
  622. Type minus_one = _mm_set1_ps(-1.0f);
  623. Type one = _mm_set1_ps(1.0f);
  624. return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
  625. #elif defined(JPH_USE_NEON)
  626. Type minus_one = vdupq_n_f32(-1.0f);
  627. Type one = vdupq_n_f32(1.0f);
  628. return vorrq_s32(vandq_s32(mValue, minus_one), one);
  629. #else
  630. return Vec4(signbit(mF32[0])? -1.0f : 1.0f,
  631. signbit(mF32[1])? -1.0f : 1.0f,
  632. signbit(mF32[2])? -1.0f : 1.0f,
  633. signbit(mF32[3])? -1.0f : 1.0f);
  634. #endif
  635. }
  636. Vec4 Vec4::Normalized() const
  637. {
  638. #if defined(JPH_USE_SSE4_1)
  639. return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
  640. #elif defined(JPH_USE_NEON)
  641. float32x4_t mul = vmulq_f32(mValue, mValue);
  642. float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
  643. return vdivq_f32(mValue, vsqrtq_f32(sum));
  644. #else
  645. return *this / Length();
  646. #endif
  647. }
  648. void Vec4::StoreFloat4(Float4 *outV) const
  649. {
  650. #if defined(JPH_USE_SSE)
  651. _mm_storeu_ps(&outV->x, mValue);
  652. #elif defined(JPH_USE_NEON)
  653. vst1q_f32(&outV->x, mValue);
  654. #else
  655. for (int i = 0; i < 4; ++i)
  656. (&outV->x)[i] = mF32[i];
  657. #endif
  658. }
  659. UVec4 Vec4::ToInt() const
  660. {
  661. #if defined(JPH_USE_SSE)
  662. return _mm_cvttps_epi32(mValue);
  663. #elif defined(JPH_USE_NEON)
  664. return vcvtq_u32_f32(mValue);
  665. #else
  666. return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
  667. #endif
  668. }
  669. UVec4 Vec4::ReinterpretAsInt() const
  670. {
  671. #if defined(JPH_USE_SSE)
  672. return UVec4(_mm_castps_si128(mValue));
  673. #elif defined(JPH_USE_NEON)
  674. return vreinterpretq_u32_f32(mValue);
  675. #else
  676. return *reinterpret_cast<const UVec4 *>(this);
  677. #endif
  678. }
  679. int Vec4::GetSignBits() const
  680. {
  681. #if defined(JPH_USE_SSE)
  682. return _mm_movemask_ps(mValue);
  683. #elif defined(JPH_USE_NEON)
  684. int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
  685. return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
  686. #else
  687. return (signbit(mF32[0])? 1 : 0) | (signbit(mF32[1])? 2 : 0) | (signbit(mF32[2])? 4 : 0) | (signbit(mF32[3])? 8 : 0);
  688. #endif
  689. }
  690. float Vec4::ReduceMin() const
  691. {
  692. Vec4 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  693. v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  694. return v.GetX();
  695. }
  696. float Vec4::ReduceMax() const
  697. {
  698. Vec4 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  699. v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  700. return v.GetX();
  701. }
  702. void Vec4::SinCos(Vec4 &outSin, Vec4 &outCos) const
  703. {
  704. // Implementation based on sinf.c from the cephes library, combines sinf and cosf in a single function, changes octants to quadrants and vectorizes it
  705. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  706. // Make argument positive and remember sign for sin only since cos is symmetric around x (highest bit of a float is the sign bit)
  707. UVec4 sin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  708. Vec4 x = Vec4::sXor(*this, sin_sign.ReinterpretAsFloat());
  709. // x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
  710. UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
  711. // Make x relative to the closest quadrant.
  712. // This does x = x - quadrant * PI / 2 using a two step Cody-Waite argument reduction.
  713. // This improves the accuracy of the result by avoiding loss of significant bits in the subtraction.
  714. // We start with x = x - quadrant * PI / 2, PI / 2 in hexadecimal notation is 0x3fc90fdb, we remove the lowest 16 bits to
  715. // get 0x3fc90000 (= 1.5703125) this means we can now multiply with a number of up to 2^16 without losing any bits.
  716. // This leaves us with: x = (x - quadrant * 1.5703125) - quadrant * (PI / 2 - 1.5703125).
  717. // PI / 2 - 1.5703125 in hexadecimal is 0x39fdaa22, stripping the lowest 12 bits we get 0x39fda000 (= 0.0004837512969970703125)
  718. // This leaves uw with: x = ((x - quadrant * 1.5703125) - quadrant * 0.0004837512969970703125) - quadrant * (PI / 2 - 1.5703125 - 0.0004837512969970703125)
  719. // See: https://stackoverflow.com/questions/42455143/sine-cosine-modular-extended-precision-arithmetic
  720. // After this we have x in the range [-PI / 4, PI / 4].
  721. Vec4 float_quadrant = quadrant.ToFloat();
  722. x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
  723. // Calculate x2 = x^2
  724. Vec4 x2 = x * x;
  725. // Taylor expansion:
  726. // Cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + x^8/8! + ... = (((x2/8!- 1/6!) * x2 + 1/4!) * x2 - 1/2!) * x2 + 1
  727. Vec4 taylor_cos = ((2.443315711809948e-5f * x2 - Vec4::sReplicate(1.388731625493765e-3f)) * x2 + Vec4::sReplicate(4.166664568298827e-2f)) * x2 * x2 - 0.5f * x2 + Vec4::sReplicate(1.0f);
  728. // Sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ... = ((-x2/7! + 1/5!) * x2 - 1/3!) * x2 * x + x
  729. Vec4 taylor_sin = ((-1.9515295891e-4f * x2 + Vec4::sReplicate(8.3321608736e-3f)) * x2 - Vec4::sReplicate(1.6666654611e-1f)) * x2 * x + x;
  730. // The lowest 2 bits of quadrant indicate the quadrant that we are in.
  731. // Let x be the original input value and x' our value that has been mapped to the range [-PI / 4, PI / 4].
  732. // since cos(x) = sin(x - PI / 2) and since we want to use the Taylor expansion as close as possible to 0,
  733. // we can alternate between using the Taylor expansion for sin and cos according to the following table:
  734. //
  735. // quadrant sin(x) cos(x)
  736. // XXX00b sin(x') cos(x')
  737. // XXX01b cos(x') -sin(x')
  738. // XXX10b -sin(x') -cos(x')
  739. // XXX11b -cos(x') sin(x')
  740. //
  741. // So: sin_sign = bit2, cos_sign = bit1 ^ bit2, bit1 determines if we use sin or cos Taylor expansion
  742. UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
  743. UVec4 bit2 = UVec4::sAnd(quadrant.LogicalShiftLeft<30>(), UVec4::sReplicate(0x80000000U));
  744. // Select which one of the results is sin and which one is cos
  745. Vec4 s = Vec4::sSelect(taylor_sin, taylor_cos, bit1);
  746. Vec4 c = Vec4::sSelect(taylor_cos, taylor_sin, bit1);
  747. // Update the signs
  748. sin_sign = UVec4::sXor(sin_sign, bit2);
  749. UVec4 cos_sign = UVec4::sXor(bit1, bit2);
  750. // Correct the signs
  751. outSin = Vec4::sXor(s, sin_sign.ReinterpretAsFloat());
  752. outCos = Vec4::sXor(c, cos_sign.ReinterpretAsFloat());
  753. }
  754. Vec4 Vec4::Tan() const
  755. {
  756. // Implementation based on tanf.c from the cephes library, see Vec4::SinCos for further details
  757. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  758. // Make argument positive
  759. UVec4 tan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  760. Vec4 x = Vec4::sXor(*this, tan_sign.ReinterpretAsFloat());
  761. // x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
  762. UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
  763. // Remap x to range [-PI / 4, PI / 4], see Vec4::SinCos
  764. Vec4 float_quadrant = quadrant.ToFloat();
  765. x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
  766. // Calculate x2 = x^2
  767. Vec4 x2 = x * x;
  768. // Roughly equivalent to the Taylor expansion:
  769. // Tan(x) = x + x^3/3 + 2*x^5/15 + 17*x^7/315 + 62*x^9/2835 + ...
  770. Vec4 tan =
  771. (((((9.38540185543e-3f * x2 + Vec4::sReplicate(3.11992232697e-3f)) * x2 + Vec4::sReplicate(2.44301354525e-2f)) * x2
  772. + Vec4::sReplicate(5.34112807005e-2f)) * x2 + Vec4::sReplicate(1.33387994085e-1f)) * x2 + Vec4::sReplicate(3.33331568548e-1f)) * x2 * x + x;
  773. // For the 2nd and 4th quadrant we need to invert the value
  774. UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
  775. tan = Vec4::sSelect(tan, Vec4::sReplicate(-1.0f) / (tan JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))), bit1); // Add small epsilon to prevent div by zero, works because tan is always positive
  776. // Put the sign back
  777. return Vec4::sXor(tan, tan_sign.ReinterpretAsFloat());
  778. }
  779. Vec4 Vec4::ASin() const
  780. {
  781. // Implementation based on asinf.c from the cephes library
  782. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  783. // Make argument positive
  784. UVec4 asin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  785. Vec4 a = Vec4::sXor(*this, asin_sign.ReinterpretAsFloat());
  786. // ASin is not defined outside the range [-1, 1] but it often happens that a value is slightly above 1 so we just clamp here
  787. a = Vec4::sMin(a, Vec4::sReplicate(1.0f));
  788. // When |x| <= 0.5 we use the asin approximation as is
  789. Vec4 z1 = a * a;
  790. Vec4 x1 = a;
  791. // When |x| > 0.5 we use the identity asin(x) = PI / 2 - 2 * asin(sqrt((1 - x) / 2))
  792. Vec4 z2 = 0.5f * (Vec4::sReplicate(1.0f) - a);
  793. Vec4 x2 = z2.Sqrt();
  794. // Select which of the two situations we have
  795. UVec4 greater = Vec4::sGreater(a, Vec4::sReplicate(0.5f));
  796. Vec4 z = Vec4::sSelect(z1, z2, greater);
  797. Vec4 x = Vec4::sSelect(x1, x2, greater);
  798. // Polynomial approximation of asin
  799. z = ((((4.2163199048e-2f * z + Vec4::sReplicate(2.4181311049e-2f)) * z + Vec4::sReplicate(4.5470025998e-2f)) * z + Vec4::sReplicate(7.4953002686e-2f)) * z + Vec4::sReplicate(1.6666752422e-1f)) * z * x + x;
  800. // If |x| > 0.5 we need to apply the remainder of the identity above
  801. z = Vec4::sSelect(z, Vec4::sReplicate(0.5f * JPH_PI) - (z + z), greater);
  802. // Put the sign back
  803. return Vec4::sXor(z, asin_sign.ReinterpretAsFloat());
  804. }
  805. Vec4 Vec4::ACos() const
  806. {
  807. // Not the most accurate, but simple
  808. return Vec4::sReplicate(0.5f * JPH_PI) - ASin();
  809. }
  810. Vec4 Vec4::ATan() const
  811. {
  812. // Implementation based on atanf.c from the cephes library
  813. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  814. // Make argument positive
  815. UVec4 atan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  816. Vec4 x = Vec4::sXor(*this, atan_sign.ReinterpretAsFloat());
  817. Vec4 y = Vec4::sZero();
  818. // If x > Tan(PI / 8)
  819. UVec4 greater1 = Vec4::sGreater(x, Vec4::sReplicate(0.4142135623730950f));
  820. Vec4 x1 = (x - Vec4::sReplicate(1.0f)) / (x + Vec4::sReplicate(1.0f));
  821. // If x > Tan(3 * PI / 8)
  822. UVec4 greater2 = Vec4::sGreater(x, Vec4::sReplicate(2.414213562373095f));
  823. Vec4 x2 = Vec4::sReplicate(-1.0f) / (x JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))); // Add small epsilon to prevent div by zero, works because x is always positive
  824. // Apply first if
  825. x = Vec4::sSelect(x, x1, greater1);
  826. y = Vec4::sSelect(y, Vec4::sReplicate(0.25f * JPH_PI), greater1);
  827. // Apply second if
  828. x = Vec4::sSelect(x, x2, greater2);
  829. y = Vec4::sSelect(y, Vec4::sReplicate(0.5f * JPH_PI), greater2);
  830. // Polynomial approximation
  831. Vec4 z = x * x;
  832. y += (((8.05374449538e-2f * z - Vec4::sReplicate(1.38776856032e-1f)) * z + Vec4::sReplicate(1.99777106478e-1f)) * z - Vec4::sReplicate(3.33329491539e-1f)) * z * x + x;
  833. // Put the sign back
  834. return Vec4::sXor(y, atan_sign.ReinterpretAsFloat());
  835. }
  836. Vec4 Vec4::sATan2(Vec4Arg inY, Vec4Arg inX)
  837. {
  838. UVec4 sign_mask = UVec4::sReplicate(0x80000000U);
  839. // Determine absolute value and sign of y
  840. UVec4 y_sign = UVec4::sAnd(inY.ReinterpretAsInt(), sign_mask);
  841. Vec4 y_abs = Vec4::sXor(inY, y_sign.ReinterpretAsFloat());
  842. // Determine absolute value and sign of x
  843. UVec4 x_sign = UVec4::sAnd(inX.ReinterpretAsInt(), sign_mask);
  844. Vec4 x_abs = Vec4::sXor(inX, x_sign.ReinterpretAsFloat());
  845. // Always divide smallest / largest to avoid dividing by zero
  846. UVec4 x_is_numerator = Vec4::sLess(x_abs, y_abs);
  847. Vec4 numerator = Vec4::sSelect(y_abs, x_abs, x_is_numerator);
  848. Vec4 denominator = Vec4::sSelect(x_abs, y_abs, x_is_numerator);
  849. Vec4 atan = (numerator / denominator).ATan();
  850. // If we calculated x / y instead of y / x the result is PI / 2 - result (note that this is true because we know the result is positive because the input was positive)
  851. atan = Vec4::sSelect(atan, Vec4::sReplicate(0.5f * JPH_PI) - atan, x_is_numerator);
  852. // Now we need to map to the correct quadrant
  853. // x_sign y_sign result
  854. // +1 +1 atan
  855. // -1 +1 -atan + PI
  856. // -1 -1 atan - PI
  857. // +1 -1 -atan
  858. // This can be written as: x_sign * y_sign * (atan - (x_sign < 0? PI : 0))
  859. atan -= Vec4::sAnd(x_sign.ArithmeticShiftRight<31>().ReinterpretAsFloat(), Vec4::sReplicate(JPH_PI));
  860. atan = Vec4::sXor(atan, UVec4::sXor(x_sign, y_sign).ReinterpretAsFloat());
  861. return atan;
  862. }
  863. JPH_NAMESPACE_END