Vec4.inl 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970
  1. // Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
  2. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  3. // SPDX-License-Identifier: MIT
  4. #include <Jolt/Math/Trigonometry.h>
  5. #include <Jolt/Math/Vec3.h>
  6. #include <Jolt/Math/UVec4.h>
  7. JPH_NAMESPACE_BEGIN
  8. // Constructor
  9. Vec4::Vec4(Vec3Arg inRHS) :
  10. mValue(inRHS.mValue)
  11. {
  12. }
  13. Vec4::Vec4(Vec3Arg inRHS, float inW)
  14. {
  15. #if defined(JPH_USE_SSE4_1)
  16. mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
  17. #elif defined(JPH_USE_NEON)
  18. mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
  19. #else
  20. for (int i = 0; i < 3; i++)
  21. mF32[i] = inRHS.mF32[i];
  22. mF32[3] = inW;
  23. #endif
  24. }
  25. Vec4::Vec4(float inX, float inY, float inZ, float inW)
  26. {
  27. #if defined(JPH_USE_SSE)
  28. mValue = _mm_set_ps(inW, inZ, inY, inX);
  29. #elif defined(JPH_USE_NEON)
  30. uint32x2_t xy = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32 *>(&inX)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inY)) << 32));
  31. uint32x2_t zw = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32* >(&inZ)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inW)) << 32));
  32. mValue = vcombine_f32(xy, zw);
  33. #else
  34. mF32[0] = inX;
  35. mF32[1] = inY;
  36. mF32[2] = inZ;
  37. mF32[3] = inW;
  38. #endif
  39. }
  40. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
  41. Vec4 Vec4::Swizzle() const
  42. {
  43. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  44. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  45. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  46. static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
  47. #if defined(JPH_USE_SSE)
  48. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
  49. #elif defined(JPH_USE_NEON)
  50. return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
  51. #else
  52. return Vec4(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ], mF32[SwizzleW]);
  53. #endif
  54. }
  55. Vec4 Vec4::sZero()
  56. {
  57. #if defined(JPH_USE_SSE)
  58. return _mm_setzero_ps();
  59. #elif defined(JPH_USE_NEON)
  60. return vdupq_n_f32(0);
  61. #else
  62. return Vec4(0, 0, 0, 0);
  63. #endif
  64. }
  65. Vec4 Vec4::sReplicate(float inV)
  66. {
  67. #if defined(JPH_USE_SSE)
  68. return _mm_set1_ps(inV);
  69. #elif defined(JPH_USE_NEON)
  70. return vdupq_n_f32(inV);
  71. #else
  72. return Vec4(inV, inV, inV, inV);
  73. #endif
  74. }
  75. Vec4 Vec4::sNaN()
  76. {
  77. return sReplicate(numeric_limits<float>::quiet_NaN());
  78. }
  79. Vec4 Vec4::sLoadFloat4(const Float4 *inV)
  80. {
  81. #if defined(JPH_USE_SSE)
  82. return _mm_loadu_ps(&inV->x);
  83. #elif defined(JPH_USE_NEON)
  84. return vld1q_f32(&inV->x);
  85. #else
  86. return Vec4(inV->x, inV->y, inV->z, inV->w);
  87. #endif
  88. }
  89. Vec4 Vec4::sLoadFloat4Aligned(const Float4 *inV)
  90. {
  91. #if defined(JPH_USE_SSE)
  92. return _mm_load_ps(&inV->x);
  93. #elif defined(JPH_USE_NEON)
  94. return vld1q_f32(&inV->x);
  95. #else
  96. return Vec4(inV->x, inV->y, inV->z, inV->w);
  97. #endif
  98. }
  99. template <const int Scale>
  100. Vec4 Vec4::sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
  101. {
  102. #if defined(JPH_USE_SSE)
  103. #ifdef JPH_USE_AVX2
  104. return _mm_i32gather_ps(inBase, inOffsets.mValue, Scale);
  105. #else
  106. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  107. Type x = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale));
  108. Type y = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale));
  109. Type xy = _mm_unpacklo_ps(x, y);
  110. Type z = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale));
  111. Type w = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale));
  112. Type zw = _mm_unpacklo_ps(z, w);
  113. return _mm_movelh_ps(xy, zw);
  114. #endif
  115. #else
  116. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  117. float x = *reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale);
  118. float y = *reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale);
  119. float z = *reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale);
  120. float w = *reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale);
  121. return Vec4(x, y, z, w);
  122. #endif
  123. }
  124. Vec4 Vec4::sMin(Vec4Arg inV1, Vec4Arg inV2)
  125. {
  126. #if defined(JPH_USE_SSE)
  127. return _mm_min_ps(inV1.mValue, inV2.mValue);
  128. #elif defined(JPH_USE_NEON)
  129. return vminq_f32(inV1.mValue, inV2.mValue);
  130. #else
  131. return Vec4(min(inV1.mF32[0], inV2.mF32[0]),
  132. min(inV1.mF32[1], inV2.mF32[1]),
  133. min(inV1.mF32[2], inV2.mF32[2]),
  134. min(inV1.mF32[3], inV2.mF32[3]));
  135. #endif
  136. }
  137. Vec4 Vec4::sMax(Vec4Arg inV1, Vec4Arg inV2)
  138. {
  139. #if defined(JPH_USE_SSE)
  140. return _mm_max_ps(inV1.mValue, inV2.mValue);
  141. #elif defined(JPH_USE_NEON)
  142. return vmaxq_f32(inV1.mValue, inV2.mValue);
  143. #else
  144. return Vec4(max(inV1.mF32[0], inV2.mF32[0]),
  145. max(inV1.mF32[1], inV2.mF32[1]),
  146. max(inV1.mF32[2], inV2.mF32[2]),
  147. max(inV1.mF32[3], inV2.mF32[3]));
  148. #endif
  149. }
  150. UVec4 Vec4::sEquals(Vec4Arg inV1, Vec4Arg inV2)
  151. {
  152. #if defined(JPH_USE_SSE)
  153. return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
  154. #elif defined(JPH_USE_NEON)
  155. return vceqq_f32(inV1.mValue, inV2.mValue);
  156. #else
  157. return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0,
  158. inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0,
  159. inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0,
  160. inV1.mF32[3] == inV2.mF32[3]? 0xffffffffu : 0);
  161. #endif
  162. }
  163. UVec4 Vec4::sLess(Vec4Arg inV1, Vec4Arg inV2)
  164. {
  165. #if defined(JPH_USE_SSE)
  166. return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
  167. #elif defined(JPH_USE_NEON)
  168. return vcltq_f32(inV1.mValue, inV2.mValue);
  169. #else
  170. return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0,
  171. inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0,
  172. inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0,
  173. inV1.mF32[3] < inV2.mF32[3]? 0xffffffffu : 0);
  174. #endif
  175. }
  176. UVec4 Vec4::sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  177. {
  178. #if defined(JPH_USE_SSE)
  179. return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
  180. #elif defined(JPH_USE_NEON)
  181. return vcleq_f32(inV1.mValue, inV2.mValue);
  182. #else
  183. return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0,
  184. inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0,
  185. inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0,
  186. inV1.mF32[3] <= inV2.mF32[3]? 0xffffffffu : 0);
  187. #endif
  188. }
  189. UVec4 Vec4::sGreater(Vec4Arg inV1, Vec4Arg inV2)
  190. {
  191. #if defined(JPH_USE_SSE)
  192. return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
  193. #elif defined(JPH_USE_NEON)
  194. return vcgtq_f32(inV1.mValue, inV2.mValue);
  195. #else
  196. return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0,
  197. inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0,
  198. inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0,
  199. inV1.mF32[3] > inV2.mF32[3]? 0xffffffffu : 0);
  200. #endif
  201. }
  202. UVec4 Vec4::sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  203. {
  204. #if defined(JPH_USE_SSE)
  205. return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
  206. #elif defined(JPH_USE_NEON)
  207. return vcgeq_f32(inV1.mValue, inV2.mValue);
  208. #else
  209. return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0,
  210. inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0,
  211. inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0,
  212. inV1.mF32[3] >= inV2.mF32[3]? 0xffffffffu : 0);
  213. #endif
  214. }
  215. Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
  216. {
  217. #if defined(JPH_USE_SSE)
  218. #ifdef JPH_USE_FMADD
  219. return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
  220. #else
  221. return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
  222. #endif
  223. #elif defined(JPH_USE_NEON)
  224. return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
  225. #else
  226. return Vec4(inMul1.mF32[0] * inMul2.mF32[0] + inAdd.mF32[0],
  227. inMul1.mF32[1] * inMul2.mF32[1] + inAdd.mF32[1],
  228. inMul1.mF32[2] * inMul2.mF32[2] + inAdd.mF32[2],
  229. inMul1.mF32[3] * inMul2.mF32[3] + inAdd.mF32[3]);
  230. #endif
  231. }
  232. Vec4 Vec4::sSelect(Vec4Arg inV1, Vec4Arg inV2, UVec4Arg inControl)
  233. {
  234. #if defined(JPH_USE_SSE4_1)
  235. return _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
  236. #elif defined(JPH_USE_NEON)
  237. return vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
  238. #else
  239. Vec4 result;
  240. for (int i = 0; i < 4; i++)
  241. result.mF32[i] = inControl.mU32[i] ? inV2.mF32[i] : inV1.mF32[i];
  242. return result;
  243. #endif
  244. }
  245. Vec4 Vec4::sOr(Vec4Arg inV1, Vec4Arg inV2)
  246. {
  247. #if defined(JPH_USE_SSE)
  248. return _mm_or_ps(inV1.mValue, inV2.mValue);
  249. #elif defined(JPH_USE_NEON)
  250. return vorrq_s32(inV1.mValue, inV2.mValue);
  251. #else
  252. return UVec4::sOr(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
  253. #endif
  254. }
  255. Vec4 Vec4::sXor(Vec4Arg inV1, Vec4Arg inV2)
  256. {
  257. #if defined(JPH_USE_SSE)
  258. return _mm_xor_ps(inV1.mValue, inV2.mValue);
  259. #elif defined(JPH_USE_NEON)
  260. return veorq_s32(inV1.mValue, inV2.mValue);
  261. #else
  262. return UVec4::sXor(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
  263. #endif
  264. }
  265. Vec4 Vec4::sAnd(Vec4Arg inV1, Vec4Arg inV2)
  266. {
  267. #if defined(JPH_USE_SSE)
  268. return _mm_and_ps(inV1.mValue, inV2.mValue);
  269. #elif defined(JPH_USE_NEON)
  270. return vandq_s32(inV1.mValue, inV2.mValue);
  271. #else
  272. return UVec4::sAnd(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
  273. #endif
  274. }
  275. void Vec4::sSort4(Vec4 &ioValue, UVec4 &ioIndex)
  276. {
  277. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  278. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  279. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  280. UVec4 c1 = sLess(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  281. ioValue = sSelect(ioValue, v1, c1);
  282. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  283. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  284. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  285. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  286. UVec4 c2 = sLess(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  287. ioValue = sSelect(ioValue, v2, c2);
  288. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  289. // Pass 3, test 2nd vs 3rd component
  290. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  291. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  292. UVec4 c3 = sLess(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  293. ioValue = sSelect(ioValue, v3, c3);
  294. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  295. }
  296. void Vec4::sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
  297. {
  298. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  299. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  300. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  301. UVec4 c1 = sGreater(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  302. ioValue = sSelect(ioValue, v1, c1);
  303. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  304. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  305. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  306. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  307. UVec4 c2 = sGreater(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  308. ioValue = sSelect(ioValue, v2, c2);
  309. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  310. // Pass 3, test 2nd vs 3rd component
  311. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  312. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  313. UVec4 c3 = sGreater(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  314. ioValue = sSelect(ioValue, v3, c3);
  315. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  316. }
  317. bool Vec4::operator == (Vec4Arg inV2) const
  318. {
  319. return sEquals(*this, inV2).TestAllTrue();
  320. }
  321. bool Vec4::IsClose(Vec4Arg inV2, float inMaxDistSq) const
  322. {
  323. return (inV2 - *this).LengthSq() <= inMaxDistSq;
  324. }
  325. bool Vec4::IsNormalized(float inTolerance) const
  326. {
  327. return abs(LengthSq() - 1.0f) <= inTolerance;
  328. }
  329. bool Vec4::IsNaN() const
  330. {
  331. #if defined(JPH_USE_AVX512)
  332. return _mm_fpclass_ps_mask(mValue, 0b10000001) != 0;
  333. #elif defined(JPH_USE_SSE)
  334. return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
  335. #elif defined(JPH_USE_NEON)
  336. uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
  337. return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
  338. #else
  339. return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]) || isnan(mF32[3]);
  340. #endif
  341. }
  342. Vec4 Vec4::operator * (Vec4Arg inV2) const
  343. {
  344. #if defined(JPH_USE_SSE)
  345. return _mm_mul_ps(mValue, inV2.mValue);
  346. #elif defined(JPH_USE_NEON)
  347. return vmulq_f32(mValue, inV2.mValue);
  348. #else
  349. return Vec4(mF32[0] * inV2.mF32[0],
  350. mF32[1] * inV2.mF32[1],
  351. mF32[2] * inV2.mF32[2],
  352. mF32[3] * inV2.mF32[3]);
  353. #endif
  354. }
  355. Vec4 Vec4::operator * (float inV2) const
  356. {
  357. #if defined(JPH_USE_SSE)
  358. return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  359. #elif defined(JPH_USE_NEON)
  360. return vmulq_n_f32(mValue, inV2);
  361. #else
  362. return Vec4(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2, mF32[3] * inV2);
  363. #endif
  364. }
  365. /// Multiply vector with float
  366. Vec4 operator * (float inV1, Vec4Arg inV2)
  367. {
  368. #if defined(JPH_USE_SSE)
  369. return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
  370. #elif defined(JPH_USE_NEON)
  371. return vmulq_n_f32(inV2.mValue, inV1);
  372. #else
  373. return Vec4(inV1 * inV2.mF32[0],
  374. inV1 * inV2.mF32[1],
  375. inV1 * inV2.mF32[2],
  376. inV1 * inV2.mF32[3]);
  377. #endif
  378. }
  379. Vec4 Vec4::operator / (float inV2) const
  380. {
  381. #if defined(JPH_USE_SSE)
  382. return _mm_div_ps(mValue, _mm_set1_ps(inV2));
  383. #elif defined(JPH_USE_NEON)
  384. return vdivq_f32(mValue, vdupq_n_f32(inV2));
  385. #else
  386. return Vec4(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2, mF32[3] / inV2);
  387. #endif
  388. }
  389. Vec4 &Vec4::operator *= (float inV2)
  390. {
  391. #if defined(JPH_USE_SSE)
  392. mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  393. #elif defined(JPH_USE_NEON)
  394. mValue = vmulq_n_f32(mValue, inV2);
  395. #else
  396. for (int i = 0; i < 4; ++i)
  397. mF32[i] *= inV2;
  398. #endif
  399. return *this;
  400. }
  401. Vec4 &Vec4::operator *= (Vec4Arg inV2)
  402. {
  403. #if defined(JPH_USE_SSE)
  404. mValue = _mm_mul_ps(mValue, inV2.mValue);
  405. #elif defined(JPH_USE_NEON)
  406. mValue = vmulq_f32(mValue, inV2.mValue);
  407. #else
  408. for (int i = 0; i < 4; ++i)
  409. mF32[i] *= inV2.mF32[i];
  410. #endif
  411. return *this;
  412. }
  413. Vec4 &Vec4::operator /= (float inV2)
  414. {
  415. #if defined(JPH_USE_SSE)
  416. mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
  417. #elif defined(JPH_USE_NEON)
  418. mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
  419. #else
  420. for (int i = 0; i < 4; ++i)
  421. mF32[i] /= inV2;
  422. #endif
  423. return *this;
  424. }
  425. Vec4 Vec4::operator + (Vec4Arg inV2) const
  426. {
  427. #if defined(JPH_USE_SSE)
  428. return _mm_add_ps(mValue, inV2.mValue);
  429. #elif defined(JPH_USE_NEON)
  430. return vaddq_f32(mValue, inV2.mValue);
  431. #else
  432. return Vec4(mF32[0] + inV2.mF32[0],
  433. mF32[1] + inV2.mF32[1],
  434. mF32[2] + inV2.mF32[2],
  435. mF32[3] + inV2.mF32[3]);
  436. #endif
  437. }
  438. Vec4 &Vec4::operator += (Vec4Arg inV2)
  439. {
  440. #if defined(JPH_USE_SSE)
  441. mValue = _mm_add_ps(mValue, inV2.mValue);
  442. #elif defined(JPH_USE_NEON)
  443. mValue = vaddq_f32(mValue, inV2.mValue);
  444. #else
  445. for (int i = 0; i < 4; ++i)
  446. mF32[i] += inV2.mF32[i];
  447. #endif
  448. return *this;
  449. }
  450. Vec4 Vec4::operator - () const
  451. {
  452. #if defined(JPH_USE_SSE)
  453. return _mm_sub_ps(_mm_setzero_ps(), mValue);
  454. #elif defined(JPH_USE_NEON)
  455. return vnegq_f32(mValue);
  456. #else
  457. return Vec4(-mF32[0], -mF32[1], -mF32[2], -mF32[3]);
  458. #endif
  459. }
  460. Vec4 Vec4::operator - (Vec4Arg inV2) const
  461. {
  462. #if defined(JPH_USE_SSE)
  463. return _mm_sub_ps(mValue, inV2.mValue);
  464. #elif defined(JPH_USE_NEON)
  465. return vsubq_f32(mValue, inV2.mValue);
  466. #else
  467. return Vec4(mF32[0] - inV2.mF32[0],
  468. mF32[1] - inV2.mF32[1],
  469. mF32[2] - inV2.mF32[2],
  470. mF32[3] - inV2.mF32[3]);
  471. #endif
  472. }
  473. Vec4 &Vec4::operator -= (Vec4Arg inV2)
  474. {
  475. #if defined(JPH_USE_SSE)
  476. mValue = _mm_sub_ps(mValue, inV2.mValue);
  477. #elif defined(JPH_USE_NEON)
  478. mValue = vsubq_f32(mValue, inV2.mValue);
  479. #else
  480. for (int i = 0; i < 4; ++i)
  481. mF32[i] -= inV2.mF32[i];
  482. #endif
  483. return *this;
  484. }
  485. Vec4 Vec4::operator / (Vec4Arg inV2) const
  486. {
  487. #if defined(JPH_USE_SSE)
  488. return _mm_div_ps(mValue, inV2.mValue);
  489. #elif defined(JPH_USE_NEON)
  490. return vdivq_f32(mValue, inV2.mValue);
  491. #else
  492. return Vec4(mF32[0] / inV2.mF32[0],
  493. mF32[1] / inV2.mF32[1],
  494. mF32[2] / inV2.mF32[2],
  495. mF32[3] / inV2.mF32[3]);
  496. #endif
  497. }
  498. Vec4 Vec4::SplatX() const
  499. {
  500. #if defined(JPH_USE_SSE)
  501. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
  502. #elif defined(JPH_USE_NEON)
  503. return vdupq_laneq_f32(mValue, 0);
  504. #else
  505. return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
  506. #endif
  507. }
  508. Vec4 Vec4::SplatY() const
  509. {
  510. #if defined(JPH_USE_SSE)
  511. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
  512. #elif defined(JPH_USE_NEON)
  513. return vdupq_laneq_f32(mValue, 1);
  514. #else
  515. return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
  516. #endif
  517. }
  518. Vec4 Vec4::SplatZ() const
  519. {
  520. #if defined(JPH_USE_SSE)
  521. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
  522. #elif defined(JPH_USE_NEON)
  523. return vdupq_laneq_f32(mValue, 2);
  524. #else
  525. return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
  526. #endif
  527. }
  528. Vec4 Vec4::SplatW() const
  529. {
  530. #if defined(JPH_USE_SSE)
  531. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
  532. #elif defined(JPH_USE_NEON)
  533. return vdupq_laneq_f32(mValue, 3);
  534. #else
  535. return Vec4(mF32[3], mF32[3], mF32[3], mF32[3]);
  536. #endif
  537. }
  538. Vec4 Vec4::Abs() const
  539. {
  540. #if defined(JPH_USE_AVX512)
  541. return _mm_range_ps(mValue, mValue, 0b1000);
  542. #elif defined(JPH_USE_SSE)
  543. return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
  544. #elif defined(JPH_USE_NEON)
  545. return vabsq_f32(mValue);
  546. #else
  547. return Vec4(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]), abs(mF32[3]));
  548. #endif
  549. }
  550. Vec4 Vec4::Reciprocal() const
  551. {
  552. return sReplicate(1.0f) / mValue;
  553. }
  554. Vec4 Vec4::DotV(Vec4Arg inV2) const
  555. {
  556. #if defined(JPH_USE_SSE4_1)
  557. return _mm_dp_ps(mValue, inV2.mValue, 0xff);
  558. #elif defined(JPH_USE_NEON)
  559. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  560. return vdupq_n_f32(vaddvq_f32(mul));
  561. #else
  562. // Brackets placed so that the order is consistent with the vectorized version
  563. return Vec4::sReplicate((mF32[0] * inV2.mF32[0] + mF32[1] * inV2.mF32[1]) + (mF32[2] * inV2.mF32[2] + mF32[3] * inV2.mF32[3]));
  564. #endif
  565. }
  566. float Vec4::Dot(Vec4Arg inV2) const
  567. {
  568. #if defined(JPH_USE_SSE4_1)
  569. return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
  570. #elif defined(JPH_USE_NEON)
  571. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  572. return vaddvq_f32(mul);
  573. #else
  574. // Brackets placed so that the order is consistent with the vectorized version
  575. return (mF32[0] * inV2.mF32[0] + mF32[1] * inV2.mF32[1]) + (mF32[2] * inV2.mF32[2] + mF32[3] * inV2.mF32[3]);
  576. #endif
  577. }
  578. float Vec4::LengthSq() const
  579. {
  580. #if defined(JPH_USE_SSE4_1)
  581. return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
  582. #elif defined(JPH_USE_NEON)
  583. float32x4_t mul = vmulq_f32(mValue, mValue);
  584. return vaddvq_f32(mul);
  585. #else
  586. // Brackets placed so that the order is consistent with the vectorized version
  587. return (mF32[0] * mF32[0] + mF32[1] * mF32[1]) + (mF32[2] * mF32[2] + mF32[3] * mF32[3]);
  588. #endif
  589. }
  590. float Vec4::Length() const
  591. {
  592. #if defined(JPH_USE_SSE4_1)
  593. return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
  594. #elif defined(JPH_USE_NEON)
  595. float32x4_t mul = vmulq_f32(mValue, mValue);
  596. float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
  597. return vget_lane_f32(vsqrt_f32(sum), 0);
  598. #else
  599. // Brackets placed so that the order is consistent with the vectorized version
  600. return sqrt((mF32[0] * mF32[0] + mF32[1] * mF32[1]) + (mF32[2] * mF32[2] + mF32[3] * mF32[3]));
  601. #endif
  602. }
  603. Vec4 Vec4::Sqrt() const
  604. {
  605. #if defined(JPH_USE_SSE)
  606. return _mm_sqrt_ps(mValue);
  607. #elif defined(JPH_USE_NEON)
  608. return vsqrtq_f32(mValue);
  609. #else
  610. return Vec4(sqrt(mF32[0]), sqrt(mF32[1]), sqrt(mF32[2]), sqrt(mF32[3]));
  611. #endif
  612. }
  613. Vec4 Vec4::GetSign() const
  614. {
  615. #if defined(JPH_USE_AVX512)
  616. return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
  617. #elif defined(JPH_USE_SSE)
  618. Type minus_one = _mm_set1_ps(-1.0f);
  619. Type one = _mm_set1_ps(1.0f);
  620. return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
  621. #elif defined(JPH_USE_NEON)
  622. Type minus_one = vdupq_n_f32(-1.0f);
  623. Type one = vdupq_n_f32(1.0f);
  624. return vorrq_s32(vandq_s32(mValue, minus_one), one);
  625. #else
  626. return Vec4(signbit(mF32[0])? -1.0f : 1.0f,
  627. signbit(mF32[1])? -1.0f : 1.0f,
  628. signbit(mF32[2])? -1.0f : 1.0f,
  629. signbit(mF32[3])? -1.0f : 1.0f);
  630. #endif
  631. }
  632. Vec4 Vec4::Normalized() const
  633. {
  634. #if defined(JPH_USE_SSE4_1)
  635. return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
  636. #elif defined(JPH_USE_NEON)
  637. float32x4_t mul = vmulq_f32(mValue, mValue);
  638. float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
  639. return vdivq_f32(mValue, vsqrtq_f32(sum));
  640. #else
  641. return *this / Length();
  642. #endif
  643. }
  644. void Vec4::StoreFloat4(Float4 *outV) const
  645. {
  646. #if defined(JPH_USE_SSE)
  647. _mm_storeu_ps(&outV->x, mValue);
  648. #elif defined(JPH_USE_NEON)
  649. vst1q_f32(&outV->x, mValue);
  650. #else
  651. for (int i = 0; i < 4; ++i)
  652. (&outV->x)[i] = mF32[i];
  653. #endif
  654. }
  655. UVec4 Vec4::ToInt() const
  656. {
  657. #if defined(JPH_USE_SSE)
  658. return _mm_cvttps_epi32(mValue);
  659. #elif defined(JPH_USE_NEON)
  660. return vcvtq_u32_f32(mValue);
  661. #else
  662. return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
  663. #endif
  664. }
  665. UVec4 Vec4::ReinterpretAsInt() const
  666. {
  667. #if defined(JPH_USE_SSE)
  668. return UVec4(_mm_castps_si128(mValue));
  669. #elif defined(JPH_USE_NEON)
  670. return vreinterpretq_u32_f32(mValue);
  671. #else
  672. return *reinterpret_cast<const UVec4 *>(this);
  673. #endif
  674. }
  675. int Vec4::GetSignBits() const
  676. {
  677. #if defined(JPH_USE_SSE)
  678. return _mm_movemask_ps(mValue);
  679. #elif defined(JPH_USE_NEON)
  680. int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
  681. return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
  682. #else
  683. return (signbit(mF32[0])? 1 : 0) | (signbit(mF32[1])? 2 : 0) | (signbit(mF32[2])? 4 : 0) | (signbit(mF32[3])? 8 : 0);
  684. #endif
  685. }
  686. float Vec4::ReduceMin() const
  687. {
  688. Vec4 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  689. v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  690. return v.GetX();
  691. }
  692. float Vec4::ReduceMax() const
  693. {
  694. Vec4 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  695. v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  696. return v.GetX();
  697. }
  698. void Vec4::SinCos(Vec4 &outSin, Vec4 &outCos) const
  699. {
  700. // Implementation based on sinf.c from the cephes library, combines sinf and cosf in a single function, changes octants to quadrants and vectorizes it
  701. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  702. // Make argument positive and remember sign for sin only since cos is symmetric around x (highest bit of a float is the sign bit)
  703. UVec4 sin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  704. Vec4 x = Vec4::sXor(*this, sin_sign.ReinterpretAsFloat());
  705. // x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
  706. UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
  707. // Make x relative to the closest quadrant.
  708. // This does x = x - quadrant * PI / 2 using a two step Cody-Waite argument reduction.
  709. // This improves the accuracy of the result by avoiding loss of significant bits in the subtraction.
  710. // We start with x = x - quadrant * PI / 2, PI / 2 in hexadecimal notation is 0x3fc90fdb, we remove the lowest 16 bits to
  711. // get 0x3fc90000 (= 1.5703125) this means we can now multiply with a number of up to 2^16 without losing any bits.
  712. // This leaves us with: x = (x - quadrant * 1.5703125) - quadrant * (PI / 2 - 1.5703125).
  713. // PI / 2 - 1.5703125 in hexadecimal is 0x39fdaa22, stripping the lowest 12 bits we get 0x39fda000 (= 0.0004837512969970703125)
  714. // This leaves uw with: x = ((x - quadrant * 1.5703125) - quadrant * 0.0004837512969970703125) - quadrant * (PI / 2 - 1.5703125 - 0.0004837512969970703125)
  715. // See: https://stackoverflow.com/questions/42455143/sine-cosine-modular-extended-precision-arithmetic
  716. // After this we have x in the range [-PI / 4, PI / 4].
  717. Vec4 float_quadrant = quadrant.ToFloat();
  718. x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
  719. // Calculate x2 = x^2
  720. Vec4 x2 = x * x;
  721. // Taylor expansion:
  722. // Cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + x^8/8! + ... = (((x2/8!- 1/6!) * x2 + 1/4!) * x2 - 1/2!) * x2 + 1
  723. Vec4 taylor_cos = ((2.443315711809948e-5f * x2 - Vec4::sReplicate(1.388731625493765e-3f)) * x2 + Vec4::sReplicate(4.166664568298827e-2f)) * x2 * x2 - 0.5f * x2 + Vec4::sReplicate(1.0f);
  724. // Sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ... = ((-x2/7! + 1/5!) * x2 - 1/3!) * x2 * x + x
  725. Vec4 taylor_sin = ((-1.9515295891e-4f * x2 + Vec4::sReplicate(8.3321608736e-3f)) * x2 - Vec4::sReplicate(1.6666654611e-1f)) * x2 * x + x;
  726. // The lowest 2 bits of quadrant indicate the quadrant that we are in.
  727. // Let x be the original input value and x' our value that has been mapped to the range [-PI / 4, PI / 4].
  728. // since cos(x) = sin(x - PI / 2) and since we want to use the Taylor expansion as close as possible to 0,
  729. // we can alternate between using the Taylor expansion for sin and cos according to the following table:
  730. //
  731. // quadrant sin(x) cos(x)
  732. // XXX00b sin(x') cos(x')
  733. // XXX01b cos(x') -sin(x')
  734. // XXX10b -sin(x') -cos(x')
  735. // XXX11b -cos(x') sin(x')
  736. //
  737. // So: sin_sign = bit2, cos_sign = bit1 ^ bit2, bit1 determines if we use sin or cos Taylor expansion
  738. UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
  739. UVec4 bit2 = UVec4::sAnd(quadrant.LogicalShiftLeft<30>(), UVec4::sReplicate(0x80000000U));
  740. // Select which one of the results is sin and which one is cos
  741. Vec4 s = Vec4::sSelect(taylor_sin, taylor_cos, bit1);
  742. Vec4 c = Vec4::sSelect(taylor_cos, taylor_sin, bit1);
  743. // Update the signs
  744. sin_sign = UVec4::sXor(sin_sign, bit2);
  745. UVec4 cos_sign = UVec4::sXor(bit1, bit2);
  746. // Correct the signs
  747. outSin = Vec4::sXor(s, sin_sign.ReinterpretAsFloat());
  748. outCos = Vec4::sXor(c, cos_sign.ReinterpretAsFloat());
  749. }
  750. Vec4 Vec4::Tan() const
  751. {
  752. // Implementation based on tanf.c from the cephes library, see Vec4::SinCos for further details
  753. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  754. // Make argument positive
  755. UVec4 tan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  756. Vec4 x = Vec4::sXor(*this, tan_sign.ReinterpretAsFloat());
  757. // x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
  758. UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
  759. // Remap x to range [-PI / 4, PI / 4], see Vec4::SinCos
  760. Vec4 float_quadrant = quadrant.ToFloat();
  761. x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
  762. // Calculate x2 = x^2
  763. Vec4 x2 = x * x;
  764. // Roughly equivalent to the Taylor expansion:
  765. // Tan(x) = x + x^3/3 + 2*x^5/15 + 17*x^7/315 + 62*x^9/2835 + ...
  766. Vec4 tan =
  767. (((((9.38540185543e-3f * x2 + Vec4::sReplicate(3.11992232697e-3f)) * x2 + Vec4::sReplicate(2.44301354525e-2f)) * x2
  768. + Vec4::sReplicate(5.34112807005e-2f)) * x2 + Vec4::sReplicate(1.33387994085e-1f)) * x2 + Vec4::sReplicate(3.33331568548e-1f)) * x2 * x + x;
  769. // For the 2nd and 4th quadrant we need to invert the value
  770. UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
  771. tan = Vec4::sSelect(tan, Vec4::sReplicate(-1.0f) / (tan JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))), bit1); // Add small epsilon to prevent div by zero, works because tan is always positive
  772. // Put the sign back
  773. return Vec4::sXor(tan, tan_sign.ReinterpretAsFloat());
  774. }
  775. Vec4 Vec4::ASin() const
  776. {
  777. // Implementation based on asinf.c from the cephes library
  778. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  779. // Make argument positive
  780. UVec4 asin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  781. Vec4 a = Vec4::sXor(*this, asin_sign.ReinterpretAsFloat());
  782. // ASin is not defined outside the range [-1, 1] but it often happens that a value is slightly above 1 so we just clamp here
  783. a = Vec4::sMin(a, Vec4::sReplicate(1.0f));
  784. // When |x| <= 0.5 we use the asin approximation as is
  785. Vec4 z1 = a * a;
  786. Vec4 x1 = a;
  787. // When |x| > 0.5 we use the identity asin(x) = PI / 2 - 2 * asin(sqrt((1 - x) / 2))
  788. Vec4 z2 = 0.5f * (Vec4::sReplicate(1.0f) - a);
  789. Vec4 x2 = z2.Sqrt();
  790. // Select which of the two situations we have
  791. UVec4 greater = Vec4::sGreater(a, Vec4::sReplicate(0.5f));
  792. Vec4 z = Vec4::sSelect(z1, z2, greater);
  793. Vec4 x = Vec4::sSelect(x1, x2, greater);
  794. // Polynomial approximation of asin
  795. z = ((((4.2163199048e-2f * z + Vec4::sReplicate(2.4181311049e-2f)) * z + Vec4::sReplicate(4.5470025998e-2f)) * z + Vec4::sReplicate(7.4953002686e-2f)) * z + Vec4::sReplicate(1.6666752422e-1f)) * z * x + x;
  796. // If |x| > 0.5 we need to apply the remainder of the identity above
  797. z = Vec4::sSelect(z, Vec4::sReplicate(0.5f * JPH_PI) - (z + z), greater);
  798. // Put the sign back
  799. return Vec4::sXor(z, asin_sign.ReinterpretAsFloat());
  800. }
  801. Vec4 Vec4::ACos() const
  802. {
  803. // Not the most accurate, but simple
  804. return Vec4::sReplicate(0.5f * JPH_PI) - ASin();
  805. }
  806. Vec4 Vec4::ATan() const
  807. {
  808. // Implementation based on atanf.c from the cephes library
  809. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  810. // Make argument positive
  811. UVec4 atan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  812. Vec4 x = Vec4::sXor(*this, atan_sign.ReinterpretAsFloat());
  813. Vec4 y = Vec4::sZero();
  814. // If x > Tan(PI / 8)
  815. UVec4 greater1 = Vec4::sGreater(x, Vec4::sReplicate(0.4142135623730950f));
  816. Vec4 x1 = (x - Vec4::sReplicate(1.0f)) / (x + Vec4::sReplicate(1.0f));
  817. // If x > Tan(3 * PI / 8)
  818. UVec4 greater2 = Vec4::sGreater(x, Vec4::sReplicate(2.414213562373095f));
  819. Vec4 x2 = Vec4::sReplicate(-1.0f) / (x JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))); // Add small epsilon to prevent div by zero, works because x is always positive
  820. // Apply first if
  821. x = Vec4::sSelect(x, x1, greater1);
  822. y = Vec4::sSelect(y, Vec4::sReplicate(0.25f * JPH_PI), greater1);
  823. // Apply second if
  824. x = Vec4::sSelect(x, x2, greater2);
  825. y = Vec4::sSelect(y, Vec4::sReplicate(0.5f * JPH_PI), greater2);
  826. // Polynomial approximation
  827. Vec4 z = x * x;
  828. y += (((8.05374449538e-2f * z - Vec4::sReplicate(1.38776856032e-1f)) * z + Vec4::sReplicate(1.99777106478e-1f)) * z - Vec4::sReplicate(3.33329491539e-1f)) * z * x + x;
  829. // Put the sign back
  830. return Vec4::sXor(y, atan_sign.ReinterpretAsFloat());
  831. }
  832. Vec4 Vec4::sATan2(Vec4Arg inY, Vec4Arg inX)
  833. {
  834. UVec4 sign_mask = UVec4::sReplicate(0x80000000U);
  835. // Determine absolute value and sign of y
  836. UVec4 y_sign = UVec4::sAnd(inY.ReinterpretAsInt(), sign_mask);
  837. Vec4 y_abs = Vec4::sXor(inY, y_sign.ReinterpretAsFloat());
  838. // Determine absolute value and sign of x
  839. UVec4 x_sign = UVec4::sAnd(inX.ReinterpretAsInt(), sign_mask);
  840. Vec4 x_abs = Vec4::sXor(inX, x_sign.ReinterpretAsFloat());
  841. // Always divide smallest / largest to avoid dividing by zero
  842. UVec4 x_is_numerator = Vec4::sLess(x_abs, y_abs);
  843. Vec4 numerator = Vec4::sSelect(y_abs, x_abs, x_is_numerator);
  844. Vec4 denominator = Vec4::sSelect(x_abs, y_abs, x_is_numerator);
  845. Vec4 atan = (numerator / denominator).ATan();
  846. // If we calculated x / y instead of y / x the result is PI / 2 - result (note that this is true because we know the result is positive because the input was positive)
  847. atan = Vec4::sSelect(atan, Vec4::sReplicate(0.5f * JPH_PI) - atan, x_is_numerator);
  848. // Now we need to map to the correct quadrant
  849. // x_sign y_sign result
  850. // +1 +1 atan
  851. // -1 +1 -atan + PI
  852. // -1 -1 atan - PI
  853. // +1 -1 -atan
  854. // This can be written as: x_sign * y_sign * (atan - (x_sign < 0? PI : 0))
  855. atan -= Vec4::sAnd(x_sign.ArithmeticShiftRight<31>().ReinterpretAsFloat(), Vec4::sReplicate(JPH_PI));
  856. atan = Vec4::sXor(atan, UVec4::sXor(x_sign, y_sign).ReinterpretAsFloat());
  857. return atan;
  858. }
  859. JPH_NAMESPACE_END