Vec4.inl 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152
  1. // Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
  2. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  3. // SPDX-License-Identifier: MIT
  4. #include <Jolt/Math/Trigonometry.h>
  5. #include <Jolt/Math/Vec3.h>
  6. #include <Jolt/Math/UVec4.h>
  7. JPH_NAMESPACE_BEGIN
  8. // Constructor
  9. Vec4::Vec4(Vec3Arg inRHS) :
  10. mValue(inRHS.mValue)
  11. {
  12. }
  13. Vec4::Vec4(Vec3Arg inRHS, float inW)
  14. {
  15. #if defined(JPH_USE_SSE4_1)
  16. mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
  17. #elif defined(JPH_USE_NEON)
  18. mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
  19. #else
  20. for (int i = 0; i < 3; i++)
  21. mF32[i] = inRHS.mF32[i];
  22. mF32[3] = inW;
  23. #endif
  24. }
  25. Vec4::Vec4(float inX, float inY, float inZ, float inW)
  26. {
  27. #if defined(JPH_USE_SSE)
  28. mValue = _mm_set_ps(inW, inZ, inY, inX);
  29. #elif defined(JPH_USE_NEON)
  30. uint32x2_t xy = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inX)) | (static_cast<uint64>(BitCast<uint32>(inY)) << 32));
  31. uint32x2_t zw = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inZ)) | (static_cast<uint64>(BitCast<uint32>(inW)) << 32));
  32. mValue = vreinterpretq_f32_u32(vcombine_u32(xy, zw));
  33. #else
  34. mF32[0] = inX;
  35. mF32[1] = inY;
  36. mF32[2] = inZ;
  37. mF32[3] = inW;
  38. #endif
  39. }
  40. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
  41. Vec4 Vec4::Swizzle() const
  42. {
  43. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  44. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  45. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  46. static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
  47. #if defined(JPH_USE_SSE)
  48. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
  49. #elif defined(JPH_USE_NEON)
  50. return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
  51. #else
  52. return Vec4(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ], mF32[SwizzleW]);
  53. #endif
  54. }
  55. Vec4 Vec4::sZero()
  56. {
  57. #if defined(JPH_USE_SSE)
  58. return _mm_setzero_ps();
  59. #elif defined(JPH_USE_NEON)
  60. return vdupq_n_f32(0);
  61. #else
  62. return Vec4(0, 0, 0, 0);
  63. #endif
  64. }
  65. Vec4 Vec4::sReplicate(float inV)
  66. {
  67. #if defined(JPH_USE_SSE)
  68. return _mm_set1_ps(inV);
  69. #elif defined(JPH_USE_NEON)
  70. return vdupq_n_f32(inV);
  71. #else
  72. return Vec4(inV, inV, inV, inV);
  73. #endif
  74. }
  75. Vec4 Vec4::sOne()
  76. {
  77. return sReplicate(1.0f);
  78. }
  79. Vec4 Vec4::sNaN()
  80. {
  81. return sReplicate(numeric_limits<float>::quiet_NaN());
  82. }
  83. Vec4 Vec4::sLoadFloat4(const Float4 *inV)
  84. {
  85. #if defined(JPH_USE_SSE)
  86. return _mm_loadu_ps(&inV->x);
  87. #elif defined(JPH_USE_NEON)
  88. return vld1q_f32(&inV->x);
  89. #else
  90. return Vec4(inV->x, inV->y, inV->z, inV->w);
  91. #endif
  92. }
  93. Vec4 Vec4::sLoadFloat4Aligned(const Float4 *inV)
  94. {
  95. #if defined(JPH_USE_SSE)
  96. return _mm_load_ps(&inV->x);
  97. #elif defined(JPH_USE_NEON)
  98. return vld1q_f32(&inV->x);
  99. #else
  100. return Vec4(inV->x, inV->y, inV->z, inV->w);
  101. #endif
  102. }
  103. template <const int Scale>
  104. Vec4 Vec4::sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
  105. {
  106. #if defined(JPH_USE_SSE)
  107. #ifdef JPH_USE_AVX2
  108. return _mm_i32gather_ps(inBase, inOffsets.mValue, Scale);
  109. #else
  110. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  111. Type x = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale));
  112. Type y = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale));
  113. Type xy = _mm_unpacklo_ps(x, y);
  114. Type z = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale));
  115. Type w = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale));
  116. Type zw = _mm_unpacklo_ps(z, w);
  117. return _mm_movelh_ps(xy, zw);
  118. #endif
  119. #else
  120. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  121. float x = *reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale);
  122. float y = *reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale);
  123. float z = *reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale);
  124. float w = *reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale);
  125. return Vec4(x, y, z, w);
  126. #endif
  127. }
  128. Vec4 Vec4::sMin(Vec4Arg inV1, Vec4Arg inV2)
  129. {
  130. #if defined(JPH_USE_SSE)
  131. return _mm_min_ps(inV1.mValue, inV2.mValue);
  132. #elif defined(JPH_USE_NEON)
  133. return vminq_f32(inV1.mValue, inV2.mValue);
  134. #else
  135. return Vec4(min(inV1.mF32[0], inV2.mF32[0]),
  136. min(inV1.mF32[1], inV2.mF32[1]),
  137. min(inV1.mF32[2], inV2.mF32[2]),
  138. min(inV1.mF32[3], inV2.mF32[3]));
  139. #endif
  140. }
  141. Vec4 Vec4::sMax(Vec4Arg inV1, Vec4Arg inV2)
  142. {
  143. #if defined(JPH_USE_SSE)
  144. return _mm_max_ps(inV1.mValue, inV2.mValue);
  145. #elif defined(JPH_USE_NEON)
  146. return vmaxq_f32(inV1.mValue, inV2.mValue);
  147. #else
  148. return Vec4(max(inV1.mF32[0], inV2.mF32[0]),
  149. max(inV1.mF32[1], inV2.mF32[1]),
  150. max(inV1.mF32[2], inV2.mF32[2]),
  151. max(inV1.mF32[3], inV2.mF32[3]));
  152. #endif
  153. }
  154. Vec4 Vec4::sClamp(Vec4Arg inV, Vec4Arg inMin, Vec4Arg inMax)
  155. {
  156. return sMax(sMin(inV, inMax), inMin);
  157. }
  158. UVec4 Vec4::sEquals(Vec4Arg inV1, Vec4Arg inV2)
  159. {
  160. #if defined(JPH_USE_SSE)
  161. return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
  162. #elif defined(JPH_USE_NEON)
  163. return vceqq_f32(inV1.mValue, inV2.mValue);
  164. #else
  165. return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0,
  166. inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0,
  167. inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0,
  168. inV1.mF32[3] == inV2.mF32[3]? 0xffffffffu : 0);
  169. #endif
  170. }
  171. UVec4 Vec4::sLess(Vec4Arg inV1, Vec4Arg inV2)
  172. {
  173. #if defined(JPH_USE_SSE)
  174. return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
  175. #elif defined(JPH_USE_NEON)
  176. return vcltq_f32(inV1.mValue, inV2.mValue);
  177. #else
  178. return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0,
  179. inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0,
  180. inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0,
  181. inV1.mF32[3] < inV2.mF32[3]? 0xffffffffu : 0);
  182. #endif
  183. }
  184. UVec4 Vec4::sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  185. {
  186. #if defined(JPH_USE_SSE)
  187. return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
  188. #elif defined(JPH_USE_NEON)
  189. return vcleq_f32(inV1.mValue, inV2.mValue);
  190. #else
  191. return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0,
  192. inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0,
  193. inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0,
  194. inV1.mF32[3] <= inV2.mF32[3]? 0xffffffffu : 0);
  195. #endif
  196. }
  197. UVec4 Vec4::sGreater(Vec4Arg inV1, Vec4Arg inV2)
  198. {
  199. #if defined(JPH_USE_SSE)
  200. return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
  201. #elif defined(JPH_USE_NEON)
  202. return vcgtq_f32(inV1.mValue, inV2.mValue);
  203. #else
  204. return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0,
  205. inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0,
  206. inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0,
  207. inV1.mF32[3] > inV2.mF32[3]? 0xffffffffu : 0);
  208. #endif
  209. }
  210. UVec4 Vec4::sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
  211. {
  212. #if defined(JPH_USE_SSE)
  213. return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
  214. #elif defined(JPH_USE_NEON)
  215. return vcgeq_f32(inV1.mValue, inV2.mValue);
  216. #else
  217. return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0,
  218. inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0,
  219. inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0,
  220. inV1.mF32[3] >= inV2.mF32[3]? 0xffffffffu : 0);
  221. #endif
  222. }
  223. Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
  224. {
  225. #if defined(JPH_USE_SSE)
  226. #ifdef JPH_USE_FMADD
  227. return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
  228. #else
  229. return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
  230. #endif
  231. #elif defined(JPH_USE_NEON)
  232. return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
  233. #else
  234. return Vec4(inMul1.mF32[0] * inMul2.mF32[0] + inAdd.mF32[0],
  235. inMul1.mF32[1] * inMul2.mF32[1] + inAdd.mF32[1],
  236. inMul1.mF32[2] * inMul2.mF32[2] + inAdd.mF32[2],
  237. inMul1.mF32[3] * inMul2.mF32[3] + inAdd.mF32[3]);
  238. #endif
  239. }
  240. Vec4 Vec4::sSelect(Vec4Arg inNotSet, Vec4Arg inSet, UVec4Arg inControl)
  241. {
  242. #if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
  243. return _mm_blendv_ps(inNotSet.mValue, inSet.mValue, _mm_castsi128_ps(inControl.mValue));
  244. #elif defined(JPH_USE_SSE)
  245. __m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));
  246. return _mm_or_ps(_mm_and_ps(is_set, inSet.mValue), _mm_andnot_ps(is_set, inNotSet.mValue));
  247. #elif defined(JPH_USE_NEON)
  248. return vbslq_f32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);
  249. #else
  250. Vec4 result;
  251. for (int i = 0; i < 4; i++)
  252. result.mF32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mF32[i] : inNotSet.mF32[i];
  253. return result;
  254. #endif
  255. }
  256. Vec4 Vec4::sOr(Vec4Arg inV1, Vec4Arg inV2)
  257. {
  258. #if defined(JPH_USE_SSE)
  259. return _mm_or_ps(inV1.mValue, inV2.mValue);
  260. #elif defined(JPH_USE_NEON)
  261. return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
  262. #else
  263. return UVec4::sOr(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
  264. #endif
  265. }
  266. Vec4 Vec4::sXor(Vec4Arg inV1, Vec4Arg inV2)
  267. {
  268. #if defined(JPH_USE_SSE)
  269. return _mm_xor_ps(inV1.mValue, inV2.mValue);
  270. #elif defined(JPH_USE_NEON)
  271. return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
  272. #else
  273. return UVec4::sXor(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
  274. #endif
  275. }
  276. Vec4 Vec4::sAnd(Vec4Arg inV1, Vec4Arg inV2)
  277. {
  278. #if defined(JPH_USE_SSE)
  279. return _mm_and_ps(inV1.mValue, inV2.mValue);
  280. #elif defined(JPH_USE_NEON)
  281. return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
  282. #else
  283. return UVec4::sAnd(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
  284. #endif
  285. }
  286. void Vec4::sSort4(Vec4 &ioValue, UVec4 &ioIndex)
  287. {
  288. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  289. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  290. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  291. UVec4 c1 = sLess(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  292. ioValue = sSelect(ioValue, v1, c1);
  293. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  294. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  295. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  296. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  297. UVec4 c2 = sLess(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  298. ioValue = sSelect(ioValue, v2, c2);
  299. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  300. // Pass 3, test 2nd vs 3rd component
  301. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  302. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  303. UVec4 c3 = sLess(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  304. ioValue = sSelect(ioValue, v3, c3);
  305. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  306. }
  307. void Vec4::sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
  308. {
  309. // Pass 1, test 1st vs 3rd, 2nd vs 4th
  310. Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  311. UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
  312. UVec4 c1 = sGreater(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
  313. ioValue = sSelect(ioValue, v1, c1);
  314. ioIndex = UVec4::sSelect(ioIndex, i1, c1);
  315. // Pass 2, test 1st vs 2nd, 3rd vs 4th
  316. Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  317. UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
  318. UVec4 c2 = sGreater(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
  319. ioValue = sSelect(ioValue, v2, c2);
  320. ioIndex = UVec4::sSelect(ioIndex, i2, c2);
  321. // Pass 3, test 2nd vs 3rd component
  322. Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  323. UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
  324. UVec4 c3 = sGreater(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
  325. ioValue = sSelect(ioValue, v3, c3);
  326. ioIndex = UVec4::sSelect(ioIndex, i3, c3);
  327. }
  328. bool Vec4::operator == (Vec4Arg inV2) const
  329. {
  330. return sEquals(*this, inV2).TestAllTrue();
  331. }
  332. bool Vec4::IsClose(Vec4Arg inV2, float inMaxDistSq) const
  333. {
  334. return (inV2 - *this).LengthSq() <= inMaxDistSq;
  335. }
  336. bool Vec4::IsNearZero(float inMaxDistSq) const
  337. {
  338. return LengthSq() <= inMaxDistSq;
  339. }
  340. bool Vec4::IsNormalized(float inTolerance) const
  341. {
  342. return abs(LengthSq() - 1.0f) <= inTolerance;
  343. }
  344. bool Vec4::IsNaN() const
  345. {
  346. #if defined(JPH_USE_AVX512)
  347. return _mm_fpclass_ps_mask(mValue, 0b10000001) != 0;
  348. #elif defined(JPH_USE_SSE)
  349. return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
  350. #elif defined(JPH_USE_NEON)
  351. uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
  352. return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
  353. #else
  354. return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]) || isnan(mF32[3]);
  355. #endif
  356. }
  357. Vec4 Vec4::operator * (Vec4Arg inV2) const
  358. {
  359. #if defined(JPH_USE_SSE)
  360. return _mm_mul_ps(mValue, inV2.mValue);
  361. #elif defined(JPH_USE_NEON)
  362. return vmulq_f32(mValue, inV2.mValue);
  363. #else
  364. return Vec4(mF32[0] * inV2.mF32[0],
  365. mF32[1] * inV2.mF32[1],
  366. mF32[2] * inV2.mF32[2],
  367. mF32[3] * inV2.mF32[3]);
  368. #endif
  369. }
  370. Vec4 Vec4::operator * (float inV2) const
  371. {
  372. #if defined(JPH_USE_SSE)
  373. return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  374. #elif defined(JPH_USE_NEON)
  375. return vmulq_n_f32(mValue, inV2);
  376. #else
  377. return Vec4(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2, mF32[3] * inV2);
  378. #endif
  379. }
  380. /// Multiply vector with float
  381. Vec4 operator * (float inV1, Vec4Arg inV2)
  382. {
  383. #if defined(JPH_USE_SSE)
  384. return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
  385. #elif defined(JPH_USE_NEON)
  386. return vmulq_n_f32(inV2.mValue, inV1);
  387. #else
  388. return Vec4(inV1 * inV2.mF32[0],
  389. inV1 * inV2.mF32[1],
  390. inV1 * inV2.mF32[2],
  391. inV1 * inV2.mF32[3]);
  392. #endif
  393. }
  394. Vec4 Vec4::operator / (float inV2) const
  395. {
  396. #if defined(JPH_USE_SSE)
  397. return _mm_div_ps(mValue, _mm_set1_ps(inV2));
  398. #elif defined(JPH_USE_NEON)
  399. return vdivq_f32(mValue, vdupq_n_f32(inV2));
  400. #else
  401. return Vec4(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2, mF32[3] / inV2);
  402. #endif
  403. }
  404. Vec4 &Vec4::operator *= (float inV2)
  405. {
  406. #if defined(JPH_USE_SSE)
  407. mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
  408. #elif defined(JPH_USE_NEON)
  409. mValue = vmulq_n_f32(mValue, inV2);
  410. #else
  411. for (int i = 0; i < 4; ++i)
  412. mF32[i] *= inV2;
  413. #endif
  414. return *this;
  415. }
  416. Vec4 &Vec4::operator *= (Vec4Arg inV2)
  417. {
  418. #if defined(JPH_USE_SSE)
  419. mValue = _mm_mul_ps(mValue, inV2.mValue);
  420. #elif defined(JPH_USE_NEON)
  421. mValue = vmulq_f32(mValue, inV2.mValue);
  422. #else
  423. for (int i = 0; i < 4; ++i)
  424. mF32[i] *= inV2.mF32[i];
  425. #endif
  426. return *this;
  427. }
  428. Vec4 &Vec4::operator /= (float inV2)
  429. {
  430. #if defined(JPH_USE_SSE)
  431. mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
  432. #elif defined(JPH_USE_NEON)
  433. mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
  434. #else
  435. for (int i = 0; i < 4; ++i)
  436. mF32[i] /= inV2;
  437. #endif
  438. return *this;
  439. }
  440. Vec4 Vec4::operator + (Vec4Arg inV2) const
  441. {
  442. #if defined(JPH_USE_SSE)
  443. return _mm_add_ps(mValue, inV2.mValue);
  444. #elif defined(JPH_USE_NEON)
  445. return vaddq_f32(mValue, inV2.mValue);
  446. #else
  447. return Vec4(mF32[0] + inV2.mF32[0],
  448. mF32[1] + inV2.mF32[1],
  449. mF32[2] + inV2.mF32[2],
  450. mF32[3] + inV2.mF32[3]);
  451. #endif
  452. }
  453. Vec4 &Vec4::operator += (Vec4Arg inV2)
  454. {
  455. #if defined(JPH_USE_SSE)
  456. mValue = _mm_add_ps(mValue, inV2.mValue);
  457. #elif defined(JPH_USE_NEON)
  458. mValue = vaddq_f32(mValue, inV2.mValue);
  459. #else
  460. for (int i = 0; i < 4; ++i)
  461. mF32[i] += inV2.mF32[i];
  462. #endif
  463. return *this;
  464. }
  465. Vec4 Vec4::operator - () const
  466. {
  467. #if defined(JPH_USE_SSE)
  468. return _mm_sub_ps(_mm_setzero_ps(), mValue);
  469. #elif defined(JPH_USE_NEON)
  470. #ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
  471. return vsubq_f32(vdupq_n_f32(0), mValue);
  472. #else
  473. return vnegq_f32(mValue);
  474. #endif
  475. #else
  476. #ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
  477. return Vec4(0.0f - mF32[0], 0.0f - mF32[1], 0.0f - mF32[2], 0.0f - mF32[3]);
  478. #else
  479. return Vec4(-mF32[0], -mF32[1], -mF32[2], -mF32[3]);
  480. #endif
  481. #endif
  482. }
  483. Vec4 Vec4::operator - (Vec4Arg inV2) const
  484. {
  485. #if defined(JPH_USE_SSE)
  486. return _mm_sub_ps(mValue, inV2.mValue);
  487. #elif defined(JPH_USE_NEON)
  488. return vsubq_f32(mValue, inV2.mValue);
  489. #else
  490. return Vec4(mF32[0] - inV2.mF32[0],
  491. mF32[1] - inV2.mF32[1],
  492. mF32[2] - inV2.mF32[2],
  493. mF32[3] - inV2.mF32[3]);
  494. #endif
  495. }
  496. Vec4 &Vec4::operator -= (Vec4Arg inV2)
  497. {
  498. #if defined(JPH_USE_SSE)
  499. mValue = _mm_sub_ps(mValue, inV2.mValue);
  500. #elif defined(JPH_USE_NEON)
  501. mValue = vsubq_f32(mValue, inV2.mValue);
  502. #else
  503. for (int i = 0; i < 4; ++i)
  504. mF32[i] -= inV2.mF32[i];
  505. #endif
  506. return *this;
  507. }
  508. Vec4 Vec4::operator / (Vec4Arg inV2) const
  509. {
  510. #if defined(JPH_USE_SSE)
  511. return _mm_div_ps(mValue, inV2.mValue);
  512. #elif defined(JPH_USE_NEON)
  513. return vdivq_f32(mValue, inV2.mValue);
  514. #else
  515. return Vec4(mF32[0] / inV2.mF32[0],
  516. mF32[1] / inV2.mF32[1],
  517. mF32[2] / inV2.mF32[2],
  518. mF32[3] / inV2.mF32[3]);
  519. #endif
  520. }
  521. Vec4 Vec4::SplatX() const
  522. {
  523. #if defined(JPH_USE_SSE)
  524. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
  525. #elif defined(JPH_USE_NEON)
  526. return vdupq_laneq_f32(mValue, 0);
  527. #else
  528. return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
  529. #endif
  530. }
  531. Vec4 Vec4::SplatY() const
  532. {
  533. #if defined(JPH_USE_SSE)
  534. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
  535. #elif defined(JPH_USE_NEON)
  536. return vdupq_laneq_f32(mValue, 1);
  537. #else
  538. return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
  539. #endif
  540. }
  541. Vec4 Vec4::SplatZ() const
  542. {
  543. #if defined(JPH_USE_SSE)
  544. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
  545. #elif defined(JPH_USE_NEON)
  546. return vdupq_laneq_f32(mValue, 2);
  547. #else
  548. return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
  549. #endif
  550. }
  551. Vec4 Vec4::SplatW() const
  552. {
  553. #if defined(JPH_USE_SSE)
  554. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
  555. #elif defined(JPH_USE_NEON)
  556. return vdupq_laneq_f32(mValue, 3);
  557. #else
  558. return Vec4(mF32[3], mF32[3], mF32[3], mF32[3]);
  559. #endif
  560. }
  561. Vec3 Vec4::SplatX3() const
  562. {
  563. #if defined(JPH_USE_SSE)
  564. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
  565. #elif defined(JPH_USE_NEON)
  566. return vdupq_laneq_f32(mValue, 0);
  567. #else
  568. return Vec3(mF32[0], mF32[0], mF32[0]);
  569. #endif
  570. }
  571. Vec3 Vec4::SplatY3() const
  572. {
  573. #if defined(JPH_USE_SSE)
  574. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
  575. #elif defined(JPH_USE_NEON)
  576. return vdupq_laneq_f32(mValue, 1);
  577. #else
  578. return Vec3(mF32[1], mF32[1], mF32[1]);
  579. #endif
  580. }
  581. Vec3 Vec4::SplatZ3() const
  582. {
  583. #if defined(JPH_USE_SSE)
  584. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
  585. #elif defined(JPH_USE_NEON)
  586. return vdupq_laneq_f32(mValue, 2);
  587. #else
  588. return Vec3(mF32[2], mF32[2], mF32[2]);
  589. #endif
  590. }
  591. Vec3 Vec4::SplatW3() const
  592. {
  593. #if defined(JPH_USE_SSE)
  594. return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
  595. #elif defined(JPH_USE_NEON)
  596. return vdupq_laneq_f32(mValue, 3);
  597. #else
  598. return Vec3(mF32[3], mF32[3], mF32[3]);
  599. #endif
  600. }
  601. int Vec4::GetLowestComponentIndex() const
  602. {
  603. // Get the minimum value in all 4 components
  604. Vec4 value = Vec4::sMin(*this, Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>());
  605. value = Vec4::sMin(value, value.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>());
  606. // Compare with the original vector to find which component is equal to the minimum value
  607. return CountTrailingZeros(Vec4::sEquals(*this, value).GetTrues());
  608. }
  609. int Vec4::GetHighestComponentIndex() const
  610. {
  611. // Get the maximum value in all 4 components
  612. Vec4 value = Vec4::sMax(*this, Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>());
  613. value = Vec4::sMax(value, value.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>());
  614. // Compare with the original vector to find which component is equal to the maximum value
  615. return CountTrailingZeros(Vec4::sEquals(*this, value).GetTrues());
  616. }
  617. Vec4 Vec4::Abs() const
  618. {
  619. #if defined(JPH_USE_AVX512)
  620. return _mm_range_ps(mValue, mValue, 0b1000);
  621. #elif defined(JPH_USE_SSE)
  622. return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
  623. #elif defined(JPH_USE_NEON)
  624. return vabsq_f32(mValue);
  625. #else
  626. return Vec4(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]), abs(mF32[3]));
  627. #endif
  628. }
  629. Vec4 Vec4::Reciprocal() const
  630. {
  631. return sOne() / mValue;
  632. }
  633. Vec4 Vec4::DotV(Vec4Arg inV2) const
  634. {
  635. #if defined(JPH_USE_SSE4_1)
  636. return _mm_dp_ps(mValue, inV2.mValue, 0xff);
  637. #elif defined(JPH_USE_NEON)
  638. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  639. return vdupq_n_f32(vaddvq_f32(mul));
  640. #else
  641. // Brackets placed so that the order is consistent with the vectorized version
  642. return Vec4::sReplicate((mF32[0] * inV2.mF32[0] + mF32[1] * inV2.mF32[1]) + (mF32[2] * inV2.mF32[2] + mF32[3] * inV2.mF32[3]));
  643. #endif
  644. }
  645. float Vec4::Dot(Vec4Arg inV2) const
  646. {
  647. #if defined(JPH_USE_SSE4_1)
  648. return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
  649. #elif defined(JPH_USE_NEON)
  650. float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
  651. return vaddvq_f32(mul);
  652. #else
  653. // Brackets placed so that the order is consistent with the vectorized version
  654. return (mF32[0] * inV2.mF32[0] + mF32[1] * inV2.mF32[1]) + (mF32[2] * inV2.mF32[2] + mF32[3] * inV2.mF32[3]);
  655. #endif
  656. }
  657. float Vec4::LengthSq() const
  658. {
  659. #if defined(JPH_USE_SSE4_1)
  660. return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
  661. #elif defined(JPH_USE_NEON)
  662. float32x4_t mul = vmulq_f32(mValue, mValue);
  663. return vaddvq_f32(mul);
  664. #else
  665. // Brackets placed so that the order is consistent with the vectorized version
  666. return (mF32[0] * mF32[0] + mF32[1] * mF32[1]) + (mF32[2] * mF32[2] + mF32[3] * mF32[3]);
  667. #endif
  668. }
  669. float Vec4::Length() const
  670. {
  671. #if defined(JPH_USE_SSE4_1)
  672. return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
  673. #elif defined(JPH_USE_NEON)
  674. float32x4_t mul = vmulq_f32(mValue, mValue);
  675. float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
  676. return vget_lane_f32(vsqrt_f32(sum), 0);
  677. #else
  678. // Brackets placed so that the order is consistent with the vectorized version
  679. return sqrt((mF32[0] * mF32[0] + mF32[1] * mF32[1]) + (mF32[2] * mF32[2] + mF32[3] * mF32[3]));
  680. #endif
  681. }
  682. Vec4 Vec4::Sqrt() const
  683. {
  684. #if defined(JPH_USE_SSE)
  685. return _mm_sqrt_ps(mValue);
  686. #elif defined(JPH_USE_NEON)
  687. return vsqrtq_f32(mValue);
  688. #else
  689. return Vec4(sqrt(mF32[0]), sqrt(mF32[1]), sqrt(mF32[2]), sqrt(mF32[3]));
  690. #endif
  691. }
  692. Vec4 Vec4::GetSign() const
  693. {
  694. #if defined(JPH_USE_AVX512)
  695. return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
  696. #elif defined(JPH_USE_SSE)
  697. Type minus_one = _mm_set1_ps(-1.0f);
  698. Type one = _mm_set1_ps(1.0f);
  699. return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
  700. #elif defined(JPH_USE_NEON)
  701. Type minus_one = vdupq_n_f32(-1.0f);
  702. Type one = vdupq_n_f32(1.0f);
  703. return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(mValue), vreinterpretq_u32_f32(minus_one)), vreinterpretq_u32_f32(one)));
  704. #else
  705. return Vec4(std::signbit(mF32[0])? -1.0f : 1.0f,
  706. std::signbit(mF32[1])? -1.0f : 1.0f,
  707. std::signbit(mF32[2])? -1.0f : 1.0f,
  708. std::signbit(mF32[3])? -1.0f : 1.0f);
  709. #endif
  710. }
  711. template <int X, int Y, int Z, int W>
  712. JPH_INLINE Vec4 Vec4::FlipSign() const
  713. {
  714. static_assert(X == 1 || X == -1, "X must be 1 or -1");
  715. static_assert(Y == 1 || Y == -1, "Y must be 1 or -1");
  716. static_assert(Z == 1 || Z == -1, "Z must be 1 or -1");
  717. static_assert(W == 1 || W == -1, "W must be 1 or -1");
  718. return Vec4::sXor(*this, Vec4(X > 0? 0.0f : -0.0f, Y > 0? 0.0f : -0.0f, Z > 0? 0.0f : -0.0f, W > 0? 0.0f : -0.0f));
  719. }
  720. Vec4 Vec4::Normalized() const
  721. {
  722. #if defined(JPH_USE_SSE4_1)
  723. return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
  724. #elif defined(JPH_USE_NEON)
  725. float32x4_t mul = vmulq_f32(mValue, mValue);
  726. float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
  727. return vdivq_f32(mValue, vsqrtq_f32(sum));
  728. #else
  729. return *this / Length();
  730. #endif
  731. }
  732. void Vec4::StoreFloat4(Float4 *outV) const
  733. {
  734. #if defined(JPH_USE_SSE)
  735. _mm_storeu_ps(&outV->x, mValue);
  736. #elif defined(JPH_USE_NEON)
  737. vst1q_f32(&outV->x, mValue);
  738. #else
  739. for (int i = 0; i < 4; ++i)
  740. (&outV->x)[i] = mF32[i];
  741. #endif
  742. }
  743. UVec4 Vec4::ToInt() const
  744. {
  745. #if defined(JPH_USE_SSE)
  746. return _mm_cvttps_epi32(mValue);
  747. #elif defined(JPH_USE_NEON)
  748. return vcvtq_u32_f32(mValue);
  749. #else
  750. return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
  751. #endif
  752. }
  753. UVec4 Vec4::ReinterpretAsInt() const
  754. {
  755. #if defined(JPH_USE_SSE)
  756. return UVec4(_mm_castps_si128(mValue));
  757. #elif defined(JPH_USE_NEON)
  758. return vreinterpretq_u32_f32(mValue);
  759. #else
  760. return *reinterpret_cast<const UVec4 *>(this);
  761. #endif
  762. }
  763. int Vec4::GetSignBits() const
  764. {
  765. #if defined(JPH_USE_SSE)
  766. return _mm_movemask_ps(mValue);
  767. #elif defined(JPH_USE_NEON)
  768. int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
  769. return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
  770. #else
  771. return (std::signbit(mF32[0])? 1 : 0) | (std::signbit(mF32[1])? 2 : 0) | (std::signbit(mF32[2])? 4 : 0) | (std::signbit(mF32[3])? 8 : 0);
  772. #endif
  773. }
  774. float Vec4::ReduceMin() const
  775. {
  776. Vec4 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  777. v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  778. return v.GetX();
  779. }
  780. float Vec4::ReduceMax() const
  781. {
  782. Vec4 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
  783. v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
  784. return v.GetX();
  785. }
  786. void Vec4::SinCos(Vec4 &outSin, Vec4 &outCos) const
  787. {
  788. // Implementation based on sinf.c from the cephes library, combines sinf and cosf in a single function, changes octants to quadrants and vectorizes it
  789. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  790. // Make argument positive and remember sign for sin only since cos is symmetric around x (highest bit of a float is the sign bit)
  791. UVec4 sin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  792. Vec4 x = Vec4::sXor(*this, sin_sign.ReinterpretAsFloat());
  793. // x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
  794. UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
  795. // Make x relative to the closest quadrant.
  796. // This does x = x - quadrant * PI / 2 using a two step Cody-Waite argument reduction.
  797. // This improves the accuracy of the result by avoiding loss of significant bits in the subtraction.
  798. // We start with x = x - quadrant * PI / 2, PI / 2 in hexadecimal notation is 0x3fc90fdb, we remove the lowest 16 bits to
  799. // get 0x3fc90000 (= 1.5703125) this means we can now multiply with a number of up to 2^16 without losing any bits.
  800. // This leaves us with: x = (x - quadrant * 1.5703125) - quadrant * (PI / 2 - 1.5703125).
  801. // PI / 2 - 1.5703125 in hexadecimal is 0x39fdaa22, stripping the lowest 12 bits we get 0x39fda000 (= 0.0004837512969970703125)
  802. // This leaves uw with: x = ((x - quadrant * 1.5703125) - quadrant * 0.0004837512969970703125) - quadrant * (PI / 2 - 1.5703125 - 0.0004837512969970703125)
  803. // See: https://stackoverflow.com/questions/42455143/sine-cosine-modular-extended-precision-arithmetic
  804. // After this we have x in the range [-PI / 4, PI / 4].
  805. Vec4 float_quadrant = quadrant.ToFloat();
  806. x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
  807. // Calculate x2 = x^2
  808. Vec4 x2 = x * x;
  809. // Taylor expansion:
  810. // Cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + x^8/8! + ... = (((x2/8!- 1/6!) * x2 + 1/4!) * x2 - 1/2!) * x2 + 1
  811. Vec4 taylor_cos = ((2.443315711809948e-5f * x2 - Vec4::sReplicate(1.388731625493765e-3f)) * x2 + Vec4::sReplicate(4.166664568298827e-2f)) * x2 * x2 - 0.5f * x2 + Vec4::sOne();
  812. // Sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ... = ((-x2/7! + 1/5!) * x2 - 1/3!) * x2 * x + x
  813. Vec4 taylor_sin = ((-1.9515295891e-4f * x2 + Vec4::sReplicate(8.3321608736e-3f)) * x2 - Vec4::sReplicate(1.6666654611e-1f)) * x2 * x + x;
  814. // The lowest 2 bits of quadrant indicate the quadrant that we are in.
  815. // Let x be the original input value and x' our value that has been mapped to the range [-PI / 4, PI / 4].
  816. // since cos(x) = sin(x - PI / 2) and since we want to use the Taylor expansion as close as possible to 0,
  817. // we can alternate between using the Taylor expansion for sin and cos according to the following table:
  818. //
  819. // quadrant sin(x) cos(x)
  820. // XXX00b sin(x') cos(x')
  821. // XXX01b cos(x') -sin(x')
  822. // XXX10b -sin(x') -cos(x')
  823. // XXX11b -cos(x') sin(x')
  824. //
  825. // So: sin_sign = bit2, cos_sign = bit1 ^ bit2, bit1 determines if we use sin or cos Taylor expansion
  826. UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
  827. UVec4 bit2 = UVec4::sAnd(quadrant.LogicalShiftLeft<30>(), UVec4::sReplicate(0x80000000U));
  828. // Select which one of the results is sin and which one is cos
  829. Vec4 s = Vec4::sSelect(taylor_sin, taylor_cos, bit1);
  830. Vec4 c = Vec4::sSelect(taylor_cos, taylor_sin, bit1);
  831. // Update the signs
  832. sin_sign = UVec4::sXor(sin_sign, bit2);
  833. UVec4 cos_sign = UVec4::sXor(bit1, bit2);
  834. // Correct the signs
  835. outSin = Vec4::sXor(s, sin_sign.ReinterpretAsFloat());
  836. outCos = Vec4::sXor(c, cos_sign.ReinterpretAsFloat());
  837. }
  838. Vec4 Vec4::Tan() const
  839. {
  840. // Implementation based on tanf.c from the cephes library, see Vec4::SinCos for further details
  841. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  842. // Make argument positive
  843. UVec4 tan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  844. Vec4 x = Vec4::sXor(*this, tan_sign.ReinterpretAsFloat());
  845. // x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
  846. UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
  847. // Remap x to range [-PI / 4, PI / 4], see Vec4::SinCos
  848. Vec4 float_quadrant = quadrant.ToFloat();
  849. x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
  850. // Calculate x2 = x^2
  851. Vec4 x2 = x * x;
  852. // Roughly equivalent to the Taylor expansion:
  853. // Tan(x) = x + x^3/3 + 2*x^5/15 + 17*x^7/315 + 62*x^9/2835 + ...
  854. Vec4 tan =
  855. (((((9.38540185543e-3f * x2 + Vec4::sReplicate(3.11992232697e-3f)) * x2 + Vec4::sReplicate(2.44301354525e-2f)) * x2
  856. + Vec4::sReplicate(5.34112807005e-2f)) * x2 + Vec4::sReplicate(1.33387994085e-1f)) * x2 + Vec4::sReplicate(3.33331568548e-1f)) * x2 * x + x;
  857. // For the 2nd and 4th quadrant we need to invert the value
  858. UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
  859. tan = Vec4::sSelect(tan, Vec4::sReplicate(-1.0f) / (tan JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))), bit1); // Add small epsilon to prevent div by zero, works because tan is always positive
  860. // Put the sign back
  861. return Vec4::sXor(tan, tan_sign.ReinterpretAsFloat());
  862. }
  863. Vec4 Vec4::ASin() const
  864. {
  865. // Implementation based on asinf.c from the cephes library
  866. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  867. // Make argument positive
  868. UVec4 asin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  869. Vec4 a = Vec4::sXor(*this, asin_sign.ReinterpretAsFloat());
  870. // ASin is not defined outside the range [-1, 1] but it often happens that a value is slightly above 1 so we just clamp here
  871. a = Vec4::sMin(a, Vec4::sOne());
  872. // When |x| <= 0.5 we use the asin approximation as is
  873. Vec4 z1 = a * a;
  874. Vec4 x1 = a;
  875. // When |x| > 0.5 we use the identity asin(x) = PI / 2 - 2 * asin(sqrt((1 - x) / 2))
  876. Vec4 z2 = 0.5f * (Vec4::sOne() - a);
  877. Vec4 x2 = z2.Sqrt();
  878. // Select which of the two situations we have
  879. UVec4 greater = Vec4::sGreater(a, Vec4::sReplicate(0.5f));
  880. Vec4 z = Vec4::sSelect(z1, z2, greater);
  881. Vec4 x = Vec4::sSelect(x1, x2, greater);
  882. // Polynomial approximation of asin
  883. z = ((((4.2163199048e-2f * z + Vec4::sReplicate(2.4181311049e-2f)) * z + Vec4::sReplicate(4.5470025998e-2f)) * z + Vec4::sReplicate(7.4953002686e-2f)) * z + Vec4::sReplicate(1.6666752422e-1f)) * z * x + x;
  884. // If |x| > 0.5 we need to apply the remainder of the identity above
  885. z = Vec4::sSelect(z, Vec4::sReplicate(0.5f * JPH_PI) - (z + z), greater);
  886. // Put the sign back
  887. return Vec4::sXor(z, asin_sign.ReinterpretAsFloat());
  888. }
  889. Vec4 Vec4::ACos() const
  890. {
  891. // Not the most accurate, but simple
  892. return Vec4::sReplicate(0.5f * JPH_PI) - ASin();
  893. }
  894. Vec4 Vec4::ATan() const
  895. {
  896. // Implementation based on atanf.c from the cephes library
  897. // Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
  898. // Make argument positive
  899. UVec4 atan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
  900. Vec4 x = Vec4::sXor(*this, atan_sign.ReinterpretAsFloat());
  901. Vec4 y = Vec4::sZero();
  902. // If x > Tan(PI / 8)
  903. UVec4 greater1 = Vec4::sGreater(x, Vec4::sReplicate(0.4142135623730950f));
  904. Vec4 x1 = (x - Vec4::sOne()) / (x + Vec4::sOne());
  905. // If x > Tan(3 * PI / 8)
  906. UVec4 greater2 = Vec4::sGreater(x, Vec4::sReplicate(2.414213562373095f));
  907. Vec4 x2 = Vec4::sReplicate(-1.0f) / (x JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))); // Add small epsilon to prevent div by zero, works because x is always positive
  908. // Apply first if
  909. x = Vec4::sSelect(x, x1, greater1);
  910. y = Vec4::sSelect(y, Vec4::sReplicate(0.25f * JPH_PI), greater1);
  911. // Apply second if
  912. x = Vec4::sSelect(x, x2, greater2);
  913. y = Vec4::sSelect(y, Vec4::sReplicate(0.5f * JPH_PI), greater2);
  914. // Polynomial approximation
  915. Vec4 z = x * x;
  916. y += (((8.05374449538e-2f * z - Vec4::sReplicate(1.38776856032e-1f)) * z + Vec4::sReplicate(1.99777106478e-1f)) * z - Vec4::sReplicate(3.33329491539e-1f)) * z * x + x;
  917. // Put the sign back
  918. return Vec4::sXor(y, atan_sign.ReinterpretAsFloat());
  919. }
  920. Vec4 Vec4::sATan2(Vec4Arg inY, Vec4Arg inX)
  921. {
  922. UVec4 sign_mask = UVec4::sReplicate(0x80000000U);
  923. // Determine absolute value and sign of y
  924. UVec4 y_sign = UVec4::sAnd(inY.ReinterpretAsInt(), sign_mask);
  925. Vec4 y_abs = Vec4::sXor(inY, y_sign.ReinterpretAsFloat());
  926. // Determine absolute value and sign of x
  927. UVec4 x_sign = UVec4::sAnd(inX.ReinterpretAsInt(), sign_mask);
  928. Vec4 x_abs = Vec4::sXor(inX, x_sign.ReinterpretAsFloat());
  929. // Always divide smallest / largest to avoid dividing by zero
  930. UVec4 x_is_numerator = Vec4::sLess(x_abs, y_abs);
  931. Vec4 numerator = Vec4::sSelect(y_abs, x_abs, x_is_numerator);
  932. Vec4 denominator = Vec4::sSelect(x_abs, y_abs, x_is_numerator);
  933. Vec4 atan = (numerator / denominator).ATan();
  934. // If we calculated x / y instead of y / x the result is PI / 2 - result (note that this is true because we know the result is positive because the input was positive)
  935. atan = Vec4::sSelect(atan, Vec4::sReplicate(0.5f * JPH_PI) - atan, x_is_numerator);
  936. // Now we need to map to the correct quadrant
  937. // x_sign y_sign result
  938. // +1 +1 atan
  939. // -1 +1 -atan + PI
  940. // -1 -1 atan - PI
  941. // +1 -1 -atan
  942. // This can be written as: x_sign * y_sign * (atan - (x_sign < 0? PI : 0))
  943. atan -= Vec4::sAnd(x_sign.ArithmeticShiftRight<31>().ReinterpretAsFloat(), Vec4::sReplicate(JPH_PI));
  944. atan = Vec4::sXor(atan, UVec4::sXor(x_sign, y_sign).ReinterpretAsFloat());
  945. return atan;
  946. }
  947. uint32 Vec4::CompressUnitVector() const
  948. {
  949. constexpr float cOneOverSqrt2 = 0.70710678f;
  950. constexpr uint cNumBits = 9;
  951. constexpr uint cMask = (1 << cNumBits) - 1;
  952. constexpr uint cMaxValue = cMask - 1; // Need odd number of buckets to quantize to or else we can't encode 0
  953. constexpr float cScale = float(cMaxValue) / (2.0f * cOneOverSqrt2);
  954. // Store sign bit
  955. Vec4 v = *this;
  956. uint32 max_element = v.Abs().GetHighestComponentIndex();
  957. uint32 value = 0;
  958. if (v[max_element] < 0.0f)
  959. {
  960. value = 0x80000000u;
  961. v = -v;
  962. }
  963. // Store highest component
  964. value |= max_element << 29;
  965. // Store the other three components in a compressed format
  966. UVec4 compressed = Vec4::sClamp((v + Vec4::sReplicate(cOneOverSqrt2)) * cScale + Vec4::sReplicate(0.5f), Vec4::sZero(), Vec4::sReplicate(cMaxValue)).ToInt();
  967. switch (max_element)
  968. {
  969. case 0:
  970. compressed = compressed.Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_UNUSED>();
  971. break;
  972. case 1:
  973. compressed = compressed.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_UNUSED>();
  974. break;
  975. case 2:
  976. compressed = compressed.Swizzle<SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_UNUSED>();
  977. break;
  978. }
  979. value |= compressed.GetX();
  980. value |= compressed.GetY() << cNumBits;
  981. value |= compressed.GetZ() << 2 * cNumBits;
  982. return value;
  983. }
  984. Vec4 Vec4::sDecompressUnitVector(uint32 inValue)
  985. {
  986. constexpr float cOneOverSqrt2 = 0.70710678f;
  987. constexpr uint cNumBits = 9;
  988. constexpr uint cMask = (1u << cNumBits) - 1;
  989. constexpr uint cMaxValue = cMask - 1; // Need odd number of buckets to quantize to or else we can't encode 0
  990. constexpr float cScale = 2.0f * cOneOverSqrt2 / float(cMaxValue);
  991. // Restore three components
  992. Vec4 v = Vec4(UVec4(inValue & cMask, (inValue >> cNumBits) & cMask, (inValue >> (2 * cNumBits)) & cMask, 0).ToFloat()) * cScale - Vec4(cOneOverSqrt2, cOneOverSqrt2, cOneOverSqrt2, 0.0f);
  993. JPH_ASSERT(v.GetW() == 0.0f);
  994. // Restore the highest component
  995. v.SetW(sqrt(max(1.0f - v.LengthSq(), 0.0f)));
  996. // Extract sign
  997. if ((inValue & 0x80000000u) != 0)
  998. v = -v;
  999. // Swizzle the components in place
  1000. switch ((inValue >> 29) & 3)
  1001. {
  1002. case 0:
  1003. v = v.Swizzle<SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z>();
  1004. break;
  1005. case 1:
  1006. v = v.Swizzle<SWIZZLE_X, SWIZZLE_W, SWIZZLE_Y, SWIZZLE_Z>();
  1007. break;
  1008. case 2:
  1009. v = v.Swizzle<SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_Z>();
  1010. break;
  1011. }
  1012. return v;
  1013. }
  1014. JPH_NAMESPACE_END