UVec4.inl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636
  1. // Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
  2. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  3. // SPDX-License-Identifier: MIT
  4. JPH_NAMESPACE_BEGIN
  5. UVec4::UVec4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW)
  6. {
  7. #if defined(JPH_USE_SSE)
  8. mValue = _mm_set_epi32(int(inW), int(inZ), int(inY), int(inX));
  9. #elif defined(JPH_USE_NEON)
  10. uint32x2_t xy = vcreate_u32(static_cast<uint64>(inX) | (static_cast<uint64>(inY) << 32));
  11. uint32x2_t zw = vcreate_u32(static_cast<uint64>(inZ) | (static_cast<uint64>(inW) << 32));
  12. mValue = vcombine_u32(xy, zw);
  13. #else
  14. mU32[0] = inX;
  15. mU32[1] = inY;
  16. mU32[2] = inZ;
  17. mU32[3] = inW;
  18. #endif
  19. }
  20. bool UVec4::operator == (UVec4Arg inV2) const
  21. {
  22. return sEquals(*this, inV2).TestAllTrue();
  23. }
  24. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
  25. UVec4 UVec4::Swizzle() const
  26. {
  27. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  28. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  29. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  30. static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
  31. #if defined(JPH_USE_SSE)
  32. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
  33. #elif defined(JPH_USE_NEON)
  34. return JPH_NEON_SHUFFLE_U32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
  35. #else
  36. return UVec4(mU32[SwizzleX], mU32[SwizzleY], mU32[SwizzleZ], mU32[SwizzleW]);
  37. #endif
  38. }
  39. UVec4 UVec4::sZero()
  40. {
  41. #if defined(JPH_USE_SSE)
  42. return _mm_setzero_si128();
  43. #elif defined(JPH_USE_NEON)
  44. return vdupq_n_u32(0);
  45. #else
  46. return UVec4(0, 0, 0, 0);
  47. #endif
  48. }
  49. UVec4 UVec4::sReplicate(uint32 inV)
  50. {
  51. #if defined(JPH_USE_SSE)
  52. return _mm_set1_epi32(int(inV));
  53. #elif defined(JPH_USE_NEON)
  54. return vdupq_n_u32(inV);
  55. #else
  56. return UVec4(inV, inV, inV, inV);
  57. #endif
  58. }
  59. UVec4 UVec4::sLoadInt(const uint32 *inV)
  60. {
  61. #if defined(JPH_USE_SSE)
  62. return _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(inV)));
  63. #elif defined(JPH_USE_NEON)
  64. return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);
  65. #else
  66. return UVec4(*inV, 0, 0, 0);
  67. #endif
  68. }
  69. UVec4 UVec4::sLoadInt4(const uint32 *inV)
  70. {
  71. #if defined(JPH_USE_SSE)
  72. return _mm_loadu_si128(reinterpret_cast<const __m128i *>(inV));
  73. #elif defined(JPH_USE_NEON)
  74. return vld1q_u32(inV);
  75. #else
  76. return UVec4(inV[0], inV[1], inV[2], inV[3]);
  77. #endif
  78. }
  79. UVec4 UVec4::sLoadInt4Aligned(const uint32 *inV)
  80. {
  81. #if defined(JPH_USE_SSE)
  82. return _mm_load_si128(reinterpret_cast<const __m128i *>(inV));
  83. #elif defined(JPH_USE_NEON)
  84. return vld1q_u32(inV); // ARM doesn't make distinction between aligned or not
  85. #else
  86. return UVec4(inV[0], inV[1], inV[2], inV[3]);
  87. #endif
  88. }
  89. template <const int Scale>
  90. UVec4 UVec4::sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
  91. {
  92. #ifdef JPH_USE_AVX2
  93. return _mm_i32gather_epi32(reinterpret_cast<const int *>(inBase), inOffsets.mValue, Scale);
  94. #else
  95. const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
  96. uint32 x = *reinterpret_cast<const uint32 *>(base + inOffsets.GetX() * Scale);
  97. uint32 y = *reinterpret_cast<const uint32 *>(base + inOffsets.GetY() * Scale);
  98. uint32 z = *reinterpret_cast<const uint32 *>(base + inOffsets.GetZ() * Scale);
  99. uint32 w = *reinterpret_cast<const uint32 *>(base + inOffsets.GetW() * Scale);
  100. return UVec4(x, y, z, w);
  101. #endif
  102. }
  103. UVec4 UVec4::sMin(UVec4Arg inV1, UVec4Arg inV2)
  104. {
  105. #if defined(JPH_USE_SSE4_1)
  106. return _mm_min_epu32(inV1.mValue, inV2.mValue);
  107. #elif defined(JPH_USE_NEON)
  108. return vminq_u32(inV1.mValue, inV2.mValue);
  109. #else
  110. UVec4 result;
  111. for (int i = 0; i < 4; i++)
  112. result.mU32[i] = min(inV1.mU32[i], inV2.mU32[i]);
  113. return result;
  114. #endif
  115. }
  116. UVec4 UVec4::sMax(UVec4Arg inV1, UVec4Arg inV2)
  117. {
  118. #if defined(JPH_USE_SSE4_1)
  119. return _mm_max_epu32(inV1.mValue, inV2.mValue);
  120. #elif defined(JPH_USE_NEON)
  121. return vmaxq_u32(inV1.mValue, inV2.mValue);
  122. #else
  123. UVec4 result;
  124. for (int i = 0; i < 4; i++)
  125. result.mU32[i] = max(inV1.mU32[i], inV2.mU32[i]);
  126. return result;
  127. #endif
  128. }
  129. UVec4 UVec4::sEquals(UVec4Arg inV1, UVec4Arg inV2)
  130. {
  131. #if defined(JPH_USE_SSE)
  132. return _mm_cmpeq_epi32(inV1.mValue, inV2.mValue);
  133. #elif defined(JPH_USE_NEON)
  134. return vceqq_u32(inV1.mValue, inV2.mValue);
  135. #else
  136. return UVec4(inV1.mU32[0] == inV2.mU32[0]? 0xffffffffu : 0,
  137. inV1.mU32[1] == inV2.mU32[1]? 0xffffffffu : 0,
  138. inV1.mU32[2] == inV2.mU32[2]? 0xffffffffu : 0,
  139. inV1.mU32[3] == inV2.mU32[3]? 0xffffffffu : 0);
  140. #endif
  141. }
  142. UVec4 UVec4::sSelect(UVec4Arg inNotSet, UVec4Arg inSet, UVec4Arg inControl)
  143. {
  144. #if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
  145. return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inNotSet.mValue), _mm_castsi128_ps(inSet.mValue), _mm_castsi128_ps(inControl.mValue)));
  146. #elif defined(JPH_USE_SSE)
  147. __m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));
  148. return _mm_castps_si128(_mm_or_ps(_mm_and_ps(is_set, _mm_castsi128_ps(inSet.mValue)), _mm_andnot_ps(is_set, _mm_castsi128_ps(inNotSet.mValue))));
  149. #elif defined(JPH_USE_NEON)
  150. return vbslq_u32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);
  151. #else
  152. UVec4 result;
  153. for (int i = 0; i < 4; i++)
  154. result.mU32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mU32[i] : inNotSet.mU32[i];
  155. return result;
  156. #endif
  157. }
  158. UVec4 UVec4::sOr(UVec4Arg inV1, UVec4Arg inV2)
  159. {
  160. #if defined(JPH_USE_SSE)
  161. return _mm_or_si128(inV1.mValue, inV2.mValue);
  162. #elif defined(JPH_USE_NEON)
  163. return vorrq_u32(inV1.mValue, inV2.mValue);
  164. #else
  165. return UVec4(inV1.mU32[0] | inV2.mU32[0],
  166. inV1.mU32[1] | inV2.mU32[1],
  167. inV1.mU32[2] | inV2.mU32[2],
  168. inV1.mU32[3] | inV2.mU32[3]);
  169. #endif
  170. }
  171. UVec4 UVec4::sXor(UVec4Arg inV1, UVec4Arg inV2)
  172. {
  173. #if defined(JPH_USE_SSE)
  174. return _mm_xor_si128(inV1.mValue, inV2.mValue);
  175. #elif defined(JPH_USE_NEON)
  176. return veorq_u32(inV1.mValue, inV2.mValue);
  177. #else
  178. return UVec4(inV1.mU32[0] ^ inV2.mU32[0],
  179. inV1.mU32[1] ^ inV2.mU32[1],
  180. inV1.mU32[2] ^ inV2.mU32[2],
  181. inV1.mU32[3] ^ inV2.mU32[3]);
  182. #endif
  183. }
  184. UVec4 UVec4::sAnd(UVec4Arg inV1, UVec4Arg inV2)
  185. {
  186. #if defined(JPH_USE_SSE)
  187. return _mm_and_si128(inV1.mValue, inV2.mValue);
  188. #elif defined(JPH_USE_NEON)
  189. return vandq_u32(inV1.mValue, inV2.mValue);
  190. #else
  191. return UVec4(inV1.mU32[0] & inV2.mU32[0],
  192. inV1.mU32[1] & inV2.mU32[1],
  193. inV1.mU32[2] & inV2.mU32[2],
  194. inV1.mU32[3] & inV2.mU32[3]);
  195. #endif
  196. }
  197. UVec4 UVec4::sNot(UVec4Arg inV1)
  198. {
  199. #if defined(JPH_USE_AVX512)
  200. return _mm_ternarylogic_epi32(inV1.mValue, inV1.mValue, inV1.mValue, 0b01010101);
  201. #elif defined(JPH_USE_SSE)
  202. return sXor(inV1, sReplicate(0xffffffff));
  203. #elif defined(JPH_USE_NEON)
  204. return vmvnq_u32(inV1.mValue);
  205. #else
  206. return UVec4(~inV1.mU32[0], ~inV1.mU32[1], ~inV1.mU32[2], ~inV1.mU32[3]);
  207. #endif
  208. }
  209. UVec4 UVec4::sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
  210. {
  211. // If inValue.z is false then shift W to Z
  212. UVec4 v = UVec4::sSelect(inIndex.Swizzle<SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>(), inIndex, inValue.SplatZ());
  213. // If inValue.y is false then shift Z and further to Y and further
  214. v = UVec4::sSelect(v.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatY());
  215. // If inValue.x is false then shift X and further to Y and further
  216. v = UVec4::sSelect(v.Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatX());
  217. return v;
  218. }
  219. UVec4 UVec4::operator * (UVec4Arg inV2) const
  220. {
  221. #if defined(JPH_USE_SSE4_1)
  222. return _mm_mullo_epi32(mValue, inV2.mValue);
  223. #elif defined(JPH_USE_NEON)
  224. return vmulq_u32(mValue, inV2.mValue);
  225. #else
  226. UVec4 result;
  227. for (int i = 0; i < 4; i++)
  228. result.mU32[i] = mU32[i] * inV2.mU32[i];
  229. return result;
  230. #endif
  231. }
  232. UVec4 UVec4::operator + (UVec4Arg inV2) const
  233. {
  234. #if defined(JPH_USE_SSE)
  235. return _mm_add_epi32(mValue, inV2.mValue);
  236. #elif defined(JPH_USE_NEON)
  237. return vaddq_u32(mValue, inV2.mValue);
  238. #else
  239. return UVec4(mU32[0] + inV2.mU32[0],
  240. mU32[1] + inV2.mU32[1],
  241. mU32[2] + inV2.mU32[2],
  242. mU32[3] + inV2.mU32[3]);
  243. #endif
  244. }
  245. UVec4 &UVec4::operator += (UVec4Arg inV2)
  246. {
  247. #if defined(JPH_USE_SSE)
  248. mValue = _mm_add_epi32(mValue, inV2.mValue);
  249. #elif defined(JPH_USE_NEON)
  250. mValue = vaddq_u32(mValue, inV2.mValue);
  251. #else
  252. for (int i = 0; i < 4; ++i)
  253. mU32[i] += inV2.mU32[i];
  254. #endif
  255. return *this;
  256. }
  257. UVec4 UVec4::operator - (UVec4Arg inV2) const
  258. {
  259. #if defined(JPH_USE_SSE)
  260. return _mm_sub_epi32(mValue, inV2.mValue);
  261. #elif defined(JPH_USE_NEON)
  262. return vsubq_u32(mValue, inV2.mValue);
  263. #else
  264. return UVec4(mU32[0] - inV2.mU32[0],
  265. mU32[1] - inV2.mU32[1],
  266. mU32[2] - inV2.mU32[2],
  267. mU32[3] - inV2.mU32[3]);
  268. #endif
  269. }
  270. UVec4 &UVec4::operator -= (UVec4Arg inV2)
  271. {
  272. #if defined(JPH_USE_SSE)
  273. mValue = _mm_sub_epi32(mValue, inV2.mValue);
  274. #elif defined(JPH_USE_NEON)
  275. mValue = vsubq_u32(mValue, inV2.mValue);
  276. #else
  277. for (int i = 0; i < 4; ++i)
  278. mU32[i] -= inV2.mU32[i];
  279. #endif
  280. return *this;
  281. }
  282. UVec4 UVec4::SplatX() const
  283. {
  284. #if defined(JPH_USE_SSE)
  285. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(0, 0, 0, 0));
  286. #elif defined(JPH_USE_NEON)
  287. return vdupq_laneq_u32(mValue, 0);
  288. #else
  289. return UVec4(mU32[0], mU32[0], mU32[0], mU32[0]);
  290. #endif
  291. }
  292. UVec4 UVec4::SplatY() const
  293. {
  294. #if defined(JPH_USE_SSE)
  295. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(1, 1, 1, 1));
  296. #elif defined(JPH_USE_NEON)
  297. return vdupq_laneq_u32(mValue, 1);
  298. #else
  299. return UVec4(mU32[1], mU32[1], mU32[1], mU32[1]);
  300. #endif
  301. }
  302. UVec4 UVec4::SplatZ() const
  303. {
  304. #if defined(JPH_USE_SSE)
  305. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(2, 2, 2, 2));
  306. #elif defined(JPH_USE_NEON)
  307. return vdupq_laneq_u32(mValue, 2);
  308. #else
  309. return UVec4(mU32[2], mU32[2], mU32[2], mU32[2]);
  310. #endif
  311. }
  312. UVec4 UVec4::SplatW() const
  313. {
  314. #if defined(JPH_USE_SSE)
  315. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(3, 3, 3, 3));
  316. #elif defined(JPH_USE_NEON)
  317. return vdupq_laneq_u32(mValue, 3);
  318. #else
  319. return UVec4(mU32[3], mU32[3], mU32[3], mU32[3]);
  320. #endif
  321. }
  322. Vec4 UVec4::ToFloat() const
  323. {
  324. #if defined(JPH_USE_SSE)
  325. return _mm_cvtepi32_ps(mValue);
  326. #elif defined(JPH_USE_NEON)
  327. return vcvtq_f32_u32(mValue);
  328. #else
  329. return Vec4((float)mU32[0], (float)mU32[1], (float)mU32[2], (float)mU32[3]);
  330. #endif
  331. }
  332. Vec4 UVec4::ReinterpretAsFloat() const
  333. {
  334. #if defined(JPH_USE_SSE)
  335. return Vec4(_mm_castsi128_ps(mValue));
  336. #elif defined(JPH_USE_NEON)
  337. return vreinterpretq_f32_u32(mValue);
  338. #else
  339. return *reinterpret_cast<const Vec4 *>(this);
  340. #endif
  341. }
  342. UVec4 UVec4::DotV(UVec4Arg inV2) const
  343. {
  344. #if defined(JPH_USE_SSE4_1)
  345. __m128i mul = _mm_mullo_epi32(mValue, inV2.mValue);
  346. __m128i sum = _mm_add_epi32(mul, _mm_shuffle_epi32(mul, _MM_SHUFFLE(2, 3, 0, 1)));
  347. return _mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)));
  348. #elif defined(JPH_USE_NEON)
  349. uint32x4_t mul = vmulq_u32(mValue, inV2.mValue);
  350. return vdupq_n_u32(vaddvq_u32(mul));
  351. #else
  352. return UVec4::sReplicate(mU32[0] * inV2.mU32[0] + mU32[1] * inV2.mU32[1] + mU32[2] * inV2.mU32[2] + mU32[3] * inV2.mU32[3]);
  353. #endif
  354. }
  355. uint32 UVec4::Dot(UVec4Arg inV2) const
  356. {
  357. #if defined(JPH_USE_SSE4_1)
  358. __m128i mul = _mm_mullo_epi32(mValue, inV2.mValue);
  359. __m128i sum = _mm_add_epi32(mul, _mm_shuffle_epi32(mul, _MM_SHUFFLE(2, 3, 0, 1)));
  360. return _mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2))));
  361. #elif defined(JPH_USE_NEON)
  362. uint32x4_t mul = vmulq_u32(mValue, inV2.mValue);
  363. return vaddvq_u32(mul);
  364. #else
  365. return mU32[0] * inV2.mU32[0] + mU32[1] * inV2.mU32[1] + mU32[2] * inV2.mU32[2] + mU32[3] * inV2.mU32[3];
  366. #endif
  367. }
  368. void UVec4::StoreInt4(uint32 *outV) const
  369. {
  370. #if defined(JPH_USE_SSE)
  371. _mm_storeu_si128(reinterpret_cast<__m128i *>(outV), mValue);
  372. #elif defined(JPH_USE_NEON)
  373. vst1q_u32(outV, mValue);
  374. #else
  375. for (int i = 0; i < 4; ++i)
  376. outV[i] = mU32[i];
  377. #endif
  378. }
  379. void UVec4::StoreInt4Aligned(uint32 *outV) const
  380. {
  381. #if defined(JPH_USE_SSE)
  382. _mm_store_si128(reinterpret_cast<__m128i *>(outV), mValue);
  383. #elif defined(JPH_USE_NEON)
  384. vst1q_u32(outV, mValue); // ARM doesn't make distinction between aligned or not
  385. #else
  386. for (int i = 0; i < 4; ++i)
  387. outV[i] = mU32[i];
  388. #endif
  389. }
  390. int UVec4::CountTrues() const
  391. {
  392. #if defined(JPH_USE_SSE)
  393. return CountBits(_mm_movemask_ps(_mm_castsi128_ps(mValue)));
  394. #elif defined(JPH_USE_NEON)
  395. return vaddvq_u32(vshrq_n_u32(mValue, 31));
  396. #else
  397. return (mU32[0] >> 31) + (mU32[1] >> 31) + (mU32[2] >> 31) + (mU32[3] >> 31);
  398. #endif
  399. }
  400. int UVec4::GetTrues() const
  401. {
  402. #if defined(JPH_USE_SSE)
  403. return _mm_movemask_ps(_mm_castsi128_ps(mValue));
  404. #elif defined(JPH_USE_NEON)
  405. int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
  406. return vaddvq_u32(vshlq_u32(vshrq_n_u32(mValue, 31), shift));
  407. #else
  408. return (mU32[0] >> 31) | ((mU32[1] >> 31) << 1) | ((mU32[2] >> 31) << 2) | ((mU32[3] >> 31) << 3);
  409. #endif
  410. }
  411. bool UVec4::TestAnyTrue() const
  412. {
  413. return GetTrues() != 0;
  414. }
  415. bool UVec4::TestAnyXYZTrue() const
  416. {
  417. return (GetTrues() & 0b111) != 0;
  418. }
  419. bool UVec4::TestAllTrue() const
  420. {
  421. return GetTrues() == 0b1111;
  422. }
  423. bool UVec4::TestAllXYZTrue() const
  424. {
  425. return (GetTrues() & 0b111) == 0b111;
  426. }
  427. template <const uint Count>
  428. UVec4 UVec4::LogicalShiftLeft() const
  429. {
  430. static_assert(Count <= 31, "Invalid shift");
  431. #if defined(JPH_USE_SSE)
  432. return _mm_slli_epi32(mValue, Count);
  433. #elif defined(JPH_USE_NEON)
  434. return vshlq_n_u32(mValue, Count);
  435. #else
  436. return UVec4(mU32[0] << Count, mU32[1] << Count, mU32[2] << Count, mU32[3] << Count);
  437. #endif
  438. }
  439. template <const uint Count>
  440. UVec4 UVec4::LogicalShiftRight() const
  441. {
  442. static_assert(Count <= 31, "Invalid shift");
  443. #if defined(JPH_USE_SSE)
  444. return _mm_srli_epi32(mValue, Count);
  445. #elif defined(JPH_USE_NEON)
  446. return vshrq_n_u32(mValue, Count);
  447. #else
  448. return UVec4(mU32[0] >> Count, mU32[1] >> Count, mU32[2] >> Count, mU32[3] >> Count);
  449. #endif
  450. }
  451. template <const uint Count>
  452. UVec4 UVec4::ArithmeticShiftRight() const
  453. {
  454. static_assert(Count <= 31, "Invalid shift");
  455. #if defined(JPH_USE_SSE)
  456. return _mm_srai_epi32(mValue, Count);
  457. #elif defined(JPH_USE_NEON)
  458. return vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(mValue), Count));
  459. #else
  460. return UVec4(uint32(int32_t(mU32[0]) >> Count),
  461. uint32(int32_t(mU32[1]) >> Count),
  462. uint32(int32_t(mU32[2]) >> Count),
  463. uint32(int32_t(mU32[3]) >> Count));
  464. #endif
  465. }
  466. UVec4 UVec4::Expand4Uint16Lo() const
  467. {
  468. #if defined(JPH_USE_SSE)
  469. return _mm_unpacklo_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
  470. #elif defined(JPH_USE_NEON)
  471. uint16x4_t value = vget_low_u16(vreinterpretq_u16_u32(mValue));
  472. uint16x4_t zero = vdup_n_u16(0);
  473. return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
  474. #else
  475. return UVec4(mU32[0] & 0xffff,
  476. (mU32[0] >> 16) & 0xffff,
  477. mU32[1] & 0xffff,
  478. (mU32[1] >> 16) & 0xffff);
  479. #endif
  480. }
  481. UVec4 UVec4::Expand4Uint16Hi() const
  482. {
  483. #if defined(JPH_USE_SSE)
  484. return _mm_unpackhi_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
  485. #elif defined(JPH_USE_NEON)
  486. uint16x4_t value = vget_high_u16(vreinterpretq_u16_u32(mValue));
  487. uint16x4_t zero = vdup_n_u16(0);
  488. return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
  489. #else
  490. return UVec4(mU32[2] & 0xffff,
  491. (mU32[2] >> 16) & 0xffff,
  492. mU32[3] & 0xffff,
  493. (mU32[3] >> 16) & 0xffff);
  494. #endif
  495. }
  496. UVec4 UVec4::Expand4Byte0() const
  497. {
  498. #if defined(JPH_USE_SSE4_1)
  499. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));
  500. #elif defined(JPH_USE_NEON)
  501. uint8x16_t idx = JPH_NEON_UINT8x16(0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f);
  502. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  503. #else
  504. UVec4 result;
  505. for (int i = 0; i < 4; i++)
  506. result.mU32[i] = (mU32[0] >> (i * 8)) & 0xff;
  507. return result;
  508. #endif
  509. }
  510. UVec4 UVec4::Expand4Byte4() const
  511. {
  512. #if defined(JPH_USE_SSE4_1)
  513. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));
  514. #elif defined(JPH_USE_NEON)
  515. uint8x16_t idx = JPH_NEON_UINT8x16(0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f);
  516. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  517. #else
  518. UVec4 result;
  519. for (int i = 0; i < 4; i++)
  520. result.mU32[i] = (mU32[1] >> (i * 8)) & 0xff;
  521. return result;
  522. #endif
  523. }
  524. UVec4 UVec4::Expand4Byte8() const
  525. {
  526. #if defined(JPH_USE_SSE4_1)
  527. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));
  528. #elif defined(JPH_USE_NEON)
  529. uint8x16_t idx = JPH_NEON_UINT8x16(0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f);
  530. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  531. #else
  532. UVec4 result;
  533. for (int i = 0; i < 4; i++)
  534. result.mU32[i] = (mU32[2] >> (i * 8)) & 0xff;
  535. return result;
  536. #endif
  537. }
  538. UVec4 UVec4::Expand4Byte12() const
  539. {
  540. #if defined(JPH_USE_SSE4_1)
  541. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));
  542. #elif defined(JPH_USE_NEON)
  543. uint8x16_t idx = JPH_NEON_UINT8x16(0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f);
  544. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  545. #else
  546. UVec4 result;
  547. for (int i = 0; i < 4; i++)
  548. result.mU32[i] = (mU32[3] >> (i * 8)) & 0xff;
  549. return result;
  550. #endif
  551. }
  552. UVec4 UVec4::ShiftComponents4Minus(int inCount) const
  553. {
  554. #if defined(JPH_USE_SSE4_1) || defined(JPH_USE_NEON)
  555. alignas(UVec4) static constexpr uint32 sFourMinusXShuffle[5][4] =
  556. {
  557. { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
  558. { 0x0f0e0d0c, 0xffffffff, 0xffffffff, 0xffffffff },
  559. { 0x0b0a0908, 0x0f0e0d0c, 0xffffffff, 0xffffffff },
  560. { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0xffffffff },
  561. { 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c }
  562. };
  563. #endif
  564. #if defined(JPH_USE_SSE4_1)
  565. return _mm_shuffle_epi8(mValue, *reinterpret_cast<const UVec4::Type *>(sFourMinusXShuffle[inCount]));
  566. #elif defined(JPH_USE_NEON)
  567. uint8x16_t idx = vreinterpretq_u8_u32(*reinterpret_cast<const UVec4::Type *>(sFourMinusXShuffle[inCount]));
  568. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  569. #else
  570. UVec4 result = UVec4::sZero();
  571. for (int i = 0; i < inCount; i++)
  572. result.mU32[i] = mU32[i + 4 - inCount];
  573. return result;
  574. #endif
  575. }
  576. JPH_NAMESPACE_END