UVec4.inl 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. // Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
  2. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  3. // SPDX-License-Identifier: MIT
  4. JPH_NAMESPACE_BEGIN
  5. UVec4::UVec4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW)
  6. {
  7. #if defined(JPH_USE_SSE)
  8. mValue = _mm_set_epi32(int(inW), int(inZ), int(inY), int(inX));
  9. #elif defined(JPH_USE_NEON)
  10. uint32x2_t xy = vcreate_u32(static_cast<uint64>(inX) | (static_cast<uint64>(inY) << 32));
  11. uint32x2_t zw = vcreate_u32(static_cast<uint64>(inZ) | (static_cast<uint64>(inW) << 32));
  12. mValue = vcombine_u32(xy, zw);
  13. #else
  14. mU32[0] = inX;
  15. mU32[1] = inY;
  16. mU32[2] = inZ;
  17. mU32[3] = inW;
  18. #endif
  19. }
  20. bool UVec4::operator == (UVec4Arg inV2) const
  21. {
  22. return sEquals(*this, inV2).TestAllTrue();
  23. }
  24. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
  25. UVec4 UVec4::Swizzle() const
  26. {
  27. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  28. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  29. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  30. static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
  31. #if defined(JPH_USE_SSE)
  32. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
  33. #elif defined(JPH_USE_NEON)
  34. return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
  35. #else
  36. return UVec4(mU32[SwizzleX], mU32[SwizzleY], mU32[SwizzleZ], mU32[SwizzleW]);
  37. #endif
  38. }
  39. UVec4 UVec4::sZero()
  40. {
  41. #if defined(JPH_USE_SSE)
  42. return _mm_setzero_si128();
  43. #elif defined(JPH_USE_NEON)
  44. return vdupq_n_u32(0);
  45. #else
  46. return UVec4(0, 0, 0, 0);
  47. #endif
  48. }
  49. UVec4 UVec4::sReplicate(uint32 inV)
  50. {
  51. #if defined(JPH_USE_SSE)
  52. return _mm_set1_epi32(int(inV));
  53. #elif defined(JPH_USE_NEON)
  54. return vdupq_n_u32(inV);
  55. #else
  56. return UVec4(inV, inV, inV, inV);
  57. #endif
  58. }
  59. UVec4 UVec4::sLoadInt(const uint32 *inV)
  60. {
  61. #if defined(JPH_USE_SSE)
  62. return _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(inV)));
  63. #elif defined(JPH_USE_NEON)
  64. return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);
  65. #else
  66. return UVec4(*inV, 0, 0, 0);
  67. #endif
  68. }
  69. UVec4 UVec4::sLoadInt4(const uint32 *inV)
  70. {
  71. #if defined(JPH_USE_SSE)
  72. return _mm_loadu_si128(reinterpret_cast<const __m128i *>(inV));
  73. #elif defined(JPH_USE_NEON)
  74. return vld1q_u32(inV);
  75. #else
  76. return UVec4(inV[0], inV[1], inV[2], inV[3]);
  77. #endif
  78. }
  79. UVec4 UVec4::sLoadInt4Aligned(const uint32 *inV)
  80. {
  81. #if defined(JPH_USE_SSE)
  82. return _mm_load_si128(reinterpret_cast<const __m128i *>(inV));
  83. #elif defined(JPH_USE_NEON)
  84. return vld1q_u32(inV); // ARM doesn't make distinction between aligned or not
  85. #else
  86. return UVec4(inV[0], inV[1], inV[2], inV[3]);
  87. #endif
  88. }
  89. template <const int Scale>
  90. UVec4 UVec4::sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
  91. {
  92. #ifdef JPH_USE_AVX2
  93. return _mm_i32gather_epi32(reinterpret_cast<const int *>(inBase), inOffsets.mValue, Scale);
  94. #else
  95. return Vec4::sGatherFloat4<Scale>(reinterpret_cast<const float *>(inBase), inOffsets).ReinterpretAsInt();
  96. #endif
  97. }
  98. UVec4 UVec4::sMin(UVec4Arg inV1, UVec4Arg inV2)
  99. {
  100. #if defined(JPH_USE_SSE4_1)
  101. return _mm_min_epu32(inV1.mValue, inV2.mValue);
  102. #elif defined(JPH_USE_NEON)
  103. return vminq_u32(inV1.mValue, inV2.mValue);
  104. #else
  105. UVec4 result;
  106. for (int i = 0; i < 4; i++)
  107. result.mU32[i] = min(inV1.mU32[i], inV2.mU32[i]);
  108. return result;
  109. #endif
  110. }
  111. UVec4 UVec4::sMax(UVec4Arg inV1, UVec4Arg inV2)
  112. {
  113. #if defined(JPH_USE_SSE4_1)
  114. return _mm_max_epu32(inV1.mValue, inV2.mValue);
  115. #elif defined(JPH_USE_NEON)
  116. return vmaxq_u32(inV1.mValue, inV2.mValue);
  117. #else
  118. UVec4 result;
  119. for (int i = 0; i < 4; i++)
  120. result.mU32[i] = max(inV1.mU32[i], inV2.mU32[i]);
  121. return result;
  122. #endif
  123. }
  124. UVec4 UVec4::sEquals(UVec4Arg inV1, UVec4Arg inV2)
  125. {
  126. #if defined(JPH_USE_SSE)
  127. return _mm_cmpeq_epi32(inV1.mValue, inV2.mValue);
  128. #elif defined(JPH_USE_NEON)
  129. return vceqq_u32(inV1.mValue, inV2.mValue);
  130. #else
  131. return UVec4(inV1.mU32[0] == inV2.mU32[0]? 0xffffffffu : 0,
  132. inV1.mU32[1] == inV2.mU32[1]? 0xffffffffu : 0,
  133. inV1.mU32[2] == inV2.mU32[2]? 0xffffffffu : 0,
  134. inV1.mU32[3] == inV2.mU32[3]? 0xffffffffu : 0);
  135. #endif
  136. }
  137. UVec4 UVec4::sSelect(UVec4Arg inV1, UVec4Arg inV2, UVec4Arg inControl)
  138. {
  139. #if defined(JPH_USE_SSE4_1)
  140. return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inV1.mValue), _mm_castsi128_ps(inV2.mValue), _mm_castsi128_ps(inControl.mValue)));
  141. #elif defined(JPH_USE_NEON)
  142. return vbslq_u32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
  143. #else
  144. UVec4 result;
  145. for (int i = 0; i < 4; i++)
  146. result.mU32[i] = inControl.mU32[i] ? inV2.mU32[i] : inV1.mU32[i];
  147. return result;
  148. #endif
  149. }
  150. UVec4 UVec4::sOr(UVec4Arg inV1, UVec4Arg inV2)
  151. {
  152. #if defined(JPH_USE_SSE)
  153. return _mm_or_si128(inV1.mValue, inV2.mValue);
  154. #elif defined(JPH_USE_NEON)
  155. return vorrq_u32(inV1.mValue, inV2.mValue);
  156. #else
  157. return UVec4(inV1.mU32[0] | inV2.mU32[0],
  158. inV1.mU32[1] | inV2.mU32[1],
  159. inV1.mU32[2] | inV2.mU32[2],
  160. inV1.mU32[3] | inV2.mU32[3]);
  161. #endif
  162. }
  163. UVec4 UVec4::sXor(UVec4Arg inV1, UVec4Arg inV2)
  164. {
  165. #if defined(JPH_USE_SSE)
  166. return _mm_xor_si128(inV1.mValue, inV2.mValue);
  167. #elif defined(JPH_USE_NEON)
  168. return veorq_u32(inV1.mValue, inV2.mValue);
  169. #else
  170. return UVec4(inV1.mU32[0] ^ inV2.mU32[0],
  171. inV1.mU32[1] ^ inV2.mU32[1],
  172. inV1.mU32[2] ^ inV2.mU32[2],
  173. inV1.mU32[3] ^ inV2.mU32[3]);
  174. #endif
  175. }
  176. UVec4 UVec4::sAnd(UVec4Arg inV1, UVec4Arg inV2)
  177. {
  178. #if defined(JPH_USE_SSE)
  179. return _mm_and_si128(inV1.mValue, inV2.mValue);
  180. #elif defined(JPH_USE_NEON)
  181. return vandq_u32(inV1.mValue, inV2.mValue);
  182. #else
  183. return UVec4(inV1.mU32[0] & inV2.mU32[0],
  184. inV1.mU32[1] & inV2.mU32[1],
  185. inV1.mU32[2] & inV2.mU32[2],
  186. inV1.mU32[3] & inV2.mU32[3]);
  187. #endif
  188. }
  189. UVec4 UVec4::sNot(UVec4Arg inV1)
  190. {
  191. #if defined(JPH_USE_AVX512)
  192. return _mm_ternarylogic_epi32(inV1.mValue, inV1.mValue, inV1.mValue, 0b01010101);
  193. #elif defined(JPH_USE_SSE)
  194. return sXor(inV1, sReplicate(0xffffffff));
  195. #elif defined(JPH_USE_NEON)
  196. return vmvnq_u32(inV1.mValue);
  197. #else
  198. return UVec4(~inV1.mU32[0], ~inV1.mU32[1], ~inV1.mU32[2], ~inV1.mU32[3]);
  199. #endif
  200. }
  201. UVec4 UVec4::sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
  202. {
  203. // If inValue.z is false then shift W to Z
  204. UVec4 v = UVec4::sSelect(inIndex.Swizzle<SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>(), inIndex, inValue.SplatZ());
  205. // If inValue.y is false then shift Z and further to Y and further
  206. v = UVec4::sSelect(v.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatY());
  207. // If inValue.x is false then shift X and furhter to Y and furhter
  208. v = UVec4::sSelect(v.Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatX());
  209. return v;
  210. }
  211. UVec4 UVec4::operator * (UVec4Arg inV2) const
  212. {
  213. #if defined(JPH_USE_SSE4_1)
  214. return _mm_mullo_epi32(mValue, inV2.mValue);
  215. #elif defined(JPH_USE_NEON)
  216. return vmulq_u32(mValue, inV2.mValue);
  217. #else
  218. UVec4 result;
  219. for (int i = 0; i < 4; i++)
  220. result.mU32[i] = mU32[i] * inV2.mU32[i];
  221. return result;
  222. #endif
  223. }
  224. UVec4 UVec4::operator + (UVec4Arg inV2)
  225. {
  226. #if defined(JPH_USE_SSE)
  227. return _mm_add_epi32(mValue, inV2.mValue);
  228. #elif defined(JPH_USE_NEON)
  229. return vaddq_u32(mValue, inV2.mValue);
  230. #else
  231. return UVec4(mU32[0] + inV2.mU32[0],
  232. mU32[1] + inV2.mU32[1],
  233. mU32[2] + inV2.mU32[2],
  234. mU32[3] + inV2.mU32[3]);
  235. #endif
  236. }
  237. UVec4 &UVec4::operator += (UVec4Arg inV2)
  238. {
  239. #if defined(JPH_USE_SSE)
  240. mValue = _mm_add_epi32(mValue, inV2.mValue);
  241. #elif defined(JPH_USE_NEON)
  242. mValue = vaddq_u32(mValue, inV2.mValue);
  243. #else
  244. for (int i = 0; i < 4; ++i)
  245. mU32[i] += inV2.mU32[i];
  246. #endif
  247. return *this;
  248. }
  249. UVec4 UVec4::SplatX() const
  250. {
  251. #if defined(JPH_USE_SSE)
  252. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(0, 0, 0, 0));
  253. #elif defined(JPH_USE_NEON)
  254. return vdupq_laneq_u32(mValue, 0);
  255. #else
  256. return UVec4(mU32[0], mU32[0], mU32[0], mU32[0]);
  257. #endif
  258. }
  259. UVec4 UVec4::SplatY() const
  260. {
  261. #if defined(JPH_USE_SSE)
  262. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(1, 1, 1, 1));
  263. #elif defined(JPH_USE_NEON)
  264. return vdupq_laneq_u32(mValue, 1);
  265. #else
  266. return UVec4(mU32[1], mU32[1], mU32[1], mU32[1]);
  267. #endif
  268. }
  269. UVec4 UVec4::SplatZ() const
  270. {
  271. #if defined(JPH_USE_SSE)
  272. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(2, 2, 2, 2));
  273. #elif defined(JPH_USE_NEON)
  274. return vdupq_laneq_u32(mValue, 2);
  275. #else
  276. return UVec4(mU32[2], mU32[2], mU32[2], mU32[2]);
  277. #endif
  278. }
  279. UVec4 UVec4::SplatW() const
  280. {
  281. #if defined(JPH_USE_SSE)
  282. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(3, 3, 3, 3));
  283. #elif defined(JPH_USE_NEON)
  284. return vdupq_laneq_u32(mValue, 3);
  285. #else
  286. return UVec4(mU32[3], mU32[3], mU32[3], mU32[3]);
  287. #endif
  288. }
  289. Vec4 UVec4::ToFloat() const
  290. {
  291. #if defined(JPH_USE_SSE)
  292. return _mm_cvtepi32_ps(mValue);
  293. #elif defined(JPH_USE_NEON)
  294. return vcvtq_f32_s32(mValue);
  295. #else
  296. return Vec4((float)mU32[0], (float)mU32[1], (float)mU32[2], (float)mU32[3]);
  297. #endif
  298. }
  299. Vec4 UVec4::ReinterpretAsFloat() const
  300. {
  301. #if defined(JPH_USE_SSE)
  302. return Vec4(_mm_castsi128_ps(mValue));
  303. #elif defined(JPH_USE_NEON)
  304. return vreinterpretq_f32_s32(mValue);
  305. #else
  306. return *reinterpret_cast<const Vec4 *>(this);
  307. #endif
  308. }
  309. void UVec4::StoreInt4(uint32 *outV) const
  310. {
  311. #if defined(JPH_USE_SSE)
  312. _mm_storeu_si128(reinterpret_cast<__m128i *>(outV), mValue);
  313. #elif defined(JPH_USE_NEON)
  314. vst1q_u32(outV, mValue);
  315. #else
  316. for (int i = 0; i < 4; ++i)
  317. outV[i] = mU32[i];
  318. #endif
  319. }
  320. void UVec4::StoreInt4Aligned(uint32 *outV) const
  321. {
  322. #if defined(JPH_USE_SSE)
  323. _mm_store_si128(reinterpret_cast<__m128i *>(outV), mValue);
  324. #elif defined(JPH_USE_NEON)
  325. vst1q_u32(outV, mValue); // ARM doesn't make distinction between aligned or not
  326. #else
  327. for (int i = 0; i < 4; ++i)
  328. outV[i] = mU32[i];
  329. #endif
  330. }
  331. int UVec4::CountTrues() const
  332. {
  333. #if defined(JPH_USE_SSE)
  334. return CountBits(_mm_movemask_ps(_mm_castsi128_ps(mValue)));
  335. #elif defined(JPH_USE_NEON)
  336. return vaddvq_u32(vshrq_n_u32(mValue, 31));
  337. #else
  338. return (mU32[0] >> 31) + (mU32[1] >> 31) + (mU32[2] >> 31) + (mU32[3] >> 31);
  339. #endif
  340. }
  341. int UVec4::GetTrues() const
  342. {
  343. #if defined(JPH_USE_SSE)
  344. return _mm_movemask_ps(_mm_castsi128_ps(mValue));
  345. #elif defined(JPH_USE_NEON)
  346. int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
  347. return vaddvq_u32(vshlq_u32(vshrq_n_u32(mValue, 31), shift));
  348. #else
  349. return (mU32[0] >> 31) | ((mU32[1] >> 31) << 1) | ((mU32[2] >> 31) << 2) | ((mU32[3] >> 31) << 3);
  350. #endif
  351. }
  352. bool UVec4::TestAnyTrue() const
  353. {
  354. return GetTrues() != 0;
  355. }
  356. bool UVec4::TestAnyXYZTrue() const
  357. {
  358. return (GetTrues() & 0b111) != 0;
  359. }
  360. bool UVec4::TestAllTrue() const
  361. {
  362. return GetTrues() == 0b1111;
  363. }
  364. bool UVec4::TestAllXYZTrue() const
  365. {
  366. return (GetTrues() & 0b111) == 0b111;
  367. }
  368. template <const uint Count>
  369. UVec4 UVec4::LogicalShiftLeft() const
  370. {
  371. static_assert(Count <= 31, "Invalid shift");
  372. #if defined(JPH_USE_SSE)
  373. return _mm_slli_epi32(mValue, Count);
  374. #elif defined(JPH_USE_NEON)
  375. return vshlq_n_u32(mValue, Count);
  376. #else
  377. return UVec4(mU32[0] << Count, mU32[1] << Count, mU32[2] << Count, mU32[3] << Count);
  378. #endif
  379. }
  380. template <const uint Count>
  381. UVec4 UVec4::LogicalShiftRight() const
  382. {
  383. static_assert(Count <= 31, "Invalid shift");
  384. #if defined(JPH_USE_SSE)
  385. return _mm_srli_epi32(mValue, Count);
  386. #elif defined(JPH_USE_NEON)
  387. return vshrq_n_u32(mValue, Count);
  388. #else
  389. return UVec4(mU32[0] >> Count, mU32[1] >> Count, mU32[2] >> Count, mU32[3] >> Count);
  390. #endif
  391. }
  392. template <const uint Count>
  393. UVec4 UVec4::ArithmeticShiftRight() const
  394. {
  395. static_assert(Count <= 31, "Invalid shift");
  396. #if defined(JPH_USE_SSE)
  397. return _mm_srai_epi32(mValue, Count);
  398. #elif defined(JPH_USE_NEON)
  399. return vshrq_n_s32(mValue, Count);
  400. #else
  401. return UVec4(uint32(int32_t(mU32[0]) >> Count),
  402. uint32(int32_t(mU32[1]) >> Count),
  403. uint32(int32_t(mU32[2]) >> Count),
  404. uint32(int32_t(mU32[3]) >> Count));
  405. #endif
  406. }
  407. UVec4 UVec4::Expand4Uint16Lo() const
  408. {
  409. #if defined(JPH_USE_SSE)
  410. return _mm_unpacklo_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
  411. #elif defined(JPH_USE_NEON)
  412. int16x4_t value = vget_low_s16(mValue);
  413. int16x4_t zero = vdup_n_s16(0);
  414. return vcombine_s16(vzip1_s16(value, zero), vzip2_s16(value, zero));
  415. #else
  416. return UVec4(mU32[0] & 0xffff,
  417. (mU32[0] >> 16) & 0xffff,
  418. mU32[1] & 0xffff,
  419. (mU32[1] >> 16) & 0xffff);
  420. #endif
  421. }
  422. UVec4 UVec4::Expand4Uint16Hi() const
  423. {
  424. #if defined(JPH_USE_SSE)
  425. return _mm_unpackhi_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
  426. #elif defined(JPH_USE_NEON)
  427. int16x4_t value = vget_high_s16(mValue);
  428. int16x4_t zero = vdup_n_s16(0);
  429. return vcombine_s16(vzip1_s16(value, zero), vzip2_s16(value, zero));
  430. #else
  431. return UVec4(mU32[2] & 0xffff,
  432. (mU32[2] >> 16) & 0xffff,
  433. mU32[3] & 0xffff,
  434. (mU32[3] >> 16) & 0xffff);
  435. #endif
  436. }
  437. UVec4 UVec4::Expand4Byte0() const
  438. {
  439. #if defined(JPH_USE_SSE4_1)
  440. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));
  441. #elif defined(JPH_USE_NEON)
  442. int8x16_t idx = JPH_NEON_INT8x16(0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f);
  443. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  444. #else
  445. UVec4 result;
  446. for (int i = 0; i < 4; i++)
  447. result.mU32[i] = (mU32[0] >> (i * 8)) & 0xff;
  448. return result;
  449. #endif
  450. }
  451. UVec4 UVec4::Expand4Byte4() const
  452. {
  453. #if defined(JPH_USE_SSE4_1)
  454. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));
  455. #elif defined(JPH_USE_NEON)
  456. int8x16_t idx = JPH_NEON_INT8x16(0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f);
  457. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  458. #else
  459. UVec4 result;
  460. for (int i = 0; i < 4; i++)
  461. result.mU32[i] = (mU32[1] >> (i * 8)) & 0xff;
  462. return result;
  463. #endif
  464. }
  465. UVec4 UVec4::Expand4Byte8() const
  466. {
  467. #if defined(JPH_USE_SSE4_1)
  468. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));
  469. #elif defined(JPH_USE_NEON)
  470. int8x16_t idx = JPH_NEON_INT8x16(0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f);
  471. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  472. #else
  473. UVec4 result;
  474. for (int i = 0; i < 4; i++)
  475. result.mU32[i] = (mU32[2] >> (i * 8)) & 0xff;
  476. return result;
  477. #endif
  478. }
  479. UVec4 UVec4::Expand4Byte12() const
  480. {
  481. #if defined(JPH_USE_SSE4_1)
  482. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));
  483. #elif defined(JPH_USE_NEON)
  484. int8x16_t idx = JPH_NEON_INT8x16(0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f);
  485. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  486. #else
  487. UVec4 result;
  488. for (int i = 0; i < 4; i++)
  489. result.mU32[i] = (mU32[3] >> (i * 8)) & 0xff;
  490. return result;
  491. #endif
  492. }
  493. UVec4 UVec4::ShiftComponents4Minus(int inCount) const
  494. {
  495. #if defined(JPH_USE_SSE4_1) || defined(JPH_USE_NEON)
  496. alignas(UVec4) static constexpr uint32 sFourMinusXShuffle[5][4] =
  497. {
  498. { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
  499. { 0x0f0e0d0c, 0xffffffff, 0xffffffff, 0xffffffff },
  500. { 0x0b0a0908, 0x0f0e0d0c, 0xffffffff, 0xffffffff },
  501. { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0xffffffff },
  502. { 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c }
  503. };
  504. #endif
  505. #if defined(JPH_USE_SSE4_1)
  506. return _mm_shuffle_epi8(mValue, *reinterpret_cast<const UVec4::Type *>(sFourMinusXShuffle[inCount]));
  507. #elif defined(JPH_USE_NEON)
  508. uint8x16_t idx = vreinterpretq_u8_u32(*reinterpret_cast<const UVec4::Type *>(sFourMinusXShuffle[inCount]));
  509. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  510. #else
  511. UVec4 result = UVec4::sZero();
  512. for (int i = 0; i < inCount; i++)
  513. result.mU32[i] = mU32[i + 4 - inCount];
  514. return result;
  515. #endif
  516. }
  517. JPH_NAMESPACE_END