UVec4.inl 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. JPH_NAMESPACE_BEGIN
  4. UVec4::UVec4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW)
  5. {
  6. #if defined(JPH_USE_SSE)
  7. mValue = _mm_set_epi32(int(inW), int(inZ), int(inY), int(inX));
  8. #elif defined(JPH_USE_NEON)
  9. uint32x2_t xy = vcreate_u32(static_cast<uint64>(inX) | (static_cast<uint64>(inY) << 32));
  10. uint32x2_t zw = vcreate_u32(static_cast<uint64>(inZ) | (static_cast<uint64>(inW) << 32));
  11. mValue = vcombine_u32(xy, zw);
  12. #else
  13. mU32[0] = inX;
  14. mU32[1] = inY;
  15. mU32[2] = inZ;
  16. mU32[3] = inW;
  17. #endif
  18. }
  19. bool UVec4::operator == (UVec4Arg inV2) const
  20. {
  21. return sEquals(*this, inV2).TestAllTrue();
  22. }
  23. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
  24. UVec4 UVec4::Swizzle() const
  25. {
  26. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  27. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  28. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  29. static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
  30. #if defined(JPH_USE_SSE)
  31. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
  32. #elif defined(JPH_USE_NEON)
  33. return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
  34. #else
  35. return UVec4(mU32[SwizzleX], mU32[SwizzleY], mU32[SwizzleZ], mU32[SwizzleW]);
  36. #endif
  37. }
  38. UVec4 UVec4::sZero()
  39. {
  40. #if defined(JPH_USE_SSE)
  41. return _mm_setzero_si128();
  42. #elif defined(JPH_USE_NEON)
  43. return vdupq_n_u32(0);
  44. #else
  45. return UVec4(0, 0, 0, 0);
  46. #endif
  47. }
  48. UVec4 UVec4::sReplicate(uint32 inV)
  49. {
  50. #if defined(JPH_USE_SSE)
  51. return _mm_set1_epi32(int(inV));
  52. #elif defined(JPH_USE_NEON)
  53. return vdupq_n_u32(inV);
  54. #else
  55. return UVec4(inV, inV, inV, inV);
  56. #endif
  57. }
  58. UVec4 UVec4::sLoadInt(const uint32 *inV)
  59. {
  60. #if defined(JPH_USE_SSE)
  61. return _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(inV)));
  62. #elif defined(JPH_USE_NEON)
  63. return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);
  64. #else
  65. return UVec4(*inV, 0, 0, 0);
  66. #endif
  67. }
  68. UVec4 UVec4::sLoadInt4(const uint32 *inV)
  69. {
  70. #if defined(JPH_USE_SSE)
  71. return _mm_loadu_si128(reinterpret_cast<const __m128i *>(inV));
  72. #elif defined(JPH_USE_NEON)
  73. return vld1q_u32(inV);
  74. #else
  75. return UVec4(inV[0], inV[1], inV[2], inV[3]);
  76. #endif
  77. }
  78. UVec4 UVec4::sLoadInt4Aligned(const uint32 *inV)
  79. {
  80. #if defined(JPH_USE_SSE)
  81. return _mm_load_si128(reinterpret_cast<const __m128i *>(inV));
  82. #elif defined(JPH_USE_NEON)
  83. return vld1q_u32(inV); // ARM doesn't make distinction between aligned or not
  84. #else
  85. return UVec4(inV[0], inV[1], inV[2], inV[3]);
  86. #endif
  87. }
  88. template <const int Scale>
  89. UVec4 UVec4::sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
  90. {
  91. #ifdef JPH_USE_AVX2
  92. return _mm_i32gather_epi32(reinterpret_cast<const int *>(inBase), inOffsets.mValue, Scale);
  93. #else
  94. return Vec4::sGatherFloat4<Scale>(reinterpret_cast<const float *>(inBase), inOffsets).ReinterpretAsInt();
  95. #endif
  96. }
  97. UVec4 UVec4::sMin(UVec4Arg inV1, UVec4Arg inV2)
  98. {
  99. #if defined(JPH_USE_SSE4_1)
  100. return _mm_min_epu32(inV1.mValue, inV2.mValue);
  101. #elif defined(JPH_USE_NEON)
  102. return vminq_u32(inV1.mValue, inV2.mValue);
  103. #else
  104. UVec4 result;
  105. for (int i = 0; i < 4; i++)
  106. result.mU32[i] = min(inV1.mU32[i], inV2.mU32[i]);
  107. return result;
  108. #endif
  109. }
  110. UVec4 UVec4::sMax(UVec4Arg inV1, UVec4Arg inV2)
  111. {
  112. #if defined(JPH_USE_SSE4_1)
  113. return _mm_max_epu32(inV1.mValue, inV2.mValue);
  114. #elif defined(JPH_USE_NEON)
  115. return vmaxq_u32(inV1.mValue, inV2.mValue);
  116. #else
  117. UVec4 result;
  118. for (int i = 0; i < 4; i++)
  119. result.mU32[i] = max(inV1.mU32[i], inV2.mU32[i]);
  120. return result;
  121. #endif
  122. }
  123. UVec4 UVec4::sEquals(UVec4Arg inV1, UVec4Arg inV2)
  124. {
  125. #if defined(JPH_USE_SSE)
  126. return _mm_cmpeq_epi32(inV1.mValue, inV2.mValue);
  127. #elif defined(JPH_USE_NEON)
  128. return vceqq_u32(inV1.mValue, inV2.mValue);
  129. #else
  130. return UVec4(inV1.mU32[0] == inV2.mU32[0]? 0xffffffffu : 0,
  131. inV1.mU32[1] == inV2.mU32[1]? 0xffffffffu : 0,
  132. inV1.mU32[2] == inV2.mU32[2]? 0xffffffffu : 0,
  133. inV1.mU32[3] == inV2.mU32[3]? 0xffffffffu : 0);
  134. #endif
  135. }
  136. UVec4 UVec4::sSelect(UVec4Arg inV1, UVec4Arg inV2, UVec4Arg inControl)
  137. {
  138. #if defined(JPH_USE_SSE4_1)
  139. return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inV1.mValue), _mm_castsi128_ps(inV2.mValue), _mm_castsi128_ps(inControl.mValue)));
  140. #elif defined(JPH_USE_NEON)
  141. return vbslq_u32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
  142. #else
  143. UVec4 result;
  144. for (int i = 0; i < 4; i++)
  145. result.mU32[i] = inControl.mU32[i] ? inV2.mU32[i] : inV1.mU32[i];
  146. return result;
  147. #endif
  148. }
  149. UVec4 UVec4::sOr(UVec4Arg inV1, UVec4Arg inV2)
  150. {
  151. #if defined(JPH_USE_SSE)
  152. return _mm_or_si128(inV1.mValue, inV2.mValue);
  153. #elif defined(JPH_USE_NEON)
  154. return vorrq_u32(inV1.mValue, inV2.mValue);
  155. #else
  156. return UVec4(inV1.mU32[0] | inV2.mU32[0],
  157. inV1.mU32[1] | inV2.mU32[1],
  158. inV1.mU32[2] | inV2.mU32[2],
  159. inV1.mU32[3] | inV2.mU32[3]);
  160. #endif
  161. }
  162. UVec4 UVec4::sXor(UVec4Arg inV1, UVec4Arg inV2)
  163. {
  164. #if defined(JPH_USE_SSE)
  165. return _mm_xor_si128(inV1.mValue, inV2.mValue);
  166. #elif defined(JPH_USE_NEON)
  167. return veorq_u32(inV1.mValue, inV2.mValue);
  168. #else
  169. return UVec4(inV1.mU32[0] ^ inV2.mU32[0],
  170. inV1.mU32[1] ^ inV2.mU32[1],
  171. inV1.mU32[2] ^ inV2.mU32[2],
  172. inV1.mU32[3] ^ inV2.mU32[3]);
  173. #endif
  174. }
  175. UVec4 UVec4::sAnd(UVec4Arg inV1, UVec4Arg inV2)
  176. {
  177. #if defined(JPH_USE_SSE)
  178. return _mm_and_si128(inV1.mValue, inV2.mValue);
  179. #elif defined(JPH_USE_NEON)
  180. return vandq_u32(inV1.mValue, inV2.mValue);
  181. #else
  182. return UVec4(inV1.mU32[0] & inV2.mU32[0],
  183. inV1.mU32[1] & inV2.mU32[1],
  184. inV1.mU32[2] & inV2.mU32[2],
  185. inV1.mU32[3] & inV2.mU32[3]);
  186. #endif
  187. }
  188. UVec4 UVec4::sNot(UVec4Arg inV1)
  189. {
  190. #if defined(JPH_USE_AVX512)
  191. return _mm_ternarylogic_epi32(inV1.mValue, inV1.mValue, inV1.mValue, 0b01010101);
  192. #elif defined(JPH_USE_SSE)
  193. return sXor(inV1, sReplicate(0xffffffff));
  194. #elif defined(JPH_USE_NEON)
  195. return vmvnq_u32(inV1.mValue);
  196. #else
  197. return UVec4(~inV1.mU32[0], ~inV1.mU32[1], ~inV1.mU32[2], ~inV1.mU32[3]);
  198. #endif
  199. }
  200. UVec4 UVec4::sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
  201. {
  202. // If inValue.z is false then shift W to Z
  203. UVec4 v = UVec4::sSelect(inIndex.Swizzle<SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>(), inIndex, inValue.SplatZ());
  204. // If inValue.y is false then shift Z and further to Y and further
  205. v = UVec4::sSelect(v.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatY());
  206. // If inValue.x is false then shift X and furhter to Y and furhter
  207. v = UVec4::sSelect(v.Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatX());
  208. return v;
  209. }
  210. UVec4 UVec4::operator * (UVec4Arg inV2) const
  211. {
  212. #if defined(JPH_USE_SSE4_1)
  213. return _mm_mullo_epi32(mValue, inV2.mValue);
  214. #elif defined(JPH_USE_NEON)
  215. return vmulq_u32(mValue, inV2.mValue);
  216. #else
  217. UVec4 result;
  218. for (int i = 0; i < 4; i++)
  219. result.mU32[i] = mU32[i] * inV2.mU32[i];
  220. return result;
  221. #endif
  222. }
  223. UVec4 UVec4::operator + (UVec4Arg inV2)
  224. {
  225. #if defined(JPH_USE_SSE)
  226. return _mm_add_epi32(mValue, inV2.mValue);
  227. #elif defined(JPH_USE_NEON)
  228. return vaddq_u32(mValue, inV2.mValue);
  229. #else
  230. return UVec4(mU32[0] + inV2.mU32[0],
  231. mU32[1] + inV2.mU32[1],
  232. mU32[2] + inV2.mU32[2],
  233. mU32[3] + inV2.mU32[3]);
  234. #endif
  235. }
  236. UVec4 &UVec4::operator += (UVec4Arg inV2)
  237. {
  238. #if defined(JPH_USE_SSE)
  239. mValue = _mm_add_epi32(mValue, inV2.mValue);
  240. #elif defined(JPH_USE_NEON)
  241. mValue = vaddq_u32(mValue, inV2.mValue);
  242. #else
  243. for (int i = 0; i < 4; ++i)
  244. mU32[i] += inV2.mU32[i];
  245. #endif
  246. return *this;
  247. }
  248. UVec4 UVec4::SplatX() const
  249. {
  250. #if defined(JPH_USE_SSE)
  251. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(0, 0, 0, 0));
  252. #elif defined(JPH_USE_NEON)
  253. return vdupq_laneq_u32(mValue, 0);
  254. #else
  255. return UVec4(mU32[0], mU32[0], mU32[0], mU32[0]);
  256. #endif
  257. }
  258. UVec4 UVec4::SplatY() const
  259. {
  260. #if defined(JPH_USE_SSE)
  261. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(1, 1, 1, 1));
  262. #elif defined(JPH_USE_NEON)
  263. return vdupq_laneq_u32(mValue, 1);
  264. #else
  265. return UVec4(mU32[1], mU32[1], mU32[1], mU32[1]);
  266. #endif
  267. }
  268. UVec4 UVec4::SplatZ() const
  269. {
  270. #if defined(JPH_USE_SSE)
  271. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(2, 2, 2, 2));
  272. #elif defined(JPH_USE_NEON)
  273. return vdupq_laneq_u32(mValue, 2);
  274. #else
  275. return UVec4(mU32[2], mU32[2], mU32[2], mU32[2]);
  276. #endif
  277. }
  278. UVec4 UVec4::SplatW() const
  279. {
  280. #if defined(JPH_USE_SSE)
  281. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(3, 3, 3, 3));
  282. #elif defined(JPH_USE_NEON)
  283. return vdupq_laneq_u32(mValue, 3);
  284. #else
  285. return UVec4(mU32[3], mU32[3], mU32[3], mU32[3]);
  286. #endif
  287. }
  288. Vec4 UVec4::ToFloat() const
  289. {
  290. #if defined(JPH_USE_SSE)
  291. return _mm_cvtepi32_ps(mValue);
  292. #elif defined(JPH_USE_NEON)
  293. return vcvtq_f32_s32(mValue);
  294. #else
  295. return Vec4((float)mU32[0], (float)mU32[1], (float)mU32[2], (float)mU32[3]);
  296. #endif
  297. }
  298. Vec4 UVec4::ReinterpretAsFloat() const
  299. {
  300. #if defined(JPH_USE_SSE)
  301. return Vec4(_mm_castsi128_ps(mValue));
  302. #elif defined(JPH_USE_NEON)
  303. return vreinterpretq_f32_s32(mValue);
  304. #else
  305. return *reinterpret_cast<const Vec4 *>(this);
  306. #endif
  307. }
  308. void UVec4::StoreInt4(uint32 *outV) const
  309. {
  310. #if defined(JPH_USE_SSE)
  311. _mm_storeu_si128(reinterpret_cast<__m128i *>(outV), mValue);
  312. #elif defined(JPH_USE_NEON)
  313. vst1q_u32(outV, mValue);
  314. #else
  315. for (int i = 0; i < 4; ++i)
  316. outV[i] = mU32[i];
  317. #endif
  318. }
  319. void UVec4::StoreInt4Aligned(uint32 *outV) const
  320. {
  321. #if defined(JPH_USE_SSE)
  322. _mm_store_si128(reinterpret_cast<__m128i *>(outV), mValue);
  323. #elif defined(JPH_USE_NEON)
  324. vst1q_u32(outV, mValue); // ARM doesn't make distinction between aligned or not
  325. #else
  326. for (int i = 0; i < 4; ++i)
  327. outV[i] = mU32[i];
  328. #endif
  329. }
  330. int UVec4::CountTrues() const
  331. {
  332. #if defined(JPH_USE_SSE)
  333. return CountBits(_mm_movemask_ps(_mm_castsi128_ps(mValue)));
  334. #elif defined(JPH_USE_NEON)
  335. return vaddvq_u32(vshrq_n_u32(mValue, 31));
  336. #else
  337. return (mU32[0] >> 31) + (mU32[1] >> 31) + (mU32[2] >> 31) + (mU32[3] >> 31);
  338. #endif
  339. }
  340. int UVec4::GetTrues() const
  341. {
  342. #if defined(JPH_USE_SSE)
  343. return _mm_movemask_ps(_mm_castsi128_ps(mValue));
  344. #elif defined(JPH_USE_NEON)
  345. int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
  346. return vaddvq_u32(vshlq_u32(vshrq_n_u32(mValue, 31), shift));
  347. #else
  348. return (mU32[0] >> 31) | ((mU32[1] >> 31) << 1) | ((mU32[2] >> 31) << 2) | ((mU32[3] >> 31) << 3);
  349. #endif
  350. }
  351. bool UVec4::TestAnyTrue() const
  352. {
  353. return GetTrues() != 0;
  354. }
  355. bool UVec4::TestAnyXYZTrue() const
  356. {
  357. return (GetTrues() & 0b111) != 0;
  358. }
  359. bool UVec4::TestAllTrue() const
  360. {
  361. return GetTrues() == 0b1111;
  362. }
  363. bool UVec4::TestAllXYZTrue() const
  364. {
  365. return (GetTrues() & 0b111) == 0b111;
  366. }
  367. template <const uint Count>
  368. UVec4 UVec4::LogicalShiftLeft() const
  369. {
  370. static_assert(Count <= 31, "Invalid shift");
  371. #if defined(JPH_USE_SSE)
  372. return _mm_slli_epi32(mValue, Count);
  373. #elif defined(JPH_USE_NEON)
  374. return vshlq_n_u32(mValue, Count);
  375. #else
  376. return UVec4(mU32[0] << Count, mU32[1] << Count, mU32[2] << Count, mU32[3] << Count);
  377. #endif
  378. }
  379. template <const uint Count>
  380. UVec4 UVec4::LogicalShiftRight() const
  381. {
  382. static_assert(Count <= 31, "Invalid shift");
  383. #if defined(JPH_USE_SSE)
  384. return _mm_srli_epi32(mValue, Count);
  385. #elif defined(JPH_USE_NEON)
  386. return vshrq_n_u32(mValue, Count);
  387. #else
  388. return UVec4(mU32[0] >> Count, mU32[1] >> Count, mU32[2] >> Count, mU32[3] >> Count);
  389. #endif
  390. }
  391. template <const uint Count>
  392. UVec4 UVec4::ArithmeticShiftRight() const
  393. {
  394. static_assert(Count <= 31, "Invalid shift");
  395. #if defined(JPH_USE_SSE)
  396. return _mm_srai_epi32(mValue, Count);
  397. #elif defined(JPH_USE_NEON)
  398. return vshrq_n_s32(mValue, Count);
  399. #else
  400. return UVec4(uint32(int32_t(mU32[0]) >> Count),
  401. uint32(int32_t(mU32[1]) >> Count),
  402. uint32(int32_t(mU32[2]) >> Count),
  403. uint32(int32_t(mU32[3]) >> Count));
  404. #endif
  405. }
  406. UVec4 UVec4::Expand4Uint16Lo() const
  407. {
  408. #if defined(JPH_USE_SSE)
  409. return _mm_unpacklo_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
  410. #elif defined(JPH_USE_NEON)
  411. int16x4_t value = vget_low_s16(mValue);
  412. int16x4_t zero = vdup_n_s16(0);
  413. return vcombine_s16(vzip1_s16(value, zero), vzip2_s16(value, zero));
  414. #else
  415. return UVec4(mU32[0] & 0xffff,
  416. (mU32[0] >> 16) & 0xffff,
  417. mU32[1] & 0xffff,
  418. (mU32[1] >> 16) & 0xffff);
  419. #endif
  420. }
  421. UVec4 UVec4::Expand4Uint16Hi() const
  422. {
  423. #if defined(JPH_USE_SSE)
  424. return _mm_unpackhi_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
  425. #elif defined(JPH_USE_NEON)
  426. int16x4_t value = vget_high_s16(mValue);
  427. int16x4_t zero = vdup_n_s16(0);
  428. return vcombine_s16(vzip1_s16(value, zero), vzip2_s16(value, zero));
  429. #else
  430. return UVec4(mU32[2] & 0xffff,
  431. (mU32[2] >> 16) & 0xffff,
  432. mU32[3] & 0xffff,
  433. (mU32[3] >> 16) & 0xffff);
  434. #endif
  435. }
  436. UVec4 UVec4::Expand4Byte0() const
  437. {
  438. #if defined(JPH_USE_SSE4_1)
  439. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));
  440. #elif defined(JPH_USE_NEON)
  441. int8x16_t idx = JPH_NEON_INT8x16(0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f);
  442. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  443. #else
  444. UVec4 result;
  445. for (int i = 0; i < 4; i++)
  446. result.mU32[i] = (mU32[0] >> (i * 8)) & 0xff;
  447. return result;
  448. #endif
  449. }
  450. UVec4 UVec4::Expand4Byte4() const
  451. {
  452. #if defined(JPH_USE_SSE4_1)
  453. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));
  454. #elif defined(JPH_USE_NEON)
  455. int8x16_t idx = JPH_NEON_INT8x16(0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f);
  456. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  457. #else
  458. UVec4 result;
  459. for (int i = 0; i < 4; i++)
  460. result.mU32[i] = (mU32[1] >> (i * 8)) & 0xff;
  461. return result;
  462. #endif
  463. }
  464. UVec4 UVec4::Expand4Byte8() const
  465. {
  466. #if defined(JPH_USE_SSE4_1)
  467. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));
  468. #elif defined(JPH_USE_NEON)
  469. int8x16_t idx = JPH_NEON_INT8x16(0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f);
  470. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  471. #else
  472. UVec4 result;
  473. for (int i = 0; i < 4; i++)
  474. result.mU32[i] = (mU32[2] >> (i * 8)) & 0xff;
  475. return result;
  476. #endif
  477. }
  478. UVec4 UVec4::Expand4Byte12() const
  479. {
  480. #if defined(JPH_USE_SSE4_1)
  481. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));
  482. #elif defined(JPH_USE_NEON)
  483. int8x16_t idx = JPH_NEON_INT8x16(0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f);
  484. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  485. #else
  486. UVec4 result;
  487. for (int i = 0; i < 4; i++)
  488. result.mU32[i] = (mU32[3] >> (i * 8)) & 0xff;
  489. return result;
  490. #endif
  491. }
  492. UVec4 UVec4::ShiftComponents4Minus(int inCount) const
  493. {
  494. #if defined(JPH_USE_SSE4_1)
  495. return _mm_shuffle_epi8(mValue, sFourMinusXShuffle[inCount].mValue);
  496. #elif defined(JPH_USE_NEON)
  497. uint8x16_t idx = vreinterpretq_u8_u32(sFourMinusXShuffle[inCount].mValue);
  498. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  499. #else
  500. UVec4 result = UVec4::sZero();
  501. for (int i = 0; i < inCount; i++)
  502. result.mU32[i] = mU32[i + 4 - inCount];
  503. return result;
  504. #endif
  505. }
  506. JPH_NAMESPACE_END