UVec4.inl 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. namespace JPH {
  4. UVec4::UVec4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW)
  5. {
  6. #if defined(JPH_USE_SSE)
  7. mValue = _mm_set_epi32(int(inW), int(inZ), int(inY), int(inX));
  8. #elif defined(JPH_USE_NEON)
  9. uint32x2_t xy = vcreate_u32(static_cast<uint64>(inX) | (static_cast<uint64>(inY) << 32));
  10. uint32x2_t zw = vcreate_u32(static_cast<uint64>(inZ) | (static_cast<uint64>(inW) << 32));
  11. mValue = vcombine_u32(xy, zw);
  12. #else
  13. #error Undefined CPU architecture
  14. #endif
  15. }
  16. bool UVec4::operator == (UVec4Arg inV2) const
  17. {
  18. return sEquals(*this, inV2).TestAllTrue();
  19. }
  20. template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
  21. UVec4 UVec4::Swizzle() const
  22. {
  23. static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
  24. static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
  25. static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
  26. static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
  27. #if defined(JPH_USE_SSE)
  28. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
  29. #elif defined(JPH_USE_NEON)
  30. return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
  31. #else
  32. #error Unsupported CPU architecture
  33. #endif
  34. }
  35. UVec4 UVec4::sZero()
  36. {
  37. #if defined(JPH_USE_SSE)
  38. return _mm_setzero_si128();
  39. #elif defined(JPH_USE_NEON)
  40. return vdupq_n_u32(0);
  41. #else
  42. #error Unsupported CPU architecture
  43. #endif
  44. }
  45. UVec4 UVec4::sReplicate(uint32 inV)
  46. {
  47. #if defined(JPH_USE_SSE)
  48. return _mm_set1_epi32(int(inV));
  49. #elif defined(JPH_USE_NEON)
  50. return vdupq_n_u32(inV);
  51. #else
  52. #error Unsupported CPU architecture
  53. #endif
  54. }
  55. UVec4 UVec4::sLoadInt(const uint32 *inV)
  56. {
  57. #if defined(JPH_USE_SSE)
  58. return _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(inV)));
  59. #elif defined(JPH_USE_NEON)
  60. return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);
  61. #else
  62. #error Unsupported CPU architecture
  63. #endif
  64. }
  65. UVec4 UVec4::sLoadInt4(const uint32 *inV)
  66. {
  67. #if defined(JPH_USE_SSE)
  68. return _mm_loadu_si128(reinterpret_cast<const __m128i *>(inV));
  69. #elif defined(JPH_USE_NEON)
  70. return vld1q_u32(inV);
  71. #else
  72. #error Unsupported CPU architecture
  73. #endif
  74. }
  75. UVec4 UVec4::sLoadInt4Aligned(const uint32 *inV)
  76. {
  77. #if defined(JPH_USE_SSE)
  78. return _mm_load_si128(reinterpret_cast<const __m128i *>(inV));
  79. #elif defined(JPH_USE_NEON)
  80. return vld1q_u32(inV); // ARM doesn't make distinction between aligned or not
  81. #else
  82. #error Unsupported CPU architecture
  83. #endif
  84. }
  85. template <const int Scale>
  86. UVec4 UVec4::sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
  87. {
  88. #ifdef JPH_USE_AVX2
  89. return _mm_i32gather_epi32(reinterpret_cast<const int *>(inBase), inOffsets.mValue, Scale);
  90. #else
  91. return Vec4::sGatherFloat4<Scale>(reinterpret_cast<const float *>(inBase), inOffsets).ReinterpretAsInt();
  92. #endif
  93. }
  94. UVec4 UVec4::sMin(UVec4Arg inV1, UVec4Arg inV2)
  95. {
  96. #if defined(JPH_USE_SSE4_1)
  97. return _mm_min_epu32(inV1.mValue, inV2.mValue);
  98. #elif defined(JPH_USE_NEON)
  99. return vminq_u32(inV1.mValue, inV2.mValue);
  100. #else
  101. UVec4 result;
  102. for (int i = 0; i < 4; i++)
  103. {
  104. result.mU32[i] = min(inV1.mU32[i], inV2.mU32[i]);
  105. }
  106. return result;
  107. #endif
  108. }
  109. UVec4 UVec4::sMax(UVec4Arg inV1, UVec4Arg inV2)
  110. {
  111. #if defined(JPH_USE_SSE4_1)
  112. return _mm_max_epu32(inV1.mValue, inV2.mValue);
  113. #elif defined(JPH_USE_NEON)
  114. return vmaxq_u32(inV1.mValue, inV2.mValue);
  115. #else
  116. UVec4 result;
  117. for (int i = 0; i < 4; i++)
  118. {
  119. result.mU32[i] = max(inV1.mU32[i], inV2.mU32[i]);
  120. }
  121. return result;
  122. #endif
  123. }
  124. UVec4 UVec4::sEquals(UVec4Arg inV1, UVec4Arg inV2)
  125. {
  126. #if defined(JPH_USE_SSE)
  127. return _mm_cmpeq_epi32(inV1.mValue, inV2.mValue);
  128. #elif defined(JPH_USE_NEON)
  129. return vceqq_u32(inV1.mValue, inV2.mValue);
  130. #else
  131. #error Unsupported CPU architecture
  132. #endif
  133. }
  134. UVec4 UVec4::sSelect(UVec4Arg inV1, UVec4Arg inV2, UVec4Arg inControl)
  135. {
  136. #if defined(JPH_USE_SSE4_1)
  137. return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inV1.mValue), _mm_castsi128_ps(inV2.mValue), _mm_castsi128_ps(inControl.mValue)));
  138. #elif defined(JPH_USE_NEON)
  139. return vbslq_u32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
  140. #else
  141. UVec4 result;
  142. for (int i = 0; i < 4; i++)
  143. {
  144. result.mU32[i] = inControl.mU32[i] ? inV2.mU32[i] : inV1.mU32[i];
  145. }
  146. return result;
  147. #endif
  148. }
  149. UVec4 UVec4::sOr(UVec4Arg inV1, UVec4Arg inV2)
  150. {
  151. #if defined(JPH_USE_SSE)
  152. return _mm_or_si128(inV1.mValue, inV2.mValue);
  153. #elif defined(JPH_USE_NEON)
  154. return vorrq_u32(inV1.mValue, inV2.mValue);
  155. #else
  156. #error Unsupported CPU architecture
  157. #endif
  158. }
  159. UVec4 UVec4::sXor(UVec4Arg inV1, UVec4Arg inV2)
  160. {
  161. #if defined(JPH_USE_SSE)
  162. return _mm_xor_si128(inV1.mValue, inV2.mValue);
  163. #elif defined(JPH_USE_NEON)
  164. return veorq_u32(inV1.mValue, inV2.mValue);
  165. #else
  166. #error Unsupported CPU architecture
  167. #endif
  168. }
  169. UVec4 UVec4::sAnd(UVec4Arg inV1, UVec4Arg inV2)
  170. {
  171. #if defined(JPH_USE_SSE)
  172. return _mm_and_si128(inV1.mValue, inV2.mValue);
  173. #elif defined(JPH_USE_NEON)
  174. return vandq_u32(inV1.mValue, inV2.mValue);
  175. #else
  176. #error Unsupported CPU architecture
  177. #endif
  178. }
  179. UVec4 UVec4::sNot(UVec4Arg inV1)
  180. {
  181. #if defined(JPH_USE_SSE)
  182. return sXor(inV1, sReplicate(0xffffffff));
  183. #elif defined(JPH_USE_NEON)
  184. return vmvnq_u32(inV1.mValue);
  185. #else
  186. #error Unsupported CPU architecture
  187. #endif
  188. }
  189. UVec4 UVec4::sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
  190. {
  191. // If inValue.z is false then shift W to Z
  192. UVec4 v = UVec4::sSelect(inIndex.Swizzle<SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>(), inIndex, inValue.SplatZ());
  193. // If inValue.y is false then shift Z and further to Y and further
  194. v = UVec4::sSelect(v.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatY());
  195. // If inValue.x is false then shift X and furhter to Y and furhter
  196. v = UVec4::sSelect(v.Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatX());
  197. return v;
  198. }
  199. UVec4 UVec4::operator * (UVec4Arg inV2) const
  200. {
  201. #if defined(JPH_USE_SSE4_1)
  202. return _mm_mullo_epi32(mValue, inV2.mValue);
  203. #elif defined(JPH_USE_NEON)
  204. return vmulq_u32(mValue, inV2.mValue);
  205. #else
  206. UVec4 result;
  207. for (int i = 0; i < 4; i++)
  208. {
  209. result.mU32[i] = mU32[i] * inV2.mU32[i];
  210. }
  211. return result;
  212. #endif
  213. }
  214. UVec4 UVec4::operator + (UVec4Arg inV2)
  215. {
  216. #if defined(JPH_USE_SSE)
  217. return _mm_add_epi32(mValue, inV2.mValue);
  218. #elif defined(JPH_USE_NEON)
  219. return vaddq_u32(mValue, inV2.mValue);
  220. #else
  221. #error Unsupported CPU architecture
  222. #endif
  223. }
  224. UVec4 &UVec4::operator += (UVec4Arg inV2)
  225. {
  226. #if defined(JPH_USE_SSE)
  227. mValue = _mm_add_epi32(mValue, inV2.mValue);
  228. #elif defined(JPH_USE_NEON)
  229. mValue = vaddq_u32(mValue, inV2.mValue);
  230. #else
  231. #error Unsupported CPU architecture
  232. #endif
  233. return *this;
  234. }
  235. UVec4 UVec4::SplatX() const
  236. {
  237. #if defined(JPH_USE_SSE)
  238. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(0, 0, 0, 0));
  239. #elif defined(JPH_USE_NEON)
  240. return vdupq_laneq_u32(mValue, 0);
  241. #else
  242. #error Unsupported CPU architecture
  243. #endif
  244. }
  245. UVec4 UVec4::SplatY() const
  246. {
  247. #if defined(JPH_USE_SSE)
  248. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(1, 1, 1, 1));
  249. #elif defined(JPH_USE_NEON)
  250. return vdupq_laneq_u32(mValue, 1);
  251. #else
  252. #error Unsupported CPU architecture
  253. #endif
  254. }
  255. UVec4 UVec4::SplatZ() const
  256. {
  257. #if defined(JPH_USE_SSE)
  258. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(2, 2, 2, 2));
  259. #elif defined(JPH_USE_NEON)
  260. return vdupq_laneq_u32(mValue, 2);
  261. #else
  262. #error Unsupported CPU architecture
  263. #endif
  264. }
  265. UVec4 UVec4::SplatW() const
  266. {
  267. #if defined(JPH_USE_SSE)
  268. return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(3, 3, 3, 3));
  269. #elif defined(JPH_USE_NEON)
  270. return vdupq_laneq_u32(mValue, 3);
  271. #else
  272. #error Unsupported CPU architecture
  273. #endif
  274. }
  275. Vec4 UVec4::ToFloat() const
  276. {
  277. #if defined(JPH_USE_SSE)
  278. return _mm_cvtepi32_ps(mValue);
  279. #elif defined(JPH_USE_NEON)
  280. return vcvtq_f32_s32(mValue);
  281. #else
  282. #error Unsupported CPU architecture
  283. #endif
  284. }
  285. Vec4 UVec4::ReinterpretAsFloat() const
  286. {
  287. #if defined(JPH_USE_SSE)
  288. return Vec4(_mm_castsi128_ps(mValue));
  289. #elif defined(JPH_USE_NEON)
  290. return vreinterpretq_f32_s32(mValue);
  291. #else
  292. #error Unsupported CPU architecture
  293. #endif
  294. }
  295. void UVec4::StoreInt4(uint32 *outV) const
  296. {
  297. #if defined(JPH_USE_SSE)
  298. _mm_storeu_si128(reinterpret_cast<__m128i *>(outV), mValue);
  299. #elif defined(JPH_USE_NEON)
  300. vst1q_u32(outV, mValue);
  301. #else
  302. #error Unsupported CPU architecture
  303. #endif
  304. }
  305. void UVec4::StoreInt4Aligned(uint32 *outV) const
  306. {
  307. #if defined(JPH_USE_SSE)
  308. _mm_store_si128(reinterpret_cast<__m128i *>(outV), mValue);
  309. #elif defined(JPH_USE_NEON)
  310. vst1q_u32(outV, mValue); // ARM doesn't make distinction between aligned or not
  311. #else
  312. #error Unsupported CPU architecture
  313. #endif
  314. }
  315. int UVec4::CountTrues() const
  316. {
  317. #if defined(JPH_USE_SSE)
  318. return CountBits(_mm_movemask_ps(_mm_castsi128_ps(mValue)));
  319. #elif defined(JPH_USE_NEON)
  320. return vaddvq_u32(vshrq_n_u32(mValue, 31));
  321. #else
  322. #error Unsupported CPU architecture
  323. #endif
  324. }
  325. int UVec4::GetTrues() const
  326. {
  327. #if defined(JPH_USE_SSE)
  328. return _mm_movemask_ps(_mm_castsi128_ps(mValue));
  329. #elif defined(JPH_USE_NEON)
  330. int32x4_t shift = { 0, 1, 2, 3 };
  331. return vaddvq_u32(vshlq_u32(vshrq_n_u32(mValue, 31), shift));
  332. #else
  333. #error Unsupported CPU architecture
  334. #endif
  335. }
  336. bool UVec4::TestAnyTrue() const
  337. {
  338. return GetTrues() != 0;
  339. }
  340. bool UVec4::TestAnyXYZTrue() const
  341. {
  342. return (GetTrues() & 0b111) != 0;
  343. }
  344. bool UVec4::TestAllTrue() const
  345. {
  346. return GetTrues() == 0b1111;
  347. }
  348. bool UVec4::TestAllXYZTrue() const
  349. {
  350. return (GetTrues() & 0b111) == 0b111;
  351. }
  352. template <const uint Count>
  353. UVec4 UVec4::LogicalShiftLeft() const
  354. {
  355. static_assert(Count <= 31, "Invalid shift");
  356. #if defined(JPH_USE_SSE)
  357. return _mm_slli_epi32(mValue, Count);
  358. #elif defined(JPH_USE_NEON)
  359. return vshlq_n_u32(mValue, Count);
  360. #else
  361. #error Unsupported CPU architecture
  362. #endif
  363. }
  364. template <const uint Count>
  365. UVec4 UVec4::LogicalShiftRight() const
  366. {
  367. static_assert(Count <= 31, "Invalid shift");
  368. #if defined(JPH_USE_SSE)
  369. return _mm_srli_epi32(mValue, Count);
  370. #elif defined(JPH_USE_NEON)
  371. return vshrq_n_u32(mValue, Count);
  372. #else
  373. #error Unsupported CPU architecture
  374. #endif
  375. }
  376. template <const uint Count>
  377. UVec4 UVec4::ArithmeticShiftRight() const
  378. {
  379. static_assert(Count <= 31, "Invalid shift");
  380. #if defined(JPH_USE_SSE)
  381. return _mm_srai_epi32(mValue, Count);
  382. #elif defined(JPH_USE_NEON)
  383. return vshrq_n_s32(mValue, Count);
  384. #else
  385. #error Unsupported CPU architecture
  386. #endif
  387. }
  388. UVec4 UVec4::Expand4Uint16Lo() const
  389. {
  390. #if defined(JPH_USE_SSE)
  391. return _mm_unpacklo_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
  392. #elif defined(JPH_USE_NEON)
  393. int16x4_t value = vget_low_s16(mValue);
  394. int16x4_t zero = vdup_n_s16(0);
  395. return vcombine_s16(vzip1_s16(value, zero), vzip2_s16(value, zero));
  396. #else
  397. #error Unsupported CPU architecture
  398. #endif
  399. }
  400. UVec4 UVec4::Expand4Uint16Hi() const
  401. {
  402. #if defined(JPH_USE_SSE)
  403. return _mm_unpackhi_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
  404. #elif defined(JPH_USE_NEON)
  405. int16x4_t value = vget_high_s16(mValue);
  406. int16x4_t zero = vdup_n_s16(0);
  407. return vcombine_s16(vzip1_s16(value, zero), vzip2_s16(value, zero));
  408. #else
  409. #error Unsupported CPU architecture
  410. #endif
  411. }
  412. UVec4 UVec4::Expand4Byte0() const
  413. {
  414. #if defined(JPH_USE_SSE4_1)
  415. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));
  416. #elif defined(JPH_USE_NEON)
  417. int8x16_t idx = { 0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f };
  418. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  419. #else
  420. UVec4 result;
  421. for (int i = 0; i < 4; i++)
  422. {
  423. result.mU32[i] = (mU32[0] >> (i * 8)) & 0xff;
  424. }
  425. return result;
  426. #endif
  427. }
  428. UVec4 UVec4::Expand4Byte4() const
  429. {
  430. #if defined(JPH_USE_SSE4_1)
  431. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));
  432. #elif defined(JPH_USE_NEON)
  433. int8x16_t idx = { 0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f };
  434. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  435. #else
  436. UVec4 result;
  437. for (int i = 0; i < 4; i++)
  438. {
  439. result.mU32[i] = (mU32[1] >> (i * 8)) & 0xff;
  440. }
  441. return result;
  442. #endif
  443. }
  444. UVec4 UVec4::Expand4Byte8() const
  445. {
  446. #if defined(JPH_USE_SSE4_1)
  447. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));
  448. #elif defined(JPH_USE_NEON)
  449. int8x16_t idx = { 0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f };
  450. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  451. #else
  452. UVec4 result;
  453. for (int i = 0; i < 4; i++)
  454. {
  455. result.mU32[i] = (mU32[2] >> (i * 8)) & 0xff;
  456. }
  457. return result;
  458. #endif
  459. }
  460. UVec4 UVec4::Expand4Byte12() const
  461. {
  462. #if defined(JPH_USE_SSE4_1)
  463. return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));
  464. #elif defined(JPH_USE_NEON)
  465. int8x16_t idx = { 0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f };
  466. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  467. #else
  468. UVec4 result;
  469. for (int i = 0; i < 4; i++)
  470. {
  471. result.mU32[i] = (mU32[3] >> (i * 8)) & 0xff;
  472. }
  473. return result;
  474. #endif
  475. }
  476. UVec4 UVec4::ShiftComponents4Minus(int inCount) const
  477. {
  478. #if defined(JPH_USE_SSE4_1)
  479. return _mm_shuffle_epi8(mValue, sFourMinusXShuffle[inCount].mValue);
  480. #elif defined(JPH_USE_NEON)
  481. uint8x16_t idx = vreinterpretq_u8_u32(sFourMinusXShuffle[inCount].mValue);
  482. return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
  483. #else
  484. UVec4 result(0, 0, 0, 0);
  485. for (int i = 0; i < inCount; i++)
  486. {
  487. result.mU32[i] = mU32[i + 4 - inCount];
  488. }
  489. return result;
  490. #endif
  491. }
  492. } // JPH