gtc_bitfield.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997
  1. #include <glm/gtc/bitfield.hpp>
  2. #include <glm/gtc/type_precision.hpp>
  3. #include <glm/vector_relational.hpp>
  4. #include <glm/integer.hpp>
  5. #include <ctime>
  6. #include <cstdio>
  7. #include <vector>
  8. namespace mask
  9. {
  10. template<typename genType>
  11. struct type
  12. {
  13. genType Value;
  14. genType Return;
  15. };
  16. #if GLM_COMPILER & GLM_COMPILER_CLANG
  17. # pragma clang diagnostic push
  18. # pragma clang diagnostic ignored "-Wsign-conversion"
  19. #endif
  20. static inline int mask_zero(int Bits)
  21. {
  22. return ~((~0) << Bits);
  23. }
  24. #if GLM_COMPILER & GLM_COMPILER_CLANG
  25. # pragma clang diagnostic push
  26. # pragma clang diagnostic ignored "-Wsign-compare"
  27. #endif
  28. static inline int mask_mix(int Bits)
  29. {
  30. return Bits >= sizeof(int) * 8 ? 0xffffffff : (static_cast<int>(1) << Bits) - static_cast<int>(1);
  31. }
  32. #if GLM_COMPILER & GLM_COMPILER_CLANG
  33. # pragma clang diagnostic pop
  34. #endif
  35. static inline int mask_half(int Bits)
  36. {
  37. // We do the shift in two steps because 1 << 32 on an int is undefined.
  38. int const Half = Bits >> 1;
  39. int const Fill = ~0;
  40. int const ShiftHaft = (Fill << Half);
  41. int const Rest = Bits - Half;
  42. int const Reversed = ShiftHaft << Rest;
  43. return ~Reversed;
  44. }
  45. static inline int mask_loop(int Bits)
  46. {
  47. int Mask = 0;
  48. for(int Bit = 0; Bit < Bits; ++Bit)
  49. Mask |= (static_cast<int>(1) << Bit);
  50. return Mask;
  51. }
  52. static int perf()
  53. {
  54. int const Count = 100000000;
  55. std::clock_t Timestamp1 = std::clock();
  56. {
  57. std::vector<int> Mask;
  58. Mask.resize(Count);
  59. for(int i = 0; i < Count; ++i)
  60. Mask[i] = mask_mix(i % 32);
  61. }
  62. std::clock_t Timestamp2 = std::clock();
  63. {
  64. std::vector<int> Mask;
  65. Mask.resize(Count);
  66. for(int i = 0; i < Count; ++i)
  67. Mask[i] = mask_loop(i % 32);
  68. }
  69. std::clock_t Timestamp3 = std::clock();
  70. {
  71. std::vector<int> Mask;
  72. Mask.resize(Count);
  73. for(int i = 0; i < Count; ++i)
  74. Mask[i] = glm::mask(i % 32);
  75. }
  76. std::clock_t Timestamp4 = std::clock();
  77. {
  78. std::vector<int> Mask;
  79. Mask.resize(Count);
  80. for(int i = 0; i < Count; ++i)
  81. Mask[i] = mask_zero(i % 32);
  82. }
  83. std::clock_t Timestamp5 = std::clock();
  84. {
  85. std::vector<int> Mask;
  86. Mask.resize(Count);
  87. for(int i = 0; i < Count; ++i)
  88. Mask[i] = mask_half(i % 32);
  89. }
  90. std::clock_t Timestamp6 = std::clock();
  91. std::clock_t TimeMix = Timestamp2 - Timestamp1;
  92. std::clock_t TimeLoop = Timestamp3 - Timestamp2;
  93. std::clock_t TimeDefault = Timestamp4 - Timestamp3;
  94. std::clock_t TimeZero = Timestamp5 - Timestamp4;
  95. std::clock_t TimeHalf = Timestamp6 - Timestamp5;
  96. std::printf("mask[mix]: %d\n", static_cast<unsigned int>(TimeMix));
  97. std::printf("mask[loop]: %d\n", static_cast<unsigned int>(TimeLoop));
  98. std::printf("mask[default]: %d\n", static_cast<unsigned int>(TimeDefault));
  99. std::printf("mask[zero]: %d\n", static_cast<unsigned int>(TimeZero));
  100. std::printf("mask[half]: %d\n", static_cast<unsigned int>(TimeHalf));
  101. return TimeDefault < TimeLoop ? 0 : 1;
  102. }
  103. #if GLM_COMPILER & GLM_COMPILER_CLANG
  104. # pragma clang diagnostic pop
  105. #endif
  106. static int test_int()
  107. {
  108. type<int> const Data[] =
  109. {
  110. { 0, 0x00000000},
  111. { 1, 0x00000001},
  112. { 2, 0x00000003},
  113. { 3, 0x00000007},
  114. {31, 0x7fffffff}
  115. };
  116. int Error = 0;
  117. /* mask_zero is sadly not a correct code
  118. for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
  119. {
  120. int Result = mask_zero(Data[i].Value);
  121. Error += Data[i].Return == Result ? 0 : 1;
  122. }
  123. */
  124. for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
  125. {
  126. int Result = mask_mix(Data[i].Value);
  127. Error += Data[i].Return == Result ? 0 : 1;
  128. }
  129. for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
  130. {
  131. int Result = mask_half(Data[i].Value);
  132. Error += Data[i].Return == Result ? 0 : 1;
  133. }
  134. for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
  135. {
  136. int Result = mask_loop(Data[i].Value);
  137. Error += Data[i].Return == Result ? 0 : 1;
  138. }
  139. for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
  140. {
  141. int Result = glm::mask(Data[i].Value);
  142. Error += Data[i].Return == Result ? 0 : 1;
  143. }
  144. return Error;
  145. }
  146. static int test_ivec4()
  147. {
  148. type<glm::ivec4> const Data[] =
  149. {
  150. {glm::ivec4( 0), glm::ivec4(0x00000000)},
  151. {glm::ivec4( 1), glm::ivec4(0x00000001)},
  152. {glm::ivec4( 2), glm::ivec4(0x00000003)},
  153. {glm::ivec4( 3), glm::ivec4(0x00000007)}
  154. };
  155. int Error(0);
  156. for(std::size_t i = 0, n = sizeof(Data) / sizeof(type<glm::ivec4>); i < n; ++i)
  157. {
  158. glm::ivec4 Result = glm::mask(Data[i].Value);
  159. Error += glm::all(glm::equal(Data[i].Return, Result)) ? 0 : 1;
  160. }
  161. return Error;
  162. }
  163. static int test()
  164. {
  165. int Error(0);
  166. Error += test_int();
  167. Error += test_ivec4();
  168. return Error;
  169. }
  170. }//namespace mask
  171. namespace bitfieldInterleave3
  172. {
  173. template<typename PARAM, typename RET>
  174. static inline RET refBitfieldInterleave(PARAM x, PARAM y, PARAM z)
  175. {
  176. RET Result = 0;
  177. for(RET i = 0; i < sizeof(PARAM) * 8; ++i)
  178. {
  179. Result |= ((RET(x) & (RET(1U) << i)) << ((i << 1) + 0));
  180. Result |= ((RET(y) & (RET(1U) << i)) << ((i << 1) + 1));
  181. Result |= ((RET(z) & (RET(1U) << i)) << ((i << 1) + 2));
  182. }
  183. return Result;
  184. }
  185. static int test()
  186. {
  187. int Error(0);
  188. glm::uint16 x_max = 1 << 11;
  189. glm::uint16 y_max = 1 << 11;
  190. glm::uint16 z_max = 1 << 11;
  191. for(glm::uint16 z = 0; z < z_max; z += 27)
  192. for(glm::uint16 y = 0; y < y_max; y += 27)
  193. for(glm::uint16 x = 0; x < x_max; x += 27)
  194. {
  195. glm::uint64 ResultA = refBitfieldInterleave<glm::uint16, glm::uint64>(x, y, z);
  196. glm::uint64 ResultB = glm::bitfieldInterleave(x, y, z);
  197. Error += ResultA == ResultB ? 0 : 1;
  198. }
  199. return Error;
  200. }
  201. }
  202. namespace bitfieldInterleave4
  203. {
  204. template<typename PARAM, typename RET>
  205. static inline RET loopBitfieldInterleave(PARAM x, PARAM y, PARAM z, PARAM w)
  206. {
  207. RET const v[4] = {x, y, z, w};
  208. RET Result = 0;
  209. for(RET i = 0; i < sizeof(PARAM) * 8; i++)
  210. {
  211. Result |= ((((v[0] >> i) & 1U)) << ((i << 2) + 0));
  212. Result |= ((((v[1] >> i) & 1U)) << ((i << 2) + 1));
  213. Result |= ((((v[2] >> i) & 1U)) << ((i << 2) + 2));
  214. Result |= ((((v[3] >> i) & 1U)) << ((i << 2) + 3));
  215. }
  216. return Result;
  217. }
  218. static int test()
  219. {
  220. int Error(0);
  221. glm::uint16 x_max = 1 << 11;
  222. glm::uint16 y_max = 1 << 11;
  223. glm::uint16 z_max = 1 << 11;
  224. glm::uint16 w_max = 1 << 11;
  225. for(glm::uint16 w = 0; w < w_max; w += 27)
  226. for(glm::uint16 z = 0; z < z_max; z += 27)
  227. for(glm::uint16 y = 0; y < y_max; y += 27)
  228. for(glm::uint16 x = 0; x < x_max; x += 27)
  229. {
  230. glm::uint64 ResultA = loopBitfieldInterleave<glm::uint16, glm::uint64>(x, y, z, w);
  231. glm::uint64 ResultB = glm::bitfieldInterleave(x, y, z, w);
  232. Error += ResultA == ResultB ? 0 : 1;
  233. }
  234. return Error;
  235. }
  236. }
  237. namespace bitfieldInterleave
  238. {
  239. static inline glm::uint64 fastBitfieldInterleave(glm::uint32 x, glm::uint32 y)
  240. {
  241. glm::uint64 REG1;
  242. glm::uint64 REG2;
  243. REG1 = x;
  244. REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF);
  245. REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF);
  246. REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  247. REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333);
  248. REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555);
  249. REG2 = y;
  250. REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF);
  251. REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF);
  252. REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  253. REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333);
  254. REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555);
  255. return REG1 | (REG2 << 1);
  256. }
  257. static inline glm::uint64 interleaveBitfieldInterleave(glm::uint32 x, glm::uint32 y)
  258. {
  259. glm::uint64 REG1;
  260. glm::uint64 REG2;
  261. REG1 = x;
  262. REG2 = y;
  263. REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF);
  264. REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF);
  265. REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF);
  266. REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF);
  267. REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  268. REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  269. REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333);
  270. REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333);
  271. REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555);
  272. REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555);
  273. return REG1 | (REG2 << 1);
  274. }
  275. /*
  276. static inline glm::uint64 loopBitfieldInterleave(glm::uint32 x, glm::uint32 y)
  277. {
  278. static glm::uint64 const Mask[5] =
  279. {
  280. 0x5555555555555555,
  281. 0x3333333333333333,
  282. 0x0F0F0F0F0F0F0F0F,
  283. 0x00FF00FF00FF00FF,
  284. 0x0000FFFF0000FFFF
  285. };
  286. glm::uint64 REG1 = x;
  287. glm::uint64 REG2 = y;
  288. for(int i = 4; i >= 0; --i)
  289. {
  290. REG1 = ((REG1 << (1 << i)) | REG1) & Mask[i];
  291. REG2 = ((REG2 << (1 << i)) | REG2) & Mask[i];
  292. }
  293. return REG1 | (REG2 << 1);
  294. }
  295. */
  296. #if GLM_ARCH & GLM_ARCH_SSE2_BIT
  297. static inline glm::uint64 sseBitfieldInterleave(glm::uint32 x, glm::uint32 y)
  298. {
  299. __m128i const Array = _mm_set_epi32(0, static_cast<int>(y), 0, static_cast<int>(x));
  300. __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF);
  301. __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF);
  302. __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F);
  303. __m128i const Mask1 = _mm_set1_epi32(0x33333333);
  304. __m128i const Mask0 = _mm_set1_epi32(0x55555555);
  305. __m128i Reg1;
  306. __m128i Reg2;
  307. // REG1 = x;
  308. // REG2 = y;
  309. Reg1 = _mm_load_si128(&Array);
  310. //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF);
  311. //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF);
  312. Reg2 = _mm_slli_si128(Reg1, 2);
  313. Reg1 = _mm_or_si128(Reg2, Reg1);
  314. Reg1 = _mm_and_si128(Reg1, Mask4);
  315. //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF);
  316. //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF);
  317. Reg2 = _mm_slli_si128(Reg1, 1);
  318. Reg1 = _mm_or_si128(Reg2, Reg1);
  319. Reg1 = _mm_and_si128(Reg1, Mask3);
  320. //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  321. //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  322. Reg2 = _mm_slli_epi32(Reg1, 4);
  323. Reg1 = _mm_or_si128(Reg2, Reg1);
  324. Reg1 = _mm_and_si128(Reg1, Mask2);
  325. //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333);
  326. //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333);
  327. Reg2 = _mm_slli_epi32(Reg1, 2);
  328. Reg1 = _mm_or_si128(Reg2, Reg1);
  329. Reg1 = _mm_and_si128(Reg1, Mask1);
  330. //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555);
  331. //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555);
  332. Reg2 = _mm_slli_epi32(Reg1, 1);
  333. Reg1 = _mm_or_si128(Reg2, Reg1);
  334. Reg1 = _mm_and_si128(Reg1, Mask0);
  335. //return REG1 | (REG2 << 1);
  336. Reg2 = _mm_slli_epi32(Reg1, 1);
  337. Reg2 = _mm_srli_si128(Reg2, 8);
  338. Reg1 = _mm_or_si128(Reg1, Reg2);
  339. __m128i Result;
  340. _mm_store_si128(&Result, Reg1);
  341. #if GLM_COMPILER & GLM_COMPILER_CLANG
  342. # pragma clang diagnostic push
  343. # pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
  344. #endif
  345. return *reinterpret_cast<glm::uint64*>(&Result);
  346. #if GLM_COMPILER & GLM_COMPILER_CLANG
  347. # pragma clang diagnostic pop
  348. #endif
  349. }
  350. static inline glm::uint64 sseUnalignedBitfieldInterleave(glm::uint32 x, glm::uint32 y)
  351. {
  352. __m128i const Array = _mm_set_epi32(0, static_cast<int>(y), 0, static_cast<int>(x));
  353. __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF);
  354. __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF);
  355. __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F);
  356. __m128i const Mask1 = _mm_set1_epi32(0x33333333);
  357. __m128i const Mask0 = _mm_set1_epi32(0x55555555);
  358. __m128i Reg1;
  359. __m128i Reg2;
  360. // REG1 = x;
  361. // REG2 = y;
  362. Reg1 = _mm_loadu_si128(&Array);
  363. //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF);
  364. //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF);
  365. Reg2 = _mm_slli_si128(Reg1, 2);
  366. Reg1 = _mm_or_si128(Reg2, Reg1);
  367. Reg1 = _mm_and_si128(Reg1, Mask4);
  368. //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF);
  369. //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF);
  370. Reg2 = _mm_slli_si128(Reg1, 1);
  371. Reg1 = _mm_or_si128(Reg2, Reg1);
  372. Reg1 = _mm_and_si128(Reg1, Mask3);
  373. //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  374. //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  375. Reg2 = _mm_slli_epi32(Reg1, 4);
  376. Reg1 = _mm_or_si128(Reg2, Reg1);
  377. Reg1 = _mm_and_si128(Reg1, Mask2);
  378. //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333);
  379. //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333);
  380. Reg2 = _mm_slli_epi32(Reg1, 2);
  381. Reg1 = _mm_or_si128(Reg2, Reg1);
  382. Reg1 = _mm_and_si128(Reg1, Mask1);
  383. //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555);
  384. //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555);
  385. Reg2 = _mm_slli_epi32(Reg1, 1);
  386. Reg1 = _mm_or_si128(Reg2, Reg1);
  387. Reg1 = _mm_and_si128(Reg1, Mask0);
  388. //return REG1 | (REG2 << 1);
  389. Reg2 = _mm_slli_epi32(Reg1, 1);
  390. Reg2 = _mm_srli_si128(Reg2, 8);
  391. Reg1 = _mm_or_si128(Reg1, Reg2);
  392. __m128i Result;
  393. _mm_store_si128(&Result, Reg1);
  394. #if GLM_COMPILER & GLM_COMPILER_CLANG
  395. # pragma clang diagnostic push
  396. # pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
  397. #endif
  398. return *reinterpret_cast<glm::uint64*>(&Result);
  399. #if GLM_COMPILER & GLM_COMPILER_CLANG
  400. # pragma clang diagnostic pop
  401. #endif
  402. }
  403. #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
  404. static int test()
  405. {
  406. int Error = 0;
  407. /*
  408. {
  409. for(glm::uint32 y = 0; y < (1 << 10); ++y)
  410. for(glm::uint32 x = 0; x < (1 << 10); ++x)
  411. {
  412. glm::uint64 A = glm::bitfieldInterleave(x, y);
  413. glm::uint64 B = fastBitfieldInterleave(x, y);
  414. //glm::uint64 C = loopBitfieldInterleave(x, y);
  415. glm::uint64 D = interleaveBitfieldInterleave(x, y);
  416. assert(A == B);
  417. //assert(A == C);
  418. assert(A == D);
  419. # if GLM_ARCH & GLM_ARCH_SSE2_BIT
  420. glm::uint64 E = sseBitfieldInterleave(x, y);
  421. glm::uint64 F = sseUnalignedBitfieldInterleave(x, y);
  422. assert(A == E);
  423. assert(A == F);
  424. __m128i G = glm_i128_interleave(_mm_set_epi32(0, y, 0, x));
  425. glm::uint64 Result[2];
  426. _mm_storeu_si128((__m128i*)Result, G);
  427. assert(A == Result[0]);
  428. # endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
  429. }
  430. }
  431. */
  432. {
  433. for(glm::uint8 y = 0; y < 127; ++y)
  434. for(glm::uint8 x = 0; x < 127; ++x)
  435. {
  436. glm::uint64 A(glm::bitfieldInterleave(glm::u8vec2(x, y)));
  437. glm::uint64 B(glm::bitfieldInterleave(glm::u16vec2(x, y)));
  438. glm::uint64 C(glm::bitfieldInterleave(glm::u32vec2(x, y)));
  439. Error += A == B ? 0 : 1;
  440. Error += A == C ? 0 : 1;
  441. glm::u32vec2 const& D = glm::bitfieldDeinterleave(C);
  442. Error += D.x == x ? 0 : 1;
  443. Error += D.y == y ? 0 : 1;
  444. }
  445. }
  446. {
  447. for(glm::uint8 y = 0; y < 127; ++y)
  448. for(glm::uint8 x = 0; x < 127; ++x)
  449. {
  450. glm::int64 A(glm::bitfieldInterleave(glm::int8(x), glm::int8(y)));
  451. glm::int64 B(glm::bitfieldInterleave(glm::int16(x), glm::int16(y)));
  452. glm::int64 C(glm::bitfieldInterleave(glm::int32(x), glm::int32(y)));
  453. Error += A == B ? 0 : 1;
  454. Error += A == C ? 0 : 1;
  455. }
  456. }
  457. return Error;
  458. }
  459. static int perf()
  460. {
  461. glm::uint32 x_max = 1 << 11;
  462. glm::uint32 y_max = 1 << 10;
  463. // ALU
  464. std::vector<glm::uint64> Data(x_max * y_max);
  465. std::vector<glm::u32vec2> Param(x_max * y_max);
  466. for(glm::uint32 i = 0; i < Param.size(); ++i)
  467. Param[i] = glm::u32vec2(i % x_max, i / y_max);
  468. {
  469. std::clock_t LastTime = std::clock();
  470. for(std::size_t i = 0; i < Data.size(); ++i)
  471. Data[i] = glm::bitfieldInterleave(Param[i].x, Param[i].y);
  472. std::clock_t Time = std::clock() - LastTime;
  473. std::printf("glm::bitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  474. }
  475. {
  476. std::clock_t LastTime = std::clock();
  477. for(std::size_t i = 0; i < Data.size(); ++i)
  478. Data[i] = fastBitfieldInterleave(Param[i].x, Param[i].y);
  479. std::clock_t Time = std::clock() - LastTime;
  480. std::printf("fastBitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  481. }
  482. /*
  483. {
  484. std::clock_t LastTime = std::clock();
  485. for(std::size_t i = 0; i < Data.size(); ++i)
  486. Data[i] = loopBitfieldInterleave(Param[i].x, Param[i].y);
  487. std::clock_t Time = std::clock() - LastTime;
  488. std::printf("loopBitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  489. }
  490. */
  491. {
  492. std::clock_t LastTime = std::clock();
  493. for(std::size_t i = 0; i < Data.size(); ++i)
  494. Data[i] = interleaveBitfieldInterleave(Param[i].x, Param[i].y);
  495. std::clock_t Time = std::clock() - LastTime;
  496. std::printf("interleaveBitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  497. }
  498. # if GLM_ARCH & GLM_ARCH_SSE2_BIT
  499. {
  500. std::clock_t LastTime = std::clock();
  501. for(std::size_t i = 0; i < Data.size(); ++i)
  502. Data[i] = sseBitfieldInterleave(Param[i].x, Param[i].y);
  503. std::clock_t Time = std::clock() - LastTime;
  504. std::printf("sseBitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  505. }
  506. {
  507. std::clock_t LastTime = std::clock();
  508. for(std::size_t i = 0; i < Data.size(); ++i)
  509. Data[i] = sseUnalignedBitfieldInterleave(Param[i].x, Param[i].y);
  510. std::clock_t Time = std::clock() - LastTime;
  511. std::printf("sseUnalignedBitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  512. }
  513. # endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
  514. {
  515. std::clock_t LastTime = std::clock();
  516. for(std::size_t i = 0; i < Data.size(); ++i)
  517. Data[i] = glm::bitfieldInterleave(Param[i].x, Param[i].y, Param[i].x);
  518. std::clock_t Time = std::clock() - LastTime;
  519. std::printf("glm::detail::bitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  520. }
  521. # if(GLM_ARCH & GLM_ARCH_SSE2_BIT && !(GLM_COMPILER & GLM_COMPILER_GCC))
  522. {
  523. // SIMD
  524. std::vector<__m128i> SimdData;
  525. SimdData.resize(static_cast<std::size_t>(x_max * y_max));
  526. std::vector<__m128i> SimdParam;
  527. SimdParam.resize(static_cast<std::size_t>(x_max * y_max));
  528. for(std::size_t i = 0; i < SimdParam.size(); ++i)
  529. SimdParam[i] = _mm_set_epi32(static_cast<int>(i % static_cast<std::size_t>(x_max)), 0, static_cast<int>(i / static_cast<std::size_t>(y_max)), 0);
  530. std::clock_t LastTime = std::clock();
  531. for(std::size_t i = 0; i < SimdData.size(); ++i)
  532. SimdData[i] = glm_i128_interleave(SimdParam[i]);
  533. std::clock_t Time = std::clock() - LastTime;
  534. std::printf("_mm_bit_interleave_si128 Time %d clocks\n", static_cast<int>(Time));
  535. }
  536. # endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
  537. return 0;
  538. }
  539. }//namespace bitfieldInterleave
  540. namespace bitfieldInterleave5
  541. {
  542. GLM_FUNC_QUALIFIER
  543. static glm::uint16 bitfieldInterleave_u8vec2(glm::uint8 x, glm::uint8 y)
  544. {
  545. glm::uint32 Result = (glm::uint32(y) << 16) | glm::uint32(x);
  546. Result = ((Result << 4) | Result) & 0x0F0F0F0F;
  547. Result = ((Result << 2) | Result) & 0x33333333;
  548. Result = ((Result << 1) | Result) & 0x55555555;
  549. return static_cast<glm::uint16>((Result & 0x0000FFFF) | (Result >> 15));
  550. }
  551. GLM_FUNC_QUALIFIER
  552. static glm::u8vec2 bitfieldDeinterleave_u8vec2(glm::uint16 InterleavedBitfield)
  553. {
  554. glm::uint32 Result(InterleavedBitfield);
  555. Result = ((Result << 15) | Result) & 0x55555555;
  556. Result = ((Result >> 1) | Result) & 0x33333333;
  557. Result = ((Result >> 2) | Result) & 0x0F0F0F0F;
  558. Result = ((Result >> 4) | Result) & 0x00FF00FF;
  559. return glm::u8vec2(Result & 0x0000FFFF, Result >> 16);
  560. }
  561. /*
  562. GLM_FUNC_QUALIFIER
  563. static glm::uint32 bitfieldInterleave_u8vec4(glm::uint8 x, glm::uint8 y, glm::uint8 z, glm::uint8 w)
  564. {
  565. glm::uint64 Result = (glm::uint64(w) << 48) | (glm::uint64(z) << 32) | (glm::uint64(y) << 16) | glm::uint64(x);
  566. Result = ((Result << 12) | Result) & 0x000F000F000F000Full;
  567. Result = ((Result << 6) | Result) & 0x0303030303030303ull;
  568. Result = ((Result << 3) | Result) & 0x1111111111111111ull;
  569. const glm::uint32 a = static_cast<glm::uint32>((Result & 0x000000000000FFFF) >> ( 0 - 0));
  570. const glm::uint32 b = static_cast<glm::uint32>((Result & 0x00000000FFFF0000) >> (16 - 3));
  571. const glm::uint32 c = static_cast<glm::uint32>((Result & 0x0000FFFF00000000) >> (32 - 6));
  572. const glm::uint32 d = static_cast<glm::uint32>((Result & 0xFFFF000000000000) >> (48 - 12));
  573. return a | b | c | d;
  574. }
  575. GLM_FUNC_QUALIFIER
  576. static glm::u8vec4 bitfieldDeinterleave_u8vec4(glm::uint32 InterleavedBitfield)
  577. {
  578. glm::uint64 Result(InterleavedBitfield);
  579. Result = ((Result << 15) | Result) & 0x9249249249249249ull;
  580. Result = ((Result >> 1) | Result) & 0x30C30C30C30C30C3ull;
  581. Result = ((Result >> 2) | Result) & 0xF00F00F00F00F00Full;
  582. Result = ((Result >> 4) | Result) & 0x00FF0000FF0000FFull;
  583. return glm::u8vec4(
  584. (Result >> 0) & 0x000000000000FFFFull,
  585. (Result >> 16) & 0x00000000FFFF0000ull,
  586. (Result >> 32) & 0x0000FFFF00000000ull,
  587. (Result >> 48) & 0xFFFF000000000000ull);
  588. }
  589. */
  590. #if GLM_COMPILER & GLM_COMPILER_VC
  591. # pragma warning(disable : 4309)
  592. #endif
  593. /*
  594. GLM_FUNC_QUALIFIER
  595. static glm::uint32 bitfieldInterleave_u16vec2(glm::uint16 x, glm::uint16 y)
  596. {
  597. glm::uint64 Result = (glm::uint64(y) << 32) | glm::uint64(x);
  598. Result = ((Result << 8) | Result) & static_cast<glm::uint32>(0x00FF00FF00FF00FFull);
  599. Result = ((Result << 4) | Result) & static_cast<glm::uint32>(0x0F0F0F0F0F0F0F0Full);
  600. Result = ((Result << 2) | Result) & static_cast<glm::uint32>(0x3333333333333333ull);
  601. Result = ((Result << 1) | Result) & static_cast<glm::uint32>(0x5555555555555555ull);
  602. return static_cast<glm::uint32>((Result & 0x00000000FFFFFFFFull) | (Result >> 31));
  603. }
  604. GLM_FUNC_QUALIFIER
  605. static glm::u16vec2 bitfieldDeinterleave_u16vec2(glm::uint32 InterleavedBitfield)
  606. {
  607. glm::uint64 Result(InterleavedBitfield);
  608. Result = ((Result << 31) | Result) & 0x5555555555555555ull;
  609. Result = ((Result >> 1) | Result) & 0x3333333333333333ull;
  610. Result = ((Result >> 2) | Result) & 0x0F0F0F0F0F0F0F0Full;
  611. Result = ((Result >> 4) | Result) & 0x00FF00FF00FF00FFull;
  612. Result = ((Result >> 8) | Result) & 0x0000FFFF0000FFFFull;
  613. return glm::u16vec2(Result & 0x00000000FFFFFFFFull, Result >> 32);
  614. }
  615. */
  616. static int test(glm::size_t divider)
  617. {
  618. int Error = 0;
  619. glm::size_t count = 256 / divider;
  620. for(glm::size_t j = 0; j < count; ++j)
  621. for(glm::size_t i = 0; i < count; ++i)
  622. {
  623. glm::uint16 A = bitfieldInterleave_u8vec2(glm::uint8(i), glm::uint8(j));
  624. glm::uint16 B = glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j));
  625. Error += A == B ? 0 : 1;
  626. glm::u8vec2 C = bitfieldDeinterleave_u8vec2(A);
  627. Error += C.x == glm::uint8(i) ? 0 : 1;
  628. Error += C.y == glm::uint8(j) ? 0 : 1;
  629. }
  630. /*
  631. for(glm::size_t j = 0; j < count; ++j)
  632. for(glm::size_t i = 0; i < count; ++i)
  633. {
  634. glm::uint32 A = bitfieldInterleave_u8vec4(glm::uint8(i), glm::uint8(j), glm::uint8(i), glm::uint8(j));
  635. glm::uint32 B = glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j), glm::uint8(i), glm::uint8(j));
  636. Error += A == B ? 0 : 1;
  637. glm::u8vec4 C = bitfieldDeinterleave_u8vec4(A);
  638. Error += C.x == glm::uint8(i) ? 0 : 1;
  639. Error += C.y == glm::uint8(j) ? 0 : 1;
  640. Error += C.z == glm::uint8(i) ? 0 : 1;
  641. Error += C.w == glm::uint8(j) ? 0 : 1;
  642. }
  643. */
  644. /*
  645. for(glm::size_t j = 0; j < count; ++j)
  646. for(glm::size_t i = 0; i < count; ++i)
  647. {
  648. glm::uint32 A = bitfieldInterleave_u16vec2(glm::uint16(i), glm::uint16(j));
  649. glm::uint32 B = glm::bitfieldInterleave(glm::uint16(i), glm::uint16(j));
  650. Error += A == B ? 0 : 1;
  651. }
  652. */
  653. return Error;
  654. }
  655. static int perf_old_u8vec2(std::vector<glm::uint16>& Result, glm::size_t divider)
  656. {
  657. int Error = 0;
  658. glm::size_t count = 256 / divider;
  659. const std::clock_t BeginTime = std::clock();
  660. for(glm::size_t k = 0; k < 10000; ++k)
  661. for(glm::size_t j = 0; j < count; ++j)
  662. for(glm::size_t i = 0; i < count; ++i)
  663. Error += Result[j * count + i] == glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j)) ? 0 : 1;
  664. const std::clock_t EndTime = std::clock();
  665. std::printf("glm::bitfieldInterleave<u8vec2> Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  666. return Error;
  667. }
  668. static int perf_new_u8vec2(std::vector<glm::uint16>& Result, glm::size_t divider)
  669. {
  670. int Error = 0;
  671. glm::size_t count = 256 / divider;
  672. const std::clock_t BeginTime = std::clock();
  673. for(glm::size_t k = 0; k < 10000; ++k)
  674. for(glm::size_t j = 0; j < count; ++j)
  675. for(glm::size_t i = 0; i < count; ++i)
  676. Error += Result[j * count + i] == bitfieldInterleave_u8vec2(glm::uint8(i), glm::uint8(j)) ? 0 : 1;
  677. const std::clock_t EndTime = std::clock();
  678. std::printf("bitfieldInterleave_u8vec2 Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  679. return Error;
  680. }
  681. static int perf_old_u8vec4(std::vector<glm::uint32>& Result, glm::size_t divider)
  682. {
  683. int Error = 0;
  684. glm::size_t count = 256 / divider;
  685. const std::clock_t BeginTime = std::clock();
  686. for(glm::size_t k = 0; k < 10000; ++k)
  687. for(glm::size_t j = 0; j < count; ++j)
  688. for(glm::size_t i = 0; i < count; ++i)
  689. Error += Result[j * count + i] == glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j), glm::uint8(i), glm::uint8(j)) ? 0 : 1;
  690. const std::clock_t EndTime = std::clock();
  691. std::printf("glm::bitfieldInterleave<u8vec4> Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  692. return Error;
  693. }
  694. /*
  695. static int perf_new_u8vec4(std::vector<glm::uint32>& Result, glm::size_t divider)
  696. {
  697. int Error = 0;
  698. glm::size_t count = 256 / divider;
  699. const std::clock_t BeginTime = std::clock();
  700. for(glm::size_t k = 0; k < 10000; ++k)
  701. for(glm::size_t j = 0; j < count; ++j)
  702. for(glm::size_t i = 0; i < count; ++i)
  703. Error += Result[j * count + i] == bitfieldInterleave_u8vec4(glm::uint8(i), glm::uint8(j), glm::uint8(i), glm::uint8(j)) ? 0 : 1;
  704. const std::clock_t EndTime = std::clock();
  705. std::printf("bitfieldInterleave_u8vec4 Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  706. return Error;
  707. }
  708. */
  709. static int perf_old_u16vec2(std::vector<glm::uint32>& Result, glm::size_t divider)
  710. {
  711. int Error = 0;
  712. glm::size_t count = 256 / divider;
  713. const std::clock_t BeginTime = std::clock();
  714. for(glm::size_t k = 0; k < 10000; ++k)
  715. for(glm::size_t j = 0; j < count; ++j)
  716. for(glm::size_t i = 0; i < count; ++i)
  717. Error += Result[j * count + i] == glm::bitfieldInterleave(glm::uint16(i), glm::uint16(j)) ? 0 : 1;
  718. const std::clock_t EndTime = std::clock();
  719. std::printf("glm::bitfieldInterleave<u16vec2> Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  720. return Error;
  721. }
  722. /*
  723. static int perf_new_u16vec2(std::vector<glm::uint32>& Result, glm::size_t divider)
  724. {
  725. int Error = 0;
  726. glm::size_t count = 256 / divider;
  727. const std::clock_t BeginTime = std::clock();
  728. for(glm::size_t k = 0; k < 10000; ++k)
  729. for(glm::size_t j = 0; j < count; ++j)
  730. for(glm::size_t i = 0; i < count; ++i)
  731. Error += Result[j * count + i] == bitfieldInterleave_u16vec2(glm::uint16(i), glm::uint16(j)) ? 0 : 1;
  732. const std::clock_t EndTime = std::clock();
  733. std::printf("bitfieldInterleave_u16vec2 Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  734. return Error;
  735. }
  736. */
  737. static int perf(glm::size_t divider)
  738. {
  739. int Error = 0;
  740. glm::size_t count = 256 / divider;
  741. std::printf("bitfieldInterleave perf: init\r");
  742. std::vector<glm::uint16> Result_u8vec2(count * count, 0);
  743. for(glm::size_t j = 0; j < count; ++j)
  744. for(glm::size_t i = 0; i < count; ++i)
  745. Result_u8vec2[j * count + i] = glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j));
  746. Error += perf_old_u8vec2(Result_u8vec2, divider);
  747. Error += perf_new_u8vec2(Result_u8vec2, divider);
  748. std::vector<glm::uint32> Result_u8vec4(count * count, 0);
  749. for(glm::size_t j = 0; j < count; ++j)
  750. for(glm::size_t i = 0; i < count; ++i)
  751. Result_u8vec4[j * count + i] = glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j), glm::uint8(i), glm::uint8(j));
  752. Error += perf_old_u8vec4(Result_u8vec4, divider);
  753. //Error += perf_new_u8vec4(Result_u8vec4, divider);
  754. std::vector<glm::uint32> Result_u16vec2(count * count, 0);
  755. for(glm::size_t j = 0; j < count; ++j)
  756. for(glm::size_t i = 0; i < count; ++i)
  757. Result_u16vec2[j * count + i] = glm::bitfieldInterleave(glm::uint16(i), glm::uint16(j));
  758. Error += perf_old_u16vec2(Result_u16vec2, divider);
  759. //Error += perf_new_u16vec2(Result_u16vec2, divider);
  760. std::printf("bitfieldInterleave perf: %d Errors\n", Error);
  761. return Error;
  762. }
  763. }//namespace bitfieldInterleave5
  764. static int test_bitfieldRotateRight()
  765. {
  766. glm::ivec4 const A = glm::bitfieldRotateRight(glm::ivec4(2), 1);
  767. glm::ivec4 const B = glm::ivec4(2) >> 1;
  768. return A == B;
  769. }
  770. static int test_bitfieldRotateLeft()
  771. {
  772. glm::ivec4 const A = glm::bitfieldRotateLeft(glm::ivec4(2), 1);
  773. glm::ivec4 const B = glm::ivec4(2) << 1;
  774. return A == B;
  775. }
  776. int main()
  777. {
  778. int Error = 0;
  779. // Tests for a faster and to reserve bitfieldInterleave
  780. Error += ::bitfieldInterleave5::test(64);
  781. Error += ::bitfieldInterleave5::perf(64);
  782. Error += ::mask::test();
  783. Error += ::bitfieldInterleave3::test();
  784. Error += ::bitfieldInterleave4::test();
  785. Error += ::bitfieldInterleave::test();
  786. Error += test_bitfieldRotateRight();
  787. Error += test_bitfieldRotateLeft();
  788. Error += ::mask::perf();
  789. Error += ::bitfieldInterleave::perf();
  790. return Error;
  791. }