gtc_bitfield.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005
  1. #include <glm/gtc/bitfield.hpp>
  2. #include <glm/gtc/type_precision.hpp>
  3. #include <glm/vector_relational.hpp>
  4. #include <glm/integer.hpp>
  5. #include <ctime>
  6. #include <cstdio>
  7. #include <vector>
  8. namespace mask
  9. {
  10. template<typename genType>
  11. struct type
  12. {
  13. genType Value;
  14. genType Return;
  15. };
  16. #if GLM_COMPILER & GLM_COMPILER_CLANG
  17. # pragma clang diagnostic push
  18. # pragma clang diagnostic ignored "-Wsign-conversion"
  19. #endif
  20. static inline int mask_zero(int Bits)
  21. {
  22. return ~((~0) << Bits);
  23. }
  24. #if GLM_COMPILER & GLM_COMPILER_CLANG
  25. # pragma clang diagnostic push
  26. # pragma clang diagnostic ignored "-Wsign-compare"
  27. #endif
  28. static inline int mask_mix(int Bits)
  29. {
  30. return Bits >= sizeof(int) * 8 ? 0xffffffff : (static_cast<int>(1) << Bits) - static_cast<int>(1);
  31. }
  32. #if GLM_COMPILER & GLM_COMPILER_CLANG
  33. # pragma clang diagnostic pop
  34. #endif
  35. static inline int mask_half(int Bits)
  36. {
  37. // We do the shift in two steps because 1 << 32 on an int is undefined.
  38. int const Half = Bits >> 1;
  39. int const Fill = ~0;
  40. int const ShiftHaft = (Fill << Half);
  41. int const Rest = Bits - Half;
  42. int const Reversed = ShiftHaft << Rest;
  43. return ~Reversed;
  44. }
  45. static inline int mask_loop(int Bits)
  46. {
  47. int Mask = 0;
  48. for(int Bit = 0; Bit < Bits; ++Bit)
  49. Mask |= (static_cast<int>(1) << Bit);
  50. return Mask;
  51. }
  52. static int perf()
  53. {
  54. int const Count = 1000;
  55. std::clock_t Timestamp1 = std::clock();
  56. {
  57. std::vector<int> Mask;
  58. Mask.resize(Count);
  59. for(int i = 0; i < Count; ++i)
  60. Mask[i] = mask_mix(i % 32);
  61. }
  62. std::clock_t Timestamp2 = std::clock();
  63. {
  64. std::vector<int> Mask;
  65. Mask.resize(Count);
  66. for(int i = 0; i < Count; ++i)
  67. Mask[i] = mask_loop(i % 32);
  68. }
  69. std::clock_t Timestamp3 = std::clock();
  70. {
  71. std::vector<int> Mask;
  72. Mask.resize(Count);
  73. for(int i = 0; i < Count; ++i)
  74. Mask[i] = glm::mask(i % 32);
  75. }
  76. std::clock_t Timestamp4 = std::clock();
  77. {
  78. std::vector<int> Mask;
  79. Mask.resize(Count);
  80. for(int i = 0; i < Count; ++i)
  81. Mask[i] = mask_zero(i % 32);
  82. }
  83. std::clock_t Timestamp5 = std::clock();
  84. {
  85. std::vector<int> Mask;
  86. Mask.resize(Count);
  87. for(int i = 0; i < Count; ++i)
  88. Mask[i] = mask_half(i % 32);
  89. }
  90. std::clock_t Timestamp6 = std::clock();
  91. std::clock_t TimeMix = Timestamp2 - Timestamp1;
  92. std::clock_t TimeLoop = Timestamp3 - Timestamp2;
  93. std::clock_t TimeDefault = Timestamp4 - Timestamp3;
  94. std::clock_t TimeZero = Timestamp5 - Timestamp4;
  95. std::clock_t TimeHalf = Timestamp6 - Timestamp5;
  96. std::printf("mask[mix]: %d\n", static_cast<unsigned int>(TimeMix));
  97. std::printf("mask[loop]: %d\n", static_cast<unsigned int>(TimeLoop));
  98. std::printf("mask[default]: %d\n", static_cast<unsigned int>(TimeDefault));
  99. std::printf("mask[zero]: %d\n", static_cast<unsigned int>(TimeZero));
  100. std::printf("mask[half]: %d\n", static_cast<unsigned int>(TimeHalf));
  101. return TimeDefault <= TimeLoop ? 0 : 1;
  102. }
  103. #if GLM_COMPILER & GLM_COMPILER_CLANG
  104. # pragma clang diagnostic pop
  105. #endif
  106. static int test_int()
  107. {
  108. type<int> const Data[] =
  109. {
  110. { 0, 0x00000000},
  111. { 1, 0x00000001},
  112. { 2, 0x00000003},
  113. { 3, 0x00000007},
  114. {31, 0x7fffffff}
  115. };
  116. int Error = 0;
  117. /* mask_zero is sadly not a correct code
  118. for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
  119. {
  120. int Result = mask_zero(Data[i].Value);
  121. Error += Data[i].Return == Result ? 0 : 1;
  122. }
  123. */
  124. for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
  125. {
  126. int Result = mask_mix(Data[i].Value);
  127. Error += Data[i].Return == Result ? 0 : 1;
  128. }
  129. for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
  130. {
  131. int Result = mask_half(Data[i].Value);
  132. Error += Data[i].Return == Result ? 0 : 1;
  133. }
  134. for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
  135. {
  136. int Result = mask_loop(Data[i].Value);
  137. Error += Data[i].Return == Result ? 0 : 1;
  138. }
  139. for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
  140. {
  141. int Result = glm::mask(Data[i].Value);
  142. Error += Data[i].Return == Result ? 0 : 1;
  143. }
  144. return Error;
  145. }
  146. static int test_ivec4()
  147. {
  148. type<glm::ivec4> const Data[] =
  149. {
  150. {glm::ivec4( 0), glm::ivec4(0x00000000)},
  151. {glm::ivec4( 1), glm::ivec4(0x00000001)},
  152. {glm::ivec4( 2), glm::ivec4(0x00000003)},
  153. {glm::ivec4( 3), glm::ivec4(0x00000007)}
  154. };
  155. int Error(0);
  156. for(std::size_t i = 0, n = sizeof(Data) / sizeof(type<glm::ivec4>); i < n; ++i)
  157. {
  158. glm::ivec4 Result = glm::mask(Data[i].Value);
  159. Error += glm::all(glm::equal(Data[i].Return, Result)) ? 0 : 1;
  160. }
  161. return Error;
  162. }
  163. static int test()
  164. {
  165. int Error(0);
  166. Error += test_int();
  167. Error += test_ivec4();
  168. return Error;
  169. }
  170. }//namespace mask
  171. namespace bitfieldInterleave3
  172. {
  173. template<typename PARAM, typename RET>
  174. static inline RET refBitfieldInterleave(PARAM x, PARAM y, PARAM z)
  175. {
  176. RET Result = 0;
  177. for(RET i = 0; i < sizeof(PARAM) * 8; ++i)
  178. {
  179. Result |= ((RET(x) & (RET(1U) << i)) << ((i << 1) + 0));
  180. Result |= ((RET(y) & (RET(1U) << i)) << ((i << 1) + 1));
  181. Result |= ((RET(z) & (RET(1U) << i)) << ((i << 1) + 2));
  182. }
  183. return Result;
  184. }
  185. static int test()
  186. {
  187. int Error(0);
  188. glm::uint16 const test_max = 5; // previously 11
  189. glm::uint16 x_max = 1 << test_max;
  190. glm::uint16 y_max = 1 << test_max;
  191. glm::uint16 z_max = 1 << test_max;
  192. for(glm::uint16 z = 0; z < z_max; z += 27)
  193. for(glm::uint16 y = 0; y < y_max; y += 27)
  194. for(glm::uint16 x = 0; x < x_max; x += 27)
  195. {
  196. glm::uint64 ResultA = refBitfieldInterleave<glm::uint16, glm::uint64>(x, y, z);
  197. glm::uint64 ResultB = glm::bitfieldInterleave(x, y, z);
  198. Error += ResultA == ResultB ? 0 : 1;
  199. }
  200. return Error;
  201. }
  202. }
  203. namespace bitfieldInterleave4
  204. {
  205. template<typename PARAM, typename RET>
  206. static inline RET loopBitfieldInterleave(PARAM x, PARAM y, PARAM z, PARAM w)
  207. {
  208. RET const v[4] = {x, y, z, w};
  209. RET Result = 0;
  210. for(RET i = 0; i < sizeof(PARAM) * 8; i++)
  211. {
  212. Result |= ((((v[0] >> i) & 1U)) << ((i << 2) + 0));
  213. Result |= ((((v[1] >> i) & 1U)) << ((i << 2) + 1));
  214. Result |= ((((v[2] >> i) & 1U)) << ((i << 2) + 2));
  215. Result |= ((((v[3] >> i) & 1U)) << ((i << 2) + 3));
  216. }
  217. return Result;
  218. }
  219. static int test()
  220. {
  221. int Error(0);
  222. glm::uint16 const test_max = 5; // previously 11
  223. glm::uint16 x_max = 1 << test_max;
  224. glm::uint16 y_max = 1 << test_max;
  225. glm::uint16 z_max = 1 << test_max;
  226. glm::uint16 w_max = 1 << test_max;
  227. for(glm::uint16 w = 0; w < w_max; w += 27)
  228. for(glm::uint16 z = 0; z < z_max; z += 27)
  229. for(glm::uint16 y = 0; y < y_max; y += 27)
  230. for(glm::uint16 x = 0; x < x_max; x += 27)
  231. {
  232. glm::uint64 ResultA = loopBitfieldInterleave<glm::uint16, glm::uint64>(x, y, z, w);
  233. glm::uint64 ResultB = glm::bitfieldInterleave(x, y, z, w);
  234. Error += ResultA == ResultB ? 0 : 1;
  235. }
  236. return Error;
  237. }
  238. }
  239. namespace bitfieldInterleave
  240. {
  241. static inline glm::uint64 fastBitfieldInterleave(glm::uint32 x, glm::uint32 y)
  242. {
  243. glm::uint64 REG1;
  244. glm::uint64 REG2;
  245. REG1 = x;
  246. REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF);
  247. REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF);
  248. REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  249. REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333);
  250. REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555);
  251. REG2 = y;
  252. REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF);
  253. REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF);
  254. REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  255. REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333);
  256. REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555);
  257. return REG1 | (REG2 << 1);
  258. }
  259. static inline glm::uint64 interleaveBitfieldInterleave(glm::uint32 x, glm::uint32 y)
  260. {
  261. glm::uint64 REG1;
  262. glm::uint64 REG2;
  263. REG1 = x;
  264. REG2 = y;
  265. REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF);
  266. REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF);
  267. REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF);
  268. REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF);
  269. REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  270. REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  271. REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333);
  272. REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333);
  273. REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555);
  274. REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555);
  275. return REG1 | (REG2 << 1);
  276. }
  277. /*
  278. static inline glm::uint64 loopBitfieldInterleave(glm::uint32 x, glm::uint32 y)
  279. {
  280. static glm::uint64 const Mask[5] =
  281. {
  282. 0x5555555555555555,
  283. 0x3333333333333333,
  284. 0x0F0F0F0F0F0F0F0F,
  285. 0x00FF00FF00FF00FF,
  286. 0x0000FFFF0000FFFF
  287. };
  288. glm::uint64 REG1 = x;
  289. glm::uint64 REG2 = y;
  290. for(int i = 4; i >= 0; --i)
  291. {
  292. REG1 = ((REG1 << (1 << i)) | REG1) & Mask[i];
  293. REG2 = ((REG2 << (1 << i)) | REG2) & Mask[i];
  294. }
  295. return REG1 | (REG2 << 1);
  296. }
  297. */
  298. #if GLM_ARCH & GLM_ARCH_SSE2_BIT
  299. static inline glm::uint64 sseBitfieldInterleave(glm::uint32 x, glm::uint32 y)
  300. {
  301. __m128i const Array = _mm_set_epi32(0, static_cast<int>(y), 0, static_cast<int>(x));
  302. __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF);
  303. __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF);
  304. __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F);
  305. __m128i const Mask1 = _mm_set1_epi32(0x33333333);
  306. __m128i const Mask0 = _mm_set1_epi32(0x55555555);
  307. __m128i Reg1;
  308. __m128i Reg2;
  309. // REG1 = x;
  310. // REG2 = y;
  311. Reg1 = _mm_load_si128(&Array);
  312. //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF);
  313. //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF);
  314. Reg2 = _mm_slli_si128(Reg1, 2);
  315. Reg1 = _mm_or_si128(Reg2, Reg1);
  316. Reg1 = _mm_and_si128(Reg1, Mask4);
  317. //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF);
  318. //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF);
  319. Reg2 = _mm_slli_si128(Reg1, 1);
  320. Reg1 = _mm_or_si128(Reg2, Reg1);
  321. Reg1 = _mm_and_si128(Reg1, Mask3);
  322. //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  323. //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  324. Reg2 = _mm_slli_epi32(Reg1, 4);
  325. Reg1 = _mm_or_si128(Reg2, Reg1);
  326. Reg1 = _mm_and_si128(Reg1, Mask2);
  327. //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333);
  328. //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333);
  329. Reg2 = _mm_slli_epi32(Reg1, 2);
  330. Reg1 = _mm_or_si128(Reg2, Reg1);
  331. Reg1 = _mm_and_si128(Reg1, Mask1);
  332. //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555);
  333. //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555);
  334. Reg2 = _mm_slli_epi32(Reg1, 1);
  335. Reg1 = _mm_or_si128(Reg2, Reg1);
  336. Reg1 = _mm_and_si128(Reg1, Mask0);
  337. //return REG1 | (REG2 << 1);
  338. Reg2 = _mm_slli_epi32(Reg1, 1);
  339. Reg2 = _mm_srli_si128(Reg2, 8);
  340. Reg1 = _mm_or_si128(Reg1, Reg2);
  341. __m128i Result;
  342. _mm_store_si128(&Result, Reg1);
  343. #if GLM_COMPILER & GLM_COMPILER_CLANG
  344. # pragma clang diagnostic push
  345. # pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
  346. #endif
  347. return *reinterpret_cast<glm::uint64*>(&Result);
  348. #if GLM_COMPILER & GLM_COMPILER_CLANG
  349. # pragma clang diagnostic pop
  350. #endif
  351. }
  352. static inline glm::uint64 sseUnalignedBitfieldInterleave(glm::uint32 x, glm::uint32 y)
  353. {
  354. __m128i const Array = _mm_set_epi32(0, static_cast<int>(y), 0, static_cast<int>(x));
  355. __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF);
  356. __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF);
  357. __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F);
  358. __m128i const Mask1 = _mm_set1_epi32(0x33333333);
  359. __m128i const Mask0 = _mm_set1_epi32(0x55555555);
  360. __m128i Reg1;
  361. __m128i Reg2;
  362. // REG1 = x;
  363. // REG2 = y;
  364. Reg1 = _mm_loadu_si128(&Array);
  365. //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF);
  366. //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF);
  367. Reg2 = _mm_slli_si128(Reg1, 2);
  368. Reg1 = _mm_or_si128(Reg2, Reg1);
  369. Reg1 = _mm_and_si128(Reg1, Mask4);
  370. //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF);
  371. //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF);
  372. Reg2 = _mm_slli_si128(Reg1, 1);
  373. Reg1 = _mm_or_si128(Reg2, Reg1);
  374. Reg1 = _mm_and_si128(Reg1, Mask3);
  375. //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  376. //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F);
  377. Reg2 = _mm_slli_epi32(Reg1, 4);
  378. Reg1 = _mm_or_si128(Reg2, Reg1);
  379. Reg1 = _mm_and_si128(Reg1, Mask2);
  380. //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333);
  381. //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333);
  382. Reg2 = _mm_slli_epi32(Reg1, 2);
  383. Reg1 = _mm_or_si128(Reg2, Reg1);
  384. Reg1 = _mm_and_si128(Reg1, Mask1);
  385. //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555);
  386. //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555);
  387. Reg2 = _mm_slli_epi32(Reg1, 1);
  388. Reg1 = _mm_or_si128(Reg2, Reg1);
  389. Reg1 = _mm_and_si128(Reg1, Mask0);
  390. //return REG1 | (REG2 << 1);
  391. Reg2 = _mm_slli_epi32(Reg1, 1);
  392. Reg2 = _mm_srli_si128(Reg2, 8);
  393. Reg1 = _mm_or_si128(Reg1, Reg2);
  394. __m128i Result;
  395. _mm_store_si128(&Result, Reg1);
  396. #if GLM_COMPILER & GLM_COMPILER_CLANG
  397. # pragma clang diagnostic push
  398. # pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
  399. #endif
  400. return *reinterpret_cast<glm::uint64*>(&Result);
  401. #if GLM_COMPILER & GLM_COMPILER_CLANG
  402. # pragma clang diagnostic pop
  403. #endif
  404. }
  405. #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
  406. static int test()
  407. {
  408. int Error = 0;
  409. glm::uint8 const test_loop = 15; // 127 ideally
  410. /*
  411. {
  412. for(glm::uint32 y = 0; y < (1 << 10); ++y)
  413. for(glm::uint32 x = 0; x < (1 << 10); ++x)
  414. {
  415. glm::uint64 A = glm::bitfieldInterleave(x, y);
  416. glm::uint64 B = fastBitfieldInterleave(x, y);
  417. //glm::uint64 C = loopBitfieldInterleave(x, y);
  418. glm::uint64 D = interleaveBitfieldInterleave(x, y);
  419. assert(A == B);
  420. //assert(A == C);
  421. assert(A == D);
  422. # if GLM_ARCH & GLM_ARCH_SSE2_BIT
  423. glm::uint64 E = sseBitfieldInterleave(x, y);
  424. glm::uint64 F = sseUnalignedBitfieldInterleave(x, y);
  425. assert(A == E);
  426. assert(A == F);
  427. __m128i G = glm_i128_interleave(_mm_set_epi32(0, y, 0, x));
  428. glm::uint64 Result[2];
  429. _mm_storeu_si128((__m128i*)Result, G);
  430. assert(A == Result[0]);
  431. # endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
  432. }
  433. }
  434. */
  435. {
  436. for(glm::uint8 y = 0; y < test_loop; ++y)
  437. for(glm::uint8 x = 0; x < test_loop; ++x)
  438. {
  439. glm::uint64 A(glm::bitfieldInterleave(glm::u8vec2(x, y)));
  440. glm::uint64 B(glm::bitfieldInterleave(glm::u16vec2(x, y)));
  441. glm::uint64 C(glm::bitfieldInterleave(glm::u32vec2(x, y)));
  442. Error += A == B ? 0 : 1;
  443. Error += A == C ? 0 : 1;
  444. glm::u32vec2 const& D = glm::bitfieldDeinterleave(C);
  445. Error += D.x == x ? 0 : 1;
  446. Error += D.y == y ? 0 : 1;
  447. }
  448. }
  449. {
  450. for(glm::uint8 y = 0; y < test_loop; ++y)
  451. for(glm::uint8 x = 0; x < test_loop; ++x)
  452. {
  453. glm::int64 A(glm::bitfieldInterleave(glm::int8(x), glm::int8(y)));
  454. glm::int64 B(glm::bitfieldInterleave(glm::int16(x), glm::int16(y)));
  455. glm::int64 C(glm::bitfieldInterleave(glm::int32(x), glm::int32(y)));
  456. Error += A == B ? 0 : 1;
  457. Error += A == C ? 0 : 1;
  458. }
  459. }
  460. return Error;
  461. }
  462. static int perf()
  463. {
  464. glm::uint32 x_max = 1 << 4;
  465. glm::uint32 y_max = 1 << 3;
  466. // ALU
  467. std::vector<glm::uint64> Data(x_max * y_max);
  468. std::vector<glm::u32vec2> Param(x_max * y_max);
  469. for(glm::uint32 i = 0; i < Param.size(); ++i) {
  470. Param[i] = glm::u32vec2(i % x_max, i / y_max);
  471. }
  472. {
  473. std::clock_t LastTime = std::clock();
  474. for(std::size_t i = 0; i < Data.size(); ++i)
  475. Data[i] = glm::bitfieldInterleave(Param[i].x, Param[i].y);
  476. std::clock_t Time = std::clock() - LastTime;
  477. std::printf("glm::bitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  478. }
  479. {
  480. std::clock_t LastTime = std::clock();
  481. for(std::size_t i = 0; i < Data.size(); ++i)
  482. Data[i] = fastBitfieldInterleave(Param[i].x, Param[i].y);
  483. std::clock_t Time = std::clock() - LastTime;
  484. std::printf("fastBitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  485. }
  486. /*
  487. {
  488. std::clock_t LastTime = std::clock();
  489. for(std::size_t i = 0; i < Data.size(); ++i)
  490. Data[i] = loopBitfieldInterleave(Param[i].x, Param[i].y);
  491. std::clock_t Time = std::clock() - LastTime;
  492. std::printf("loopBitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  493. }
  494. */
  495. {
  496. std::clock_t LastTime = std::clock();
  497. for(std::size_t i = 0; i < Data.size(); ++i)
  498. Data[i] = interleaveBitfieldInterleave(Param[i].x, Param[i].y);
  499. std::clock_t Time = std::clock() - LastTime;
  500. std::printf("interleaveBitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  501. }
  502. # if GLM_ARCH & GLM_ARCH_SSE2_BIT
  503. {
  504. std::clock_t LastTime = std::clock();
  505. for(std::size_t i = 0; i < Data.size(); ++i)
  506. Data[i] = sseBitfieldInterleave(Param[i].x, Param[i].y);
  507. std::clock_t Time = std::clock() - LastTime;
  508. std::printf("sseBitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  509. }
  510. {
  511. std::clock_t LastTime = std::clock();
  512. for(std::size_t i = 0; i < Data.size(); ++i)
  513. Data[i] = sseUnalignedBitfieldInterleave(Param[i].x, Param[i].y);
  514. std::clock_t Time = std::clock() - LastTime;
  515. std::printf("sseUnalignedBitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  516. }
  517. # endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
  518. {
  519. std::clock_t LastTime = std::clock();
  520. for(std::size_t i = 0; i < Data.size(); ++i)
  521. Data[i] = glm::bitfieldInterleave(Param[i].x, Param[i].y, Param[i].x);
  522. std::clock_t Time = std::clock() - LastTime;
  523. std::printf("glm::detail::bitfieldInterleave Time %d clocks\n", static_cast<int>(Time));
  524. }
  525. # if(GLM_ARCH & GLM_ARCH_SSE2_BIT && !(GLM_COMPILER & GLM_COMPILER_GCC))
  526. {
  527. // SIMD
  528. std::vector<__m128i> SimdData;
  529. SimdData.resize(static_cast<std::size_t>(x_max * y_max));
  530. std::vector<__m128i> SimdParam;
  531. SimdParam.resize(static_cast<std::size_t>(x_max * y_max));
  532. for(std::size_t i = 0; i < SimdParam.size(); ++i)
  533. SimdParam[i] = _mm_set_epi32(static_cast<int>(i % static_cast<std::size_t>(x_max)), 0, static_cast<int>(i / static_cast<std::size_t>(y_max)), 0);
  534. std::clock_t LastTime = std::clock();
  535. for(std::size_t i = 0; i < SimdData.size(); ++i)
  536. SimdData[i] = glm_i128_interleave(SimdParam[i]);
  537. std::clock_t Time = std::clock() - LastTime;
  538. std::printf("_mm_bit_interleave_si128 Time %d clocks\n", static_cast<int>(Time));
  539. }
  540. # endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
  541. return 0;
  542. }
  543. }//namespace bitfieldInterleave
  544. namespace bitfieldInterleave5
  545. {
  546. GLM_FUNC_QUALIFIER
  547. static glm::uint16 bitfieldInterleave_u8vec2(glm::uint8 x, glm::uint8 y)
  548. {
  549. glm::uint32 Result = (glm::uint32(y) << 16) | glm::uint32(x);
  550. Result = ((Result << 4) | Result) & 0x0F0F0F0F;
  551. Result = ((Result << 2) | Result) & 0x33333333;
  552. Result = ((Result << 1) | Result) & 0x55555555;
  553. return static_cast<glm::uint16>((Result & 0x0000FFFF) | (Result >> 15));
  554. }
  555. GLM_FUNC_QUALIFIER
  556. static glm::u8vec2 bitfieldDeinterleave_u8vec2(glm::uint16 InterleavedBitfield)
  557. {
  558. glm::uint32 Result(InterleavedBitfield);
  559. Result = ((Result << 15) | Result) & 0x55555555;
  560. Result = ((Result >> 1) | Result) & 0x33333333;
  561. Result = ((Result >> 2) | Result) & 0x0F0F0F0F;
  562. Result = ((Result >> 4) | Result) & 0x00FF00FF;
  563. return glm::u8vec2(Result & 0x0000FFFF, Result >> 16);
  564. }
  565. /*
  566. GLM_FUNC_QUALIFIER
  567. static glm::uint32 bitfieldInterleave_u8vec4(glm::uint8 x, glm::uint8 y, glm::uint8 z, glm::uint8 w)
  568. {
  569. glm::uint64 Result = (glm::uint64(w) << 48) | (glm::uint64(z) << 32) | (glm::uint64(y) << 16) | glm::uint64(x);
  570. Result = ((Result << 12) | Result) & 0x000F000F000F000Full;
  571. Result = ((Result << 6) | Result) & 0x0303030303030303ull;
  572. Result = ((Result << 3) | Result) & 0x1111111111111111ull;
  573. const glm::uint32 a = static_cast<glm::uint32>((Result & 0x000000000000FFFF) >> ( 0 - 0));
  574. const glm::uint32 b = static_cast<glm::uint32>((Result & 0x00000000FFFF0000) >> (16 - 3));
  575. const glm::uint32 c = static_cast<glm::uint32>((Result & 0x0000FFFF00000000) >> (32 - 6));
  576. const glm::uint32 d = static_cast<glm::uint32>((Result & 0xFFFF000000000000) >> (48 - 12));
  577. return a | b | c | d;
  578. }
  579. GLM_FUNC_QUALIFIER
  580. static glm::u8vec4 bitfieldDeinterleave_u8vec4(glm::uint32 InterleavedBitfield)
  581. {
  582. glm::uint64 Result(InterleavedBitfield);
  583. Result = ((Result << 15) | Result) & 0x9249249249249249ull;
  584. Result = ((Result >> 1) | Result) & 0x30C30C30C30C30C3ull;
  585. Result = ((Result >> 2) | Result) & 0xF00F00F00F00F00Full;
  586. Result = ((Result >> 4) | Result) & 0x00FF0000FF0000FFull;
  587. return glm::u8vec4(
  588. (Result >> 0) & 0x000000000000FFFFull,
  589. (Result >> 16) & 0x00000000FFFF0000ull,
  590. (Result >> 32) & 0x0000FFFF00000000ull,
  591. (Result >> 48) & 0xFFFF000000000000ull);
  592. }
  593. */
  594. #if GLM_COMPILER & GLM_COMPILER_VC
  595. # pragma warning(disable : 4309)
  596. #endif
  597. /*
  598. GLM_FUNC_QUALIFIER
  599. static glm::uint32 bitfieldInterleave_u16vec2(glm::uint16 x, glm::uint16 y)
  600. {
  601. glm::uint64 Result = (glm::uint64(y) << 32) | glm::uint64(x);
  602. Result = ((Result << 8) | Result) & static_cast<glm::uint32>(0x00FF00FF00FF00FFull);
  603. Result = ((Result << 4) | Result) & static_cast<glm::uint32>(0x0F0F0F0F0F0F0F0Full);
  604. Result = ((Result << 2) | Result) & static_cast<glm::uint32>(0x3333333333333333ull);
  605. Result = ((Result << 1) | Result) & static_cast<glm::uint32>(0x5555555555555555ull);
  606. return static_cast<glm::uint32>((Result & 0x00000000FFFFFFFFull) | (Result >> 31));
  607. }
  608. GLM_FUNC_QUALIFIER
  609. static glm::u16vec2 bitfieldDeinterleave_u16vec2(glm::uint32 InterleavedBitfield)
  610. {
  611. glm::uint64 Result(InterleavedBitfield);
  612. Result = ((Result << 31) | Result) & 0x5555555555555555ull;
  613. Result = ((Result >> 1) | Result) & 0x3333333333333333ull;
  614. Result = ((Result >> 2) | Result) & 0x0F0F0F0F0F0F0F0Full;
  615. Result = ((Result >> 4) | Result) & 0x00FF00FF00FF00FFull;
  616. Result = ((Result >> 8) | Result) & 0x0000FFFF0000FFFFull;
  617. return glm::u16vec2(Result & 0x00000000FFFFFFFFull, Result >> 32);
  618. }
  619. */
  620. static int test(glm::size_t divider)
  621. {
  622. int Error = 0;
  623. glm::size_t count = 256 / divider;
  624. for(glm::size_t j = 0; j < count; ++j)
  625. for(glm::size_t i = 0; i < count; ++i)
  626. {
  627. glm::uint16 A = bitfieldInterleave_u8vec2(glm::uint8(i), glm::uint8(j));
  628. glm::uint16 B = glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j));
  629. Error += A == B ? 0 : 1;
  630. glm::u8vec2 C = bitfieldDeinterleave_u8vec2(A);
  631. Error += C.x == glm::uint8(i) ? 0 : 1;
  632. Error += C.y == glm::uint8(j) ? 0 : 1;
  633. }
  634. /*
  635. for(glm::size_t j = 0; j < count; ++j)
  636. for(glm::size_t i = 0; i < count; ++i)
  637. {
  638. glm::uint32 A = bitfieldInterleave_u8vec4(glm::uint8(i), glm::uint8(j), glm::uint8(i), glm::uint8(j));
  639. glm::uint32 B = glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j), glm::uint8(i), glm::uint8(j));
  640. Error += A == B ? 0 : 1;
  641. glm::u8vec4 C = bitfieldDeinterleave_u8vec4(A);
  642. Error += C.x == glm::uint8(i) ? 0 : 1;
  643. Error += C.y == glm::uint8(j) ? 0 : 1;
  644. Error += C.z == glm::uint8(i) ? 0 : 1;
  645. Error += C.w == glm::uint8(j) ? 0 : 1;
  646. }
  647. */
  648. /*
  649. for(glm::size_t j = 0; j < count; ++j)
  650. for(glm::size_t i = 0; i < count; ++i)
  651. {
  652. glm::uint32 A = bitfieldInterleave_u16vec2(glm::uint16(i), glm::uint16(j));
  653. glm::uint32 B = glm::bitfieldInterleave(glm::uint16(i), glm::uint16(j));
  654. Error += A == B ? 0 : 1;
  655. }
  656. */
  657. return Error;
  658. }
  659. static int perf_old_u8vec2(std::vector<glm::uint16>& Result, glm::size_t divider)
  660. {
  661. int Error = 0;
  662. glm::size_t count = 256 / divider;
  663. const std::clock_t BeginTime = std::clock();
  664. for(glm::size_t k = 0; k < 100; ++k)
  665. for(glm::size_t j = 0; j < count; ++j)
  666. for(glm::size_t i = 0; i < count; ++i)
  667. Error += Result[j * count + i] == glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j)) ? 0 : 1;
  668. const std::clock_t EndTime = std::clock();
  669. std::printf("glm::bitfieldInterleave<u8vec2> Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  670. return Error;
  671. }
  672. static int perf_new_u8vec2(std::vector<glm::uint16>& Result, glm::size_t divider)
  673. {
  674. int Error = 0;
  675. glm::size_t count = 256 / divider;
  676. const std::clock_t BeginTime = std::clock();
  677. for(glm::size_t k = 0; k < 100; ++k)
  678. for(glm::size_t j = 0; j < count; ++j)
  679. for(glm::size_t i = 0; i < count; ++i)
  680. Error += Result[j * count + i] == bitfieldInterleave_u8vec2(glm::uint8(i), glm::uint8(j)) ? 0 : 1;
  681. const std::clock_t EndTime = std::clock();
  682. std::printf("bitfieldInterleave_u8vec2 Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  683. return Error;
  684. }
  685. static int perf_old_u8vec4(std::vector<glm::uint32>& Result, glm::size_t divider)
  686. {
  687. int Error = 0;
  688. glm::size_t count = 256 / divider;
  689. const std::clock_t BeginTime = std::clock();
  690. for(glm::size_t k = 0; k < 100; ++k)
  691. for(glm::size_t j = 0; j < count; ++j)
  692. for(glm::size_t i = 0; i < count; ++i)
  693. Error += Result[j * count + i] == glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j), glm::uint8(i), glm::uint8(j)) ? 0 : 1;
  694. const std::clock_t EndTime = std::clock();
  695. std::printf("glm::bitfieldInterleave<u8vec4> Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  696. return Error;
  697. }
  698. /*
  699. static int perf_new_u8vec4(std::vector<glm::uint32>& Result, glm::size_t divider)
  700. {
  701. int Error = 0;
  702. glm::size_t count = 256 / divider;
  703. const std::clock_t BeginTime = std::clock();
  704. for(glm::size_t k = 0; k < 10000; ++k)
  705. for(glm::size_t j = 0; j < count; ++j)
  706. for(glm::size_t i = 0; i < count; ++i)
  707. Error += Result[j * count + i] == bitfieldInterleave_u8vec4(glm::uint8(i), glm::uint8(j), glm::uint8(i), glm::uint8(j)) ? 0 : 1;
  708. const std::clock_t EndTime = std::clock();
  709. std::printf("bitfieldInterleave_u8vec4 Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  710. return Error;
  711. }
  712. */
  713. static int perf_old_u16vec2(std::vector<glm::uint32>& Result, glm::size_t divider)
  714. {
  715. int Error = 0;
  716. glm::size_t count = 256 / divider;
  717. const std::clock_t BeginTime = std::clock();
  718. for(glm::size_t k = 0; k < 100; ++k)
  719. for(glm::size_t j = 0; j < count; ++j)
  720. for(glm::size_t i = 0; i < count; ++i)
  721. Error += Result[j * count + i] == glm::bitfieldInterleave(glm::uint16(i), glm::uint16(j)) ? 0 : 1;
  722. const std::clock_t EndTime = std::clock();
  723. std::printf("glm::bitfieldInterleave<u16vec2> Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  724. return Error;
  725. }
  726. /*
  727. static int perf_new_u16vec2(std::vector<glm::uint32>& Result, glm::size_t divider)
  728. {
  729. int Error = 0;
  730. glm::size_t count = 256 / divider;
  731. const std::clock_t BeginTime = std::clock();
  732. for(glm::size_t k = 0; k < 10000; ++k)
  733. for(glm::size_t j = 0; j < count; ++j)
  734. for(glm::size_t i = 0; i < count; ++i)
  735. Error += Result[j * count + i] == bitfieldInterleave_u16vec2(glm::uint16(i), glm::uint16(j)) ? 0 : 1;
  736. const std::clock_t EndTime = std::clock();
  737. std::printf("bitfieldInterleave_u16vec2 Time %d clocks\n", static_cast<int>(EndTime - BeginTime));
  738. return Error;
  739. }
  740. */
  741. static int perf(glm::size_t divider)
  742. {
  743. int Error = 0;
  744. glm::size_t count = 256 / divider;
  745. std::printf("bitfieldInterleave perf: init\r");
  746. std::vector<glm::uint16> Result_u8vec2(count * count, 0);
  747. for(glm::size_t j = 0; j < count; ++j)
  748. for(glm::size_t i = 0; i < count; ++i)
  749. Result_u8vec2[j * count + i] = glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j));
  750. Error += perf_old_u8vec2(Result_u8vec2, divider);
  751. Error += perf_new_u8vec2(Result_u8vec2, divider);
  752. std::vector<glm::uint32> Result_u8vec4(count * count, 0);
  753. for(glm::size_t j = 0; j < count; ++j)
  754. for(glm::size_t i = 0; i < count; ++i)
  755. Result_u8vec4[j * count + i] = glm::bitfieldInterleave(glm::uint8(i), glm::uint8(j), glm::uint8(i), glm::uint8(j));
  756. Error += perf_old_u8vec4(Result_u8vec4, divider);
  757. //Error += perf_new_u8vec4(Result_u8vec4, divider);
  758. std::vector<glm::uint32> Result_u16vec2(count * count, 0);
  759. for(glm::size_t j = 0; j < count; ++j)
  760. for(glm::size_t i = 0; i < count; ++i)
  761. Result_u16vec2[j * count + i] = glm::bitfieldInterleave(glm::uint16(i), glm::uint16(j));
  762. Error += perf_old_u16vec2(Result_u16vec2, divider);
  763. //Error += perf_new_u16vec2(Result_u16vec2, divider);
  764. std::printf("bitfieldInterleave perf: %d Errors\n", Error);
  765. return Error;
  766. }
  767. }//namespace bitfieldInterleave5
  768. static int test_bitfieldRotateRight()
  769. {
  770. glm::ivec4 const A = glm::bitfieldRotateRight(glm::ivec4(2), 1);
  771. glm::ivec4 const B = glm::ivec4(2) >> 1;
  772. return A == B;
  773. }
  774. static int test_bitfieldRotateLeft()
  775. {
  776. glm::ivec4 const A = glm::bitfieldRotateLeft(glm::ivec4(2), 1);
  777. glm::ivec4 const B = glm::ivec4(2) << 1;
  778. return A == B;
  779. }
  780. int main()
  781. {
  782. int Error = 0;
  783. Error += ::bitfieldInterleave::test();
  784. Error += ::bitfieldInterleave3::test();
  785. Error += ::bitfieldInterleave4::test();
  786. // Tests for a faster and to reserve bitfieldInterleave
  787. Error += ::bitfieldInterleave5::test(64);
  788. Error += ::bitfieldInterleave5::perf(64);
  789. Error += ::bitfieldInterleave::perf();
  790. Error += ::mask::test();
  791. Error += ::mask::perf();
  792. Error += test_bitfieldRotateRight();
  793. Error += test_bitfieldRotateLeft();
  794. return Error;
  795. }