simd_bench.cpp 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. /*
  2. * Copyright 2010-2025 Branimir Karadzic. All rights reserved.
  3. * License: https://github.com/bkaradzic/bx/blob/master/LICENSE
  4. */
  5. #include <bx/allocator.h>
  6. #include <bx/rng.h>
  7. #include <bx/simd_t.h>
  8. #include <bx/timer.h>
  9. #include <stdio.h>
  10. static void flushCache()
  11. {
  12. static uint32_t length = 1 << 26;
  13. static uint8_t* input = new uint8_t[length];
  14. static uint8_t* output = new uint8_t[length];
  15. bx::memCopy(output, input, length);
  16. }
  17. typedef bx::simd128_t (*SimdRsqrtFn)(bx::simd128_t _a);
  18. template<SimdRsqrtFn simdRsqrtFn>
  19. void simd_rsqrt_bench(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVertices)
  20. {
  21. for (uint32_t ii = 0, num = _numVertices/4; ii < num; ++ii)
  22. {
  23. bx::simd128_t* ptr = &_src[ii*4];
  24. bx::simd128_t tmp0 = bx::simd_ld(ptr + 0);
  25. bx::simd128_t tmp1 = bx::simd_ld(ptr + 1);
  26. bx::simd128_t tmp2 = bx::simd_ld(ptr + 2);
  27. bx::simd128_t tmp3 = bx::simd_ld(ptr + 3);
  28. bx::simd128_t rsqrt0 = simdRsqrtFn(tmp0);
  29. bx::simd128_t rsqrt1 = simdRsqrtFn(tmp1);
  30. bx::simd128_t rsqrt2 = simdRsqrtFn(tmp2);
  31. bx::simd128_t rsqrt3 = simdRsqrtFn(tmp3);
  32. ptr = &_dst[ii*4];
  33. bx::simd_st(ptr + 0, rsqrt0);
  34. bx::simd_st(ptr + 1, rsqrt1);
  35. bx::simd_st(ptr + 2, rsqrt2);
  36. bx::simd_st(ptr + 3, rsqrt3);
  37. }
  38. }
  39. void simd_bench_pass(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVertices)
  40. {
  41. const uint32_t numIterations = 10;
  42. {
  43. int64_t elapsed = 0;
  44. for (uint32_t test = 0; test < numIterations; ++test)
  45. {
  46. flushCache();
  47. elapsed += -bx::getHPCounter();
  48. simd_rsqrt_bench<bx::simd_rsqrt_est>(_dst, _src, _numVertices);
  49. elapsed += bx::getHPCounter();
  50. }
  51. printf(" simd_rsqrt_est: %15f\n", double(elapsed) );
  52. }
  53. {
  54. int64_t elapsed = 0;
  55. for (uint32_t test = 0; test < numIterations; ++test)
  56. {
  57. flushCache();
  58. elapsed += -bx::getHPCounter();
  59. simd_rsqrt_bench<bx::simd_rsqrt_nr>(_dst, _src, _numVertices);
  60. elapsed += bx::getHPCounter();
  61. }
  62. printf(" simd_rsqrt_nr: %15f\n", double(elapsed) );
  63. }
  64. {
  65. int64_t elapsed = 0;
  66. for (uint32_t test = 0; test < numIterations; ++test)
  67. {
  68. flushCache();
  69. elapsed += -bx::getHPCounter();
  70. simd_rsqrt_bench<bx::simd_rsqrt_carmack>(_dst, _src, _numVertices);
  71. elapsed += bx::getHPCounter();
  72. }
  73. printf("simd_rsqrt_carmack: %15f\n", double(elapsed) );
  74. }
  75. {
  76. int64_t elapsed = 0;
  77. for (uint32_t test = 0; test < numIterations; ++test)
  78. {
  79. flushCache();
  80. elapsed += -bx::getHPCounter();
  81. simd_rsqrt_bench<bx::simd_rsqrt>(_dst, _src, _numVertices);
  82. elapsed += bx::getHPCounter();
  83. }
  84. printf(" simd_rsqrt: %15f\n", double(elapsed) );
  85. }
  86. }
  87. void simd_bench()
  88. {
  89. bx::DefaultAllocator allocator;
  90. bx::RngMwc rng;
  91. const uint32_t numVertices = 1024*1024;
  92. uint8_t* data = (uint8_t*)bx::alloc(&allocator, 2*numVertices*sizeof(bx::simd128_t), 16);
  93. bx::simd128_t* src = (bx::simd128_t*)data;
  94. bx::simd128_t* dst = &src[numVertices];
  95. printf("\n -- positive & negative --\n");
  96. for (uint32_t ii = 0; ii < numVertices; ++ii)
  97. {
  98. float* ptr = (float*)&src[ii];
  99. bx::store(ptr, bx::randUnitSphere(&rng) );
  100. ptr[3] = 1.0f;
  101. }
  102. simd_bench_pass(dst, src, numVertices);
  103. printf("\n -- positive only --\n");
  104. for (uint32_t ii = 0; ii < numVertices; ++ii)
  105. {
  106. float* ptr = (float*)&src[ii];
  107. ptr[0] = bx::abs(ptr[0]);
  108. ptr[1] = bx::abs(ptr[1]);
  109. ptr[2] = bx::abs(ptr[2]);
  110. ptr[3] = bx::abs(ptr[3]);
  111. }
  112. simd_bench_pass(dst, src, numVertices);
  113. bx::free(&allocator, data, 16);
  114. }