| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- /*
- * Copyright 2010-2025 Branimir Karadzic. All rights reserved.
- * License: https://github.com/bkaradzic/bx/blob/master/LICENSE
- */
- #include <bx/allocator.h>
- #include <bx/rng.h>
- #include <bx/simd_t.h>
- #include <bx/timer.h>
- #include <stdio.h>
- static void flushCache()
- {
- static uint32_t length = 1 << 26;
- static uint8_t* input = new uint8_t[length];
- static uint8_t* output = new uint8_t[length];
- bx::memCopy(output, input, length);
- }
- typedef bx::simd128_t (*SimdRsqrtFn)(bx::simd128_t _a);
- template<SimdRsqrtFn simdRsqrtFn>
- void simd_rsqrt_bench(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVertices)
- {
- for (uint32_t ii = 0, num = _numVertices/4; ii < num; ++ii)
- {
- bx::simd128_t* ptr = &_src[ii*4];
- bx::simd128_t tmp0 = bx::simd_ld(ptr + 0);
- bx::simd128_t tmp1 = bx::simd_ld(ptr + 1);
- bx::simd128_t tmp2 = bx::simd_ld(ptr + 2);
- bx::simd128_t tmp3 = bx::simd_ld(ptr + 3);
- bx::simd128_t rsqrt0 = simdRsqrtFn(tmp0);
- bx::simd128_t rsqrt1 = simdRsqrtFn(tmp1);
- bx::simd128_t rsqrt2 = simdRsqrtFn(tmp2);
- bx::simd128_t rsqrt3 = simdRsqrtFn(tmp3);
- ptr = &_dst[ii*4];
- bx::simd_st(ptr + 0, rsqrt0);
- bx::simd_st(ptr + 1, rsqrt1);
- bx::simd_st(ptr + 2, rsqrt2);
- bx::simd_st(ptr + 3, rsqrt3);
- }
- }
- void simd_bench_pass(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVertices)
- {
- const uint32_t numIterations = 10;
- {
- int64_t elapsed = 0;
- for (uint32_t test = 0; test < numIterations; ++test)
- {
- flushCache();
- elapsed += -bx::getHPCounter();
- simd_rsqrt_bench<bx::simd_rsqrt_est>(_dst, _src, _numVertices);
- elapsed += bx::getHPCounter();
- }
- printf(" simd_rsqrt_est: %15f\n", double(elapsed) );
- }
- {
- int64_t elapsed = 0;
- for (uint32_t test = 0; test < numIterations; ++test)
- {
- flushCache();
- elapsed += -bx::getHPCounter();
- simd_rsqrt_bench<bx::simd_rsqrt_nr>(_dst, _src, _numVertices);
- elapsed += bx::getHPCounter();
- }
- printf(" simd_rsqrt_nr: %15f\n", double(elapsed) );
- }
- {
- int64_t elapsed = 0;
- for (uint32_t test = 0; test < numIterations; ++test)
- {
- flushCache();
- elapsed += -bx::getHPCounter();
- simd_rsqrt_bench<bx::simd_rsqrt_carmack>(_dst, _src, _numVertices);
- elapsed += bx::getHPCounter();
- }
- printf("simd_rsqrt_carmack: %15f\n", double(elapsed) );
- }
- {
- int64_t elapsed = 0;
- for (uint32_t test = 0; test < numIterations; ++test)
- {
- flushCache();
- elapsed += -bx::getHPCounter();
- simd_rsqrt_bench<bx::simd_rsqrt>(_dst, _src, _numVertices);
- elapsed += bx::getHPCounter();
- }
- printf(" simd_rsqrt: %15f\n", double(elapsed) );
- }
- }
- void simd_bench()
- {
- bx::DefaultAllocator allocator;
- bx::RngMwc rng;
- const uint32_t numVertices = 1024*1024;
- uint8_t* data = (uint8_t*)bx::alloc(&allocator, 2*numVertices*sizeof(bx::simd128_t), 16);
- bx::simd128_t* src = (bx::simd128_t*)data;
- bx::simd128_t* dst = &src[numVertices];
- printf("\n -- positive & negative --\n");
- for (uint32_t ii = 0; ii < numVertices; ++ii)
- {
- float* ptr = (float*)&src[ii];
- bx::store(ptr, bx::randUnitSphere(&rng) );
- ptr[3] = 1.0f;
- }
- simd_bench_pass(dst, src, numVertices);
- printf("\n -- positive only --\n");
- for (uint32_t ii = 0; ii < numVertices; ++ii)
- {
- float* ptr = (float*)&src[ii];
- ptr[0] = bx::abs(ptr[0]);
- ptr[1] = bx::abs(ptr[1]);
- ptr[2] = bx::abs(ptr[2]);
- ptr[3] = bx::abs(ptr[3]);
- }
- simd_bench_pass(dst, src, numVertices);
- bx::free(&allocator, data, 16);
- }
|