| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 | // Copyright The OpenTelemetry Authors// SPDX-License-Identifier: Apache-2.0#include <benchmark/benchmark.h>#include <algorithm>#include <atomic>#include <cstdint>#include <thread>#include <vector>#if defined(__i386__) || defined(__x86_64__)#  if defined(__clang__) || defined(__INTEL_COMPILER)#    include <emmintrin.h>  // for _mm_pause#  endif#endif#include "opentelemetry/common/macros.h"#include "opentelemetry/common/spin_lock_mutex.h"namespace{using opentelemetry::common::SpinLockMutex;constexpr int TightLoopLocks = 10000;// Runs a thrash-test where we spin up N threads, each of which will// attempt to lock-mutate-unlock a total of `TightLoopLocks` times.//// lock: A lambda denoting how to lock.   Accepts a reference to `SpinLockType`.// unlock: A lambda denoting how to unlock.   Accepts a reference to `SpinLockType`.template <typename SpinLockType, typename LockF, typename UnlockF>inline void SpinThrash(benchmark::State &s, SpinLockType &spinlock, LockF lock, UnlockF unlock){  auto num_threads = s.range(0);  // Value we will increment, fighting over a spinlock.  // The contention is meant to be brief, as close to our expected  // use cases of "updating pointers" or "pushing an event onto a buffer".  std::int64_t value OPENTELEMETRY_MAYBE_UNUSED = 0;  std::vector<std::thread> threads;  threads.reserve(num_threads);  // Timing loop  for (auto _ : s)  {    for (auto i = 0; i < num_threads; i++)    {      threads.emplace_back([&] {        // Increment value once each time the lock is acquired.  Spin a few times        // to ensure maximum thread contention.        for (int i = 0; i < TightLoopLocks; i++)        {          lock(spinlock);          value++;          unlock(spinlock);        }      });    }    // Join threads    for (auto &thread : threads)      thread.join();    threads.clear();  }}// Benchmark of full spin-lock implementation.static void BM_SpinLockThrashing(benchmark::State &s){  SpinLockMutex spinlock;  SpinThrash(s, spinlock, [](SpinLockMutex &m) { m.lock(); }, [](SpinLockMutex &m) { m.unlock(); });}// Naive `while(try_lock()) {}` implementation of lock.static void BM_NaiveSpinLockThrashing(benchmark::State &s){  SpinLockMutex spinlock;  SpinThrash(      s, spinlock,      [](SpinLockMutex &m) {        while (!m.try_lock())        {          // Left this comment to keep the same format on old and new versions of clang-format        }      },      [](SpinLockMutex &m) { m.unlock(); });}// Simple `while(try_lock()) { yield-processor }`static void BM_ProcYieldSpinLockThrashing(benchmark::State &s){  SpinLockMutex spinlock;  SpinThrash<SpinLockMutex>(      s, spinlock,      [](SpinLockMutex &m) {        while (!m.try_lock())        {#if defined(_MSC_VER)          YieldProcessor();#elif defined(__i386__) || defined(__x86_64__)#  if defined(__clang__) || defined(__INTEL_COMPILER)          _mm_pause();#  else          __builtin_ia32_pause();#  endif#elif defined(__armel__) || defined(__ARMEL__)          asm volatile("nop" ::: "memory");#elif defined(__arm__) || defined(__aarch64__)  // arm big endian / arm64          __asm__ __volatile__("yield" ::: "memory");#endif        }      },      [](SpinLockMutex &m) { m.unlock(); });}// SpinLock thrashing with thread::yield().static void BM_ThreadYieldSpinLockThrashing(benchmark::State &s){#if defined(__cpp_lib_atomic_value_initialization) && \    __cpp_lib_atomic_value_initialization >= 201911L  std::atomic_flag mutex{};#else  std::atomic_flag mutex = ATOMIC_FLAG_INIT;#endif  SpinThrash<std::atomic_flag>(      s, mutex,      [](std::atomic_flag &l) {        uint32_t try_count = 0;        while (l.test_and_set(std::memory_order_acq_rel))        {          ++try_count;          if (try_count % 32)          {            std::this_thread::yield();          }        }        std::this_thread::yield();      },      [](std::atomic_flag &l) { l.clear(std::memory_order_release); });}// Run the benchmarks at 2x thread/core and measure the amount of time to thrash around.BENCHMARK(BM_SpinLockThrashing)    ->RangeMultiplier(2)    ->Range(1, std::thread::hardware_concurrency())    ->MeasureProcessCPUTime()    ->UseRealTime()    ->Unit(benchmark::kMillisecond);BENCHMARK(BM_ProcYieldSpinLockThrashing)    ->RangeMultiplier(2)    ->Range(1, std::thread::hardware_concurrency())    ->MeasureProcessCPUTime()    ->UseRealTime()    ->Unit(benchmark::kMillisecond);BENCHMARK(BM_NaiveSpinLockThrashing)    ->RangeMultiplier(2)    ->Range(1, std::thread::hardware_concurrency())    ->MeasureProcessCPUTime()    ->UseRealTime()    ->Unit(benchmark::kMillisecond);BENCHMARK(BM_ThreadYieldSpinLockThrashing)    ->RangeMultiplier(2)    ->Range(1, std::thread::hardware_concurrency())    ->MeasureProcessCPUTime()    ->UseRealTime()    ->Unit(benchmark::kMillisecond);}  // namespaceBENCHMARK_MAIN();
 |