spinlock_benchmark.cc 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. // Copyright The OpenTelemetry Authors
  2. // SPDX-License-Identifier: Apache-2.0
  3. #include <benchmark/benchmark.h>
  4. #include <algorithm>
  5. #include <atomic>
  6. #include <cstdint>
  7. #include <thread>
  8. #include <vector>
  9. #if defined(__i386__) || defined(__x86_64__)
  10. # if defined(__clang__) || defined(__INTEL_COMPILER)
  11. # include <emmintrin.h> // for _mm_pause
  12. # endif
  13. #endif
  14. #include "opentelemetry/common/macros.h"
  15. #include "opentelemetry/common/spin_lock_mutex.h"
  16. namespace
  17. {
  18. using opentelemetry::common::SpinLockMutex;
  19. constexpr int TightLoopLocks = 10000;
  20. // Runs a thrash-test where we spin up N threads, each of which will
  21. // attempt to lock-mutate-unlock a total of `TightLoopLocks` times.
  22. //
  23. // lock: A lambda denoting how to lock. Accepts a reference to `SpinLockType`.
  24. // unlock: A lambda denoting how to unlock. Accepts a reference to `SpinLockType`.
  25. template <typename SpinLockType, typename LockF, typename UnlockF>
  26. inline void SpinThrash(benchmark::State &s, SpinLockType &spinlock, LockF lock, UnlockF unlock)
  27. {
  28. auto num_threads = s.range(0);
  29. // Value we will increment, fighting over a spinlock.
  30. // The contention is meant to be brief, as close to our expected
  31. // use cases of "updating pointers" or "pushing an event onto a buffer".
  32. std::int64_t value OPENTELEMETRY_MAYBE_UNUSED = 0;
  33. std::vector<std::thread> threads;
  34. threads.reserve(num_threads);
  35. // Timing loop
  36. for (auto _ : s)
  37. {
  38. for (auto i = 0; i < num_threads; i++)
  39. {
  40. threads.emplace_back([&] {
  41. // Increment value once each time the lock is acquired. Spin a few times
  42. // to ensure maximum thread contention.
  43. for (int i = 0; i < TightLoopLocks; i++)
  44. {
  45. lock(spinlock);
  46. value++;
  47. unlock(spinlock);
  48. }
  49. });
  50. }
  51. // Join threads
  52. for (auto &thread : threads)
  53. thread.join();
  54. threads.clear();
  55. }
  56. }
  57. // Benchmark of full spin-lock implementation.
  58. static void BM_SpinLockThrashing(benchmark::State &s)
  59. {
  60. SpinLockMutex spinlock;
  61. SpinThrash(s, spinlock, [](SpinLockMutex &m) { m.lock(); }, [](SpinLockMutex &m) { m.unlock(); });
  62. }
  63. // Naive `while(try_lock()) {}` implementation of lock.
  64. static void BM_NaiveSpinLockThrashing(benchmark::State &s)
  65. {
  66. SpinLockMutex spinlock;
  67. SpinThrash(
  68. s, spinlock,
  69. [](SpinLockMutex &m) {
  70. while (!m.try_lock())
  71. {
  72. // Left this comment to keep the same format on old and new versions of clang-format
  73. }
  74. },
  75. [](SpinLockMutex &m) { m.unlock(); });
  76. }
  77. // Simple `while(try_lock()) { yield-processor }`
  78. static void BM_ProcYieldSpinLockThrashing(benchmark::State &s)
  79. {
  80. SpinLockMutex spinlock;
  81. SpinThrash<SpinLockMutex>(
  82. s, spinlock,
  83. [](SpinLockMutex &m) {
  84. while (!m.try_lock())
  85. {
  86. #if defined(_MSC_VER)
  87. YieldProcessor();
  88. #elif defined(__i386__) || defined(__x86_64__)
  89. # if defined(__clang__) || defined(__INTEL_COMPILER)
  90. _mm_pause();
  91. # else
  92. __builtin_ia32_pause();
  93. # endif
  94. #elif defined(__armel__) || defined(__ARMEL__)
  95. asm volatile("nop" ::: "memory");
  96. #elif defined(__arm__) || defined(__aarch64__) // arm big endian / arm64
  97. __asm__ __volatile__("yield" ::: "memory");
  98. #endif
  99. }
  100. },
  101. [](SpinLockMutex &m) { m.unlock(); });
  102. }
  103. // SpinLock thrashing with thread::yield().
  104. static void BM_ThreadYieldSpinLockThrashing(benchmark::State &s)
  105. {
  106. #if defined(__cpp_lib_atomic_value_initialization) && \
  107. __cpp_lib_atomic_value_initialization >= 201911L
  108. std::atomic_flag mutex{};
  109. #else
  110. std::atomic_flag mutex = ATOMIC_FLAG_INIT;
  111. #endif
  112. SpinThrash<std::atomic_flag>(
  113. s, mutex,
  114. [](std::atomic_flag &l) {
  115. uint32_t try_count = 0;
  116. while (l.test_and_set(std::memory_order_acq_rel))
  117. {
  118. ++try_count;
  119. if (try_count % 32)
  120. {
  121. std::this_thread::yield();
  122. }
  123. }
  124. std::this_thread::yield();
  125. },
  126. [](std::atomic_flag &l) { l.clear(std::memory_order_release); });
  127. }
  128. // Run the benchmarks at 2x thread/core and measure the amount of time to thrash around.
  129. BENCHMARK(BM_SpinLockThrashing)
  130. ->RangeMultiplier(2)
  131. ->Range(1, std::thread::hardware_concurrency())
  132. ->MeasureProcessCPUTime()
  133. ->UseRealTime()
  134. ->Unit(benchmark::kMillisecond);
  135. BENCHMARK(BM_ProcYieldSpinLockThrashing)
  136. ->RangeMultiplier(2)
  137. ->Range(1, std::thread::hardware_concurrency())
  138. ->MeasureProcessCPUTime()
  139. ->UseRealTime()
  140. ->Unit(benchmark::kMillisecond);
  141. BENCHMARK(BM_NaiveSpinLockThrashing)
  142. ->RangeMultiplier(2)
  143. ->Range(1, std::thread::hardware_concurrency())
  144. ->MeasureProcessCPUTime()
  145. ->UseRealTime()
  146. ->Unit(benchmark::kMillisecond);
  147. BENCHMARK(BM_ThreadYieldSpinLockThrashing)
  148. ->RangeMultiplier(2)
  149. ->Range(1, std::thread::hardware_concurrency())
  150. ->MeasureProcessCPUTime()
  151. ->UseRealTime()
  152. ->Unit(benchmark::kMillisecond);
  153. } // namespace
  154. BENCHMARK_MAIN();