arch-x86.h 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. /*-------------------------------------------------------------------------
  2. *
  3. * arch-x86.h
  4. * Atomic operations considerations specific to intel x86
  5. *
  6. * Note that we actually require a 486 upwards because the 386 doesn't have
  7. * support for xadd and cmpxchg. Given that the 386 isn't supported anywhere
  8. * anymore that's not much of a restriction luckily.
  9. *
  10. * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  11. * Portions Copyright (c) 1994, Regents of the University of California
  12. *
  13. * NOTES:
  14. *
  15. * src/include/port/atomics/arch-x86.h
  16. *
  17. *-------------------------------------------------------------------------
  18. */
  19. /*
  20. * Both 32 and 64 bit x86 do not allow loads to be reordered with other loads,
  21. * or stores to be reordered with other stores, but a load can be performed
  22. * before a subsequent store.
  23. *
  24. * Technically, some x86-ish chips support uncached memory access and/or
  25. * special instructions that are weakly ordered. In those cases we'd need
  26. * the read and write barriers to be lfence and sfence. But since we don't
  27. * do those things, a compiler barrier should be enough.
  28. *
  29. * "lock; addl" has worked for longer than "mfence". It's also rumored to be
  30. * faster in many scenarios.
  31. */
  32. #if defined(__GNUC__) || defined(__INTEL_COMPILER)
  33. #if defined(__i386__) || defined(__i386)
  34. #define pg_memory_barrier_impl() \
  35. __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
  36. #elif defined(__x86_64__)
  37. #define pg_memory_barrier_impl() \
  38. __asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
  39. #endif
  40. #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
  41. #define pg_read_barrier_impl() pg_compiler_barrier_impl()
  42. #define pg_write_barrier_impl() pg_compiler_barrier_impl()
  43. /*
  44. * Provide implementation for atomics using inline assembly on x86 gcc. It's
  45. * nice to support older gcc's and the compare/exchange implementation here is
  46. * actually more efficient than the * __sync variant.
  47. */
  48. #if defined(HAVE_ATOMICS)
  49. #if defined(__GNUC__) || defined(__INTEL_COMPILER)
  50. #define PG_HAVE_ATOMIC_FLAG_SUPPORT
  51. typedef struct pg_atomic_flag
  52. {
  53. volatile char value;
  54. } pg_atomic_flag;
  55. #define PG_HAVE_ATOMIC_U32_SUPPORT
  56. typedef struct pg_atomic_uint32
  57. {
  58. volatile uint32 value;
  59. } pg_atomic_uint32;
  60. /*
  61. * It's too complicated to write inline asm for 64bit types on 32bit and the
  62. * 486 can't do it anyway.
  63. */
  64. #ifdef __x86_64__
  65. #define PG_HAVE_ATOMIC_U64_SUPPORT
  66. typedef struct pg_atomic_uint64
  67. {
  68. /* alignment guaranteed due to being on a 64bit platform */
  69. volatile uint64 value;
  70. } pg_atomic_uint64;
  71. #endif /* __x86_64__ */
  72. #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
  73. #endif /* defined(HAVE_ATOMICS) */
  74. #if !defined(PG_HAVE_SPIN_DELAY)
  75. /*
  76. * This sequence is equivalent to the PAUSE instruction ("rep" is
  77. * ignored by old IA32 processors if the following instruction is
  78. * not a string operation); the IA-32 Architecture Software
  79. * Developer's Manual, Vol. 3, Section 7.7.2 describes why using
  80. * PAUSE in the inner loop of a spin lock is necessary for good
  81. * performance:
  82. *
  83. * The PAUSE instruction improves the performance of IA-32
  84. * processors supporting Hyper-Threading Technology when
  85. * executing spin-wait loops and other routines where one
  86. * thread is accessing a shared lock or semaphore in a tight
  87. * polling loop. When executing a spin-wait loop, the
  88. * processor can suffer a severe performance penalty when
  89. * exiting the loop because it detects a possible memory order
  90. * violation and flushes the core processor's pipeline. The
  91. * PAUSE instruction provides a hint to the processor that the
  92. * code sequence is a spin-wait loop. The processor uses this
  93. * hint to avoid the memory order violation and prevent the
  94. * pipeline flush. In addition, the PAUSE instruction
  95. * de-pipelines the spin-wait loop to prevent it from
  96. * consuming execution resources excessively.
  97. */
  98. #if defined(__GNUC__) || defined(__INTEL_COMPILER)
  99. #define PG_HAVE_SPIN_DELAY
  100. static __inline__ void
  101. pg_spin_delay_impl(void)
  102. {
  103. __asm__ __volatile__(" rep; nop \n");
  104. }
  105. #elif defined(_MSC_VER) && defined(__x86_64__)
  106. #define PG_HAVE_SPIN_DELAY
  107. static __forceinline void
  108. pg_spin_delay_impl(void)
  109. {
  110. _mm_pause();
  111. }
  112. #elif defined(_MSC_VER)
  113. #define PG_HAVE_SPIN_DELAY
  114. static __forceinline void
  115. pg_spin_delay_impl(void)
  116. {
  117. /* See comment for gcc code. Same code, MASM syntax */
  118. __asm rep nop;
  119. }
  120. #endif
  121. #endif /* !defined(PG_HAVE_SPIN_DELAY) */
  122. #if defined(HAVE_ATOMICS)
  123. #if defined(__GNUC__) || defined(__INTEL_COMPILER)
  124. #define PG_HAVE_ATOMIC_TEST_SET_FLAG
  125. static inline bool
  126. pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
  127. {
  128. register char _res = 1;
  129. __asm__ __volatile__(
  130. " lock \n"
  131. " xchgb %0,%1 \n"
  132. : "+q"(_res), "+m"(ptr->value)
  133. :
  134. : "memory");
  135. return _res == 0;
  136. }
  137. #define PG_HAVE_ATOMIC_CLEAR_FLAG
  138. static inline void
  139. pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
  140. {
  141. /*
  142. * On a TSO architecture like x86 it's sufficient to use a compiler
  143. * barrier to achieve release semantics.
  144. */
  145. __asm__ __volatile__("" ::: "memory");
  146. ptr->value = 0;
  147. }
  148. #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
  149. static inline bool
  150. pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
  151. uint32 *expected, uint32 newval)
  152. {
  153. char ret;
  154. /*
  155. * Perform cmpxchg and use the zero flag which it implicitly sets when
  156. * equal to measure the success.
  157. */
  158. __asm__ __volatile__(
  159. " lock \n"
  160. " cmpxchgl %4,%5 \n"
  161. " setz %2 \n"
  162. : "=a" (*expected), "=m"(ptr->value), "=q" (ret)
  163. : "a" (*expected), "r" (newval), "m"(ptr->value)
  164. : "memory", "cc");
  165. return (bool) ret;
  166. }
  167. #define PG_HAVE_ATOMIC_FETCH_ADD_U32
  168. static inline uint32
  169. pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
  170. {
  171. uint32 res;
  172. __asm__ __volatile__(
  173. " lock \n"
  174. " xaddl %0,%1 \n"
  175. : "=q"(res), "=m"(ptr->value)
  176. : "0" (add_), "m"(ptr->value)
  177. : "memory", "cc");
  178. return res;
  179. }
  180. #ifdef __x86_64__
  181. #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
  182. static inline bool
  183. pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
  184. uint64 *expected, uint64 newval)
  185. {
  186. char ret;
  187. /*
  188. * Perform cmpxchg and use the zero flag which it implicitly sets when
  189. * equal to measure the success.
  190. */
  191. __asm__ __volatile__(
  192. " lock \n"
  193. " cmpxchgq %4,%5 \n"
  194. " setz %2 \n"
  195. : "=a" (*expected), "=m"(ptr->value), "=q" (ret)
  196. : "a" (*expected), "r" (newval), "m"(ptr->value)
  197. : "memory", "cc");
  198. return (bool) ret;
  199. }
  200. #define PG_HAVE_ATOMIC_FETCH_ADD_U64
  201. static inline uint64
  202. pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
  203. {
  204. uint64 res;
  205. __asm__ __volatile__(
  206. " lock \n"
  207. " xaddq %0,%1 \n"
  208. : "=q"(res), "=m"(ptr->value)
  209. : "0" (add_), "m"(ptr->value)
  210. : "memory", "cc");
  211. return res;
  212. }
  213. #endif /* __x86_64__ */
  214. #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
  215. /*
  216. * 8 byte reads / writes have single-copy atomicity on 32 bit x86 platforms
  217. * since at least the 586. As well as on all x86-64 cpus.
  218. */
  219. #if defined(__i568__) || defined(__i668__) || /* gcc i586+ */ \
  220. (defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \
  221. defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */
  222. #define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
  223. #endif /* 8 byte single-copy atomicity */
  224. #endif /* HAVE_ATOMICS */