123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- /*-------------------------------------------------------------------------
- *
- * arch-x86.h
- * Atomic operations considerations specific to intel x86
- *
- * Note that we actually require a 486 upwards because the 386 doesn't have
- * support for xadd and cmpxchg. Given that the 386 isn't supported anywhere
- * anymore that's not much of a restriction luckily.
- *
- * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- * NOTES:
- *
- * src/include/port/atomics/arch-x86.h
- *
- *-------------------------------------------------------------------------
- */
- /*
- * Both 32 and 64 bit x86 do not allow loads to be reordered with other loads,
- * or stores to be reordered with other stores, but a load can be performed
- * before a subsequent store.
- *
- * Technically, some x86-ish chips support uncached memory access and/or
- * special instructions that are weakly ordered. In those cases we'd need
- * the read and write barriers to be lfence and sfence. But since we don't
- * do those things, a compiler barrier should be enough.
- *
- * "lock; addl" has worked for longer than "mfence". It's also rumored to be
- * faster in many scenarios.
- */
- #if defined(__GNUC__) || defined(__INTEL_COMPILER)
- #if defined(__i386__) || defined(__i386)
- #define pg_memory_barrier_impl() \
- __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
- #elif defined(__x86_64__)
- #define pg_memory_barrier_impl() \
- __asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
- #endif
- #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
- #define pg_read_barrier_impl() pg_compiler_barrier_impl()
- #define pg_write_barrier_impl() pg_compiler_barrier_impl()
- /*
- * Provide implementation for atomics using inline assembly on x86 gcc. It's
- * nice to support older gcc's and the compare/exchange implementation here is
- * actually more efficient than the * __sync variant.
- */
- #if defined(HAVE_ATOMICS)
- #if defined(__GNUC__) || defined(__INTEL_COMPILER)
- #define PG_HAVE_ATOMIC_FLAG_SUPPORT
- typedef struct pg_atomic_flag
- {
- volatile char value;
- } pg_atomic_flag;
- #define PG_HAVE_ATOMIC_U32_SUPPORT
- typedef struct pg_atomic_uint32
- {
- volatile uint32 value;
- } pg_atomic_uint32;
- /*
- * It's too complicated to write inline asm for 64bit types on 32bit and the
- * 486 can't do it anyway.
- */
- #ifdef __x86_64__
- #define PG_HAVE_ATOMIC_U64_SUPPORT
- typedef struct pg_atomic_uint64
- {
- /* alignment guaranteed due to being on a 64bit platform */
- volatile uint64 value;
- } pg_atomic_uint64;
- #endif /* __x86_64__ */
- #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
- #endif /* defined(HAVE_ATOMICS) */
- #if !defined(PG_HAVE_SPIN_DELAY)
- /*
- * This sequence is equivalent to the PAUSE instruction ("rep" is
- * ignored by old IA32 processors if the following instruction is
- * not a string operation); the IA-32 Architecture Software
- * Developer's Manual, Vol. 3, Section 7.7.2 describes why using
- * PAUSE in the inner loop of a spin lock is necessary for good
- * performance:
- *
- * The PAUSE instruction improves the performance of IA-32
- * processors supporting Hyper-Threading Technology when
- * executing spin-wait loops and other routines where one
- * thread is accessing a shared lock or semaphore in a tight
- * polling loop. When executing a spin-wait loop, the
- * processor can suffer a severe performance penalty when
- * exiting the loop because it detects a possible memory order
- * violation and flushes the core processor's pipeline. The
- * PAUSE instruction provides a hint to the processor that the
- * code sequence is a spin-wait loop. The processor uses this
- * hint to avoid the memory order violation and prevent the
- * pipeline flush. In addition, the PAUSE instruction
- * de-pipelines the spin-wait loop to prevent it from
- * consuming execution resources excessively.
- */
- #if defined(__GNUC__) || defined(__INTEL_COMPILER)
- #define PG_HAVE_SPIN_DELAY
- static __inline__ void
- pg_spin_delay_impl(void)
- {
- __asm__ __volatile__(" rep; nop \n");
- }
- #elif defined(_MSC_VER) && defined(__x86_64__)
- #define PG_HAVE_SPIN_DELAY
- static __forceinline void
- pg_spin_delay_impl(void)
- {
- _mm_pause();
- }
- #elif defined(_MSC_VER)
- #define PG_HAVE_SPIN_DELAY
- static __forceinline void
- pg_spin_delay_impl(void)
- {
- /* See comment for gcc code. Same code, MASM syntax */
- __asm rep nop;
- }
- #endif
- #endif /* !defined(PG_HAVE_SPIN_DELAY) */
- #if defined(HAVE_ATOMICS)
- #if defined(__GNUC__) || defined(__INTEL_COMPILER)
- #define PG_HAVE_ATOMIC_TEST_SET_FLAG
- static inline bool
- pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
- {
- register char _res = 1;
- __asm__ __volatile__(
- " lock \n"
- " xchgb %0,%1 \n"
- : "+q"(_res), "+m"(ptr->value)
- :
- : "memory");
- return _res == 0;
- }
- #define PG_HAVE_ATOMIC_CLEAR_FLAG
- static inline void
- pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
- {
- /*
- * On a TSO architecture like x86 it's sufficient to use a compiler
- * barrier to achieve release semantics.
- */
- __asm__ __volatile__("" ::: "memory");
- ptr->value = 0;
- }
- #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
- static inline bool
- pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
- uint32 *expected, uint32 newval)
- {
- char ret;
- /*
- * Perform cmpxchg and use the zero flag which it implicitly sets when
- * equal to measure the success.
- */
- __asm__ __volatile__(
- " lock \n"
- " cmpxchgl %4,%5 \n"
- " setz %2 \n"
- : "=a" (*expected), "=m"(ptr->value), "=q" (ret)
- : "a" (*expected), "r" (newval), "m"(ptr->value)
- : "memory", "cc");
- return (bool) ret;
- }
- #define PG_HAVE_ATOMIC_FETCH_ADD_U32
- static inline uint32
- pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
- {
- uint32 res;
- __asm__ __volatile__(
- " lock \n"
- " xaddl %0,%1 \n"
- : "=q"(res), "=m"(ptr->value)
- : "0" (add_), "m"(ptr->value)
- : "memory", "cc");
- return res;
- }
- #ifdef __x86_64__
- #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
- static inline bool
- pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
- uint64 *expected, uint64 newval)
- {
- char ret;
- /*
- * Perform cmpxchg and use the zero flag which it implicitly sets when
- * equal to measure the success.
- */
- __asm__ __volatile__(
- " lock \n"
- " cmpxchgq %4,%5 \n"
- " setz %2 \n"
- : "=a" (*expected), "=m"(ptr->value), "=q" (ret)
- : "a" (*expected), "r" (newval), "m"(ptr->value)
- : "memory", "cc");
- return (bool) ret;
- }
- #define PG_HAVE_ATOMIC_FETCH_ADD_U64
- static inline uint64
- pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
- {
- uint64 res;
- __asm__ __volatile__(
- " lock \n"
- " xaddq %0,%1 \n"
- : "=q"(res), "=m"(ptr->value)
- : "0" (add_), "m"(ptr->value)
- : "memory", "cc");
- return res;
- }
- #endif /* __x86_64__ */
- #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
- /*
- * 8 byte reads / writes have single-copy atomicity on 32 bit x86 platforms
- * since at least the 586. As well as on all x86-64 cpus.
- */
- #if defined(__i568__) || defined(__i668__) || /* gcc i586+ */ \
- (defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \
- defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */
- #define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
- #endif /* 8 byte single-copy atomicity */
- #endif /* HAVE_ATOMICS */
|