2
0

buf_internals.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. /*-------------------------------------------------------------------------
  2. *
  3. * buf_internals.h
  4. * Internal definitions for buffer manager and the buffer replacement
  5. * strategy.
  6. *
  7. *
  8. * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  9. * Portions Copyright (c) 1994, Regents of the University of California
  10. *
  11. * src/include/storage/buf_internals.h
  12. *
  13. *-------------------------------------------------------------------------
  14. */
  15. #ifndef BUFMGR_INTERNALS_H
  16. #define BUFMGR_INTERNALS_H
  17. #include "port/atomics.h"
  18. #include "storage/buf.h"
  19. #include "storage/bufmgr.h"
  20. #include "storage/condition_variable.h"
  21. #include "storage/latch.h"
  22. #include "storage/lwlock.h"
  23. #include "storage/shmem.h"
  24. #include "storage/smgr.h"
  25. #include "storage/spin.h"
  26. #include "utils/relcache.h"
  27. /*
  28. * Buffer state is a single 32-bit variable where following data is combined.
  29. *
  30. * - 18 bits refcount
  31. * - 4 bits usage count
  32. * - 10 bits of flags
  33. *
  34. * Combining these values allows to perform some operations without locking
  35. * the buffer header, by modifying them together with a CAS loop.
  36. *
  37. * The definition of buffer state components is below.
  38. */
  39. #define BUF_REFCOUNT_ONE 1
  40. #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
  41. #define BUF_USAGECOUNT_MASK 0x003C0000U
  42. #define BUF_USAGECOUNT_ONE (1U << 18)
  43. #define BUF_USAGECOUNT_SHIFT 18
  44. #define BUF_FLAG_MASK 0xFFC00000U
  45. /* Get refcount and usagecount from buffer state */
  46. #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
  47. #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
  48. /*
  49. * Flags for buffer descriptors
  50. *
  51. * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
  52. * entry associated with the buffer's tag.
  53. */
  54. #define BM_LOCKED (1U << 22) /* buffer header is locked */
  55. #define BM_DIRTY (1U << 23) /* data needs writing */
  56. #define BM_VALID (1U << 24) /* data is valid */
  57. #define BM_TAG_VALID (1U << 25) /* tag is assigned */
  58. #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
  59. #define BM_IO_ERROR (1U << 27) /* previous I/O failed */
  60. #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
  61. #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
  62. #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
  63. #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
  64. * or init fork) */
  65. /*
  66. * The maximum allowed value of usage_count represents a tradeoff between
  67. * accuracy and speed of the clock-sweep buffer management algorithm. A
  68. * large value (comparable to NBuffers) would approximate LRU semantics.
  69. * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
  70. * clock sweeps to find a free buffer, so in practice we don't want the
  71. * value to be very large.
  72. */
  73. #define BM_MAX_USAGE_COUNT 5
  74. /*
  75. * Buffer tag identifies which disk block the buffer contains.
  76. *
  77. * Note: the BufferTag data must be sufficient to determine where to write the
  78. * block, without reference to pg_class or pg_tablespace entries. It's
  79. * possible that the backend flushing the buffer doesn't even believe the
  80. * relation is visible yet (its xact may have started before the xact that
  81. * created the rel). The storage manager must be able to cope anyway.
  82. *
  83. * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
  84. * to be fixed to zero them, since this struct is used as a hash key.
  85. */
  86. typedef struct buftag
  87. {
  88. RelFileNode rnode; /* physical relation identifier */
  89. ForkNumber forkNum;
  90. BlockNumber blockNum; /* blknum relative to begin of reln */
  91. } BufferTag;
  92. #define CLEAR_BUFFERTAG(a) \
  93. ( \
  94. (a).rnode.spcNode = InvalidOid, \
  95. (a).rnode.dbNode = InvalidOid, \
  96. (a).rnode.relNode = InvalidOid, \
  97. (a).forkNum = InvalidForkNumber, \
  98. (a).blockNum = InvalidBlockNumber \
  99. )
  100. #define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \
  101. ( \
  102. (a).rnode = (xx_rnode), \
  103. (a).forkNum = (xx_forkNum), \
  104. (a).blockNum = (xx_blockNum) \
  105. )
  106. #define BUFFERTAGS_EQUAL(a,b) \
  107. ( \
  108. RelFileNodeEquals((a).rnode, (b).rnode) && \
  109. (a).blockNum == (b).blockNum && \
  110. (a).forkNum == (b).forkNum \
  111. )
  112. /*
  113. * The shared buffer mapping table is partitioned to reduce contention.
  114. * To determine which partition lock a given tag requires, compute the tag's
  115. * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
  116. * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
  117. */
  118. #define BufTableHashPartition(hashcode) \
  119. ((hashcode) % NUM_BUFFER_PARTITIONS)
  120. #define BufMappingPartitionLock(hashcode) \
  121. (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
  122. BufTableHashPartition(hashcode)].lock)
  123. #define BufMappingPartitionLockByIndex(i) \
  124. (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
  125. /*
  126. * BufferDesc -- shared descriptor/state data for a single shared buffer.
  127. *
  128. * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
  129. * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
  130. * is a spinlock which is combined with flags, refcount and usagecount into
  131. * single atomic variable. This layout allow us to do some operations in a
  132. * single atomic operation, without actually acquiring and releasing spinlock;
  133. * for instance, increase or decrease refcount. buf_id field never changes
  134. * after initialization, so does not need locking. freeNext is protected by
  135. * the buffer_strategy_lock not buffer header lock. The LWLock can take care
  136. * of itself. The buffer header lock is *not* used to control access to the
  137. * data in the buffer!
  138. *
  139. * It's assumed that nobody changes the state field while buffer header lock
  140. * is held. Thus buffer header lock holder can do complex updates of the
  141. * state variable in single write, simultaneously with lock release (cleaning
  142. * BM_LOCKED flag). On the other hand, updating of state without holding
  143. * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
  144. * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
  145. *
  146. * An exception is that if we have the buffer pinned, its tag can't change
  147. * underneath us, so we can examine the tag without locking the buffer header.
  148. * Also, in places we do one-time reads of the flags without bothering to
  149. * lock the buffer header; this is generally for situations where we don't
  150. * expect the flag bit being tested to be changing.
  151. *
  152. * We can't physically remove items from a disk page if another backend has
  153. * the buffer pinned. Hence, a backend may need to wait for all other pins
  154. * to go away. This is signaled by storing its own pgprocno into
  155. * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
  156. * there can be only one such waiter per buffer.
  157. *
  158. * We use this same struct for local buffer headers, but the locks are not
  159. * used and not all of the flag bits are useful either. To avoid unnecessary
  160. * overhead, manipulations of the state field should be done without actual
  161. * atomic operations (i.e. only pg_atomic_read_u32() and
  162. * pg_atomic_unlocked_write_u32()).
  163. *
  164. * Be careful to avoid increasing the size of the struct when adding or
  165. * reordering members. Keeping it below 64 bytes (the most common CPU
  166. * cache line size) is fairly important for performance.
  167. *
  168. * Per-buffer I/O condition variables are currently kept outside this struct in
  169. * a separate array. They could be moved in here and still fit within that
  170. * limit on common systems, but for now that is not done.
  171. */
  172. typedef struct BufferDesc
  173. {
  174. BufferTag tag; /* ID of page contained in buffer */
  175. int buf_id; /* buffer's index number (from 0) */
  176. /* state of the tag, containing flags, refcount and usagecount */
  177. pg_atomic_uint32 state;
  178. int wait_backend_pgprocno; /* backend of pin-count waiter */
  179. int freeNext; /* link in freelist chain */
  180. LWLock content_lock; /* to lock access to buffer contents */
  181. } BufferDesc;
  182. /*
  183. * Concurrent access to buffer headers has proven to be more efficient if
  184. * they're cache line aligned. So we force the start of the BufferDescriptors
  185. * array to be on a cache line boundary and force the elements to be cache
  186. * line sized.
  187. *
  188. * XXX: As this is primarily matters in highly concurrent workloads which
  189. * probably all are 64bit these days, and the space wastage would be a bit
  190. * more noticeable on 32bit systems, we don't force the stride to be cache
  191. * line sized on those. If somebody does actual performance testing, we can
  192. * reevaluate.
  193. *
  194. * Note that local buffer descriptors aren't forced to be aligned - as there's
  195. * no concurrent access to those it's unlikely to be beneficial.
  196. *
  197. * We use a 64-byte cache line size here, because that's the most common
  198. * size. Making it bigger would be a waste of memory. Even if running on a
  199. * platform with either 32 or 128 byte line sizes, it's good to align to
  200. * boundaries and avoid false sharing.
  201. */
  202. #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
  203. typedef union BufferDescPadded
  204. {
  205. BufferDesc bufferdesc;
  206. char pad[BUFFERDESC_PAD_TO_SIZE];
  207. } BufferDescPadded;
  208. #define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc)
  209. #define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)])
  210. #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
  211. #define BufferDescriptorGetIOCV(bdesc) \
  212. (&(BufferIOCVArray[(bdesc)->buf_id]).cv)
  213. #define BufferDescriptorGetContentLock(bdesc) \
  214. ((LWLock*) (&(bdesc)->content_lock))
  215. extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
  216. /*
  217. * The freeNext field is either the index of the next freelist entry,
  218. * or one of these special values:
  219. */
  220. #define FREENEXT_END_OF_LIST (-1)
  221. #define FREENEXT_NOT_IN_LIST (-2)
  222. /*
  223. * Functions for acquiring/releasing a shared buffer header's spinlock. Do
  224. * not apply these to local buffers!
  225. */
  226. extern uint32 LockBufHdr(BufferDesc *desc);
  227. #define UnlockBufHdr(desc, s) \
  228. do { \
  229. pg_write_barrier(); \
  230. pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
  231. } while (0)
  232. /*
  233. * The PendingWriteback & WritebackContext structure are used to keep
  234. * information about pending flush requests to be issued to the OS.
  235. */
  236. typedef struct PendingWriteback
  237. {
  238. /* could store different types of pending flushes here */
  239. BufferTag tag;
  240. } PendingWriteback;
  241. /* struct forward declared in bufmgr.h */
  242. typedef struct WritebackContext
  243. {
  244. /* pointer to the max number of writeback requests to coalesce */
  245. int *max_pending;
  246. /* current number of pending writeback requests */
  247. int nr_pending;
  248. /* pending requests */
  249. PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
  250. } WritebackContext;
  251. /* in buf_init.c */
  252. extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
  253. extern PGDLLIMPORT WritebackContext BackendWritebackContext;
  254. /* in localbuf.c */
  255. extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
  256. /* in bufmgr.c */
  257. /*
  258. * Structure to sort buffers per file on checkpoints.
  259. *
  260. * This structure is allocated per buffer in shared memory, so it should be
  261. * kept as small as possible.
  262. */
  263. typedef struct CkptSortItem
  264. {
  265. Oid tsId;
  266. Oid relNode;
  267. ForkNumber forkNum;
  268. BlockNumber blockNum;
  269. int buf_id;
  270. } CkptSortItem;
  271. extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
  272. /*
  273. * Internal buffer management routines
  274. */
  275. /* bufmgr.c */
  276. extern void WritebackContextInit(WritebackContext *context, int *max_pending);
  277. extern void IssuePendingWritebacks(WritebackContext *context);
  278. extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  279. /* freelist.c */
  280. extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
  281. uint32 *buf_state);
  282. extern void StrategyFreeBuffer(BufferDesc *buf);
  283. extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  284. BufferDesc *buf);
  285. extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
  286. extern void StrategyNotifyBgWriter(int bgwprocno);
  287. extern Size StrategyShmemSize(void);
  288. extern void StrategyInitialize(bool init);
  289. extern bool have_free_buffer(void);
  290. /* buf_table.c */
  291. extern Size BufTableShmemSize(int size);
  292. extern void InitBufTable(int size);
  293. extern uint32 BufTableHashCode(BufferTag *tagPtr);
  294. extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
  295. extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
  296. extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
  297. /* localbuf.c */
  298. extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
  299. ForkNumber forkNum,
  300. BlockNumber blockNum);
  301. extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
  302. BlockNumber blockNum, bool *foundPtr);
  303. extern void MarkLocalBufferDirty(Buffer buffer);
  304. extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
  305. BlockNumber firstDelBlock);
  306. extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode);
  307. extern void AtEOXact_LocalBuffers(bool isCommit);
  308. #endif /* BUFMGR_INTERNALS_H */