nbtxlog.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. /*-------------------------------------------------------------------------
  2. *
  3. * nbtxlog.h
  4. * header file for postgres btree xlog routines
  5. *
  6. * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  7. * Portions Copyright (c) 1994, Regents of the University of California
  8. *
  9. * src/include/access/nbtxlog.h
  10. *
  11. *-------------------------------------------------------------------------
  12. */
  13. #ifndef NBTXLOG_H
  14. #define NBTXLOG_H
  15. #include "access/transam.h"
  16. #include "access/xlogreader.h"
  17. #include "lib/stringinfo.h"
  18. #include "storage/off.h"
  19. /*
  20. * XLOG records for btree operations
  21. *
  22. * XLOG allows to store some information in high 4 bits of log
  23. * record xl_info field
  24. */
  25. #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */
  26. #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
  27. #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
  28. #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
  29. #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
  30. #define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */
  31. #define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */
  32. #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
  33. #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
  34. #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
  35. #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
  36. #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */
  37. #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during
  38. * vacuum */
  39. #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
  40. * FSM */
  41. #define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the
  42. * metapage */
  43. /*
  44. * All that we need to regenerate the meta-data page
  45. */
  46. typedef struct xl_btree_metadata
  47. {
  48. uint32 version;
  49. BlockNumber root;
  50. uint32 level;
  51. BlockNumber fastroot;
  52. uint32 fastlevel;
  53. uint32 last_cleanup_num_delpages;
  54. bool allequalimage;
  55. } xl_btree_metadata;
  56. /*
  57. * This is what we need to know about simple (without split) insert.
  58. *
  59. * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
  60. * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a
  61. * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
  62. * page.
  63. *
  64. * Backup Blk 0: original page
  65. * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
  66. * Backup Blk 2: xl_btree_metadata, if INSERT_META
  67. *
  68. * Note: The new tuple is actually the "original" new item in the posting
  69. * list split insert case (i.e. the INSERT_POST case). A split offset for
  70. * the posting list is logged before the original new item. Recovery needs
  71. * both, since it must do an in-place update of the existing posting list
  72. * that was split as an extra step. Also, recovery generates a "final"
  73. * newitem. See _bt_swap_posting() for details on posting list splits.
  74. */
  75. typedef struct xl_btree_insert
  76. {
  77. OffsetNumber offnum;
  78. /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
  79. /* NEW TUPLE ALWAYS FOLLOWS AT THE END */
  80. } xl_btree_insert;
  81. #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
  82. /*
  83. * On insert with split, we save all the items going into the right sibling
  84. * so that we can restore it completely from the log record. This way takes
  85. * less xlog space than the normal approach, because if we did it standardly,
  86. * XLogInsert would almost always think the right page is new and store its
  87. * whole page image. The left page, however, is handled in the normal
  88. * incremental-update fashion.
  89. *
  90. * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
  91. * There are two variants to indicate whether the inserted tuple went into the
  92. * left or right split page (and thus, whether the new item is stored or not).
  93. * We always log the left page high key because suffix truncation can generate
  94. * a new leaf high key using user-defined code. This is also necessary on
  95. * internal pages, since the firstright item that the left page's high key was
  96. * based on will have been truncated to zero attributes in the right page (the
  97. * separator key is unavailable from the right page).
  98. *
  99. * Backup Blk 0: original page / new left page
  100. *
  101. * The left page's data portion contains the new item, if it's the _L variant.
  102. * _R variant split records generally do not have a newitem (_R variant leaf
  103. * page split records that must deal with a posting list split will include an
  104. * explicit newitem, though it is never used on the right page -- it is
  105. * actually an orignewitem needed to update existing posting list). The new
  106. * high key of the left/original page appears last of all (and must always be
  107. * present).
  108. *
  109. * Page split records that need the REDO routine to deal with a posting list
  110. * split directly will have an explicit newitem, which is actually an
  111. * orignewitem (the newitem as it was before the posting list split, not
  112. * after). A posting list split always has a newitem that comes immediately
  113. * after the posting list being split (which would have overlapped with
  114. * orignewitem prior to split). Usually REDO must deal with posting list
  115. * splits with an _L variant page split record, and usually both the new
  116. * posting list and the final newitem go on the left page (the existing
  117. * posting list will be inserted instead of the old, and the final newitem
  118. * will be inserted next to that). However, _R variant split records will
  119. * include an orignewitem when the split point for the page happens to have a
  120. * lastleft tuple that is also the posting list being split (leaving newitem
  121. * as the page split's firstright tuple). The existence of this corner case
  122. * does not change the basic fact about newitem/orignewitem for the REDO
  123. * routine: it is always state used for the left page alone. (This is why the
  124. * record's postingoff field isn't a reliable indicator of whether or not a
  125. * posting list split occurred during the page split; a non-zero value merely
  126. * indicates that the REDO routine must reconstruct a new posting list tuple
  127. * that is needed for the left page.)
  128. *
  129. * This posting list split handling is equivalent to the xl_btree_insert REDO
  130. * routine's INSERT_POST handling. While the details are more complicated
  131. * here, the concept and goals are exactly the same. See _bt_swap_posting()
  132. * for details on posting list splits.
  133. *
  134. * Backup Blk 1: new right page
  135. *
  136. * The right page's data portion contains the right page's tuples in the form
  137. * used by _bt_restore_page. This includes the new item, if it's the _R
  138. * variant. The right page's tuples also include the right page's high key
  139. * with either variant (moved from the left/original page during the split),
  140. * unless the split happened to be of the rightmost page on its level, where
  141. * there is no high key for new right page.
  142. *
  143. * Backup Blk 2: next block (orig page's rightlink), if any
  144. * Backup Blk 3: child's left sibling, if non-leaf split
  145. */
  146. typedef struct xl_btree_split
  147. {
  148. uint32 level; /* tree level of page being split */
  149. OffsetNumber firstrightoff; /* first origpage item on rightpage */
  150. OffsetNumber newitemoff; /* new item's offset */
  151. uint16 postingoff; /* offset inside orig posting tuple */
  152. } xl_btree_split;
  153. #define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16))
  154. /*
  155. * When page is deduplicated, consecutive groups of tuples with equal keys are
  156. * merged together into posting list tuples.
  157. *
  158. * The WAL record represents a deduplication pass for a leaf page. An array
  159. * of BTDedupInterval structs follows.
  160. */
  161. typedef struct xl_btree_dedup
  162. {
  163. uint16 nintervals;
  164. /* DEDUPLICATION INTERVALS FOLLOW */
  165. } xl_btree_dedup;
  166. #define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
  167. /*
  168. * This is what we need to know about page reuse within btree. This record
  169. * only exists to generate a conflict point for Hot Standby.
  170. *
  171. * Note that we must include a RelFileNode in the record because we don't
  172. * actually register the buffer with the record.
  173. */
  174. typedef struct xl_btree_reuse_page
  175. {
  176. RelFileNode node;
  177. BlockNumber block;
  178. FullTransactionId latestRemovedFullXid;
  179. } xl_btree_reuse_page;
  180. #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
  181. /*
  182. * xl_btree_vacuum and xl_btree_delete records describe deletion of index
  183. * tuples on a leaf page. The former variant is used by VACUUM, while the
  184. * latter variant is used by the ad-hoc deletions that sometimes take place
  185. * when btinsert() is called.
  186. *
  187. * The records are very similar. The only difference is that xl_btree_delete
  188. * has to include a latestRemovedXid field to generate recovery conflicts.
  189. * (VACUUM operations can just rely on earlier conflicts generated during
  190. * pruning of the table whose TIDs the to-be-deleted index tuples point to.
  191. * There are also small differences between each REDO routine that we don't go
  192. * into here.)
  193. *
  194. * xl_btree_vacuum and xl_btree_delete both represent deletion of any number
  195. * of index tuples on a single leaf page using page offset numbers. Both also
  196. * support "updates" of index tuples, which is how deletes of a subset of TIDs
  197. * contained in an existing posting list tuple are implemented.
  198. *
  199. * Updated posting list tuples are represented using xl_btree_update metadata.
  200. * The REDO routines each use the xl_btree_update entries (plus each
  201. * corresponding original index tuple from the target leaf page) to generate
  202. * the final updated tuple.
  203. *
  204. * Updates are only used when there will be some remaining TIDs left by the
  205. * REDO routine. Otherwise the posting list tuple just gets deleted outright.
  206. */
  207. typedef struct xl_btree_vacuum
  208. {
  209. uint16 ndeleted;
  210. uint16 nupdated;
  211. /* DELETED TARGET OFFSET NUMBERS FOLLOW */
  212. /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
  213. /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
  214. } xl_btree_vacuum;
  215. #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
  216. typedef struct xl_btree_delete
  217. {
  218. TransactionId latestRemovedXid;
  219. uint16 ndeleted;
  220. uint16 nupdated;
  221. /* DELETED TARGET OFFSET NUMBERS FOLLOW */
  222. /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
  223. /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
  224. } xl_btree_delete;
  225. #define SizeOfBtreeDelete (offsetof(xl_btree_delete, nupdated) + sizeof(uint16))
  226. /*
  227. * The offsets that appear in xl_btree_update metadata are offsets into the
  228. * original posting list from tuple, not page offset numbers. These are
  229. * 0-based. The page offset number for the original posting list tuple comes
  230. * from the main xl_btree_vacuum/xl_btree_delete record.
  231. */
  232. typedef struct xl_btree_update
  233. {
  234. uint16 ndeletedtids;
  235. /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
  236. } xl_btree_update;
  237. #define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
  238. /*
  239. * This is what we need to know about marking an empty subtree for deletion.
  240. * The target identifies the tuple removed from the parent page (note that we
  241. * remove this tuple's downlink and the *following* tuple's key). Note that
  242. * the leaf page is empty, so we don't need to store its content --- it is
  243. * just reinitialized during recovery using the rest of the fields.
  244. *
  245. * Backup Blk 0: leaf block
  246. * Backup Blk 1: top parent
  247. */
  248. typedef struct xl_btree_mark_page_halfdead
  249. {
  250. OffsetNumber poffset; /* deleted tuple id in parent page */
  251. /* information needed to recreate the leaf page: */
  252. BlockNumber leafblk; /* leaf block ultimately being deleted */
  253. BlockNumber leftblk; /* leaf block's left sibling, if any */
  254. BlockNumber rightblk; /* leaf block's right sibling */
  255. BlockNumber topparent; /* topmost internal page in the subtree */
  256. } xl_btree_mark_page_halfdead;
  257. #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
  258. /*
  259. * This is what we need to know about deletion of a btree page. Note that we
  260. * only leave behind a small amount of bookkeeping information in deleted
  261. * pages (deleted pages must be kept around as tombstones for a while). It is
  262. * convenient for the REDO routine to regenerate its target page from scratch.
  263. * This is why WAL record describes certain details that are actually directly
  264. * available from the target page.
  265. *
  266. * Backup Blk 0: target block being deleted
  267. * Backup Blk 1: target block's left sibling, if any
  268. * Backup Blk 2: target block's right sibling
  269. * Backup Blk 3: leaf block (if different from target)
  270. * Backup Blk 4: metapage (if rightsib becomes new fast root)
  271. */
  272. typedef struct xl_btree_unlink_page
  273. {
  274. BlockNumber leftsib; /* target block's left sibling, if any */
  275. BlockNumber rightsib; /* target block's right sibling */
  276. uint32 level; /* target block's level */
  277. FullTransactionId safexid; /* target block's BTPageSetDeleted() XID */
  278. /*
  279. * Information needed to recreate a half-dead leaf page with correct
  280. * topparent link. The fields are only used when deletion operation's
  281. * target page is an internal page. REDO routine creates half-dead page
  282. * from scratch to keep things simple (this is the same convenient
  283. * approach used for the target page itself).
  284. */
  285. BlockNumber leafleftsib;
  286. BlockNumber leafrightsib;
  287. BlockNumber leaftopparent; /* next child down in the subtree */
  288. /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
  289. } xl_btree_unlink_page;
  290. #define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
  291. /*
  292. * New root log record. There are zero tuples if this is to establish an
  293. * empty root, or two if it is the result of splitting an old root.
  294. *
  295. * Note that although this implies rewriting the metadata page, we don't need
  296. * an xl_btree_metadata record --- the rootblk and level are sufficient.
  297. *
  298. * Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
  299. * Backup Blk 1: left child (if splitting an old root)
  300. * Backup Blk 2: metapage
  301. */
  302. typedef struct xl_btree_newroot
  303. {
  304. BlockNumber rootblk; /* location of new root (redundant with blk 0) */
  305. uint32 level; /* its tree level */
  306. } xl_btree_newroot;
  307. #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
  308. /*
  309. * prototypes for functions in nbtxlog.c
  310. */
  311. extern void btree_redo(XLogReaderState *record);
  312. extern void btree_desc(StringInfo buf, XLogReaderState *record);
  313. extern const char *btree_identify(uint8 info);
  314. extern void btree_xlog_startup(void);
  315. extern void btree_xlog_cleanup(void);
  316. extern void btree_mask(char *pagedata, BlockNumber blkno);
  317. #endif /* NBTXLOG_H */