2
0

predicate_internals.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. /*-------------------------------------------------------------------------
  2. *
  3. * predicate_internals.h
  4. * POSTGRES internal predicate locking definitions.
  5. *
  6. *
  7. * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  8. * Portions Copyright (c) 1994, Regents of the University of California
  9. *
  10. * src/include/storage/predicate_internals.h
  11. *
  12. *-------------------------------------------------------------------------
  13. */
  14. #ifndef PREDICATE_INTERNALS_H
  15. #define PREDICATE_INTERNALS_H
  16. #include "storage/lock.h"
  17. #include "storage/lwlock.h"
  18. /*
  19. * Commit number.
  20. */
  21. typedef uint64 SerCommitSeqNo;
  22. /*
  23. * Reserved commit sequence numbers:
  24. * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be
  25. * used as a SerCommitSeqNo, even an invalid one
  26. * - InvalidSerCommitSeqNo is used to indicate a transaction that
  27. * hasn't committed yet, so use a number greater than all valid
  28. * ones to make comparison do the expected thing
  29. * - RecoverySerCommitSeqNo is used to refer to transactions that
  30. * happened before a crash/recovery, since we restart the sequence
  31. * at that point. It's earlier than all normal sequence numbers,
  32. * and is only used by recovered prepared transactions
  33. */
  34. #define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX)
  35. #define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1)
  36. #define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2)
  37. /*
  38. * The SERIALIZABLEXACT struct contains information needed for each
  39. * serializable database transaction to support SSI techniques.
  40. *
  41. * A home-grown list is maintained in shared memory to manage these.
  42. * An entry is used when the serializable transaction acquires a snapshot.
  43. * Unless the transaction is rolled back, this entry must generally remain
  44. * until all concurrent transactions have completed. (There are special
  45. * optimizations for READ ONLY transactions which often allow them to be
  46. * cleaned up earlier.) A transaction which is rolled back is cleaned up
  47. * as soon as possible.
  48. *
  49. * Eligibility for cleanup of committed transactions is generally determined
  50. * by comparing the transaction's finishedBefore field to
  51. * SxactGlobalXmin.
  52. */
  53. typedef struct SERIALIZABLEXACT
  54. {
  55. VirtualTransactionId vxid; /* The executing process always has one of
  56. * these. */
  57. /*
  58. * We use two numbers to track the order that transactions commit. Before
  59. * commit, a transaction is marked as prepared, and prepareSeqNo is set.
  60. * Shortly after commit, it's marked as committed, and commitSeqNo is set.
  61. * This doesn't give a strict commit order, but these two values together
  62. * are good enough for us, as we can always err on the safe side and
  63. * assume that there's a conflict, if we can't be sure of the exact
  64. * ordering of two commits.
  65. *
  66. * Note that a transaction is marked as prepared for a short period during
  67. * commit processing, even if two-phase commit is not used. But with
  68. * two-phase commit, a transaction can stay in prepared state for some
  69. * time.
  70. */
  71. SerCommitSeqNo prepareSeqNo;
  72. SerCommitSeqNo commitSeqNo;
  73. /* these values are not both interesting at the same time */
  74. union
  75. {
  76. SerCommitSeqNo earliestOutConflictCommit; /* when committed with
  77. * conflict out */
  78. SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or
  79. * no conflict out */
  80. } SeqNo;
  81. SHM_QUEUE outConflicts; /* list of write transactions whose data we
  82. * couldn't read. */
  83. SHM_QUEUE inConflicts; /* list of read transactions which couldn't
  84. * see our write. */
  85. SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */
  86. SHM_QUEUE finishedLink; /* list link in
  87. * FinishedSerializableTransactions */
  88. /*
  89. * perXactPredicateListLock is only used in parallel queries: it protects
  90. * this SERIALIZABLEXACT's predicate lock list against other workers of
  91. * the same session.
  92. */
  93. LWLock perXactPredicateListLock;
  94. /*
  95. * for r/o transactions: list of concurrent r/w transactions that we could
  96. * potentially have conflicts with, and vice versa for r/w transactions
  97. */
  98. SHM_QUEUE possibleUnsafeConflicts;
  99. TransactionId topXid; /* top level xid for the transaction, if one
  100. * exists; else invalid */
  101. TransactionId finishedBefore; /* invalid means still running; else the
  102. * struct expires when no serializable
  103. * xids are before this. */
  104. TransactionId xmin; /* the transaction's snapshot xmin */
  105. uint32 flags; /* OR'd combination of values defined below */
  106. int pid; /* pid of associated process */
  107. int pgprocno; /* pgprocno of associated process */
  108. } SERIALIZABLEXACT;
  109. #define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */
  110. #define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */
  111. #define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */
  112. #define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */
  113. /*
  114. * The following flag actually means that the flagged transaction has a
  115. * conflict out *to a transaction which committed ahead of it*. It's hard
  116. * to get that into a name of a reasonable length.
  117. */
  118. #define SXACT_FLAG_CONFLICT_OUT 0x00000010
  119. #define SXACT_FLAG_READ_ONLY 0x00000020
  120. #define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040
  121. #define SXACT_FLAG_RO_SAFE 0x00000080
  122. #define SXACT_FLAG_RO_UNSAFE 0x00000100
  123. #define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200
  124. #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
  125. /*
  126. * The following flag means the transaction has been partially released
  127. * already, but is being preserved because parallel workers might have a
  128. * reference to it. It'll be recycled by the leader at end-of-transaction.
  129. */
  130. #define SXACT_FLAG_PARTIALLY_RELEASED 0x00000800
  131. /*
  132. * The following types are used to provide an ad hoc list for holding
  133. * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to
  134. * access these by key -- there are direct pointers to these objects where
  135. * needed. If a shared memory list is created, these types can probably be
  136. * eliminated in favor of using the general solution.
  137. */
  138. typedef struct PredXactListElementData
  139. {
  140. SHM_QUEUE link;
  141. SERIALIZABLEXACT sxact;
  142. } PredXactListElementData;
  143. typedef struct PredXactListElementData *PredXactListElement;
  144. #define PredXactListElementDataSize \
  145. ((Size)MAXALIGN(sizeof(PredXactListElementData)))
  146. typedef struct PredXactListData
  147. {
  148. SHM_QUEUE availableList;
  149. SHM_QUEUE activeList;
  150. /*
  151. * These global variables are maintained when registering and cleaning up
  152. * serializable transactions. They must be global across all backends,
  153. * but are not needed outside the predicate.c source file. Protected by
  154. * SerializableXactHashLock.
  155. */
  156. TransactionId SxactGlobalXmin; /* global xmin for active serializable
  157. * transactions */
  158. int SxactGlobalXminCount; /* how many active serializable
  159. * transactions have this xmin */
  160. int WritableSxactCount; /* how many non-read-only serializable
  161. * transactions are active */
  162. SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically
  163. * increasing number for commits
  164. * of serializable transactions */
  165. /* Protected by SerializableXactHashLock. */
  166. SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks and
  167. * inConflicts for committed
  168. * transactions through this seq
  169. * no */
  170. /* Protected by SerializableFinishedListLock. */
  171. SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this
  172. * seq no */
  173. SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */
  174. PredXactListElement element;
  175. } PredXactListData;
  176. typedef struct PredXactListData *PredXactList;
  177. #define PredXactListDataSize \
  178. ((Size)MAXALIGN(sizeof(PredXactListData)))
  179. /*
  180. * The following types are used to provide lists of rw-conflicts between
  181. * pairs of transactions. Since exactly the same information is needed,
  182. * they are also used to record possible unsafe transaction relationships
  183. * for purposes of identifying safe snapshots for read-only transactions.
  184. *
  185. * When a RWConflictData is not in use to record either type of relationship
  186. * between a pair of transactions, it is kept on an "available" list. The
  187. * outLink field is used for maintaining that list.
  188. */
  189. typedef struct RWConflictData
  190. {
  191. SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */
  192. SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */
  193. SERIALIZABLEXACT *sxactOut;
  194. SERIALIZABLEXACT *sxactIn;
  195. } RWConflictData;
  196. typedef struct RWConflictData *RWConflict;
  197. #define RWConflictDataSize \
  198. ((Size)MAXALIGN(sizeof(RWConflictData)))
  199. typedef struct RWConflictPoolHeaderData
  200. {
  201. SHM_QUEUE availableList;
  202. RWConflict element;
  203. } RWConflictPoolHeaderData;
  204. typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader;
  205. #define RWConflictPoolHeaderDataSize \
  206. ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
  207. /*
  208. * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
  209. * transaction or any of its subtransactions.
  210. */
  211. typedef struct SERIALIZABLEXIDTAG
  212. {
  213. TransactionId xid;
  214. } SERIALIZABLEXIDTAG;
  215. /*
  216. * The SERIALIZABLEXID struct provides a link from a TransactionId for a
  217. * serializable transaction to the related SERIALIZABLEXACT record, even if
  218. * the transaction has completed and its connection has been closed.
  219. *
  220. * These are created as new top level transaction IDs are first assigned to
  221. * transactions which are participating in predicate locking. This may
  222. * never happen for a particular transaction if it doesn't write anything.
  223. * They are removed with their related serializable transaction objects.
  224. *
  225. * The SubTransGetTopmostTransaction method is used where necessary to get
  226. * from an XID which might be from a subtransaction to the top level XID.
  227. */
  228. typedef struct SERIALIZABLEXID
  229. {
  230. /* hash key */
  231. SERIALIZABLEXIDTAG tag;
  232. /* data */
  233. SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */
  234. } SERIALIZABLEXID;
  235. /*
  236. * The PREDICATELOCKTARGETTAG struct identifies a database object which can
  237. * be the target of predicate locks.
  238. *
  239. * Note that the hash function being used doesn't properly respect tag
  240. * length -- if the length of the structure isn't a multiple of four bytes it
  241. * will go to a four byte boundary past the end of the tag. If you change
  242. * this struct, make sure any slack space is initialized, so that any random
  243. * bytes in the middle or at the end are not included in the hash.
  244. *
  245. * TODO SSI: If we always use the same fields for the same type of value, we
  246. * should rename these. Holding off until it's clear there are no exceptions.
  247. * Since indexes are relations with blocks and tuples, it's looking likely that
  248. * the rename will be possible. If not, we may need to divide the last field
  249. * and use part of it for a target type, so that we know how to interpret the
  250. * data..
  251. */
  252. typedef struct PREDICATELOCKTARGETTAG
  253. {
  254. uint32 locktag_field1; /* a 32-bit ID field */
  255. uint32 locktag_field2; /* a 32-bit ID field */
  256. uint32 locktag_field3; /* a 32-bit ID field */
  257. uint32 locktag_field4; /* a 32-bit ID field */
  258. } PREDICATELOCKTARGETTAG;
  259. /*
  260. * The PREDICATELOCKTARGET struct represents a database object on which there
  261. * are predicate locks.
  262. *
  263. * A hash list of these objects is maintained in shared memory. An entry is
  264. * added when a predicate lock is requested on an object which doesn't
  265. * already have one. An entry is removed when the last lock is removed from
  266. * its list.
  267. */
  268. typedef struct PREDICATELOCKTARGET
  269. {
  270. /* hash key */
  271. PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
  272. /* data */
  273. SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with
  274. * predicate lock target */
  275. } PREDICATELOCKTARGET;
  276. /*
  277. * The PREDICATELOCKTAG struct identifies an individual predicate lock.
  278. *
  279. * It is the combination of predicate lock target (which is a lockable
  280. * object) and a serializable transaction which has acquired a lock on that
  281. * target.
  282. */
  283. typedef struct PREDICATELOCKTAG
  284. {
  285. PREDICATELOCKTARGET *myTarget;
  286. SERIALIZABLEXACT *myXact;
  287. } PREDICATELOCKTAG;
  288. /*
  289. * The PREDICATELOCK struct represents an individual lock.
  290. *
  291. * An entry can be created here when the related database object is read, or
  292. * by promotion of multiple finer-grained targets. All entries related to a
  293. * serializable transaction are removed when that serializable transaction is
  294. * cleaned up. Entries can also be removed when they are combined into a
  295. * single coarser-grained lock entry.
  296. */
  297. typedef struct PREDICATELOCK
  298. {
  299. /* hash key */
  300. PREDICATELOCKTAG tag; /* unique identifier of lock */
  301. /* data */
  302. SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of
  303. * predicate locks */
  304. SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of
  305. * predicate locks */
  306. SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
  307. } PREDICATELOCK;
  308. /*
  309. * The LOCALPREDICATELOCK struct represents a local copy of data which is
  310. * also present in the PREDICATELOCK table, organized for fast access without
  311. * needing to acquire a LWLock. It is strictly for optimization.
  312. *
  313. * Each serializable transaction creates its own local hash table to hold a
  314. * collection of these. This information is used to determine when a number
  315. * of fine-grained locks should be promoted to a single coarser-grained lock.
  316. * The information is maintained more-or-less in parallel to the
  317. * PREDICATELOCK data, but because this data is not protected by locks and is
  318. * only used in an optimization heuristic, it is allowed to drift in a few
  319. * corner cases where maintaining exact data would be expensive.
  320. *
  321. * The hash table is created when the serializable transaction acquires its
  322. * snapshot, and its memory is released upon completion of the transaction.
  323. */
  324. typedef struct LOCALPREDICATELOCK
  325. {
  326. /* hash key */
  327. PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
  328. /* data */
  329. bool held; /* is lock held, or just its children? */
  330. int childLocks; /* number of child locks currently held */
  331. } LOCALPREDICATELOCK;
  332. /*
  333. * The types of predicate locks which can be acquired.
  334. */
  335. typedef enum PredicateLockTargetType
  336. {
  337. PREDLOCKTAG_RELATION,
  338. PREDLOCKTAG_PAGE,
  339. PREDLOCKTAG_TUPLE
  340. /* TODO SSI: Other types may be needed for index locking */
  341. } PredicateLockTargetType;
  342. /*
  343. * This structure is used to quickly capture a copy of all predicate
  344. * locks. This is currently used only by the pg_lock_status function,
  345. * which in turn is used by the pg_locks view.
  346. */
  347. typedef struct PredicateLockData
  348. {
  349. int nelements;
  350. PREDICATELOCKTARGETTAG *locktags;
  351. SERIALIZABLEXACT *xacts;
  352. } PredicateLockData;
  353. /*
  354. * These macros define how we map logical IDs of lockable objects into the
  355. * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values,
  356. * rather than accessing the fields directly. Note multiple eval of target!
  357. */
  358. #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
  359. ((locktag).locktag_field1 = (dboid), \
  360. (locktag).locktag_field2 = (reloid), \
  361. (locktag).locktag_field3 = InvalidBlockNumber, \
  362. (locktag).locktag_field4 = InvalidOffsetNumber)
  363. #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
  364. ((locktag).locktag_field1 = (dboid), \
  365. (locktag).locktag_field2 = (reloid), \
  366. (locktag).locktag_field3 = (blocknum), \
  367. (locktag).locktag_field4 = InvalidOffsetNumber)
  368. #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
  369. ((locktag).locktag_field1 = (dboid), \
  370. (locktag).locktag_field2 = (reloid), \
  371. (locktag).locktag_field3 = (blocknum), \
  372. (locktag).locktag_field4 = (offnum))
  373. #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
  374. ((Oid) (locktag).locktag_field1)
  375. #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
  376. ((Oid) (locktag).locktag_field2)
  377. #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
  378. ((BlockNumber) (locktag).locktag_field3)
  379. #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
  380. ((OffsetNumber) (locktag).locktag_field4)
  381. #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \
  382. (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
  383. (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \
  384. PREDLOCKTAG_RELATION))
  385. /*
  386. * Two-phase commit statefile records. There are two types: for each
  387. * transaction, we generate one per-transaction record and a variable
  388. * number of per-predicate-lock records.
  389. */
  390. typedef enum TwoPhasePredicateRecordType
  391. {
  392. TWOPHASEPREDICATERECORD_XACT,
  393. TWOPHASEPREDICATERECORD_LOCK
  394. } TwoPhasePredicateRecordType;
  395. /*
  396. * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
  397. * much is needed because most of it not meaningful for a recovered
  398. * prepared transaction.
  399. *
  400. * In particular, we do not record the in and out conflict lists for a
  401. * prepared transaction because the associated SERIALIZABLEXACTs will
  402. * not be available after recovery. Instead, we simply record the
  403. * existence of each type of conflict by setting the transaction's
  404. * summary conflict in/out flag.
  405. */
  406. typedef struct TwoPhasePredicateXactRecord
  407. {
  408. TransactionId xmin;
  409. uint32 flags;
  410. } TwoPhasePredicateXactRecord;
  411. /* Per-lock state */
  412. typedef struct TwoPhasePredicateLockRecord
  413. {
  414. PREDICATELOCKTARGETTAG target;
  415. uint32 filler; /* to avoid length change in back-patched fix */
  416. } TwoPhasePredicateLockRecord;
  417. typedef struct TwoPhasePredicateRecord
  418. {
  419. TwoPhasePredicateRecordType type;
  420. union
  421. {
  422. TwoPhasePredicateXactRecord xactRecord;
  423. TwoPhasePredicateLockRecord lockRecord;
  424. } data;
  425. } TwoPhasePredicateRecord;
  426. /*
  427. * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
  428. */
  429. #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
  430. /*
  431. * Function definitions for functions needing awareness of predicate
  432. * locking internals.
  433. */
  434. extern PredicateLockData *GetPredicateLockStatusData(void);
  435. extern int GetSafeSnapshotBlockingPids(int blocked_pid,
  436. int *output, int output_size);
  437. #endif /* PREDICATE_INTERNALS_H */