2
0

checksum_impl.h 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. /*-------------------------------------------------------------------------
  2. *
  3. * checksum_impl.h
  4. * Checksum implementation for data pages.
  5. *
  6. * This file exists for the benefit of external programs that may wish to
  7. * check Postgres page checksums. They can #include this to get the code
  8. * referenced by storage/checksum.h. (Note: you may need to redefine
  9. * Assert() as empty to compile this successfully externally.)
  10. *
  11. * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  12. * Portions Copyright (c) 1994, Regents of the University of California
  13. *
  14. * src/include/storage/checksum_impl.h
  15. *
  16. *-------------------------------------------------------------------------
  17. */
  18. /*
  19. * The algorithm used to checksum pages is chosen for very fast calculation.
  20. * Workloads where the database working set fits into OS file cache but not
  21. * into shared buffers can read in pages at a very fast pace and the checksum
  22. * algorithm itself can become the largest bottleneck.
  23. *
  24. * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand
  25. * for Fowler/Noll/Vo). The primitive of a plain FNV-1a hash folds in data 1
  26. * byte at a time according to the formula:
  27. *
  28. * hash = (hash ^ value) * FNV_PRIME
  29. *
  30. * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/
  31. *
  32. * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of
  33. * high bits - high order bits in input data only affect high order bits in
  34. * output data. To resolve this we xor in the value prior to multiplication
  35. * shifted right by 17 bits. The number 17 was chosen because it doesn't
  36. * have common denominator with set bit positions in FNV_PRIME and empirically
  37. * provides the fastest mixing for high order bits of final iterations quickly
  38. * avalanche into lower positions. For performance reasons we choose to combine
  39. * 4 bytes at a time. The actual hash formula used as the basis is:
  40. *
  41. * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17)
  42. *
  43. * The main bottleneck in this calculation is the multiplication latency. To
  44. * hide the latency and to make use of SIMD parallelism multiple hash values
  45. * are calculated in parallel. The page is treated as a 32 column two
  46. * dimensional array of 32 bit values. Each column is aggregated separately
  47. * into a partial checksum. Each partial checksum uses a different initial
  48. * value (offset basis in FNV terminology). The initial values actually used
  49. * were chosen randomly, as the values themselves don't matter as much as that
  50. * they are different and don't match anything in real data. After initializing
  51. * partial checksums each value in the column is aggregated according to the
  52. * above formula. Finally two more iterations of the formula are performed with
  53. * value 0 to mix the bits of the last value added.
  54. *
  55. * The partial checksums are then folded together using xor to form a single
  56. * 32-bit checksum. The caller can safely reduce the value to 16 bits
  57. * using modulo 2^16-1. That will cause a very slight bias towards lower
  58. * values but this is not significant for the performance of the
  59. * checksum.
  60. *
  61. * The algorithm choice was based on what instructions are available in SIMD
  62. * instruction sets. This meant that a fast and good algorithm needed to use
  63. * multiplication as the main mixing operator. The simplest multiplication
  64. * based checksum primitive is the one used by FNV. The prime used is chosen
  65. * for good dispersion of values. It has no known simple patterns that result
  66. * in collisions. Test of 5-bit differentials of the primitive over 64bit keys
  67. * reveals no differentials with 3 or more values out of 100000 random keys
  68. * colliding. Avalanche test shows that only high order bits of the last word
  69. * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes,
  70. * overwriting page from random position to end with 0 bytes, and overwriting
  71. * random segments of page with 0x00, 0xFF and random data all show optimal
  72. * 2e-16 false positive rate within margin of error.
  73. *
  74. * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
  75. * multiplication instruction. As of 2013 the corresponding instruction is
  76. * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
  77. * Vectorization requires a compiler to do the vectorization for us. For recent
  78. * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
  79. * to achieve vectorization.
  80. *
  81. * The optimal amount of parallelism to use depends on CPU specific instruction
  82. * latency, SIMD instruction width, throughput and the amount of registers
  83. * available to hold intermediate state. Generally, more parallelism is better
  84. * up to the point that state doesn't fit in registers and extra load-store
  85. * instructions are needed to swap values in/out. The number chosen is a fixed
  86. * part of the algorithm because changing the parallelism changes the checksum
  87. * result.
  88. *
  89. * The parallelism number 32 was chosen based on the fact that it is the
  90. * largest state that fits into architecturally visible x86 SSE registers while
  91. * leaving some free registers for intermediate values. For future processors
  92. * with 256bit vector registers this will leave some performance on the table.
  93. * When vectorization is not available it might be beneficial to restructure
  94. * the computation to calculate a subset of the columns at a time and perform
  95. * multiple passes to avoid register spilling. This optimization opportunity
  96. * is not used. Current coding also assumes that the compiler has the ability
  97. * to unroll the inner loop to avoid loop overhead and minimize register
  98. * spilling. For less sophisticated compilers it might be beneficial to
  99. * manually unroll the inner loop.
  100. */
  101. #include "storage/bufpage.h"
  102. /* number of checksums to calculate in parallel */
  103. #define N_SUMS 32
  104. /* prime multiplier of FNV-1a hash */
  105. #define FNV_PRIME 16777619
  106. /* Use a union so that this code is valid under strict aliasing */
  107. typedef union
  108. {
  109. PageHeaderData phdr;
  110. uint32 data[BLCKSZ / (sizeof(uint32) * N_SUMS)][N_SUMS];
  111. } PGChecksummablePage;
  112. /*
  113. * Base offsets to initialize each of the parallel FNV hashes into a
  114. * different initial state.
  115. */
  116. static const uint32 checksumBaseOffsets[N_SUMS] = {
  117. 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A,
  118. 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C,
  119. 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA,
  120. 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB,
  121. 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE,
  122. 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4,
  123. 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E,
  124. 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756
  125. };
  126. /*
  127. * Calculate one round of the checksum.
  128. */
  129. #define CHECKSUM_COMP(checksum, value) \
  130. do { \
  131. uint32 __tmp = (checksum) ^ (value); \
  132. (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \
  133. } while (0)
  134. /*
  135. * Block checksum algorithm. The page must be adequately aligned
  136. * (at least on 4-byte boundary).
  137. */
  138. static uint32
  139. pg_checksum_block(const PGChecksummablePage *page)
  140. {
  141. uint32 sums[N_SUMS];
  142. uint32 result = 0;
  143. uint32 i,
  144. j;
  145. /* ensure that the size is compatible with the algorithm */
  146. Assert(sizeof(PGChecksummablePage) == BLCKSZ);
  147. /* initialize partial checksums to their corresponding offsets */
  148. memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
  149. /* main checksum calculation */
  150. for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
  151. for (j = 0; j < N_SUMS; j++)
  152. CHECKSUM_COMP(sums[j], page->data[i][j]);
  153. /* finally add in two rounds of zeroes for additional mixing */
  154. for (i = 0; i < 2; i++)
  155. for (j = 0; j < N_SUMS; j++)
  156. CHECKSUM_COMP(sums[j], 0);
  157. /* xor fold partial checksums together */
  158. for (i = 0; i < N_SUMS; i++)
  159. result ^= sums[i];
  160. return result;
  161. }
  162. /*
  163. * Compute the checksum for a Postgres page.
  164. *
  165. * The page must be adequately aligned (at least on a 4-byte boundary).
  166. * Beware also that the checksum field of the page is transiently zeroed.
  167. *
  168. * The checksum includes the block number (to detect the case where a page is
  169. * somehow moved to a different location), the page header (excluding the
  170. * checksum itself), and the page data.
  171. */
  172. uint16
  173. pg_checksum_page(char *page, BlockNumber blkno)
  174. {
  175. PGChecksummablePage *cpage = (PGChecksummablePage *) page;
  176. uint16 save_checksum;
  177. uint32 checksum;
  178. /* We only calculate the checksum for properly-initialized pages */
  179. Assert(!PageIsNew(&cpage->phdr));
  180. /*
  181. * Save pd_checksum and temporarily set it to zero, so that the checksum
  182. * calculation isn't affected by the old checksum stored on the page.
  183. * Restore it after, because actually updating the checksum is NOT part of
  184. * the API of this function.
  185. */
  186. save_checksum = cpage->phdr.pd_checksum;
  187. cpage->phdr.pd_checksum = 0;
  188. checksum = pg_checksum_block(cpage);
  189. cpage->phdr.pd_checksum = save_checksum;
  190. /* Mix in the block number to detect transposed pages */
  191. checksum ^= blkno;
  192. /*
  193. * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of
  194. * one. That avoids checksums of zero, which seems like a good idea.
  195. */
  196. return (uint16) ((checksum % 65535) + 1);
  197. }