xxh3.h 91 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320
  1. /*
  2. * xxHash - Extremely Fast Hash algorithm
  3. * Development source file for `xxh3`
  4. * Copyright (C) 2019-2020 Yann Collet
  5. *
  6. * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions are
  10. * met:
  11. *
  12. * * Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. * * Redistributions in binary form must reproduce the above
  15. * copyright notice, this list of conditions and the following disclaimer
  16. * in the documentation and/or other materials provided with the
  17. * distribution.
  18. *
  19. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. *
  31. * You can contact the author at:
  32. * - xxHash homepage: https://www.xxhash.com
  33. * - xxHash source repository: https://github.com/Cyan4973/xxHash
  34. */
  35. /*
  36. * Note: This file is separated for development purposes.
  37. * It will be integrated into `xxhash.h` when development stage is completed.
  38. *
  39. * Credit: most of the work on vectorial and asm variants comes from @easyaspi314
  40. */
  41. #ifndef XXH3_H_1397135465
  42. #define XXH3_H_1397135465
  43. /* === Dependencies === */
  44. #ifndef XXHASH_H_5627135585666179
  45. /* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */
  46. # undef XXH_INLINE_ALL /* avoid redefinition */
  47. # define XXH_INLINE_ALL
  48. #endif
  49. #include "xxhash.h"
  50. /* === Compiler specifics === */
  51. #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
  52. # define XXH_RESTRICT restrict
  53. #else
  54. /* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
  55. # define XXH_RESTRICT /* disable */
  56. #endif
  57. #if (defined(__GNUC__) && (__GNUC__ >= 3)) \
  58. || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
  59. || defined(__clang__)
  60. # define XXH_likely(x) __builtin_expect(x, 1)
  61. # define XXH_unlikely(x) __builtin_expect(x, 0)
  62. #else
  63. # define XXH_likely(x) (x)
  64. # define XXH_unlikely(x) (x)
  65. #endif
  66. #if defined(__GNUC__)
  67. # if defined(__AVX2__)
  68. # include <immintrin.h>
  69. # elif defined(__SSE2__)
  70. # include <emmintrin.h>
  71. # elif defined(__ARM_NEON__) || defined(__ARM_NEON)
  72. # define inline __inline__ /* clang bug */
  73. # include <arm_neon.h>
  74. # undef inline
  75. # endif
  76. #elif defined(_MSC_VER)
  77. # include <intrin.h>
  78. #endif
  79. /*
  80. * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
  81. * remaining a true 64-bit/128-bit hash function.
  82. *
  83. * This is done by prioritizing a subset of 64-bit operations that can be
  84. * emulated without too many steps on the average 32-bit machine.
  85. *
  86. * For example, these two lines seem similar, and run equally fast on 64-bit:
  87. *
  88. * xxh_u64 x;
  89. * x ^= (x >> 47); // good
  90. * x ^= (x >> 13); // bad
  91. *
  92. * However, to a 32-bit machine, there is a major difference.
  93. *
  94. * x ^= (x >> 47) looks like this:
  95. *
  96. * x.lo ^= (x.hi >> (47 - 32));
  97. *
  98. * while x ^= (x >> 13) looks like this:
  99. *
  100. * // note: funnel shifts are not usually cheap.
  101. * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
  102. * x.hi ^= (x.hi >> 13);
  103. *
  104. * The first one is significantly faster than the second, simply because the
  105. * shift is larger than 32. This means:
  106. * - All the bits we need are in the upper 32 bits, so we can ignore the lower
  107. * 32 bits in the shift.
  108. * - The shift result will always fit in the lower 32 bits, and therefore,
  109. * we can ignore the upper 32 bits in the xor.
  110. *
  111. * Thanks to this optimization, XXH3 only requires these features to be efficient:
  112. *
  113. * - Usable unaligned access
  114. * - A 32-bit or 64-bit ALU
  115. * - If 32-bit, a decent ADC instruction
  116. * - A 32 or 64-bit multiply with a 64-bit result
  117. * - For the 128-bit variant, a decent byteswap helps short inputs.
  118. *
  119. * The first two are already required by XXH32, and almost all 32-bit and 64-bit
  120. * platforms which can run XXH32 can run XXH3 efficiently.
  121. *
  122. * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
  123. * notable exception.
  124. *
  125. * First of all, Thumb-1 lacks support for the UMULL instruction which
  126. * performs the important long multiply. This means numerous __aeabi_lmul
  127. * calls.
  128. *
  129. * Second of all, the 8 functional registers are just not enough.
  130. * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
  131. * Lo registers, and this shuffling results in thousands more MOVs than A32.
  132. *
  133. * A32 and T32 don't have this limitation. They can access all 14 registers,
  134. * do a 32->64 multiply with UMULL, and the flexible operand allowing free
  135. * shifts is helpful, too.
  136. *
  137. * Therefore, we do a quick sanity check.
  138. *
  139. * If compiling Thumb-1 for a target which supports ARM instructions, we will
  140. * emit a warning, as it is not a "sane" platform to compile for.
  141. *
  142. * Usually, if this happens, it is because of an accident and you probably need
  143. * to specify -march, as you likely meant to compile for a newer architecture.
  144. */
  145. #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
  146. # warning "XXH3 is highly inefficient without ARM or Thumb-2."
  147. #endif
  148. /* ==========================================
  149. * Vectorization detection
  150. * ========================================== */
  151. #define XXH_SCALAR 0 /* Portable scalar version */
  152. #define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */
  153. #define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */
  154. #define XXH_NEON 3 /* NEON for most ARMv7-A and all AArch64 */
  155. #define XXH_VSX 4 /* VSX and ZVector for POWER8/z13 */
  156. #ifndef XXH_VECTOR /* can be defined on command line */
  157. # if defined(__AVX2__)
  158. # define XXH_VECTOR XXH_AVX2
  159. # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
  160. # define XXH_VECTOR XXH_SSE2
  161. # elif defined(__GNUC__) /* msvc support maybe later */ \
  162. && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
  163. && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
  164. || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
  165. # define XXH_VECTOR XXH_NEON
  166. # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
  167. || (defined(__s390x__) && defined(__VEC__)) \
  168. && defined(__GNUC__) /* TODO: IBM XL */
  169. # define XXH_VECTOR XXH_VSX
  170. # else
  171. # define XXH_VECTOR XXH_SCALAR
  172. # endif
  173. #endif
  174. /*
  175. * Controls the alignment of the accumulator.
  176. * This is for compatibility with aligned vector loads, which are usually faster.
  177. */
  178. #ifndef XXH_ACC_ALIGN
  179. # if XXH_VECTOR == XXH_SCALAR /* scalar */
  180. # define XXH_ACC_ALIGN 8
  181. # elif XXH_VECTOR == XXH_SSE2 /* sse2 */
  182. # define XXH_ACC_ALIGN 16
  183. # elif XXH_VECTOR == XXH_AVX2 /* avx2 */
  184. # define XXH_ACC_ALIGN 32
  185. # elif XXH_VECTOR == XXH_NEON /* neon */
  186. # define XXH_ACC_ALIGN 16
  187. # elif XXH_VECTOR == XXH_VSX /* vsx */
  188. # define XXH_ACC_ALIGN 16
  189. # endif
  190. #endif
  191. /*
  192. * UGLY HACK:
  193. * GCC usually generates the best code with -O3 for xxHash.
  194. *
  195. * However, when targeting AVX2, it is overzealous in its unrolling resulting
  196. * in code roughly 3/4 the speed of Clang.
  197. *
  198. * There are other issues, such as GCC splitting _mm256_loadu_si256 into
  199. * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
  200. * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
  201. *
  202. * That is why when compiling the AVX2 version, it is recommended to use either
  203. * -O2 -mavx2 -march=haswell
  204. * or
  205. * -O2 -mavx2 -mno-avx256-split-unaligned-load
  206. * for decent performance, or to use Clang instead.
  207. *
  208. * Fortunately, we can control the first one with a pragma that forces GCC into
  209. * -O2, but the other one we can't control without "failed to inline always
  210. * inline function due to target mismatch" warnings.
  211. */
  212. #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
  213. && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
  214. && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
  215. # pragma GCC push_options
  216. # pragma GCC optimize("-O2")
  217. #endif
  218. #if XXH_VECTOR == XXH_NEON
  219. /*
  220. * NEON's setup for vmlal_u32 is a little more complicated than it is on
  221. * SSE2, AVX2, and VSX.
  222. *
  223. * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
  224. *
  225. * To do the same operation, the 128-bit 'Q' register needs to be split into
  226. * two 64-bit 'D' registers, performing this operation::
  227. *
  228. * [ a | b ]
  229. * | '---------. .--------' |
  230. * | x |
  231. * | .---------' '--------. |
  232. * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]
  233. *
  234. * Due to significant changes in aarch64, the fastest method for aarch64 is
  235. * completely different than the fastest method for ARMv7-A.
  236. *
  237. * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
  238. * D11 will modify the high half of Q5. This is similar to how modifying AH
  239. * will only affect bits 8-15 of AX on x86.
  240. *
  241. * VZIP takes two registers, and puts even lanes in one register and odd lanes
  242. * in the other.
  243. *
  244. * On ARMv7-A, this strangely modifies both parameters in place instead of
  245. * taking the usual 3-operand form.
  246. *
  247. * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
  248. * lower and upper halves of the Q register to end up with the high and low
  249. * halves where we want - all in one instruction.
  250. *
  251. * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
  252. *
  253. * Unfortunately we need inline assembly for this: Instructions modifying two
  254. * registers at once is not possible in GCC or Clang's IR, and they have to
  255. * create a copy.
  256. *
  257. * aarch64 requires a different approach.
  258. *
  259. * In order to make it easier to write a decent compiler for aarch64, many
  260. * quirks were removed, such as conditional execution.
  261. *
  262. * NEON was also affected by this.
  263. *
  264. * aarch64 cannot access the high bits of a Q-form register, and writes to a
  265. * D-form register zero the high bits, similar to how writes to W-form scalar
  266. * registers (or DWORD registers on x86_64) work.
  267. *
  268. * The formerly free vget_high intrinsics now require a vext (with a few
  269. * exceptions)
  270. *
  271. * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
  272. * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
  273. * operand.
  274. *
  275. * The equivalent of the VZIP.32 on the lower and upper halves would be this
  276. * mess:
  277. *
  278. * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
  279. * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
  280. * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
  281. *
  282. * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
  283. *
  284. * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
  285. * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
  286. *
  287. * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
  288. */
  289. /*
  290. * Function-like macro:
  291. * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
  292. * {
  293. * outLo = (uint32x2_t)(in & 0xFFFFFFFF);
  294. * outHi = (uint32x2_t)(in >> 32);
  295. * in = UNDEFINED;
  296. * }
  297. */
  298. # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
  299. && defined(__GNUC__) \
  300. && !defined(__aarch64__) && !defined(__arm64__)
  301. # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
  302. do { \
  303. /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
  304. /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
  305. /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
  306. __asm__("vzip.32 %e0, %f0" : "+w" (in)); \
  307. (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \
  308. (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
  309. } while (0)
  310. # else
  311. # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
  312. do { \
  313. (outLo) = vmovn_u64 (in); \
  314. (outHi) = vshrn_n_u64 ((in), 32); \
  315. } while (0)
  316. # endif
  317. #endif /* XXH_VECTOR == XXH_NEON */
  318. /*
  319. * VSX and Z Vector helpers.
  320. *
  321. * This is very messy, and any pull requests to clean this up are welcome.
  322. *
  323. * There are a lot of problems with supporting VSX and s390x, due to
  324. * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
  325. */
  326. #if XXH_VECTOR == XXH_VSX
  327. # if defined(__s390x__)
  328. # include <s390intrin.h>
  329. # else
  330. # include <altivec.h>
  331. # endif
  332. # undef vector /* Undo the pollution */
  333. typedef __vector unsigned long long xxh_u64x2;
  334. typedef __vector unsigned char xxh_u8x16;
  335. typedef __vector unsigned xxh_u32x4;
  336. # ifndef XXH_VSX_BE
  337. # if defined(__BIG_ENDIAN__) \
  338. || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  339. # define XXH_VSX_BE 1
  340. # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
  341. # warning "-maltivec=be is not recommended. Please use native endianness."
  342. # define XXH_VSX_BE 1
  343. # else
  344. # define XXH_VSX_BE 0
  345. # endif
  346. # endif /* !defined(XXH_VSX_BE) */
  347. # if XXH_VSX_BE
  348. /* A wrapper for POWER9's vec_revb. */
  349. # if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
  350. # define XXH_vec_revb vec_revb
  351. # else
  352. XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
  353. {
  354. xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
  355. 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
  356. return vec_perm(val, val, vByteSwap);
  357. }
  358. # endif
  359. # endif /* XXH_VSX_BE */
  360. /*
  361. * Performs an unaligned load and byte swaps it on big endian.
  362. */
  363. XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
  364. {
  365. xxh_u64x2 ret;
  366. memcpy(&ret, ptr, sizeof(xxh_u64x2));
  367. # if XXH_VSX_BE
  368. ret = XXH_vec_revb(ret);
  369. # endif
  370. return ret;
  371. }
  372. /*
  373. * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
  374. *
  375. * These intrinsics weren't added until GCC 8, despite existing for a while,
  376. * and they are endian dependent. Also, their meaning swap depending on version.
  377. * */
  378. # if defined(__s390x__)
  379. /* s390x is always big endian, no issue on this platform */
  380. # define XXH_vec_mulo vec_mulo
  381. # define XXH_vec_mule vec_mule
  382. # elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw)
  383. /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
  384. # define XXH_vec_mulo __builtin_altivec_vmulouw
  385. # define XXH_vec_mule __builtin_altivec_vmuleuw
  386. # else
  387. /* gcc needs inline assembly */
  388. /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
  389. XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
  390. {
  391. xxh_u64x2 result;
  392. __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
  393. return result;
  394. }
  395. XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
  396. {
  397. xxh_u64x2 result;
  398. __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
  399. return result;
  400. }
  401. # endif /* XXH_vec_mulo, XXH_vec_mule */
  402. #endif /* XXH_VECTOR == XXH_VSX */
  403. /* prefetch
  404. * can be disabled, by declaring XXH_NO_PREFETCH build macro */
  405. #if defined(XXH_NO_PREFETCH)
  406. # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
  407. #else
  408. # if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
  409. # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
  410. # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
  411. # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
  412. # define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
  413. # else
  414. # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
  415. # endif
  416. #endif /* XXH_NO_PREFETCH */
  417. /* ==========================================
  418. * XXH3 default settings
  419. * ========================================== */
  420. #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
  421. #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
  422. # error "default keyset is not large enough"
  423. #endif
  424. /* Pseudorandom secret taken directly from FARSH */
  425. XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
  426. 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
  427. 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
  428. 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
  429. 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
  430. 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
  431. 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
  432. 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
  433. 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
  434. 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
  435. 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
  436. 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
  437. 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
  438. };
  439. /*
  440. * Calculates a 32-bit to 64-bit long multiply.
  441. *
  442. * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
  443. * need to (but it shouldn't need to anyways, it is about 7 instructions to do
  444. * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
  445. * use that instead of the normal method.
  446. *
  447. * If you are compiling for platforms like Thumb-1 and don't have a better option,
  448. * you may also want to write your own long multiply routine here.
  449. *
  450. * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
  451. * {
  452. * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
  453. * }
  454. */
  455. #if defined(_MSC_VER) && defined(_M_IX86)
  456. # include <intrin.h>
  457. # define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
  458. #else
  459. /*
  460. * Downcast + upcast is usually better than masking on older compilers like
  461. * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
  462. *
  463. * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
  464. * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
  465. */
  466. # define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
  467. #endif
  468. /*
  469. * Calculates a 64->128-bit long multiply.
  470. *
  471. * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
  472. */
  473. static XXH128_hash_t
  474. XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
  475. {
  476. /*
  477. * GCC/Clang __uint128_t method.
  478. *
  479. * On most 64-bit targets, GCC and Clang define a __uint128_t type.
  480. * This is usually the best way as it usually uses a native long 64-bit
  481. * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
  482. *
  483. * Usually.
  484. *
  485. * Despite being a 32-bit platform, Clang (and emscripten) define this type
  486. * despite not having the arithmetic for it. This results in a laggy
  487. * compiler builtin call which calculates a full 128-bit multiply.
  488. * In that case it is best to use the portable one.
  489. * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
  490. */
  491. #if defined(__GNUC__) && !defined(__wasm__) \
  492. && defined(__SIZEOF_INT128__) \
  493. || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  494. __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
  495. XXH128_hash_t r128;
  496. r128.low64 = (xxh_u64)(product);
  497. r128.high64 = (xxh_u64)(product >> 64);
  498. return r128;
  499. /*
  500. * MSVC for x64's _umul128 method.
  501. *
  502. * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
  503. *
  504. * This compiles to single operand MUL on x64.
  505. */
  506. #elif defined(_M_X64) || defined(_M_IA64)
  507. #ifndef _MSC_VER
  508. # pragma intrinsic(_umul128)
  509. #endif
  510. xxh_u64 product_high;
  511. xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
  512. XXH128_hash_t r128;
  513. r128.low64 = product_low;
  514. r128.high64 = product_high;
  515. return r128;
  516. #else
  517. /*
  518. * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
  519. *
  520. * This is a fast and simple grade school multiply, which is shown below
  521. * with base 10 arithmetic instead of base 0x100000000.
  522. *
  523. * 9 3 // D2 lhs = 93
  524. * x 7 5 // D2 rhs = 75
  525. * ----------
  526. * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
  527. * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
  528. * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
  529. * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
  530. * ---------
  531. * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
  532. * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
  533. * ---------
  534. * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
  535. *
  536. * The reasons for adding the products like this are:
  537. * 1. It avoids manual carry tracking. Just like how
  538. * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
  539. * This avoids a lot of complexity.
  540. *
  541. * 2. It hints for, and on Clang, compiles to, the powerful UMAAL
  542. * instruction available in ARM's Digital Signal Processing extension
  543. * in 32-bit ARMv6 and later, which is shown below:
  544. *
  545. * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
  546. * {
  547. * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
  548. * *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
  549. * *RdHi = (xxh_u32)(product >> 32);
  550. * }
  551. *
  552. * This instruction was designed for efficient long multiplication, and
  553. * allows this to be calculated in only 4 instructions at speeds
  554. * comparable to some 64-bit ALUs.
  555. *
  556. * 3. It isn't terrible on other platforms. Usually this will be a couple
  557. * of 32-bit ADD/ADCs.
  558. */
  559. /* First calculate all of the cross products. */
  560. xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
  561. xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
  562. xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
  563. xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
  564. /* Now add the products together. These will never overflow. */
  565. xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
  566. xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
  567. xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
  568. XXH128_hash_t r128;
  569. r128.low64 = lower;
  570. r128.high64 = upper;
  571. return r128;
  572. #endif
  573. }
  574. /*
  575. * Does a 64-bit to 128-bit multiply, then XOR folds it.
  576. *
  577. * The reason for the separate function is to prevent passing too many structs
  578. * around by value. This will hopefully inline the multiply, but we don't force it.
  579. */
  580. static xxh_u64
  581. XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
  582. {
  583. XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
  584. return product.low64 ^ product.high64;
  585. }
  586. /* Seems to produce slightly better code on GCC for some reason. */
  587. XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
  588. {
  589. XXH_ASSERT(0 <= shift && shift < 64);
  590. return v64 ^ (v64 >> shift);
  591. }
  592. /*
  593. * We don't need to (or want to) mix as much as XXH64.
  594. *
  595. * Short hashes are more evenly distributed, so it isn't necessary.
  596. */
  597. static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
  598. {
  599. h64 = XXH_xorshift64(h64, 37);
  600. h64 *= 0x165667919E3779F9ULL;
  601. h64 = XXH_xorshift64(h64, 32);
  602. return h64;
  603. }
  604. /* ==========================================
  605. * Short keys
  606. * ==========================================
  607. * One of the shortcomings of XXH32 and XXH64 was that their performance was
  608. * sub-optimal on short lengths. It used an iterative algorithm which strongly
  609. * favored lengths that were a multiple of 4 or 8.
  610. *
  611. * Instead of iterating over individual inputs, we use a set of single shot
  612. * functions which piece together a range of lengths and operate in constant time.
  613. *
  614. * Additionally, the number of multiplies has been significantly reduced. This
  615. * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
  616. *
  617. * Depending on the platform, this may or may not be faster than XXH32, but it
  618. * is almost guaranteed to be faster than XXH64.
  619. */
  620. /*
  621. * At very short lengths, there isn't enough input to fully hide secrets, or use
  622. * the entire secret.
  623. *
  624. * There is also only a limited amount of mixing we can do before significantly
  625. * impacting performance.
  626. *
  627. * Therefore, we use different sections of the secret and always mix two secret
  628. * samples with an XOR. This should have no effect on performance on the
  629. * seedless or withSeed variants because everything _should_ be constant folded
  630. * by modern compilers.
  631. *
  632. * The XOR mixing hides individual parts of the secret and increases entropy.
  633. *
  634. * This adds an extra layer of strength for custom secrets.
  635. */
  636. XXH_FORCE_INLINE XXH64_hash_t
  637. XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
  638. {
  639. XXH_ASSERT(input != NULL);
  640. XXH_ASSERT(1 <= len && len <= 3);
  641. XXH_ASSERT(secret != NULL);
  642. /*
  643. * len = 1: combined = { input[0], 0x01, input[0], input[0] }
  644. * len = 2: combined = { input[1], 0x02, input[0], input[1] }
  645. * len = 3: combined = { input[2], 0x03, input[0], input[1] }
  646. */
  647. { xxh_u8 const c1 = input[0];
  648. xxh_u8 const c2 = input[len >> 1];
  649. xxh_u8 const c3 = input[len - 1];
  650. xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24)
  651. | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
  652. xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
  653. xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
  654. xxh_u64 const mixed = keyed * PRIME64_1;
  655. return XXH3_avalanche(mixed);
  656. }
  657. }
  658. XXH_FORCE_INLINE XXH64_hash_t
  659. XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
  660. {
  661. XXH_ASSERT(input != NULL);
  662. XXH_ASSERT(secret != NULL);
  663. XXH_ASSERT(4 <= len && len < 8);
  664. seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
  665. { xxh_u32 const input1 = XXH_readLE32(input);
  666. xxh_u32 const input2 = XXH_readLE32(input + len - 4);
  667. xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
  668. xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
  669. xxh_u64 x = input64 ^ bitflip;
  670. /* this mix is inspired by Pelle Evensen's rrmxmx */
  671. x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24);
  672. x *= 0x9FB21C651E98DF25ULL;
  673. x ^= (x >> 35) + len ;
  674. x *= 0x9FB21C651E98DF25ULL;
  675. return XXH_xorshift64(x, 28);
  676. }
  677. }
  678. XXH_FORCE_INLINE XXH64_hash_t
  679. XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
  680. {
  681. XXH_ASSERT(input != NULL);
  682. XXH_ASSERT(secret != NULL);
  683. XXH_ASSERT(8 <= len && len <= 16);
  684. { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
  685. xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
  686. xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
  687. xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
  688. xxh_u64 const acc = len
  689. + XXH_swap64(input_lo) + input_hi
  690. + XXH3_mul128_fold64(input_lo, input_hi);
  691. return XXH3_avalanche(acc);
  692. }
  693. }
  694. XXH_FORCE_INLINE XXH64_hash_t
  695. XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
  696. {
  697. XXH_ASSERT(len <= 16);
  698. { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed);
  699. if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
  700. if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
  701. return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
  702. }
  703. }
  704. /*
  705. * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
  706. * multiplication by zero, affecting hashes of lengths 17 to 240.
  707. *
  708. * However, they are very unlikely.
  709. *
  710. * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
  711. * unseeded non-cryptographic hashes, it does not attempt to defend itself
  712. * against specially crafted inputs, only random inputs.
  713. *
  714. * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
  715. * cancelling out the secret is taken an arbitrary number of times (addressed
  716. * in XXH3_accumulate_512), this collision is very unlikely with random inputs
  717. * and/or proper seeding:
  718. *
  719. * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
  720. * function that is only called up to 16 times per hash with up to 240 bytes of
  721. * input.
  722. *
  723. * This is not too bad for a non-cryptographic hash function, especially with
  724. * only 64 bit outputs.
  725. *
  726. * The 128-bit variant (which trades some speed for strength) is NOT affected
  727. * by this, although it is always a good idea to use a proper seed if you care
  728. * about strength.
  729. */
  730. XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
  731. const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
  732. {
  733. #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
  734. && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \
  735. && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */
  736. /*
  737. * UGLY HACK:
  738. * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
  739. * slower code.
  740. *
  741. * By forcing seed64 into a register, we disrupt the cost model and
  742. * cause it to scalarize. See `XXH32_round()`
  743. *
  744. * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
  745. * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
  746. * GCC 9.2, despite both emitting scalar code.
  747. *
  748. * GCC generates much better scalar code than Clang for the rest of XXH3,
  749. * which is why finding a more optimal codepath is an interest.
  750. */
  751. __asm__ ("" : "+r" (seed64));
  752. #endif
  753. { xxh_u64 const input_lo = XXH_readLE64(input);
  754. xxh_u64 const input_hi = XXH_readLE64(input+8);
  755. return XXH3_mul128_fold64(
  756. input_lo ^ (XXH_readLE64(secret) + seed64),
  757. input_hi ^ (XXH_readLE64(secret+8) - seed64)
  758. );
  759. }
  760. }
  761. /* For mid range keys, XXH3 uses a Mum-hash variant. */
  762. XXH_FORCE_INLINE XXH64_hash_t
  763. XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
  764. const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
  765. XXH64_hash_t seed)
  766. {
  767. XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
  768. XXH_ASSERT(16 < len && len <= 128);
  769. { xxh_u64 acc = len * PRIME64_1;
  770. if (len > 32) {
  771. if (len > 64) {
  772. if (len > 96) {
  773. acc += XXH3_mix16B(input+48, secret+96, seed);
  774. acc += XXH3_mix16B(input+len-64, secret+112, seed);
  775. }
  776. acc += XXH3_mix16B(input+32, secret+64, seed);
  777. acc += XXH3_mix16B(input+len-48, secret+80, seed);
  778. }
  779. acc += XXH3_mix16B(input+16, secret+32, seed);
  780. acc += XXH3_mix16B(input+len-32, secret+48, seed);
  781. }
  782. acc += XXH3_mix16B(input+0, secret+0, seed);
  783. acc += XXH3_mix16B(input+len-16, secret+16, seed);
  784. return XXH3_avalanche(acc);
  785. }
  786. }
  787. #define XXH3_MIDSIZE_MAX 240
  788. XXH_NO_INLINE XXH64_hash_t
  789. XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
  790. const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
  791. XXH64_hash_t seed)
  792. {
  793. XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
  794. XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
  795. #define XXH3_MIDSIZE_STARTOFFSET 3
  796. #define XXH3_MIDSIZE_LASTOFFSET 17
  797. { xxh_u64 acc = len * PRIME64_1;
  798. int const nbRounds = (int)len / 16;
  799. int i;
  800. for (i=0; i<8; i++) {
  801. acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
  802. }
  803. acc = XXH3_avalanche(acc);
  804. XXH_ASSERT(nbRounds >= 8);
  805. #if defined(__clang__) /* Clang */ \
  806. && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
  807. && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
  808. /*
  809. * UGLY HACK:
  810. * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
  811. * In everywhere else, it uses scalar code.
  812. *
  813. * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
  814. * would still be slower than UMAAL (see XXH_mult64to128).
  815. *
  816. * Unfortunately, Clang doesn't handle the long multiplies properly and
  817. * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
  818. * scalarized into an ugly mess of VMOV.32 instructions.
  819. *
  820. * This mess is difficult to avoid without turning autovectorization
  821. * off completely, but they are usually relatively minor and/or not
  822. * worth it to fix.
  823. *
  824. * This loop is the easiest to fix, as unlike XXH32, this pragma
  825. * _actually works_ because it is a loop vectorization instead of an
  826. * SLP vectorization.
  827. */
  828. #pragma clang loop vectorize(disable)
  829. #endif
  830. for (i=8 ; i < nbRounds; i++) {
  831. acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
  832. }
  833. /* last bytes */
  834. acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
  835. return XXH3_avalanche(acc);
  836. }
  837. }
  838. /* === Long Keys === */
  839. #define STRIPE_LEN 64
  840. #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
  841. #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
  842. typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
  843. /*
  844. * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
  845. *
  846. * It is a hardened version of UMAC, based off of FARSH's implementation.
  847. *
  848. * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
  849. * implementations, and it is ridiculously fast.
  850. *
  851. * We harden it by mixing the original input to the accumulators as well as the product.
  852. *
  853. * This means that in the (relatively likely) case of a multiply by zero, the
  854. * original input is preserved.
  855. *
  856. * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
  857. * cross-pollination, as otherwise the upper and lower halves would be
  858. * essentially independent.
  859. *
  860. * This doesn't matter on 64-bit hashes since they all get merged together in
  861. * the end, so we skip the extra step.
  862. *
  863. * Both XXH3_64bits and XXH3_128bits use this subroutine.
  864. */
  865. XXH_FORCE_INLINE void
  866. XXH3_accumulate_512( void* XXH_RESTRICT acc,
  867. const void* XXH_RESTRICT input,
  868. const void* XXH_RESTRICT secret,
  869. XXH3_accWidth_e accWidth)
  870. {
  871. #if (XXH_VECTOR == XXH_AVX2)
  872. XXH_ASSERT((((size_t)acc) & 31) == 0);
  873. { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
  874. /* Unaligned. This is mainly for pointer arithmetic, and because
  875. * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
  876. const __m256i* const xinput = (const __m256i *) input;
  877. /* Unaligned. This is mainly for pointer arithmetic, and because
  878. * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
  879. const __m256i* const xsecret = (const __m256i *) secret;
  880. size_t i;
  881. for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
  882. /* data_vec = xinput[i]; */
  883. __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
  884. /* key_vec = xsecret[i]; */
  885. __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
  886. /* data_key = data_vec ^ key_vec; */
  887. __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
  888. /* data_key_lo = data_key >> 32; */
  889. __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
  890. /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
  891. __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);
  892. if (accWidth == XXH3_acc_128bits) {
  893. /* xacc[i] += swap(data_vec); */
  894. __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
  895. __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
  896. /* xacc[i] += product; */
  897. xacc[i] = _mm256_add_epi64(product, sum);
  898. } else { /* XXH3_acc_64bits */
  899. /* xacc[i] += data_vec; */
  900. __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
  901. /* xacc[i] += product; */
  902. xacc[i] = _mm256_add_epi64(product, sum);
  903. }
  904. } }
  905. #elif (XXH_VECTOR == XXH_SSE2)
  906. /* SSE2 is just a half-scale version of the AVX2 version. */
  907. XXH_ASSERT((((size_t)acc) & 15) == 0);
  908. { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
  909. /* Unaligned. This is mainly for pointer arithmetic, and because
  910. * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
  911. const __m128i* const xinput = (const __m128i *) input;
  912. /* Unaligned. This is mainly for pointer arithmetic, and because
  913. * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
  914. const __m128i* const xsecret = (const __m128i *) secret;
  915. size_t i;
  916. for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
  917. /* data_vec = xinput[i]; */
  918. __m128i const data_vec = _mm_loadu_si128 (xinput+i);
  919. /* key_vec = xsecret[i]; */
  920. __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
  921. /* data_key = data_vec ^ key_vec; */
  922. __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
  923. /* data_key_lo = data_key >> 32; */
  924. __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
  925. /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
  926. __m128i const product = _mm_mul_epu32 (data_key, data_key_lo);
  927. if (accWidth == XXH3_acc_128bits) {
  928. /* xacc[i] += swap(data_vec); */
  929. __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
  930. __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
  931. /* xacc[i] += product; */
  932. xacc[i] = _mm_add_epi64(product, sum);
  933. } else { /* XXH3_acc_64bits */
  934. /* xacc[i] += data_vec; */
  935. __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
  936. /* xacc[i] += product; */
  937. xacc[i] = _mm_add_epi64(product, sum);
  938. }
  939. } }
  940. #elif (XXH_VECTOR == XXH_NEON)
  941. XXH_ASSERT((((size_t)acc) & 15) == 0);
  942. {
  943. XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
  944. /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
  945. uint8_t const* const xinput = (const uint8_t *) input;
  946. uint8_t const* const xsecret = (const uint8_t *) secret;
  947. size_t i;
  948. for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
  949. /* data_vec = xinput[i]; */
  950. uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
  951. /* key_vec = xsecret[i]; */
  952. uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
  953. uint64x2_t data_key;
  954. uint32x2_t data_key_lo, data_key_hi;
  955. if (accWidth == XXH3_acc_64bits) {
  956. /* xacc[i] += data_vec; */
  957. xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
  958. } else { /* XXH3_acc_128bits */
  959. /* xacc[i] += swap(data_vec); */
  960. uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
  961. uint64x2_t const swapped = vextq_u64(data64, data64, 1);
  962. xacc[i] = vaddq_u64 (xacc[i], swapped);
  963. }
  964. /* data_key = data_vec ^ key_vec; */
  965. data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
  966. /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
  967. * data_key_hi = (uint32x2_t) (data_key >> 32);
  968. * data_key = UNDEFINED; */
  969. XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
  970. /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
  971. xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
  972. }
  973. }
  974. #elif (XXH_VECTOR == XXH_VSX)
  975. xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */
  976. xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */
  977. xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */
  978. xxh_u64x2 const v32 = { 32, 32 };
  979. size_t i;
  980. for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
  981. /* data_vec = xinput[i]; */
  982. xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
  983. /* key_vec = xsecret[i]; */
  984. xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
  985. xxh_u64x2 const data_key = data_vec ^ key_vec;
  986. /* shuffled = (data_key << 32) | (data_key >> 32); */
  987. xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
  988. /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
  989. xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
  990. xacc[i] += product;
  991. if (accWidth == XXH3_acc_64bits) {
  992. xacc[i] += data_vec;
  993. } else { /* XXH3_acc_128bits */
  994. /* swap high and low halves */
  995. #ifdef __s390x__
  996. xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2);
  997. #else
  998. xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
  999. #endif
  1000. xacc[i] += data_swapped;
  1001. }
  1002. }
  1003. #else /* scalar variant of Accumulator - universal */
  1004. XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
  1005. const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */
  1006. const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
  1007. size_t i;
  1008. XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
  1009. for (i=0; i < ACC_NB; i++) {
  1010. xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
  1011. xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
  1012. if (accWidth == XXH3_acc_64bits) {
  1013. xacc[i] += data_val;
  1014. } else {
  1015. xacc[i ^ 1] += data_val; /* swap adjacent lanes */
  1016. }
  1017. xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
  1018. }
  1019. #endif
  1020. }
  1021. /*
  1022. * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
  1023. *
  1024. * Multiplication isn't perfect, as explained by Google in HighwayHash:
  1025. *
  1026. * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
  1027. * // varying degrees. In descending order of goodness, bytes
  1028. * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
  1029. * // As expected, the upper and lower bytes are much worse.
  1030. *
  1031. * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
  1032. *
  1033. * Since our algorithm uses a pseudorandom secret to add some variance into the
  1034. * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
  1035. *
  1036. * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
  1037. * extraction.
  1038. *
  1039. * Both XXH3_64bits and XXH3_128bits use this subroutine.
  1040. */
  1041. XXH_FORCE_INLINE void
  1042. XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
  1043. {
  1044. #if (XXH_VECTOR == XXH_AVX2)
  1045. XXH_ASSERT((((size_t)acc) & 31) == 0);
  1046. { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
  1047. /* Unaligned. This is mainly for pointer arithmetic, and because
  1048. * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
  1049. const __m256i* const xsecret = (const __m256i *) secret;
  1050. const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
  1051. size_t i;
  1052. for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
  1053. /* xacc[i] ^= (xacc[i] >> 47) */
  1054. __m256i const acc_vec = xacc[i];
  1055. __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);
  1056. __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);
  1057. /* xacc[i] ^= xsecret; */
  1058. __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
  1059. __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
  1060. /* xacc[i] *= PRIME32_1; */
  1061. __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
  1062. __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
  1063. __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
  1064. xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
  1065. }
  1066. }
  1067. #elif (XXH_VECTOR == XXH_SSE2)
  1068. XXH_ASSERT((((size_t)acc) & 15) == 0);
  1069. { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
  1070. /* Unaligned. This is mainly for pointer arithmetic, and because
  1071. * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
  1072. const __m128i* const xsecret = (const __m128i *) secret;
  1073. const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
  1074. size_t i;
  1075. for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
  1076. /* xacc[i] ^= (xacc[i] >> 47) */
  1077. __m128i const acc_vec = xacc[i];
  1078. __m128i const shifted = _mm_srli_epi64 (acc_vec, 47);
  1079. __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);
  1080. /* xacc[i] ^= xsecret[i]; */
  1081. __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
  1082. __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
  1083. /* xacc[i] *= PRIME32_1; */
  1084. __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
  1085. __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);
  1086. __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);
  1087. xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
  1088. }
  1089. }
  1090. #elif (XXH_VECTOR == XXH_NEON)
  1091. XXH_ASSERT((((size_t)acc) & 15) == 0);
  1092. { uint64x2_t* xacc = (uint64x2_t*) acc;
  1093. uint8_t const* xsecret = (uint8_t const*) secret;
  1094. uint32x2_t prime = vdup_n_u32 (PRIME32_1);
  1095. size_t i;
  1096. for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
  1097. /* xacc[i] ^= (xacc[i] >> 47); */
  1098. uint64x2_t acc_vec = xacc[i];
  1099. uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
  1100. uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
  1101. /* xacc[i] ^= xsecret[i]; */
  1102. uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
  1103. uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
  1104. /* xacc[i] *= PRIME32_1 */
  1105. uint32x2_t data_key_lo, data_key_hi;
  1106. /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
  1107. * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
  1108. * xacc[i] = UNDEFINED; */
  1109. XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
  1110. { /*
  1111. * prod_hi = (data_key >> 32) * PRIME32_1;
  1112. *
  1113. * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
  1114. * incorrectly "optimize" this:
  1115. * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
  1116. * shifted = vshll_n_u32(tmp, 32);
  1117. * to this:
  1118. * tmp = "vmulq_u64"(a, b); // no such thing!
  1119. * shifted = vshlq_n_u64(tmp, 32);
  1120. *
  1121. * However, unlike SSE, Clang lacks a 64-bit multiply routine
  1122. * for NEON, and it scalarizes two 64-bit multiplies instead.
  1123. *
  1124. * vmull_u32 has the same timing as vmul_u32, and it avoids
  1125. * this bug completely.
  1126. * See https://bugs.llvm.org/show_bug.cgi?id=39967
  1127. */
  1128. uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
  1129. /* xacc[i] = prod_hi << 32; */
  1130. xacc[i] = vshlq_n_u64(prod_hi, 32);
  1131. /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
  1132. xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
  1133. }
  1134. } }
  1135. #elif (XXH_VECTOR == XXH_VSX)
  1136. XXH_ASSERT((((size_t)acc) & 15) == 0);
  1137. { xxh_u64x2* const xacc = (xxh_u64x2*) acc;
  1138. const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
  1139. /* constants */
  1140. xxh_u64x2 const v32 = { 32, 32 };
  1141. xxh_u64x2 const v47 = { 47, 47 };
  1142. xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
  1143. size_t i;
  1144. for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
  1145. /* xacc[i] ^= (xacc[i] >> 47); */
  1146. xxh_u64x2 const acc_vec = xacc[i];
  1147. xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
  1148. /* xacc[i] ^= xsecret[i]; */
  1149. xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
  1150. xxh_u64x2 const data_key = data_vec ^ key_vec;
  1151. /* xacc[i] *= PRIME32_1 */
  1152. /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */
  1153. xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
  1154. /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */
  1155. xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
  1156. xacc[i] = prod_odd + (prod_even << v32);
  1157. } }
  1158. #else /* scalar variant of Scrambler - universal */
  1159. XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
  1160. const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
  1161. size_t i;
  1162. XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
  1163. for (i=0; i < ACC_NB; i++) {
  1164. xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
  1165. xxh_u64 acc64 = xacc[i];
  1166. acc64 = XXH_xorshift64(acc64, 47);
  1167. acc64 ^= key64;
  1168. acc64 *= PRIME32_1;
  1169. xacc[i] = acc64;
  1170. }
  1171. #endif
  1172. }
  1173. #define XXH_PREFETCH_DIST 384
  1174. /*
  1175. * XXH3_accumulate()
  1176. * Loops over XXH3_accumulate_512().
  1177. * Assumption: nbStripes will not overflow the secret size
  1178. */
  1179. XXH_FORCE_INLINE void
  1180. XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,
  1181. const xxh_u8* XXH_RESTRICT input,
  1182. const xxh_u8* XXH_RESTRICT secret,
  1183. size_t nbStripes,
  1184. XXH3_accWidth_e accWidth)
  1185. {
  1186. size_t n;
  1187. for (n = 0; n < nbStripes; n++ ) {
  1188. const xxh_u8* const in = input + n*STRIPE_LEN;
  1189. XXH_PREFETCH(in + XXH_PREFETCH_DIST);
  1190. XXH3_accumulate_512(acc,
  1191. in,
  1192. secret + n*XXH_SECRET_CONSUME_RATE,
  1193. accWidth);
  1194. }
  1195. }
  1196. XXH_FORCE_INLINE void
  1197. XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
  1198. const xxh_u8* XXH_RESTRICT input, size_t len,
  1199. const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
  1200. XXH3_accWidth_e accWidth)
  1201. {
  1202. size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
  1203. size_t const block_len = STRIPE_LEN * nb_rounds;
  1204. size_t const nb_blocks = len / block_len;
  1205. size_t n;
  1206. XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
  1207. for (n = 0; n < nb_blocks; n++) {
  1208. XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
  1209. XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
  1210. }
  1211. /* last partial block */
  1212. XXH_ASSERT(len > STRIPE_LEN);
  1213. { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
  1214. XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
  1215. XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
  1216. /* last stripe */
  1217. if (len & (STRIPE_LEN - 1)) {
  1218. const xxh_u8* const p = input + len - STRIPE_LEN;
  1219. /* Do not align on 8, so that the secret is different from the scrambler */
  1220. #define XXH_SECRET_LASTACC_START 7
  1221. XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
  1222. } }
  1223. }
  1224. XXH_FORCE_INLINE xxh_u64
  1225. XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
  1226. {
  1227. return XXH3_mul128_fold64(
  1228. acc[0] ^ XXH_readLE64(secret),
  1229. acc[1] ^ XXH_readLE64(secret+8) );
  1230. }
  1231. static XXH64_hash_t
  1232. XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
  1233. {
  1234. xxh_u64 result64 = start;
  1235. size_t i = 0;
  1236. for (i = 0; i < 4; i++) {
  1237. result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
  1238. #if defined(__clang__) /* Clang */ \
  1239. && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \
  1240. && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
  1241. && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
  1242. /*
  1243. * UGLY HACK:
  1244. * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
  1245. * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
  1246. * XXH3_64bits, len == 256, Snapdragon 835:
  1247. * without hack: 2063.7 MB/s
  1248. * with hack: 2560.7 MB/s
  1249. */
  1250. __asm__("" : "+r" (result64));
  1251. #endif
  1252. }
  1253. return XXH3_avalanche(result64);
  1254. }
  1255. #define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
  1256. PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }
  1257. XXH_FORCE_INLINE XXH64_hash_t
  1258. XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
  1259. const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
  1260. {
  1261. XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
  1262. XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits);
  1263. /* converge into final hash */
  1264. XXH_STATIC_ASSERT(sizeof(acc) == 64);
  1265. /* do not align on 8, so that the secret is different from the accumulator */
  1266. #define XXH_SECRET_MERGEACCS_START 11
  1267. XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
  1268. return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
  1269. }
  1270. XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
  1271. {
  1272. if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
  1273. memcpy(dst, &v64, sizeof(v64));
  1274. }
  1275. /* XXH3_initCustomSecret() :
  1276. * destination `customSecret` is presumed allocated and same size as `kSecret`.
  1277. */
  1278. XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64)
  1279. {
  1280. int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
  1281. int i;
  1282. /*
  1283. * We need a separate pointer for the hack below.
  1284. * Any decent compiler will optimize this out otherwise.
  1285. */
  1286. const xxh_u8 *kSecretPtr = kSecret;
  1287. XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
  1288. #if defined(__clang__) && defined(__aarch64__)
  1289. /*
  1290. * UGLY HACK:
  1291. * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
  1292. * placed sequentially, in order, at the top of the unrolled loop.
  1293. *
  1294. * While MOVK is great for generating constants (2 cycles for a 64-bit
  1295. * constant compared to 4 cycles for LDR), long MOVK chains stall the
  1296. * integer pipelines:
  1297. * I L S
  1298. * MOVK
  1299. * MOVK
  1300. * MOVK
  1301. * MOVK
  1302. * ADD
  1303. * SUB STR
  1304. * STR
  1305. * By forcing loads from memory (as the asm line causes Clang to assume
  1306. * that kSecretPtr has been changed), the pipelines are used more efficiently:
  1307. * I L S
  1308. * LDR
  1309. * ADD LDR
  1310. * SUB STR
  1311. * STR
  1312. * XXH3_64bits_withSeed, len == 256, Snapdragon 835
  1313. * without hack: 2654.4 MB/s
  1314. * with hack: 3202.9 MB/s
  1315. */
  1316. __asm__("" : "+r" (kSecretPtr));
  1317. #endif
  1318. /*
  1319. * Note: in debug mode, this overrides the asm optimization
  1320. * and Clang will emit MOVK chains again.
  1321. */
  1322. XXH_ASSERT(kSecretPtr == kSecret);
  1323. for (i=0; i < nbRounds; i++) {
  1324. /*
  1325. * The asm hack causes Clang to assume that kSecretPtr aliases with
  1326. * customSecret, and on aarch64, this prevented LDP from merging two
  1327. * loads together for free. Putting the loads together before the stores
  1328. * properly generates LDP.
  1329. */
  1330. xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64;
  1331. xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
  1332. XXH_writeLE64(customSecret + 16*i, lo);
  1333. XXH_writeLE64(customSecret + 16*i + 8, hi);
  1334. }
  1335. }
  1336. /*
  1337. * It's important for performance that XXH3_hashLong is not inlined. Not sure
  1338. * why (uop cache maybe?), but the difference is large and easily measurable.
  1339. */
  1340. XXH_NO_INLINE XXH64_hash_t
  1341. XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
  1342. {
  1343. return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret));
  1344. }
  1345. /*
  1346. * It's important for performance that XXH3_hashLong is not inlined. Not sure
  1347. * why (uop cache maybe?), but the difference is large and easily measurable.
  1348. */
  1349. XXH_NO_INLINE XXH64_hash_t
  1350. XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
  1351. const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
  1352. {
  1353. return XXH3_hashLong_64b_internal(input, len, secret, secretSize);
  1354. }
  1355. /*
  1356. * XXH3_hashLong_64b_withSeed():
  1357. * Generate a custom key based on alteration of default kSecret with the seed,
  1358. * and then use this key for long mode hashing.
  1359. *
  1360. * This operation is decently fast but nonetheless costs a little bit of time.
  1361. * Try to avoid it whenever possible (typically when seed==0).
  1362. *
  1363. * It's important for performance that XXH3_hashLong is not inlined. Not sure
  1364. * why (uop cache maybe?), but the difference is large and easily measurable.
  1365. */
  1366. XXH_NO_INLINE XXH64_hash_t
  1367. XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
  1368. {
  1369. XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
  1370. if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len);
  1371. XXH3_initCustomSecret(secret, seed);
  1372. return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret));
  1373. }
  1374. /* === Public entry point === */
  1375. XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
  1376. {
  1377. if (len <= 16)
  1378. return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
  1379. if (len <= 128)
  1380. return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
  1381. if (len <= XXH3_MIDSIZE_MAX)
  1382. return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
  1383. return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
  1384. }
  1385. XXH_PUBLIC_API XXH64_hash_t
  1386. XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
  1387. {
  1388. XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
  1389. /*
  1390. * If an action is to be taken if `secret` conditions are not respected,
  1391. * it should be done here.
  1392. * For now, it's a contract pre-condition.
  1393. * Adding a check and a branch here would cost performance at every hash.
  1394. */
  1395. if (len <= 16)
  1396. return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
  1397. if (len <= 128)
  1398. return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
  1399. if (len <= XXH3_MIDSIZE_MAX)
  1400. return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
  1401. return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
  1402. }
  1403. XXH_PUBLIC_API XXH64_hash_t
  1404. XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
  1405. {
  1406. if (len <= 16)
  1407. return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
  1408. if (len <= 128)
  1409. return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
  1410. if (len <= XXH3_MIDSIZE_MAX)
  1411. return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
  1412. return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
  1413. }
  1414. /* === XXH3 streaming === */
  1415. /*
  1416. * Malloc's a pointer that is always aligned to align.
  1417. *
  1418. * This must be freed with `XXH_alignedFree()`.
  1419. *
  1420. * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
  1421. * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
  1422. * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
  1423. *
  1424. * This underalignment previously caused a rather obvious crash which went
  1425. * completely unnoticed due to XXH3_createState() not actually being tested.
  1426. * Credit to RedSpah for noticing this bug.
  1427. *
  1428. * The alignment is done manually: Functions like posix_memalign or _mm_malloc
  1429. * are avoided: To maintain portability, we would have to write a fallback
  1430. * like this anyways, and besides, testing for the existence of library
  1431. * functions without relying on external build tools is impossible.
  1432. *
  1433. * The method is simple: Overallocate, manually align, and store the offset
  1434. * to the original behind the returned pointer.
  1435. *
  1436. * Align must be a power of 2 and 8 <= align <= 128.
  1437. */
  1438. static void* XXH_alignedMalloc(size_t s, size_t align)
  1439. {
  1440. XXH_ASSERT(align <= 128 && align >= 8); /* range check */
  1441. XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */
  1442. XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */
  1443. { /* Overallocate to make room for manual realignment and an offset byte */
  1444. xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
  1445. if (base != NULL) {
  1446. /*
  1447. * Get the offset needed to align this pointer.
  1448. *
  1449. * Even if the returned pointer is aligned, there will always be
  1450. * at least one byte to store the offset to the original pointer.
  1451. */
  1452. size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
  1453. /* Add the offset for the now-aligned pointer */
  1454. xxh_u8* ptr = base + offset;
  1455. XXH_ASSERT((size_t)ptr % align == 0);
  1456. /* Store the offset immediately before the returned pointer. */
  1457. ptr[-1] = (xxh_u8)offset;
  1458. return ptr;
  1459. }
  1460. return NULL;
  1461. }
  1462. }
  1463. /*
  1464. * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
  1465. * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
  1466. */
  1467. static void XXH_alignedFree(void* p)
  1468. {
  1469. if (p != NULL) {
  1470. xxh_u8* ptr = (xxh_u8*)p;
  1471. /* Get the offset byte we added in XXH_malloc. */
  1472. xxh_u8 offset = ptr[-1];
  1473. /* Free the original malloc'd pointer */
  1474. xxh_u8* base = ptr - offset;
  1475. XXH_free(base);
  1476. }
  1477. }
  1478. XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
  1479. {
  1480. return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
  1481. }
  1482. XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
  1483. {
  1484. XXH_alignedFree(statePtr);
  1485. return XXH_OK;
  1486. }
  1487. XXH_PUBLIC_API void
  1488. XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
  1489. {
  1490. memcpy(dst_state, src_state, sizeof(*dst_state));
  1491. }
  1492. static void
  1493. XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
  1494. XXH64_hash_t seed,
  1495. const xxh_u8* secret, size_t secretSize)
  1496. {
  1497. XXH_ASSERT(statePtr != NULL);
  1498. memset(statePtr, 0, sizeof(*statePtr));
  1499. statePtr->acc[0] = PRIME32_3;
  1500. statePtr->acc[1] = PRIME64_1;
  1501. statePtr->acc[2] = PRIME64_2;
  1502. statePtr->acc[3] = PRIME64_3;
  1503. statePtr->acc[4] = PRIME64_4;
  1504. statePtr->acc[5] = PRIME32_2;
  1505. statePtr->acc[6] = PRIME64_5;
  1506. statePtr->acc[7] = PRIME32_1;
  1507. statePtr->seed = seed;
  1508. XXH_ASSERT(secret != NULL);
  1509. statePtr->secret = secret;
  1510. XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
  1511. statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
  1512. statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
  1513. }
  1514. XXH_PUBLIC_API XXH_errorcode
  1515. XXH3_64bits_reset(XXH3_state_t* statePtr)
  1516. {
  1517. if (statePtr == NULL) return XXH_ERROR;
  1518. XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
  1519. return XXH_OK;
  1520. }
  1521. XXH_PUBLIC_API XXH_errorcode
  1522. XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
  1523. {
  1524. if (statePtr == NULL) return XXH_ERROR;
  1525. XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
  1526. if (secret == NULL) return XXH_ERROR;
  1527. if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
  1528. return XXH_OK;
  1529. }
  1530. XXH_PUBLIC_API XXH_errorcode
  1531. XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
  1532. {
  1533. if (statePtr == NULL) return XXH_ERROR;
  1534. XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
  1535. XXH3_initCustomSecret(statePtr->customSecret, seed);
  1536. statePtr->secret = statePtr->customSecret;
  1537. return XXH_OK;
  1538. }
  1539. XXH_FORCE_INLINE void
  1540. XXH3_consumeStripes( xxh_u64* acc,
  1541. XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
  1542. const xxh_u8* input, size_t totalStripes,
  1543. const xxh_u8* secret, size_t secretLimit,
  1544. XXH3_accWidth_e accWidth)
  1545. {
  1546. XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
  1547. if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
  1548. /* need a scrambling operation */
  1549. size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
  1550. XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
  1551. XXH3_scrambleAcc(acc, secret + secretLimit);
  1552. XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
  1553. *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
  1554. } else {
  1555. XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
  1556. *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
  1557. }
  1558. }
  1559. /*
  1560. * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
  1561. */
  1562. XXH_FORCE_INLINE XXH_errorcode
  1563. XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth)
  1564. {
  1565. if (input==NULL)
  1566. #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
  1567. return XXH_OK;
  1568. #else
  1569. return XXH_ERROR;
  1570. #endif
  1571. { const xxh_u8* const bEnd = input + len;
  1572. state->totalLen += len;
  1573. if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
  1574. XXH_memcpy(state->buffer + state->bufferedSize, input, len);
  1575. state->bufferedSize += (XXH32_hash_t)len;
  1576. return XXH_OK;
  1577. }
  1578. /* input is now > XXH3_INTERNALBUFFER_SIZE */
  1579. #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
  1580. XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */
  1581. /*
  1582. * There is some input left inside the internal buffer.
  1583. * Fill it, then consume it.
  1584. */
  1585. if (state->bufferedSize) {
  1586. size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
  1587. XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
  1588. input += loadSize;
  1589. XXH3_consumeStripes(state->acc,
  1590. &state->nbStripesSoFar, state->nbStripesPerBlock,
  1591. state->buffer, XXH3_INTERNALBUFFER_STRIPES,
  1592. state->secret, state->secretLimit,
  1593. accWidth);
  1594. state->bufferedSize = 0;
  1595. }
  1596. /* Consume input by full buffer quantities */
  1597. if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
  1598. const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
  1599. do {
  1600. XXH3_consumeStripes(state->acc,
  1601. &state->nbStripesSoFar, state->nbStripesPerBlock,
  1602. input, XXH3_INTERNALBUFFER_STRIPES,
  1603. state->secret, state->secretLimit,
  1604. accWidth);
  1605. input += XXH3_INTERNALBUFFER_SIZE;
  1606. } while (input<=limit);
  1607. }
  1608. if (input < bEnd) { /* Some remaining input: buffer it */
  1609. XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
  1610. state->bufferedSize = (XXH32_hash_t)(bEnd-input);
  1611. }
  1612. }
  1613. return XXH_OK;
  1614. }
  1615. XXH_PUBLIC_API XXH_errorcode
  1616. XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
  1617. {
  1618. return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits);
  1619. }
  1620. XXH_FORCE_INLINE void
  1621. XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
  1622. {
  1623. /*
  1624. * Digest on a local copy. This way, the state remains unaltered, and it can
  1625. * continue ingesting more input afterwards.
  1626. */
  1627. memcpy(acc, state->acc, sizeof(state->acc));
  1628. if (state->bufferedSize >= STRIPE_LEN) {
  1629. size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
  1630. XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
  1631. XXH3_consumeStripes(acc,
  1632. &nbStripesSoFar, state->nbStripesPerBlock,
  1633. state->buffer, totalNbStripes,
  1634. state->secret, state->secretLimit,
  1635. accWidth);
  1636. if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */
  1637. XXH3_accumulate_512(acc,
  1638. state->buffer + state->bufferedSize - STRIPE_LEN,
  1639. state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
  1640. accWidth);
  1641. }
  1642. } else { /* bufferedSize < STRIPE_LEN */
  1643. if (state->bufferedSize) { /* one last stripe */
  1644. xxh_u8 lastStripe[STRIPE_LEN];
  1645. size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
  1646. memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
  1647. memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
  1648. XXH3_accumulate_512(acc,
  1649. lastStripe,
  1650. state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
  1651. accWidth);
  1652. } }
  1653. }
  1654. XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
  1655. {
  1656. if (state->totalLen > XXH3_MIDSIZE_MAX) {
  1657. XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
  1658. XXH3_digest_long(acc, state, XXH3_acc_64bits);
  1659. return XXH3_mergeAccs(acc,
  1660. state->secret + XXH_SECRET_MERGEACCS_START,
  1661. (xxh_u64)state->totalLen * PRIME64_1);
  1662. }
  1663. /* len <= XXH3_MIDSIZE_MAX: short code */
  1664. if (state->seed)
  1665. return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
  1666. return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
  1667. state->secret, state->secretLimit + STRIPE_LEN);
  1668. }
  1669. /* ==========================================
  1670. * XXH3 128 bits (a.k.a XXH128)
  1671. * ==========================================
  1672. * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
  1673. * even without counting the significantly larger output size.
  1674. *
  1675. * For example, extra steps are taken to avoid the seed-dependent collisions
  1676. * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
  1677. *
  1678. * This strength naturally comes at the cost of some speed, especially on short
  1679. * lengths. Note that longer hashes are about as fast as the 64-bit version
  1680. * due to it using only a slight modification of the 64-bit loop.
  1681. *
  1682. * XXH128 is also more oriented towards 64-bit machines. It is still extremely
  1683. * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
  1684. */
  1685. XXH_FORCE_INLINE XXH128_hash_t
  1686. XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
  1687. {
  1688. /* A doubled version of 1to3_64b with different constants. */
  1689. XXH_ASSERT(input != NULL);
  1690. XXH_ASSERT(1 <= len && len <= 3);
  1691. XXH_ASSERT(secret != NULL);
  1692. /*
  1693. * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
  1694. * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
  1695. * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
  1696. */
  1697. { xxh_u8 const c1 = input[0];
  1698. xxh_u8 const c2 = input[len >> 1];
  1699. xxh_u8 const c3 = input[len - 1];
  1700. xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
  1701. | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
  1702. xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
  1703. xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
  1704. xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
  1705. xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
  1706. xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
  1707. xxh_u64 const mixedl = keyed_lo * PRIME64_1;
  1708. xxh_u64 const mixedh = keyed_hi * PRIME64_5;
  1709. XXH128_hash_t h128;
  1710. h128.low64 = XXH3_avalanche(mixedl);
  1711. h128.high64 = XXH3_avalanche(mixedh);
  1712. return h128;
  1713. }
  1714. }
  1715. XXH_FORCE_INLINE XXH128_hash_t
  1716. XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
  1717. {
  1718. XXH_ASSERT(input != NULL);
  1719. XXH_ASSERT(secret != NULL);
  1720. XXH_ASSERT(4 <= len && len <= 8);
  1721. seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
  1722. { xxh_u32 const input_lo = XXH_readLE32(input);
  1723. xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
  1724. xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
  1725. xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
  1726. xxh_u64 const keyed = input_64 ^ bitflip;
  1727. /* Shift len to the left to ensure it is even, this avoids even multiplies. */
  1728. XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2));
  1729. m128.high64 += (m128.low64 << 1);
  1730. m128.low64 ^= (m128.high64 >> 3);
  1731. m128.low64 = XXH_xorshift64(m128.low64, 35);
  1732. m128.low64 *= 0x9FB21C651E98DF25ULL;
  1733. m128.low64 = XXH_xorshift64(m128.low64, 28);
  1734. m128.high64 = XXH3_avalanche(m128.high64);
  1735. return m128;
  1736. }
  1737. }
  1738. XXH_FORCE_INLINE XXH128_hash_t
  1739. XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
  1740. {
  1741. XXH_ASSERT(input != NULL);
  1742. XXH_ASSERT(secret != NULL);
  1743. XXH_ASSERT(9 <= len && len <= 16);
  1744. { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
  1745. xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
  1746. xxh_u64 const input_lo = XXH_readLE64(input);
  1747. xxh_u64 input_hi = XXH_readLE64(input + len - 8);
  1748. XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1);
  1749. /*
  1750. * Put len in the middle of m128 to ensure that the length gets mixed to
  1751. * both the low and high bits in the 128x64 multiply below.
  1752. */
  1753. m128.low64 += (xxh_u64)(len - 1) << 54;
  1754. input_hi ^= bitfliph;
  1755. /*
  1756. * Add the high 32 bits of input_hi to the high 32 bits of m128, then
  1757. * add the long product of the low 32 bits of input_hi and PRIME32_2 to
  1758. * the high 64 bits of m128.
  1759. *
  1760. * The best approach to this operation is different on 32-bit and 64-bit.
  1761. */
  1762. if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
  1763. /*
  1764. * 32-bit optimized version, which is more readable.
  1765. *
  1766. * On 32-bit, it removes an ADC and delays a dependency between the two
  1767. * halves of m128.high64, but it generates an extra mask on 64-bit.
  1768. */
  1769. m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2);
  1770. } else {
  1771. /*
  1772. * 64-bit optimized (albeit more confusing) version.
  1773. *
  1774. * Uses some properties of addition and multiplication to remove the mask:
  1775. *
  1776. * Let:
  1777. * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
  1778. * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
  1779. * c = PRIME32_2
  1780. *
  1781. * a + (b * c)
  1782. * Inverse Property: x + y - x == y
  1783. * a + (b * (1 + c - 1))
  1784. * Distributive Property: x * (y + z) == (x * y) + (x * z)
  1785. * a + (b * 1) + (b * (c - 1))
  1786. * Identity Property: x * 1 == x
  1787. * a + b + (b * (c - 1))
  1788. *
  1789. * Substitute a, b, and c:
  1790. * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
  1791. *
  1792. * Since input_hi.hi + input_hi.lo == input_hi, we get this:
  1793. * input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
  1794. */
  1795. m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1);
  1796. }
  1797. /* m128 ^= XXH_swap64(m128 >> 64); */
  1798. m128.low64 ^= XXH_swap64(m128.high64);
  1799. { /* 128x64 multiply: h128 = m128 * PRIME64_2; */
  1800. XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
  1801. h128.high64 += m128.high64 * PRIME64_2;
  1802. h128.low64 = XXH3_avalanche(h128.low64);
  1803. h128.high64 = XXH3_avalanche(h128.high64);
  1804. return h128;
  1805. } }
  1806. }
  1807. /*
  1808. * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
  1809. */
  1810. XXH_FORCE_INLINE XXH128_hash_t
  1811. XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
  1812. {
  1813. XXH_ASSERT(len <= 16);
  1814. { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
  1815. if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
  1816. if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
  1817. { XXH128_hash_t h128;
  1818. xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
  1819. xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
  1820. h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl);
  1821. h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph);
  1822. return h128;
  1823. } }
  1824. }
  1825. /*
  1826. * A bit slower than XXH3_mix16B, but handles multiply by zero better.
  1827. */
  1828. XXH_FORCE_INLINE XXH128_hash_t
  1829. XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
  1830. const xxh_u8* secret, XXH64_hash_t seed)
  1831. {
  1832. acc.low64 += XXH3_mix16B (input_1, secret+0, seed);
  1833. acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
  1834. acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
  1835. acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
  1836. return acc;
  1837. }
  1838. XXH_FORCE_INLINE XXH128_hash_t
  1839. XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
  1840. const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
  1841. XXH64_hash_t seed)
  1842. {
  1843. XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
  1844. XXH_ASSERT(16 < len && len <= 128);
  1845. { XXH128_hash_t acc;
  1846. acc.low64 = len * PRIME64_1;
  1847. acc.high64 = 0;
  1848. if (len > 32) {
  1849. if (len > 64) {
  1850. if (len > 96) {
  1851. acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
  1852. }
  1853. acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
  1854. }
  1855. acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
  1856. }
  1857. acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
  1858. { XXH128_hash_t h128;
  1859. h128.low64 = acc.low64 + acc.high64;
  1860. h128.high64 = (acc.low64 * PRIME64_1)
  1861. + (acc.high64 * PRIME64_4)
  1862. + ((len - seed) * PRIME64_2);
  1863. h128.low64 = XXH3_avalanche(h128.low64);
  1864. h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
  1865. return h128;
  1866. }
  1867. }
  1868. }
  1869. XXH_NO_INLINE XXH128_hash_t
  1870. XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
  1871. const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
  1872. XXH64_hash_t seed)
  1873. {
  1874. XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
  1875. XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
  1876. { XXH128_hash_t acc;
  1877. int const nbRounds = (int)len / 32;
  1878. int i;
  1879. acc.low64 = len * PRIME64_1;
  1880. acc.high64 = 0;
  1881. for (i=0; i<4; i++) {
  1882. acc = XXH128_mix32B(acc,
  1883. input + (32 * i),
  1884. input + (32 * i) + 16,
  1885. secret + (32 * i),
  1886. seed);
  1887. }
  1888. acc.low64 = XXH3_avalanche(acc.low64);
  1889. acc.high64 = XXH3_avalanche(acc.high64);
  1890. XXH_ASSERT(nbRounds >= 4);
  1891. for (i=4 ; i < nbRounds; i++) {
  1892. acc = XXH128_mix32B(acc,
  1893. input + (32 * i),
  1894. input + (32 * i) + 16,
  1895. secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
  1896. seed);
  1897. }
  1898. /* last bytes */
  1899. acc = XXH128_mix32B(acc,
  1900. input + len - 16,
  1901. input + len - 32,
  1902. secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
  1903. 0ULL - seed);
  1904. { XXH128_hash_t h128;
  1905. h128.low64 = acc.low64 + acc.high64;
  1906. h128.high64 = (acc.low64 * PRIME64_1)
  1907. + (acc.high64 * PRIME64_4)
  1908. + ((len - seed) * PRIME64_2);
  1909. h128.low64 = XXH3_avalanche(h128.low64);
  1910. h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
  1911. return h128;
  1912. }
  1913. }
  1914. }
  1915. XXH_FORCE_INLINE XXH128_hash_t
  1916. XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
  1917. const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
  1918. {
  1919. XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
  1920. XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits);
  1921. /* converge into final hash */
  1922. XXH_STATIC_ASSERT(sizeof(acc) == 64);
  1923. XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
  1924. { XXH128_hash_t h128;
  1925. h128.low64 = XXH3_mergeAccs(acc,
  1926. secret + XXH_SECRET_MERGEACCS_START,
  1927. (xxh_u64)len * PRIME64_1);
  1928. h128.high64 = XXH3_mergeAccs(acc,
  1929. secret + secretSize
  1930. - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
  1931. ~((xxh_u64)len * PRIME64_2));
  1932. return h128;
  1933. }
  1934. }
  1935. /*
  1936. * It's important for performance that XXH3_hashLong is not inlined. Not sure
  1937. * why (uop cache maybe?), but the difference is large and easily measurable.
  1938. */
  1939. XXH_NO_INLINE XXH128_hash_t
  1940. XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
  1941. {
  1942. return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
  1943. }
  1944. /*
  1945. * It's important for performance that XXH3_hashLong is not inlined. Not sure
  1946. * why (uop cache maybe?), but the difference is large and easily measurable.
  1947. */
  1948. XXH_NO_INLINE XXH128_hash_t
  1949. XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
  1950. const xxh_u8* secret, size_t secretSize)
  1951. {
  1952. return XXH3_hashLong_128b_internal(input, len, secret, secretSize);
  1953. }
  1954. /*
  1955. * It's important for performance that XXH3_hashLong is not inlined. Not sure
  1956. * why (uop cache maybe?), but the difference is large and easily measurable.
  1957. */
  1958. XXH_NO_INLINE XXH128_hash_t
  1959. XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
  1960. {
  1961. XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
  1962. if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len);
  1963. XXH3_initCustomSecret(secret, seed);
  1964. return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret));
  1965. }
  1966. XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
  1967. {
  1968. if (len <= 16)
  1969. return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
  1970. if (len <= 128)
  1971. return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
  1972. if (len <= XXH3_MIDSIZE_MAX)
  1973. return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
  1974. return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
  1975. }
  1976. XXH_PUBLIC_API XXH128_hash_t
  1977. XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
  1978. {
  1979. XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
  1980. /*
  1981. * If an action is to be taken if `secret` conditions are not respected,
  1982. * it should be done here.
  1983. * For now, it's a contract pre-condition.
  1984. * Adding a check and a branch here would cost performance at every hash.
  1985. */
  1986. if (len <= 16)
  1987. return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
  1988. if (len <= 128)
  1989. return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
  1990. if (len <= XXH3_MIDSIZE_MAX)
  1991. return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
  1992. return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
  1993. }
  1994. XXH_PUBLIC_API XXH128_hash_t
  1995. XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
  1996. {
  1997. if (len <= 16)
  1998. return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
  1999. if (len <= 128)
  2000. return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
  2001. if (len <= XXH3_MIDSIZE_MAX)
  2002. return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
  2003. return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
  2004. }
  2005. XXH_PUBLIC_API XXH128_hash_t
  2006. XXH128(const void* input, size_t len, XXH64_hash_t seed)
  2007. {
  2008. return XXH3_128bits_withSeed(input, len, seed);
  2009. }
  2010. /* === XXH3 128-bit streaming === */
  2011. /*
  2012. * All the functions are actually the same as for 64-bit streaming variant.
  2013. * The only difference is the finalizatiom routine.
  2014. */
  2015. static void
  2016. XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
  2017. XXH64_hash_t seed,
  2018. const xxh_u8* secret, size_t secretSize)
  2019. {
  2020. XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
  2021. }
  2022. XXH_PUBLIC_API XXH_errorcode
  2023. XXH3_128bits_reset(XXH3_state_t* statePtr)
  2024. {
  2025. if (statePtr == NULL) return XXH_ERROR;
  2026. XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
  2027. return XXH_OK;
  2028. }
  2029. XXH_PUBLIC_API XXH_errorcode
  2030. XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
  2031. {
  2032. if (statePtr == NULL) return XXH_ERROR;
  2033. XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
  2034. if (secret == NULL) return XXH_ERROR;
  2035. if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
  2036. return XXH_OK;
  2037. }
  2038. XXH_PUBLIC_API XXH_errorcode
  2039. XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
  2040. {
  2041. if (statePtr == NULL) return XXH_ERROR;
  2042. XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
  2043. XXH3_initCustomSecret(statePtr->customSecret, seed);
  2044. statePtr->secret = statePtr->customSecret;
  2045. return XXH_OK;
  2046. }
  2047. XXH_PUBLIC_API XXH_errorcode
  2048. XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
  2049. {
  2050. return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits);
  2051. }
  2052. XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
  2053. {
  2054. if (state->totalLen > XXH3_MIDSIZE_MAX) {
  2055. XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
  2056. XXH3_digest_long(acc, state, XXH3_acc_128bits);
  2057. XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
  2058. { XXH128_hash_t h128;
  2059. h128.low64 = XXH3_mergeAccs(acc,
  2060. state->secret + XXH_SECRET_MERGEACCS_START,
  2061. (xxh_u64)state->totalLen * PRIME64_1);
  2062. h128.high64 = XXH3_mergeAccs(acc,
  2063. state->secret + state->secretLimit + STRIPE_LEN
  2064. - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
  2065. ~((xxh_u64)state->totalLen * PRIME64_2));
  2066. return h128;
  2067. }
  2068. }
  2069. /* len <= XXH3_MIDSIZE_MAX : short code */
  2070. if (state->seed)
  2071. return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
  2072. return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
  2073. state->secret, state->secretLimit + STRIPE_LEN);
  2074. }
  2075. /* 128-bit utility functions */
  2076. #include <string.h> /* memcmp, memcpy */
  2077. /* return : 1 is equal, 0 if different */
  2078. XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
  2079. {
  2080. /* note : XXH128_hash_t is compact, it has no padding byte */
  2081. return !(memcmp(&h1, &h2, sizeof(h1)));
  2082. }
  2083. /* This prototype is compatible with stdlib's qsort().
  2084. * return : >0 if *h128_1 > *h128_2
  2085. * <0 if *h128_1 < *h128_2
  2086. * =0 if *h128_1 == *h128_2 */
  2087. XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
  2088. {
  2089. XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
  2090. XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
  2091. int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
  2092. /* note : bets that, in most cases, hash values are different */
  2093. if (hcmp) return hcmp;
  2094. return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
  2095. }
  2096. /*====== Canonical representation ======*/
  2097. XXH_PUBLIC_API void
  2098. XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
  2099. {
  2100. XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
  2101. if (XXH_CPU_LITTLE_ENDIAN) {
  2102. hash.high64 = XXH_swap64(hash.high64);
  2103. hash.low64 = XXH_swap64(hash.low64);
  2104. }
  2105. memcpy(dst, &hash.high64, sizeof(hash.high64));
  2106. memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
  2107. }
  2108. XXH_PUBLIC_API XXH128_hash_t
  2109. XXH128_hashFromCanonical(const XXH128_canonical_t* src)
  2110. {
  2111. XXH128_hash_t h;
  2112. h.high64 = XXH_readBE64(src);
  2113. h.low64 = XXH_readBE64(src->digest + 8);
  2114. return h;
  2115. }
  2116. /* Pop our optimization override from above */
  2117. #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
  2118. && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
  2119. && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
  2120. # pragma GCC pop_options
  2121. #endif
  2122. #endif /* XXH3_H_1397135465 */