Quellcode durchsuchen

ADD: XXH3-128 hash algorithm (#issue #1779)

Alexander Koblov vor 1 Jahr
Ursprung
Commit
ea8c59fed2

+ 117 - 0
components/kascrypt/Hashes/dcpxxh3.pas

@@ -0,0 +1,117 @@
+{******************************************************************************}
+{* DCPcrypt v2.0 written by David Barton ([email protected]) **********}
+{******************************************************************************}
+{* A binary compatible implementation of XXH3-128                             *}
+{******************************************************************************}
+{* Copyright (C) 2024 Alexander Koblov ([email protected])                    *}
+{* Permission is hereby granted, free of charge, to any person obtaining a    *}
+{* copy of this software and associated documentation files (the "Software"), *}
+{* to deal in the Software without restriction, including without limitation  *}
+{* the rights to use, copy, modify, merge, publish, distribute, sublicense,   *}
+{* and/or sell copies of the Software, and to permit persons to whom the      *}
+{* Software is furnished to do so, subject to the following conditions:       *}
+{*                                                                            *}
+{* The above copyright notice and this permission notice shall be included in *}
+{* all copies or substantial portions of the Software.                        *}
+{*                                                                            *}
+{* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *}
+{* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *}
+{* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *}
+{* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER *}
+{* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING    *}
+{* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER        *}
+{* DEALINGS IN THE SOFTWARE.                                                  *}
+{******************************************************************************}
+unit DCPxxh3;
+
+{$mode objfpc}{$H+}
+
+interface
+
+uses
+  Classes, Sysutils, DCPcrypt2, DCxxhash;
+
+type
+
+  { TDCP_xxh3_128 }
+
+  TDCP_xxh3_128 = class(TDCP_hash)
+  protected
+    S: PXXH3_state_t;
+  public
+    class function GetAlgorithm: string; override;
+    class function GetHashSize: integer; override;
+    class function SelfTest: boolean; override;
+    constructor Create(AOwner: TComponent); override;
+    destructor Destroy; override;
+    procedure Init; override;
+    procedure Burn; override;
+    procedure Update(const Buffer; Size: longword); override;
+    procedure Final(var Digest); override;
+  end;
+
+implementation
+{$R-}{$Q-}
+
+{ TDCP_xxh3_128 }
+
+class function TDCP_xxh3_128.GetHashSize: integer;
+begin
+  Result:= 128;
+end;
+
+class function TDCP_xxh3_128.GetAlgorithm: string;
+begin
+  Result:= 'XXH3-128';
+end;
+
+class function TDCP_xxh3_128.SelfTest: boolean;
+begin
+  Result:= False; // TODO: SelfTest XXH3_128
+end;
+
+constructor TDCP_xxh3_128.Create(AOwner: TComponent);
+begin
+  inherited Create(AOwner);
+  S:= XXH3_createState();
+end;
+
+destructor TDCP_xxh3_128.Destroy;
+begin
+  XXH3_freeState(S);
+  inherited Destroy;
+end;
+
+procedure TDCP_xxh3_128.Init;
+begin
+  Burn;
+  fInitialized:= true;
+end;
+
+procedure TDCP_xxh3_128.Burn;
+begin
+  XXH3_128bits_reset(S);
+  fInitialized:= false;
+end;
+
+procedure TDCP_xxh3_128.Update(const Buffer; Size: longword);
+begin
+  XXH3_128bits_update(S, @Buffer, Size);
+end;
+
+procedure TDCP_xxh3_128.Final(var Digest);
+var
+  Temp: UInt64;
+  Hash: XXH128_hash_t;
+begin
+  if not fInitialized then
+    raise EDCP_hash.Create('Hash not initialized');
+  Hash:= XXH3_128bits_digest(S);
+  Temp:= SwapEndian(Hash.low64);
+  Hash.low64:= SwapEndian(Hash.high64);
+  Hash.high64:= Temp;
+  Move(Hash, Digest, Sizeof(XXH128_hash_t));
+  Burn;
+end;
+
+end.

+ 1104 - 0
components/kascrypt/Hashes/dcxxhash.pas

@@ -0,0 +1,1104 @@
+{
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2023 Yann Collet
+ *
+ * The Pascal translation by Alexander Koblov, 2024
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+}
+
+unit DCxxhash;
+
+{$mode objfpc}{$H+}
+{$inline on}{$Q-}
+{$macro on}{$R-}
+
+interface
+
+uses
+  SysUtils;
+
+const
+  XXH3_SECRET_DEFAULT_SIZE = 192;
+  XXH3_INTERNALBUFFER_SIZE = 256;
+
+type
+  XXH64_hash_t = UInt64;
+  XXH32_hash_t = UInt32;
+
+  XXH128_hash_t = record
+    low64: XXH64_hash_t;
+    high64: XXH64_hash_t;
+  end;
+
+  {$CODEALIGN RECORDMIN=64}
+  PXXH3_state_t = ^XXH3_state_t;
+  XXH3_state_t = record
+    acc: array[0..7] of XXH64_hash_t;
+    customSecret: array[0..Pred(XXH3_SECRET_DEFAULT_SIZE)] of Byte;
+    buffer: array[0..Pred(XXH3_INTERNALBUFFER_SIZE)] of Byte;
+    bufferedSize: XXH32_hash_t;
+    useSeed: XXH32_hash_t;
+    nbStripesSoFar: UIntPtr;
+    totalLen: XXH64_hash_t;
+    nbStripesPerBlock: UIntPtr;
+    secretLimit: UIntPtr;
+    seed: XXH64_hash_t;
+    reserved64: XXH64_hash_t;
+    extSecret: PByte;
+  end;
+
+function XXH3_createState: PXXH3_state_t;
+procedure XXH3_freeState(statePtr: PXXH3_state_t);
+procedure XXH3_128bits_reset(statePtr: PXXH3_state_t);
+procedure XXH3_128bits_update(state: PXXH3_state_t; const input: PByte; len: UIntPtr);
+function XXH3_128bits_digest (const state: PXXH3_state_t): XXH128_hash_t;
+
+implementation
+
+{$IF DEFINED(CPUX86_64)}
+uses
+  CPU;
+{$ENDIF}
+
+{$CODEALIGN CONSTMIN=64}
+
+const
+  XXH_PRIME32_1 = $9E3779B1;
+  XXH_PRIME32_2 = $85EBCA77;
+  XXH_PRIME32_3 = $C2B2AE3D;
+
+  XXH_PRIME64_1 = UInt64($9E3779B185EBCA87);
+  XXH_PRIME64_2 = UInt64($C2B2AE3D27D4EB4F);
+  XXH_PRIME64_3 = UInt64($165667B19E3779F9);
+  XXH_PRIME64_4 = UInt64($85EBCA77C2B2AE63);
+  XXH_PRIME64_5 = UInt64($27D4EB2F165667C5);
+
+  XXH3_MIDSIZE_MAX = 240;
+  XXH_SECRET_LASTACC_START = 7;
+  XXH_SECRET_MERGEACCS_START = 11;
+
+  XXH3_MIDSIZE_STARTOFFSET = 3;
+  XXH3_MIDSIZE_LASTOFFSET  = 17;
+
+  XXH_SECRET_CONSUME_RATE = 8;
+  XXH_STRIPE_LEN = 64;
+  XXH_ACC_SIZE = 64;
+  XXH_ACC_NB = 8;
+
+  XXH3_SECRET_SIZE_MIN = 136;
+  XXH_SECRET_DEFAULT_SIZE = 192;
+
+  PRIME_MX1 = UInt64($165667919E3779F9);
+  PRIME_MX2 = UInt64($9FB21C651E98DF25);
+
+  XXH_ACC_ALIGN = 64; //* for compatibility with avx512 */
+
+  XXH3_INTERNALBUFFER_STRIPES = (XXH3_INTERNALBUFFER_SIZE div XXH_STRIPE_LEN);
+
+  //*! Pseudorandom secret taken directly from FARSH. */
+  const XXH3_kSecret: array[0..Pred(XXH_SECRET_DEFAULT_SIZE)] of Byte = (
+      $b8, $fe, $6c, $39, $23, $a4, $4b, $be, $7c, $01, $81, $2c, $f7, $21, $ad, $1c,
+      $de, $d4, $6d, $e9, $83, $90, $97, $db, $72, $40, $a4, $a4, $b7, $b3, $67, $1f,
+      $cb, $79, $e6, $4e, $cc, $c0, $e5, $78, $82, $5a, $d0, $7d, $cc, $ff, $72, $21,
+      $b8, $08, $46, $74, $f7, $43, $24, $8e, $e0, $35, $90, $e6, $81, $3a, $26, $4c,
+      $3c, $28, $52, $bb, $91, $c3, $00, $cb, $88, $d0, $65, $8b, $1b, $53, $2e, $a3,
+      $71, $64, $48, $97, $a2, $0d, $f9, $4e, $38, $19, $ef, $46, $a9, $de, $ac, $d8,
+      $a8, $fa, $76, $3f, $e3, $9c, $34, $3f, $f9, $dc, $bb, $c7, $c7, $0b, $4f, $1d,
+      $8a, $51, $e0, $4b, $cd, $b4, $59, $31, $c8, $9f, $7e, $c9, $d9, $78, $73, $64,
+      $ea, $c5, $ac, $83, $34, $d3, $eb, $c3, $c5, $81, $a0, $ff, $fa, $13, $63, $eb,
+      $17, $0d, $dd, $51, $b7, $f0, $da, $49, $d3, $16, $55, $26, $29, $d4, $68, $9e,
+      $2b, $16, $be, $58, $7d, $47, $a1, $fc, $8f, $f8, $b8, $d1, $7a, $d0, $31, $ce,
+      $45, $cb, $3a, $8f, $95, $16, $04, $28, $af, $d7, $fb, $ca, $bb, $4b, $40, $7e
+  );
+
+type
+  TXXH3_scrambleAcc_f = procedure(acc: PByte; const secret: PByte);
+  TXXH3_accumulate_512_f = procedure(acc: PByte; const input: PByte; const secret: PByte);
+  TXXH3_accumulate_f = procedure(acc: PByte; const input: PByte; const secret: PByte; nbStripes: UIntPtr);
+
+var
+  XXH3_accumulate: TXXH3_accumulate_f;
+  XXH3_accumulate_512: TXXH3_accumulate_512_f;
+
+function XXH_readLE32(const ptr: Pointer): UInt32; inline;
+begin
+  Result:= PUInt32(ptr)^;
+end;
+
+function XXH_readLE64(const ptr: Pointer): UInt64; inline;
+begin
+  Result:= PUInt64(ptr)^;
+end;
+
+function XXH_mult32to64(x, y: UInt64): UInt64; inline;
+begin
+  Result:= (x and $FFFFFFFF) * (y and $FFFFFFFF);
+end;
+
+function XXH64_avalanche(hash: UInt64): UInt64;
+begin
+  hash := hash xor hash shr 33;
+  hash *= XXH_PRIME64_2;
+  hash := hash xor hash shr 29;
+  hash *= XXH_PRIME64_3;
+  hash := hash xor hash shr 32;
+  Result := hash;
+end;
+
+function XXH_alignedMalloc(s: UIntPtr; align: UIntPtr): Pointer;
+var
+  offset: UIntPtr;
+  base, ptr: PByte;
+begin
+  Assert((align <= 128) and (align >= 8)); //* range check */
+  Assert((align and (align-1)) = 0);       //* power of 2 */
+  Assert((s <> 0) and (s < (s + align)));  //* empty/overflow */
+  //* Overallocate to make room for manual realignment and an offset byte */
+  base := GetMem(s + align);
+  if (base <> nil) then
+  begin
+    {*
+     * Get the offset needed to align this pointer.
+     *
+     * Even if the returned pointer is aligned, there will always be
+     * at least one byte to store the offset to the original pointer.
+    *}
+    offset := align - (UIntPtr(base) and (align - 1)); //* base % align */
+    //* Add the offset for the now-aligned pointer */
+    ptr := base + offset;
+
+    Assert(UIntPtr(ptr) mod align = 0);
+
+    //* Store the offset immediately before the returned pointer. */
+    ptr[-1] := Byte(offset);
+    Exit(ptr);
+  end;
+  Result:= nil;
+end;
+
+procedure XXH_alignedFree(p: Pointer);
+var
+  offset: Byte;
+  base, ptr: PByte;
+begin
+  if (p <> nil) then
+  begin
+    ptr:= PByte(p);
+    //* Get the offset byte we added in XXH_malloc. */
+    offset:= ptr[-1];
+    //* Free the original malloc'd pointer */
+    base:= ptr - offset;
+    FreeMem(base);
+  end;
+end;
+
+function XXH3_createState: PXXH3_state_t;
+begin
+  Result:= XXH_alignedMalloc(SizeOf(XXH3_state_t), XXH_ACC_ALIGN);
+  if (Result = nil) then Exit(nil);
+  Result^.seed:= 0;
+  Result^.extSecret:= nil;
+end;
+
+procedure XXH3_freeState(statePtr: PXXH3_state_t);
+begin
+  XXH_alignedFree(statePtr);
+end;
+
+procedure XXH3_reset_internal(statePtr: PXXH3_state_t; seed: XXH64_hash_t;
+                              const secret: PByte; secretSize: UIntPtr);
+var
+  initStart: PByte;
+  initLength: UIntPtr;
+begin
+  Assert(statePtr <> nil);
+  initStart:= @statePtr^.bufferedSize;
+  initLength:= @statePtr^.nbStripesPerBlock - initStart;
+  //* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+  FillChar(initStart^, initLength, 0);
+  statePtr^.acc[0]:= XXH_PRIME32_3;
+  statePtr^.acc[1]:= XXH_PRIME64_1;
+  statePtr^.acc[2]:= XXH_PRIME64_2;
+  statePtr^.acc[3]:= XXH_PRIME64_3;
+  statePtr^.acc[4]:= XXH_PRIME64_4;
+  statePtr^.acc[5]:= XXH_PRIME32_2;
+  statePtr^.acc[6]:= XXH_PRIME64_5;
+  statePtr^.acc[7]:= XXH_PRIME32_1;
+  statePtr^.seed:= seed;
+  statePtr^.useSeed:= XXH32_hash_t(seed <> 0);
+  statePtr^.extSecret:= secret;
+  Assert(secretSize >= XXH3_SECRET_SIZE_MIN);
+  statePtr^.secretLimit:= secretSize - XXH_STRIPE_LEN;
+  statePtr^.nbStripesPerBlock:= statePtr^.secretLimit div XXH_SECRET_CONSUME_RATE;
+end;
+
+procedure XXH3_64bits_reset(statePtr: PXXH3_state_t);
+begin
+  XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+end;
+
+procedure XXH3_128bits_reset(statePtr: PXXH3_state_t);
+begin
+  XXH3_64bits_reset(statePtr);
+end;
+
+{$IF DEFINED(CPUX86_64)}
+
+procedure XXH3_accumulate_512_sse2(acc: PByte; const input: PByte; const secret: PByte); assembler; nostackframe;
+// UNIX    RDI, RSI, RDX
+// WIN64:  RCX, RDX, R8
+asm
+{$IF DEFINED(UNIX)}
+  movq     %rdx, %r8
+  movq     %rdi, %rcx
+  movq     %rsi, %rdx
+{$ENDIF}
+  movdqu	(%rdx), %xmm3
+  movdqu	(%r8), %xmm0
+  movdqu	(%rdx), %xmm4
+  movdqu	16(%rdx), %xmm5
+  pxor	%xmm3, %xmm0
+  movdqu	16(%rdx), %xmm2
+  movdqu	32(%rdx), %xmm3
+  pshufd	$49, %xmm0, %xmm1
+  pmuludq	%xmm1, %xmm0
+  pshufd	$78, %xmm4, %xmm1
+  movdqu	32(%rdx), %xmm4
+  paddq	%xmm1, %xmm0
+  paddq	(%rcx), %xmm0
+  movups	%xmm0, (%rcx)
+  movdqu	16(%r8), %xmm0
+  pxor	%xmm5, %xmm0
+  pshufd	$49, %xmm0, %xmm1
+  pmuludq	%xmm1, %xmm0
+  pshufd	$78, %xmm2, %xmm1
+  paddq	%xmm1, %xmm0
+  paddq	16(%rcx), %xmm0
+  movups	%xmm0, 16(%rcx)
+  movdqu	32(%r8), %xmm0
+  pxor	%xmm3, %xmm0
+  pshufd	$49, %xmm0, %xmm1
+  pmuludq	%xmm1, %xmm0
+  pshufd	$78, %xmm4, %xmm1
+  paddq	%xmm1, %xmm0
+  paddq	32(%rcx), %xmm0
+  movdqu	48(%rdx), %xmm1
+  movups	%xmm0, 32(%rcx)
+  movdqu	48(%r8), %xmm0
+  pxor	%xmm1, %xmm0
+  pshufd	$78, %xmm1, %xmm1
+  pshufd	$49, %xmm0, %xmm2
+  pmuludq	%xmm2, %xmm0
+  paddq	%xmm1, %xmm0
+  paddq	48(%rcx), %xmm0
+  movups	%xmm0, 48(%rcx)
+end;
+
+procedure XXH3_accumulate_sse2(acc: PByte; const input: PByte; const secret: PByte; nbStripes: UIntPtr); assembler; nostackframe;
+// UNIX    RDI, RSI, RDX, RCX
+// WIN64:  RCX, RDX, R8,  R9
+asm
+{$IF DEFINED(UNIX)}
+  movq     %rdx, %r8
+  movq     %rcx, %r9
+  movq     %rdi, %rcx
+  movq     %rsi, %rdx
+{$ENDIF}
+  testq	%r9, %r9
+  je	.L271
+  leaq	448(%rdx), %rax
+  prefetcht0	384(%rdx)
+  movdqu	(%rcx), %xmm4
+  movdqu	16(%rcx), %xmm3
+  movdqu	32(%rcx), %xmm2
+  movdqu	48(%rcx), %xmm1
+  xorl	%edx, %edx
+  jmp	.L276
+.L274:
+  prefetcht0	(%rax)
+  addq	$64, %rax
+.L276:
+  movdqu	(%r8,%rdx,8), %xmm0
+  movdqu	-448(%rax), %xmm5
+  pxor	%xmm5, %xmm0
+  pshufd	$49, %xmm0, %xmm5
+  pmuludq	%xmm5, %xmm0
+  movdqu	-448(%rax), %xmm5
+  pshufd	$78, %xmm5, %xmm5
+  paddq	%xmm5, %xmm0
+  movdqu	-432(%rax), %xmm5
+  paddq	%xmm0, %xmm4
+  movdqu	16(%r8,%rdx,8), %xmm0
+  pxor	%xmm5, %xmm0
+  pshufd	$49, %xmm0, %xmm5
+  pmuludq	%xmm5, %xmm0
+  movdqu	-432(%rax), %xmm5
+  pshufd	$78, %xmm5, %xmm5
+  paddq	%xmm5, %xmm0
+  movdqu	-416(%rax), %xmm5
+  paddq	%xmm0, %xmm3
+  movdqu	32(%r8,%rdx,8), %xmm0
+  pxor	%xmm5, %xmm0
+  pshufd	$49, %xmm0, %xmm5
+  pmuludq	%xmm5, %xmm0
+  movdqu	-416(%rax), %xmm5
+  pshufd	$78, %xmm5, %xmm5
+  paddq	%xmm5, %xmm0
+  movdqu	-400(%rax), %xmm5
+  paddq	%xmm0, %xmm2
+  movdqu	48(%r8,%rdx,8), %xmm0
+  addq	$1, %rdx
+  pxor	%xmm5, %xmm0
+  pshufd	$49, %xmm0, %xmm5
+  pmuludq	%xmm5, %xmm0
+  movdqu	-400(%rax), %xmm5
+  pshufd	$78, %xmm5, %xmm5
+  paddq	%xmm5, %xmm0
+  paddq	%xmm0, %xmm1
+  cmpq	%rdx, %r9
+  jne	.L274
+  movups	%xmm4, (%rcx)
+  movups	%xmm3, 16(%rcx)
+  movups	%xmm2, 32(%rcx)
+  movups	%xmm1, 48(%rcx)
+.L271:
+  ret
+end;
+
+procedure XXH3_accumulate_512_avx2(acc: PByte; const input: PByte; const secret: PByte); assembler; nostackframe;
+// UNIX    RDI, RSI, RDX
+// WIN64:  RCX, RDX, R8
+asm
+{$IF DEFINED(UNIX)}
+  movq     %rdx, %r8
+  movq     %rdi, %rcx
+  movq     %rsi, %rdx
+{$ENDIF}
+  vmovdqu	(%r8), %ymm3
+  vpxor	(%rdx), %ymm3, %ymm0
+  vpsrlq	$32, %ymm0, %ymm1
+  vpmuludq	%ymm1, %ymm0, %ymm0
+  vpshufd	$78, (%rdx), %ymm1
+  vpaddq	%ymm1, %ymm0, %ymm0
+  vpaddq	(%rcx), %ymm0, %ymm0
+  vmovdqu	32(%rdx), %ymm1
+  vmovdqu	%ymm0, (%rcx)
+  vpxor	32(%r8), %ymm1, %ymm0
+  vpshufd	$78, %ymm1, %ymm1
+  vpsrlq	$32, %ymm0, %ymm2
+  vpmuludq	%ymm2, %ymm0, %ymm0
+  vpaddq	%ymm1, %ymm0, %ymm0
+  vpaddq	32(%rcx), %ymm0, %ymm0
+  vmovdqu	%ymm0, 32(%rcx)
+  vzeroupper
+end;
+
+procedure XXH3_accumulate_avx2(acc: PByte; const input: PByte; const secret: PByte; nbStripes: UIntPtr); assembler; nostackframe;
+// UNIX    RDI, RSI, RDX, RCX
+// WIN64:  RCX, RDX, R8,  R9
+asm
+{$IF DEFINED(UNIX)}
+  movq     %rdx, %r8
+  movq     %rcx, %r9
+  movq     %rdi, %rcx
+  movq     %rsi, %rdx
+{$ENDIF}
+  testq	%r9, %r9
+  je	.L290
+  leaq	448(%rdx), %rax
+  prefetcht0	384(%rdx)
+  vmovdqu	(%rcx), %ymm3
+  xorl	%edx, %edx
+  vmovdqu	32(%rcx), %ymm2
+  jmp	.L288
+.L286:
+  prefetcht0	(%rax)
+  addq	$64, %rax
+.L288:
+  vmovdqu	(%r8,%rdx,8), %ymm4
+  vpxor	-448(%rax), %ymm4, %ymm0
+  vmovdqu	32(%r8,%rdx,8), %ymm5
+  addq	$1, %rdx
+  vpsrlq	$32, %ymm0, %ymm1
+  vpmuludq	%ymm1, %ymm0, %ymm0
+  vpshufd	$78, -448(%rax), %ymm1
+  vpaddq	%ymm1, %ymm0, %ymm0
+  vpaddq	%ymm3, %ymm0, %ymm3
+  vpxor	-416(%rax), %ymm5, %ymm0
+  vpsrlq	$32, %ymm0, %ymm1
+  vpmuludq	%ymm1, %ymm0, %ymm0
+  vpshufd	$78, -416(%rax), %ymm1
+  vpaddq	%ymm1, %ymm0, %ymm0
+  vpaddq	%ymm2, %ymm0, %ymm2
+  cmpq	%rdx, %r9
+  jne	.L286
+  vmovdqu	%ymm3, (%rcx)
+  vmovdqu	%ymm2, 32(%rcx)
+  vzeroupper
+.L290:
+  ret
+end;
+
+{$ELSE}
+
+function XXH_mult32to64_add64(lhs, rhs, acc: UInt64): UInt64; inline;
+begin
+  Result:= XXH_mult32to64(UInt32(lhs), UInt32(rhs)) + acc;
+end;
+
+procedure XXH3_scalarRound(acc: PByte; const input: PByte; const secret: PByte; lane: UIntPtr); inline;
+var
+  xinput, xsecret: PByte;
+  data_val, data_key: UInt64;
+  xacc: PUInt64 absolute acc;
+begin
+  xinput:= input;
+  xsecret:= secret;
+  Assert(lane < XXH_ACC_NB);
+  // XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+  data_val:= XXH_readLE64(xinput + lane * 8);
+  data_key:= data_val xor XXH_readLE64(xsecret + lane * 8);
+  xacc[lane xor 1] += data_val; //* swap adjacent lanes */
+  xacc[lane]:= XXH_mult32to64_add64(data_key, data_key shr 32, xacc[lane]);
+end;
+
+procedure XXH3_accumulate_512_scalar(acc: PByte; const input: PByte; const secret: PByte);
+begin
+  XXH3_scalarRound(acc, input, secret, 0);
+  XXH3_scalarRound(acc, input, secret, 1);
+  XXH3_scalarRound(acc, input, secret, 2);
+  XXH3_scalarRound(acc, input, secret, 3);
+  XXH3_scalarRound(acc, input, secret, 4);
+  XXH3_scalarRound(acc, input, secret, 5);
+  XXH3_scalarRound(acc, input, secret, 6);
+  XXH3_scalarRound(acc, input, secret, 7);
+end;
+
+procedure XXH3_accumulate_scalar(acc: PByte; const input: PByte; const secret: PByte; nbStripes: UIntPtr);
+var
+  n: UIntPtr;
+  in_: PByte;
+begin
+  for n:= 0 to nbStripes - 1 do
+  begin
+    in_:= input + n * XXH_STRIPE_LEN;
+    XXH3_accumulate_512_scalar(acc, in_, secret + n * XXH_SECRET_CONSUME_RATE);
+  end;
+end;
+
+{$ENDIF}
+
+function XXH_xorshift64(v64: UInt64; shift: Integer): UInt64; inline;
+begin
+  // XXH_ASSERT(0 <= shift && shift < 64);
+  Result:=  v64 xor (v64 shr shift);
+end;
+
+procedure XXH3_scalarScrambleRound(acc: PByte; const secret: PByte; lane: UIntPtr); inline;
+var
+  acc64: UInt64;
+  key64: UInt64;
+  xacc: PUInt64;
+  xsecret: PByte;
+begin
+  xacc:= PUInt64(acc); //* presumed aligned */
+  xsecret:= secret;    //* no alignment restriction */
+
+  // XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+  Assert(lane < XXH_ACC_NB);
+
+  key64:= XXH_readLE64(xsecret + lane * 8);
+  acc64:= xacc[lane];
+  acc64:= XXH_xorshift64(acc64, 47);
+  acc64:= acc64 xor key64;
+  acc64 *= XXH_PRIME32_1;
+  xacc[lane]:= acc64;
+end;
+
+procedure XXH3_scrambleAcc_scalar(acc: PByte; const secret: PByte); inline;
+begin
+  XXH3_scalarScrambleRound(acc, secret, 0);
+  XXH3_scalarScrambleRound(acc, secret, 1);
+  XXH3_scalarScrambleRound(acc, secret, 2);
+  XXH3_scalarScrambleRound(acc, secret, 3);
+  XXH3_scalarScrambleRound(acc, secret, 4);
+  XXH3_scalarScrambleRound(acc, secret, 5);
+  XXH3_scalarScrambleRound(acc, secret, 6);
+  XXH3_scalarScrambleRound(acc, secret, 7);
+end;
+
+function XXH3_consumeStripes(acc: PByte; nbStripesSoFarPtr: PUIntPtr; nbStripesPerBlock: UIntPtr;
+                             input: PByte; nbStripes: UIntPtr;
+                             const secret: PByte; secretLimit: UIntPtr;
+                             f_acc: TXXH3_accumulate_f;
+                             f_scramble: TXXH3_scrambleAcc_f): PByte; inline;
+var
+  initialSecret: PByte;
+  nbStripesThisIter: UIntPtr;
+begin
+  initialSecret:= secret + nbStripesSoFarPtr^ * XXH_SECRET_CONSUME_RATE;
+  //* Process full blocks */
+  if (nbStripes >= (nbStripesPerBlock - nbStripesSoFarPtr^)) then
+  begin
+    //* Process the initial partial block... */
+    nbStripesThisIter:= nbStripesPerBlock - nbStripesSoFarPtr^;
+    repeat
+      //* Accumulate and scramble */
+      f_acc(acc, input, initialSecret, nbStripesThisIter);
+      f_scramble(acc, secret + secretLimit);
+      input += nbStripesThisIter * XXH_STRIPE_LEN;
+      nbStripes -= nbStripesThisIter;
+      //* Then continue the loop with the full block size */
+      nbStripesThisIter:= nbStripesPerBlock;
+      initialSecret:= secret;
+    until not (nbStripes >= nbStripesPerBlock);
+    nbStripesSoFarPtr^:= 0;
+  end;
+  //* Process a partial block */
+  if (nbStripes > 0) then
+  begin
+    f_acc(acc, input, initialSecret, nbStripes);
+    input += nbStripes * XXH_STRIPE_LEN;
+    nbStripesSoFarPtr^ += nbStripes;
+  end;
+  //* Return end pointer */
+  Result:= input;
+end;
+
+procedure XXH3_update(const state: PXXH3_state_t; input: PByte; len: UIntPtr;
+                      f_acc: TXXH3_accumulate_f; f_scramble: TXXH3_scrambleAcc_f); // inline;
+var
+  bEnd: PByte;
+  acc: PUInt64;
+  secret: PByte;
+  loadSize: UIntPtr;
+  nbStripes: UIntPtr;
+begin
+  bEnd:= input + len;
+  if (state^.extSecret = nil) then
+    secret:= state^.customSecret
+  else begin
+    secret:= state^.extSecret;
+  end;
+  acc:= state^.acc;
+  state^.totalLen += len;
+  Assert(state^.bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+
+  //* small input : just fill in tmp buffer */
+  if (len <= XXH3_INTERNALBUFFER_SIZE - state^.bufferedSize) then
+  begin
+    Move(input^, state^.buffer[state^.bufferedSize], len);
+    state^.bufferedSize += XXH32_hash_t(len);
+    Exit;
+  end;
+
+  //* total input is now > XXH3_INTERNALBUFFER_SIZE */
+  Assert(XXH3_INTERNALBUFFER_SIZE mod XXH_STRIPE_LEN = 0); //* clean multiple */
+
+  (*
+   * Internal buffer is partially filled (always, except at beginning)
+   * Complete it, then consume it.
+   *)
+  if (state^.bufferedSize > 0) then
+  begin
+    loadSize:= XXH3_INTERNALBUFFER_SIZE - state^.bufferedSize;
+    Move(input^, state^.buffer[state^.bufferedSize], loadSize);
+    input += loadSize;
+    XXH3_consumeStripes(PByte(acc),
+                        @state^.nbStripesSoFar, state^.nbStripesPerBlock,
+                        state^.buffer, XXH3_INTERNALBUFFER_STRIPES,
+                        secret, state^.secretLimit,
+                        f_acc, f_scramble);
+    state^.bufferedSize:= 0;
+  end;
+  Assert(input < bEnd);
+  if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) then
+  begin
+    nbStripes:= UIntPtr(bEnd - 1 - input) div XXH_STRIPE_LEN;
+    input:= XXH3_consumeStripes(PByte(acc),
+                                @state^.nbStripesSoFar, state^.nbStripesPerBlock,
+                                input, nbStripes,
+                                secret, state^.secretLimit,
+                                f_acc, f_scramble);
+    Move((input - XXH_STRIPE_LEN)^, state^.buffer[ + sizeof(state^.buffer) - XXH_STRIPE_LEN], XXH_STRIPE_LEN);
+
+  end;
+  //* Some remaining input (always) : buffer it */
+  Assert(input < bEnd);
+  Assert(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
+  Assert(state^.bufferedSize = 0);
+  Move(input^, state^.buffer[0], UIntPtr(bEnd - input));
+  state^.bufferedSize:= XXH32_hash_t(bEnd - input);
+end;
+
+procedure XXH3_64bits_update(state: PXXH3_state_t; const input: Pointer; len: UIntPtr); inline;
+begin
+  XXH3_update(state, input, len, XXH3_accumulate, @XXH3_scrambleAcc_scalar);
+end;
+
+procedure XXH3_128bits_update(state: PXXH3_state_t; const input: PByte; len: UIntPtr);
+begin
+  XXH3_64bits_update(state, input, len);
+end;
+
+procedure XXH3_digest_long(acc: PUInt64; const state: PXXH3_state_t;
+                           const secret: PByte); inline;
+var
+  lastStripePtr: PByte;
+  nbStripes, nbStripesSoFar, catchupSize: UIntPtr;
+  lastStripe: array[0..Pred(XXH_STRIPE_LEN)] of Byte;
+begin
+  (*
+   * Digest on a local copy. This way, the state remains unaltered, and it can
+   * continue ingesting more input afterwards.
+   *)
+  Move(state^.acc[0], acc^, sizeof(state^.acc));
+  if (state^.bufferedSize >= XXH_STRIPE_LEN) then
+  begin
+    //* Consume remaining stripes then point to remaining data in buffer */
+    nbStripes:= (state^.bufferedSize - 1) div XXH_STRIPE_LEN;
+    nbStripesSoFar:= state^.nbStripesSoFar;
+    XXH3_consumeStripes(PByte(acc),
+                       @nbStripesSoFar, state^.nbStripesPerBlock,
+                        state^.buffer, nbStripes,
+                        secret, state^.secretLimit,
+                        XXH3_accumulate, @XXH3_scrambleAcc_scalar);
+    lastStripePtr:= @state^.buffer[state^.bufferedSize - XXH_STRIPE_LEN];
+  end else begin  //* bufferedSize < XXH_STRIPE_LEN */
+    //* Copy to temp buffer */
+    catchupSize:= XXH_STRIPE_LEN - state^.bufferedSize;
+    Assert(state^.bufferedSize > 0);  //* there is always some input buffered */
+    Move(state^.buffer[sizeof(state^.buffer) - catchupSize], lastStripe[0], catchupSize);
+    Move(state^.buffer[0], lastStripe[catchupSize], state^.bufferedSize);
+    lastStripePtr:= lastStripe;
+  end;
+  //* Last stripe */
+  XXH3_accumulate_512(PByte(acc),
+                      lastStripePtr,
+                      secret + state^.secretLimit - XXH_SECRET_LASTACC_START);
+end;
+
+function XXH_mult64to128(lhs, rhs: UInt64): XXH128_hash_t;
+var
+  cross, upper, lower: UInt64;
+  lo_lo, hi_lo, lo_hi, hi_hi: UInt64;
+begin
+  //* First calculate all of the cross products. */
+  lo_lo:= XXH_mult32to64(lhs and $FFFFFFFF, rhs and $FFFFFFFF);
+  hi_lo:= XXH_mult32to64(lhs shr 32,        rhs and $FFFFFFFF);
+  lo_hi:= XXH_mult32to64(lhs and $FFFFFFFF, rhs shr 32);
+  hi_hi:= XXH_mult32to64(lhs shr 32,        rhs shr 32);
+
+  //* Now add the products together. These will never overflow. */
+  cross:= (lo_lo shr 32) + (hi_lo and $FFFFFFFF) + lo_hi;
+  upper:= (hi_lo shr 32) + (cross shr 32)        + hi_hi;
+  lower:= (cross shl 32) or (lo_lo and $FFFFFFFF);
+
+  Result.low64  := lower;
+  Result.high64 := upper;
+end;
+
+function XXH3_mul128_fold64(lhs, rhs: UInt64): UInt64;
+var
+  product: XXH128_hash_t;
+begin
+  product:= XXH_mult64to128(lhs, rhs);
+  Result:= product.low64 xor product.high64;
+end;
+
+function XXH3_mix2Accs(const acc: PUInt64; const secret: PByte): Uint64; inline;
+begin
+  Result:= XXH3_mul128_fold64(
+                              acc[0] xor XXH_readLE64(secret),
+                              acc[1] xor XXH_readLE64(secret + 8) );
+end;
+
+function XXH3_avalanche(h64: UInt64): XXH64_hash_t;
+begin
+  h64:= XXH_xorshift64(h64, 37);
+  h64 *= PRIME_MX1;
+  h64:= XXH_xorshift64(h64, 32);
+  Result:= h64;
+end;
+
+function XXH3_mergeAccs(const acc: PUInt64; const secret: PByte; start: UInt64): XXH64_hash_t;
+var
+  i: UIntPtr;
+begin
+  Result:= start;
+  for i:= 0 to 3 do
+  begin
+    result += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i);
+  end;
+  Result:= XXH3_avalanche(Result);
+end;
+
+function XXH3_len_9to16_128b(const input: PByte; len: UIntPtr; const secret: PByte; seed: XXH64_hash_t): XXH128_hash_t; inline;
+var
+  m128: XXH128_hash_t;
+  bitflipl, bitfliph, input_lo, input_hi: UInt64;
+begin
+  Assert(input <> nil);
+  Assert(secret <> nil);
+  Assert((9 <= len) and (len <= 16));
+  bitflipl := (XXH_readLE64(secret+32) xor XXH_readLE64(secret+40)) - seed;
+  bitfliph := (XXH_readLE64(secret+48) xor XXH_readLE64(secret+56)) + seed;
+  input_lo := XXH_readLE64(input);
+  input_hi := XXH_readLE64(input + len - 8);
+  m128:= XXH_mult64to128(input_lo xor input_hi xor bitflipl, XXH_PRIME64_1);
+  {*
+   * Put len in the middle of m128 to ensure that the length gets mixed to
+   * both the low and high bits in the 128x64 multiply below.
+   *}
+  m128.low64 += UInt64(len - 1) << 54;
+  input_hi   := input_hi xor bitfliph;
+  {*
+   * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+   * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+   * the high 64 bits of m128.
+   *
+   * The best approach to this operation is different on 32-bit and 64-bit.
+   *}
+{$IF DEFINED(CPU32)}
+  {*
+   * 32-bit optimized version, which is more readable.
+   *
+   * On 32-bit, it removes an ADC and delays a dependency between the two
+   * halves of m128.high64, but it generates an extra mask on 64-bit.
+   *}
+  m128.high64 += (input_hi and UInt64($FFFFFFFF00000000)) + XXH_mult32to64(UInt32(input_hi), XXH_PRIME32_2);
+{$ELSE}
+  {*
+   * 64-bit optimized (albeit more confusing) version.
+   *
+   * Uses some properties of addition and multiplication to remove the mask:
+   *
+   * Let:
+   *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+   *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+   *    c = XXH_PRIME32_2
+   *
+   *    a + (b * c)
+   * Inverse Property: x + y - x == y
+   *    a + (b * (1 + c - 1))
+   * Distributive Property: x * (y + z) == (x * y) + (x * z)
+   *    a + (b * 1) + (b * (c - 1))
+   * Identity Property: x * 1 == x
+   *    a + b + (b * (c - 1))
+   *
+   * Substitute a, b, and c:
+   *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+   *
+   * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+   *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+   *}
+  m128.high64 += input_hi + XXH_mult32to64(UInt32(input_hi), XXH_PRIME32_2 - 1);
+{$ENDIF}
+  //* m128 ^= XXH_swap64(m128 >> 64); */
+  m128.low64  := m128.low64 xor SwapEndian(m128.high64);
+
+  //* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+  Result:= XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+  Result.high64 += m128.high64 * XXH_PRIME64_2;
+
+  Result.low64   := XXH3_avalanche(Result.low64);
+  Result.high64  := XXH3_avalanche(Result.high64);
+end;
+
+function XXH3_len_4to8_128b(const input: PByte; len: UIntPtr; const secret: PByte; seed: XXH64_hash_t): XXH128_hash_t; inline;
+var
+  input_lo, input_hi: UInt32;
+  input_64, bitflip, keyed: UInt64;
+begin
+  Assert(input <> nil);
+  Assert(secret <> nil);
+  Assert((4 <= len) and (len <= 8));
+  seed := seed xor (UInt64(SwapEndian(UInt32(seed))) shl 32);
+  input_lo := XXH_readLE32(input);
+  input_hi := XXH_readLE32(input + len - 4);
+  input_64 := input_lo + (UInt64(input_hi) shl 32);
+  bitflip := (XXH_readLE64(secret+16) xor XXH_readLE64(secret+24)) + seed;
+  keyed := input_64 xor bitflip;
+
+  ///* Shift len to the left to ensure it is even, this avoids even multiplies. */
+  Result:= XXH_mult64to128(keyed, XXH_PRIME64_1 + (len shl 2));
+
+  Result.high64 += (Result.low64 shl 1);
+  Result.low64  := Result.low64 xor (Result.high64 shr 3);
+
+  Result.low64   := XXH_xorshift64(Result.low64, 35);
+  Result.low64  *= PRIME_MX2;
+  Result.low64   := XXH_xorshift64(Result.low64, 28);
+  Result.high64  := XXH3_avalanche(Result.high64);
+end;
+
+function XXH3_len_1to3_128b(const input: PByte; len: UIntPtr; const secret: PByte; seed: XXH64_hash_t): XXH128_hash_t; inline;
+var
+  c1, c2, c3: Byte;
+  combinedl, combinedh: UInt32;
+  bitflipl, bitfliph, keyed_lo, keyed_hi: UInt64;
+begin
+  //* A doubled version of 1to3_64b with different constants. */
+  Assert(input <> nil);
+  Assert((1 <= len) and (len <= 3));
+  Assert(secret <> nil);
+  (*
+   * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+   * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+   * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+   *)
+  c1 := input[0];
+  c2 := input[len >> 1];
+  c3 := input[len - 1];
+  combinedl := (UInt32(c1) shl 16) or (UInt32(c2) shl 24) or
+               (UInt32(c3) shl 0) or (UInt32(len) shl 8);
+  combinedh := RolDWord(SwapEndian(combinedl), 13);
+  bitflipl := (XXH_readLE32(secret) xor XXH_readLE32(secret + 4)) + seed;
+  bitfliph := (XXH_readLE32(secret+8) xor XXH_readLE32(secret + 12)) - seed;
+  keyed_lo := UInt64(combinedl) xor bitflipl;
+  keyed_hi := UInt64(combinedh) xor bitfliph;
+
+  Result.low64  := XXH64_avalanche(keyed_lo);
+  Result.high64 := XXH64_avalanche(keyed_hi);
+end;
+
+function XXH3_len_0to16_128b(const input: PByte; len: UIntPtr; const secret: PByte; seed: XXH64_hash_t): XXH128_hash_t; inline;
+var
+  bitflipl, bitfliph: UInt64;
+begin
+  Assert(len <= 16);
+  if (len > 8) then
+    Result:= XXH3_len_9to16_128b(input, len, secret, seed)
+  else if (len >= 4) then
+    Result:= XXH3_len_4to8_128b(input, len, secret, seed)
+  else if (len > 0) then
+    Result:= XXH3_len_1to3_128b(input, len, secret, seed)
+  else begin
+    bitflipl:= XXH_readLE64(secret+64) xor XXH_readLE64(secret+72);
+    bitfliph:= XXH_readLE64(secret+80) xor XXH_readLE64(secret+88);
+    Result.low64:= XXH64_avalanche(seed xor bitflipl);
+    Result.high64:= XXH64_avalanche( seed xor bitfliph);
+  end;
+end;
+
+function XXH3_mix16B(const input: PByte;
+                     const secret: PByte; seed64: UInt64): UInt64; inline;
+var
+  input_lo, input_hi: UInt64;
+begin
+  input_lo := XXH_readLE64(input);
+  input_hi := XXH_readLE64(input+8);
+  Result:= XXH3_mul128_fold64(
+            input_lo xor (XXH_readLE64(secret)   + seed64),
+            input_hi xor (XXH_readLE64(secret+8) - seed64)
+        );
+end;
+
+function XXH128_mix32B(var acc: XXH128_hash_t; const input_1: PByte; const input_2: PByte;
+                       const secret: PByte; seed: XXH64_hash_t): XXH128_hash_t; inline;
+begin
+  acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+  acc.low64  := acc.low64 xor (XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8));
+  acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+  acc.high64 := acc.high64 xor (XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8));
+  Result:= acc;
+end;
+
+function XXH3_len_17to128_128b(const input: PByte; len: UIntPtr;
+                               const secret: PByte; secretSize: UIntPtr;
+                               seed: XXH64_hash_t): XXH128_hash_t; inline;
+var
+  acc: XXH128_hash_t;
+begin
+  Assert(secretSize >= XXH3_SECRET_SIZE_MIN);
+  Assert((16 < len) and (len <= 128));
+
+  acc.low64 := len * XXH_PRIME64_1;
+  acc.high64 := 0;
+
+  if (len > 32) then
+  begin
+    if (len > 64) then
+    begin
+      if (len > 96) then
+      begin
+        acc := XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+      end;
+      acc := XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+    end;
+    acc := XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+  end;
+  acc := XXH128_mix32B(acc, input, input+len-16, secret, seed);
+
+  Result.low64  := acc.low64 + acc.high64;
+  Result.high64 := (acc.low64    * XXH_PRIME64_1)
+                 + (acc.high64   * XXH_PRIME64_4)
+                 + ((len - seed) * XXH_PRIME64_2);
+  Result.low64  := XXH3_avalanche(Result.low64);
+  Result.high64 := XXH64_hash_t(0) - XXH3_avalanche(Result.high64);
+end;
+
+function XXH3_len_129to240_128b(const input: PBYte; len: UIntPtr;
+                                const secret: PByte; secretSize: UIntPtr;
+                                seed: XXH64_hash_t): XXH128_hash_t; inline;
+var
+  i: UInt32;
+  acc: XXH128_hash_t;
+begin
+  Assert(secretSize >= XXH3_SECRET_SIZE_MIN);
+  Assert((128 < len) and (len <= XXH3_MIDSIZE_MAX));
+
+  acc.low64 := len * XXH_PRIME64_1;
+  acc.high64 := 0;
+  {*
+   *  We set as `i` as offset + 32. We do this so that unchanged
+   * `len` can be used as upper bound. This reaches a sweet spot
+   * where both x86 and aarch64 get simple agen and good codegen
+   * for the loop.
+   *}
+  i:= 32;
+  while (i < 160) do
+  begin
+    acc := XXH128_mix32B(acc,
+                         input  + i - 32,
+                         input  + i - 16,
+                         secret + i - 32,
+                         seed);
+    Inc(i, 32);
+  end;
+
+  acc.low64 := XXH3_avalanche(acc.low64);
+  acc.high64 := XXH3_avalanche(acc.high64);
+  {*
+   * NB: `i <= len` will duplicate the last 32-bytes if
+   * len % 32 was zero. This is an unfortunate necessity to keep
+   * the hash result stable.
+   *}
+  i:= 160;
+  while i <= len do
+  begin
+      acc := XXH128_mix32B(acc,
+                           input + i - 32,
+                           input + i - 16,
+                           secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
+                           seed);
+    Inc(i, 32);
+  end;
+  //* last bytes */
+  acc := XXH128_mix32B(acc,
+                       input + len - 16,
+                       input + len - 32,
+                       secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                       XXH64_hash_t(0) - seed);
+
+  Result.low64  := acc.low64 + acc.high64;
+  Result.high64 := (acc.low64    * XXH_PRIME64_1)
+                 + (acc.high64   * XXH_PRIME64_4)
+                 + ((len - seed) * XXH_PRIME64_2);
+  Result.low64  := XXH3_avalanche(Result.low64);
+  Result.high64 := XXH64_hash_t(0) - XXH3_avalanche(Result.high64);
+end;
+
+function XXH3_128bits_internal(const input: PByte; len: UIntPtr; seed64: XXH64_hash_t;
+                               const secret: PByte; secretLen: UIntPtr): XXH128_hash_t; inline;
+begin
+  Assert(len <= XXH3_MIDSIZE_MAX);
+  Assert(secretLen >= XXH3_SECRET_SIZE_MIN);
+  (*
+   * If an action is to be taken if `secret` conditions are not respected,
+   * it should be done here.
+   * For now, it's a contract pre-condition.
+   * Adding a check and a branch here would cost performance at every hash.
+   *)
+  if (len <= 16) then
+    Result:= XXH3_len_0to16_128b(input, len, secret, seed64)
+  else if (len <= 128) then
+    Result:= XXH3_len_17to128_128b(input, len, secret, secretLen, seed64)
+  else begin
+    Result:= XXH3_len_129to240_128b(input, len, secret, secretLen, seed64);
+  end;
+end;
+
+function XXH3_128bits_digest(const state: PXXH3_state_t): XXH128_hash_t;
+var
+  acc: PUInt64;
+  secret: PByte;
+  buffer: array[0..Pred(XXH_ACC_SIZE + XXH_ACC_ALIGN)] of Byte;
+begin
+  if (state^.extSecret = nil) then
+    secret:= state^.customSecret
+  else begin
+    secret:= state^.extSecret;
+  end;
+  if (state^.totalLen > XXH3_MIDSIZE_MAX) then
+  begin
+    acc:= System.Align(@buffer[0], XXH_ACC_ALIGN);
+    Assert(UIntPtr(acc) mod XXH_ACC_ALIGN = 0);
+
+    XXH3_digest_long(acc, state, secret);
+    Assert(state^.secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+
+    Result.low64  := XXH3_mergeAccs(acc,
+                                    secret + XXH_SECRET_MERGEACCS_START,
+                                    UInt64(state^.totalLen) * XXH_PRIME64_1);
+    Result.high64 := XXH3_mergeAccs(acc,
+                                    secret + state^.secretLimit + XXH_STRIPE_LEN -
+                                    XXH_ACC_SIZE - XXH_SECRET_MERGEACCS_START,
+                                    not (UInt64(state^.totalLen) * XXH_PRIME64_2));
+  end
+  else begin
+    //* len <= XXH3_MIDSIZE_MAX : short code */
+    {
+    if (state^.useSeed)
+      Result:= XXH3_128bits_withSeed(state^.buffer, UIntPtr(state^.totalLen), state^.seed);
+    else
+    }
+    Result:= XXH3_128bits_internal(state^.buffer, UIntPtr(state^.totalLen), 0,
+                                   secret, state^.secretLimit + XXH_STRIPE_LEN);
+  end;
+end;
+
+initialization
+{$IF DEFINED(CPUX86_64)}
+  if AVX2Support then
+  begin
+    XXH3_accumulate:= @XXH3_accumulate_avx2;
+    XXH3_accumulate_512:= @XXH3_accumulate_512_avx2;
+  end
+  else begin
+    XXH3_accumulate:= @XXH3_accumulate_sse2;
+    XXH3_accumulate_512:= @XXH3_accumulate_512_sse2;
+  end;
+{$ELSE}
+  XXH3_accumulate:= @XXH3_accumulate_scalar;
+  XXH3_accumulate_512:= @XXH3_accumulate_512_scalar;
+{$ENDIF}
+end.

+ 10 - 2
components/kascrypt/kascrypt.lpk

@@ -37,8 +37,8 @@
 "/>
 "/>
     <License Value="KAScrypt is open source software (released under the MIT license) and as such there is no charge for inclusion in other software.
     <License Value="KAScrypt is open source software (released under the MIT license) and as such there is no charge for inclusion in other software.
 "/>
 "/>
-    <Version Major="3" Minor="1" Release="1"/>
-    <Files Count="30">
+    <Version Major="3" Minor="2"/>
+    <Files Count="32">
       <Item1>
       <Item1>
         <Filename Value="dcpbase64.pas"/>
         <Filename Value="dcpbase64.pas"/>
         <UnitName Value="DCPbase64"/>
         <UnitName Value="DCPbase64"/>
@@ -159,6 +159,14 @@
         <Filename Value="Hashes/dcpchecksum32.pas"/>
         <Filename Value="Hashes/dcpchecksum32.pas"/>
         <UnitName Value="dcpchecksum32"/>
         <UnitName Value="dcpchecksum32"/>
       </Item30>
       </Item30>
+      <Item31>
+        <Filename Value="Hashes/dcpxxh3.pas"/>
+        <UnitName Value="DCPxxh3"/>
+      </Item31>
+      <Item32>
+        <Filename Value="Hashes/dcxxhash.pas"/>
+        <UnitName Value="DCxxhash"/>
+      </Item32>
     </Files>
     </Files>
     <CompatibilityMode Value="True"/>
     <CompatibilityMode Value="True"/>
     <RequiredPkgs Count="2">
     <RequiredPkgs Count="2">

+ 2 - 1
components/kascrypt/kascrypt.pas

@@ -11,7 +11,8 @@ uses
   DCPbase64, DCPblockciphers, DCPconst, DCPcrypt2, DCPhaval, DCPmd4, DCPmd5, 
   DCPbase64, DCPblockciphers, DCPconst, DCPcrypt2, DCPhaval, DCPmd4, DCPmd5, 
   DCPripemd128, DCPripemd160, DCPsha1, DCPsha256, DCPsha512, DCPtiger, 
   DCPripemd128, DCPripemd160, DCPsha1, DCPsha256, DCPsha512, DCPtiger, 
   DCPcrc32, DCcrc32, DCblake2, DCPblake2, DCPsha3, HMAC, SHA3, SHA3_512, 
   DCPcrc32, DCcrc32, DCblake2, DCPblake2, DCPsha3, HMAC, SHA3, SHA3_512, 
-  ISAAC, scrypt, DCPrijndael, SHA1, Argon2, DCPblake3, dcpchecksum32;
+  ISAAC, scrypt, DCPrijndael, SHA1, Argon2, DCPblake3, dcpchecksum32, DCPxxh3, 
+  DCxxhash;
 
 
 implementation
 implementation
 
 

+ 6 - 5
src/uhash.pas

@@ -4,7 +4,7 @@
     General Hash Unit: This unit defines the common types, functions,
     General Hash Unit: This unit defines the common types, functions,
     and procedures
     and procedures
 
 
-    Copyright (C) 2009-2023 Alexander Koblov ([email protected])
+    Copyright (C) 2009-2024 Alexander Koblov ([email protected])
 
 
     This program is free software; you can redistribute it and/or modify
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     it under the terms of the GNU General Public License as published by
@@ -35,7 +35,7 @@ type
   THashAlgorithm = (HASH_BLAKE2S, HASH_BLAKE2SP, HASH_BLAKE2B, HASH_BLAKE2BP, HASH_BLAKE3,
   THashAlgorithm = (HASH_BLAKE2S, HASH_BLAKE2SP, HASH_BLAKE2B, HASH_BLAKE2BP, HASH_BLAKE3,
                     HASH_CHECKSUM32, HASH_CRC32, HASH_HAVAL, HASH_MD4, HASH_MD5, HASH_RIPEMD128, HASH_RIPEMD160,
                     HASH_CHECKSUM32, HASH_CRC32, HASH_HAVAL, HASH_MD4, HASH_MD5, HASH_RIPEMD128, HASH_RIPEMD160,
                     HASH_SFV, HASH_SHA1, HASH_SHA224, HASH_SHA256, HASH_SHA384, HASH_SHA512,
                     HASH_SFV, HASH_SHA1, HASH_SHA224, HASH_SHA256, HASH_SHA384, HASH_SHA512,
-                    HASH_SHA3_224, HASH_SHA3_256, HASH_SHA3_384, HASH_SHA3_512, HASH_TIGER,
+                    HASH_SHA3_224, HASH_SHA3_256, HASH_SHA3_384, HASH_SHA3_512, HASH_TIGER, HASH_XXH3_128,
                     HASH_BEST
                     HASH_BEST
                     );
                     );
 
 
@@ -43,7 +43,7 @@ var
   HashFileExt: array[Low(THashAlgorithm)..Pred(High(THashAlgorithm))] of String = (
   HashFileExt: array[Low(THashAlgorithm)..Pred(High(THashAlgorithm))] of String = (
                  'blake2s', 'blake2sp', 'blake2b', 'blake2bp', 'blake3', 'checksum32', 'crc32', 'haval',
                  'blake2s', 'blake2sp', 'blake2b', 'blake2bp', 'blake3', 'checksum32', 'crc32', 'haval',
                  'md4', 'md5', 'ripemd128', 'ripemd160', 'sfv', 'sha', 'sha224', 'sha256',
                  'md4', 'md5', 'ripemd128', 'ripemd160', 'sfv', 'sha', 'sha224', 'sha256',
-                 'sha384', 'sha512', 'sha3', 'sha3', 'sha3', 'sha3', 'tiger'
+                 'sha384', 'sha512', 'sha3', 'sha3', 'sha3', 'sha3', 'tiger', 'xxh3'
                );
                );
 
 
 var
 var
@@ -51,7 +51,7 @@ var
                  'blake2s', 'blake2sp', 'blake2b', 'blake2bp', 'blake3', 'checksum32', 'crc32', 'haval',
                  'blake2s', 'blake2sp', 'blake2b', 'blake2bp', 'blake3', 'checksum32', 'crc32', 'haval',
                  'md4', 'md5', 'ripemd128', 'ripemd160', 'sfv', 'sha1_160', 'sha2_224',
                  'md4', 'md5', 'ripemd128', 'ripemd160', 'sfv', 'sha1_160', 'sha2_224',
                  'sha2_256', 'sha2_384', 'sha2_512', 'sha3_224', 'sha3_256',
                  'sha2_256', 'sha2_384', 'sha2_512', 'sha3_224', 'sha3_256',
-                 'sha3_384', 'sha3_512', 'tiger'
+                 'sha3_384', 'sha3_512', 'tiger', 'xxh3_128'
                );
                );
 
 
 procedure HashInit(out Context: THashContext; Algorithm: THashAlgorithm);
 procedure HashInit(out Context: THashContext; Algorithm: THashAlgorithm);
@@ -69,7 +69,7 @@ implementation
 
 
 uses
 uses
   LazUTF8, DCPhaval, DCPmd4, DCPmd5, DCPripemd128, DCPripemd160, DCPChecksum32, DCPcrc32,
   LazUTF8, DCPhaval, DCPmd4, DCPmd5, DCPripemd128, DCPripemd160, DCPChecksum32, DCPcrc32,
-  DCPsha1, DCPsha256, DCPsha512, DCPtiger, DCPblake2, DCPblake3, DCPsha3;
+  DCPsha1, DCPsha256, DCPsha512, DCPtiger, DCPblake2, DCPblake3, DCPsha3, DCPxxh3;
 
 
 procedure HashInit(out Context: THashContext; Algorithm: THashAlgorithm);
 procedure HashInit(out Context: THashContext; Algorithm: THashAlgorithm);
 begin
 begin
@@ -106,6 +106,7 @@ begin
     HASH_SHA3_384:   Context:= TDCP_sha3_384.Create(nil);
     HASH_SHA3_384:   Context:= TDCP_sha3_384.Create(nil);
     HASH_SHA3_512:   Context:= TDCP_sha3_512.Create(nil);
     HASH_SHA3_512:   Context:= TDCP_sha3_512.Create(nil);
     HASH_TIGER:      Context:= TDCP_tiger.Create(nil);
     HASH_TIGER:      Context:= TDCP_tiger.Create(nil);
+    HASH_XXH3_128:   Context:= TDCP_xxh3_128.Create(nil);
   end;
   end;
 
 
   Context.Init;
   Context.Init;