dcblake3.pp 27 KB


  1. {
  2. BLAKE3 - cryptographic hash function.
  3. The C code is copyright Samuel Neves and Jack O'Connor, 2019-2020.
  4. The assembly code is copyright Samuel Neves, 2019-2020.
  5. The Pascal translation by Alexander Koblov, 2020.
  6. This work is released into the public domain with CC0 1.0.
  7. Alternatively, it is licensed under the Apache License 2.0.
  8. }
  9. unit DCblake3;
  10. {$mode objfpc}{$H+}
  11. {$inline on}{$Q-}
  12. {$macro on}{$R-}
  13. interface
  14. uses
  15. Classes, SysUtils, CTypes;
  16. const
  17. BLAKE3_KEY_LEN = 32;
  18. BLAKE3_OUT_LEN = 32;
  19. BLAKE3_BLOCK_LEN = 64;
  20. BLAKE3_CHUNK_LEN = 1024;
  21. BLAKE3_MAX_DEPTH = 54;
  22. BLAKE3_MAX_SIMD_DEGREE = 16;
  23. {$if defined(CPUX86_64)}
  24. MAX_SIMD_DEGREE = 16;
  25. {$elseif defined(CPUAARCH64)}
  26. MAX_SIMD_DEGREE = 4;
  27. {$else}
  28. MAX_SIMD_DEGREE = 1;
  29. {$endif}
  30. {$if (MAX_SIMD_DEGREE > 2)}
  31. MAX_SIMD_DEGREE_OR_2 = MAX_SIMD_DEGREE;
  32. {$else}
  33. MAX_SIMD_DEGREE_OR_2 = 2;
  34. {$endif}
  35. const BLAKE3_IV: array[0..7] of cuint32 = (
  36. $6A09E667, $BB67AE85, $3C6EF372, $A54FF53A,
  37. $510E527F, $9B05688C, $1F83D9AB, $5BE0CD19
  38. );
  39. const MSG_SCHEDULE: array[0..6] of array[0..15] of cuint8 = (
  40. (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
  41. (2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8),
  42. (3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1),
  43. (10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6),
  44. (12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4),
  45. (9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7),
  46. (11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13)
  47. );
  48. type
  49. ppcuint8 = ^pcuint8;
  50. Tblake_cv = array[0..7] of cuint32;
  51. Pblake3_chunk_state = ^blake3_chunk_state;
  52. blake3_chunk_state = record
  53. cv: array[0..7] of cuint32;
  54. chunk_counter: cuint64;
  55. buf: array[0..Pred(BLAKE3_BLOCK_LEN)] of cuint8;
  56. buf_len: cuint8;
  57. blocks_compressed: cuint8;
  58. flags: cuint8;
  59. end;
  60. Pblake3_hasher = ^blake3_hasher;
  61. blake3_hasher = record
  62. key: array[0..7] of cuint32;
  63. chunk: blake3_chunk_state;
  64. cv_stack_len: cuint8;
  65. cv_stack: array[0..Pred((BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN)] of cuint8;
  66. end;
  67. procedure blake3_hasher_init(self: Pblake3_hasher);
  68. procedure blake3_hasher_update(self: Pblake3_hasher; const input: Pointer; input_len: csize_t);
  69. procedure blake3_hasher_finalize(const self: Pblake3_hasher; out_: pcuint8; out_len: csize_t);
  70. implementation
  71. {$IF DEFINED(CPUX86_64)}
  72. uses
  73. CPU, KAScpu;
  74. {$ENDIF}
  75. type
  76. blake3_flags = (
  77. CHUNK_START = 1 shl 0,
  78. CHUNK_END = 1 shl 1,
  79. PARENT = 1 shl 2,
  80. ROOT = 1 shl 3,
  81. KEYED_HASH = 1 shl 4,
  82. DERIVE_KEY_CONTEXT = 1 shl 5,
  83. DERIVE_KEY_MATERIAL = 1 shl 6
  84. );
  85. Poutput_t = ^output_t;
  86. output_t = record
  87. input_cv: array[0..7] of cuint32;
  88. counter: cuint64;
  89. block: array[0..Pred(BLAKE3_BLOCK_LEN)] of cuint8;
  90. block_len: cuint8;
  91. flags: cuint8;
  92. end;
  93. function load32( const src: Pointer ): cuint32; inline;
  94. begin
  95. Result := NtoLE(pcuint32(src)^);
  96. end;
  97. procedure store32( dst: pointer; w: cuint32 ); inline;
  98. begin
  99. pcuint32(dst)^ := LEtoN(w);
  100. end;
  101. procedure store_cv_words(bytes_out: pcuint8; cv_words: pcuint32); inline;
  102. begin
  103. store32(@bytes_out[0 * 4], cv_words[0]);
  104. store32(@bytes_out[1 * 4], cv_words[1]);
  105. store32(@bytes_out[2 * 4], cv_words[2]);
  106. store32(@bytes_out[3 * 4], cv_words[3]);
  107. store32(@bytes_out[4 * 4], cv_words[4]);
  108. store32(@bytes_out[5 * 4], cv_words[5]);
  109. store32(@bytes_out[6 * 4], cv_words[6]);
  110. store32(@bytes_out[7 * 4], cv_words[7]);
  111. end;
  112. function round_down_to_power_of_2(x: cuint64): cuint64; inline;
  113. begin
  114. Result := cuint64(1) shl BsrQWord(x or 1);
  115. end;
  116. procedure chunk_state_init(self: Pblake3_chunk_state; const key: pcuint32;
  117. flags: cuint8); inline;
  118. begin
  119. Move(key^, self^.cv[0], BLAKE3_KEY_LEN);
  120. self^.chunk_counter := 0;
  121. FillChar(self^.buf[0], BLAKE3_BLOCK_LEN, 0);
  122. self^.buf_len := 0;
  123. self^.blocks_compressed := 0;
  124. self^.flags := flags;
  125. end;
  126. procedure chunk_state_reset(self: Pblake3_chunk_state; const key: pcuint32;
  127. chunk_counter: cuint64); inline;
  128. begin
  129. Move(key^, self^.cv[0], BLAKE3_KEY_LEN);
  130. self^.chunk_counter := chunk_counter;
  131. self^.blocks_compressed := 0;
  132. FillChar(self^.buf, BLAKE3_BLOCK_LEN, 0);
  133. self^.buf_len := 0;
  134. end;
  135. function chunk_state_len(const self: Pblake3_chunk_state): csize_t; inline;
  136. begin
  137. Result := (BLAKE3_BLOCK_LEN * csize_t(self^.blocks_compressed)) + (csize_t(self^.buf_len));
  138. end;
  139. function chunk_state_fill_buf(self: Pblake3_chunk_state;
  140. const input: pcuint8; input_len: csize_t): csize_t; inline;
  141. var
  142. dest: pcuint8;
  143. begin
  144. Result := BLAKE3_BLOCK_LEN - (csize_t(self^.buf_len));
  145. if (Result > input_len) then begin
  146. Result := input_len;
  147. end;
  148. dest := PByte(self^.buf) + (csize_t(self^.buf_len));
  149. Move(input^, dest^, Result);
  150. self^.buf_len += cuint8(Result);
  151. end;
  152. function chunk_state_maybe_start_flag(const self: Pblake3_chunk_state): cuint8; inline;
  153. begin
  154. if (self^.blocks_compressed = 0) then
  155. Result := cuint8(CHUNK_START)
  156. else begin
  157. Result := 0;
  158. end;
  159. end;
  160. function make_output(const input_cv: Tblake_cv; const block: pcuint8;
  161. block_len: cuint8; counter: cuint64; flags: cuint8): output_t; inline;
  162. begin
  163. Move(input_cv[0], Result.input_cv[0], 32);
  164. Move(block^, Result.block[0], BLAKE3_BLOCK_LEN);
  165. Result.block_len := block_len;
  166. Result.counter := counter;
  167. Result.flags := flags;
  168. end;
  169. {$IF DEFINED(CPUX86_64)}
  170. {$include blake3_sse2.inc}
  171. {$include blake3_sse41.inc}
  172. {$include blake3_avx2.inc}
  173. {$ELSE}
  174. {$include blake3_pas.inc}
  175. {$ENDIF}
  176. {$IF DEFINED(CPUAARCH64)}
  177. {$include blake3_neon.inc}
  178. {$ENDIF}
  179. var
  180. blake3_simd_degree: csize_t; // The dynamically detected SIMD degree of the current platform
  181. blake3_compress_in_place: procedure(cv: pcuint32;
  182. const block: pcuint8;
  183. block_len: cuint8; counter: cuint64;
  184. flags: cuint8);
  185. blake3_compress_xof: procedure(const cv: pcuint32;
  186. const block: pcuint8;
  187. block_len: cuint8; counter: cuint64;
  188. flags: cuint8; out_: pcuint8);
  189. blake3_hash_many: procedure(inputs: ppcuint8; num_inputs: csize_t;
  190. blocks: csize_t; const key: pcuint32;
  191. counter: cuint64; increment_counter: boolean32;
  192. flags: cuint8; flags_start: cuint8;
  193. flags_end: cuint8; out_: pcuint8);
  194. procedure output_chaining_value(const self: Poutput_t; cv: pcuint8); inline;
  195. var
  196. cv_words: Tblake_cv;
  197. begin
  198. Move(self^.input_cv[0], cv_words[0], 32);
  199. blake3_compress_in_place(cv_words, self^.block, self^.block_len,
  200. self^.counter, self^.flags);
  201. store_cv_words(cv, cv_words);
  202. end;
  203. procedure output_root_bytes(const self: Poutput_t; seek: cuint64; out_: pcuint8;
  204. out_len: csize_t); inline;
  205. var
  206. memcpy_len: csize_t;
  207. available_bytes: csize_t;
  208. offset_within_block: csize_t;
  209. output_block_counter: cuint64;
  210. wide_buf: array[0..63] of cuint8;
  211. begin
  212. output_block_counter := seek div 64;
  213. offset_within_block := seek mod 64;
  214. while (out_len > 0) do
  215. begin
  216. blake3_compress_xof(self^.input_cv, self^.block, self^.block_len,
  217. output_block_counter, self^.flags or cuint8(ROOT), wide_buf);
  218. available_bytes := 64 - offset_within_block;
  219. if (out_len > available_bytes) then
  220. memcpy_len := available_bytes
  221. else begin
  222. memcpy_len := out_len;
  223. end;
  224. Move(wide_buf[offset_within_block], out_^, memcpy_len);
  225. out_ += memcpy_len;
  226. out_len -= memcpy_len;
  227. output_block_counter += 1;
  228. offset_within_block := 0;
  229. end;
  230. end;
  231. procedure chunk_state_update(self: Pblake3_chunk_state; input: pcuint8;
  232. input_len: csize_t); inline;
  233. var
  234. take: csize_t;
  235. begin
  236. if (self^.buf_len > 0) then
  237. begin
  238. take := chunk_state_fill_buf(self, input, input_len);
  239. input += take;
  240. input_len -= take;
  241. if (input_len > 0) then
  242. begin
  243. blake3_compress_in_place(
  244. self^.cv, self^.buf, BLAKE3_BLOCK_LEN, self^.chunk_counter,
  245. self^.flags or chunk_state_maybe_start_flag(self));
  246. self^.blocks_compressed += 1;
  247. self^.buf_len := 0;
  248. FillChar(self^.buf[0], BLAKE3_BLOCK_LEN, 0);
  249. end;
  250. end;
  251. while (input_len > BLAKE3_BLOCK_LEN) do
  252. begin
  253. blake3_compress_in_place(self^.cv, input, BLAKE3_BLOCK_LEN,
  254. self^.chunk_counter,
  255. self^.flags or chunk_state_maybe_start_flag(self));
  256. self^.blocks_compressed += 1;
  257. input += BLAKE3_BLOCK_LEN;
  258. input_len -= BLAKE3_BLOCK_LEN;
  259. end;
  260. take := chunk_state_fill_buf(self, input, input_len);
  261. input += take;
  262. input_len -= take;
  263. end;
  264. function chunk_state_output(const self: Pblake3_chunk_state): output_t; inline;
  265. var
  266. block_flags: cuint8;
  267. begin
  268. block_flags := self^.flags or chunk_state_maybe_start_flag(self) or cuint8(CHUNK_END);
  269. Result := make_output(self^.cv, self^.buf, self^.buf_len, self^.chunk_counter, block_flags);
  270. end;
  271. function parent_output(const block: pcuint8; const key: pcuint32; flags: cuint8): output_t; inline;
  272. begin
  273. Result := make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags or cuint8(PARENT));
  274. end;
  275. function left_len(content_len: csize_t): csize_t; inline;
  276. var
  277. full_chunks: csize_t;
  278. begin
  279. full_chunks := (content_len - 1) div BLAKE3_CHUNK_LEN;
  280. Result := round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
  281. end;
  282. function compress_chunks_parallel(const input: pcuint8; input_len: csize_t;
  283. const key: pcuint32;
  284. chunk_counter: cuint64; flags: cuint8;
  285. out_: pcuint8): csize_t; inline;
  286. var
  287. counter: cuint64;
  288. output: output_t;
  289. input_position: csize_t = 0;
  290. chunks_array_len: csize_t = 0;
  291. chunk_state: blake3_chunk_state;
  292. chunks_array: array[0..Pred(MAX_SIMD_DEGREE)] of pcuint8;
  293. begin
  294. assert(0 < input_len);
  295. assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
  296. while (input_len - input_position >= BLAKE3_CHUNK_LEN) do
  297. begin
  298. chunks_array[chunks_array_len] := @input[input_position];
  299. input_position += BLAKE3_CHUNK_LEN;
  300. chunks_array_len += 1;
  301. end;
  302. blake3_hash_many(chunks_array, chunks_array_len,
  303. BLAKE3_CHUNK_LEN div BLAKE3_BLOCK_LEN, key, chunk_counter,
  304. true, flags, cuint8(CHUNK_START), cuint8(CHUNK_END), out_);
  305. // Hash the remaining partial chunk, if there is one. Note that the empty
  306. // chunk (meaning the empty message) is a different codepath.
  307. if (input_len > input_position) then
  308. begin
  309. counter := chunk_counter + cuint64(chunks_array_len);
  310. chunk_state_init(@chunk_state, key, flags);
  311. chunk_state.chunk_counter := counter;
  312. chunk_state_update(@chunk_state, @input[input_position],
  313. input_len - input_position);
  314. output := chunk_state_output(@chunk_state);
  315. output_chaining_value(@output, @out_[chunks_array_len * BLAKE3_OUT_LEN]);
  316. Result := chunks_array_len + 1;
  317. end
  318. else begin
  319. Result := chunks_array_len;
  320. end;
  321. end;
  322. function compress_parents_parallel(const child_chaining_values: pcuint8;
  323. num_chaining_values: csize_t;
  324. const key: pcuint32; flags: cuint8;
  325. out_: pcuint8): csize_t; inline;
  326. var
  327. parents_array_len: csize_t = 0;
  328. parents_array: array[0..Pred(MAX_SIMD_DEGREE_OR_2)] of puint8;
  329. begin
  330. assert(2 <= num_chaining_values);
  331. assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
  332. while (num_chaining_values - (2 * parents_array_len) >= 2) do
  333. begin
  334. parents_array[parents_array_len] :=
  335. @child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
  336. parents_array_len += 1;
  337. end;
  338. blake3_hash_many(parents_array, parents_array_len, 1, key,
  339. 0, // Parents always use counter 0.
  340. false, flags or cuint8(PARENT),
  341. 0, // Parents have no start flags.
  342. 0, // Parents have no end flags.
  343. out_);
  344. // If there's an odd child left over, it becomes an output.
  345. if (num_chaining_values > 2 * parents_array_len) then
  346. begin
  347. Move(child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
  348. out_[parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN);
  349. Result := parents_array_len + 1;
  350. end
  351. else begin
  352. Result := parents_array_len;
  353. end;
  354. end;
  355. function blake3_compress_subtree_wide(const input: pcuint8;
  356. input_len: csize_t;
  357. const key: pcuint32;
  358. chunk_counter: cuint64;
  359. flags: cuint8; out_: pcuint8): csize_t;
  360. var
  361. left_n: csize_t;
  362. degree: csize_t;
  363. right_n: csize_t;
  364. right_cvs: pcuint8;
  365. right_input: pcuint8;
  366. left_input_len: csize_t;
  367. right_input_len: csize_t;
  368. right_chunk_counter: cuint64;
  369. num_chaining_values: csize_t;
  370. cv_array: array[0..Pred(2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN)] of cuint8;
  371. begin
  372. // Note that the single chunk case does *not* bump the SIMD degree up to 2
  373. // when it is 1. If this implementation adds multi-threading in the future,
  374. // this gives us the option of multi-threading even the 2-chunk case, which
  375. // can help performance on smaller platforms.
  376. if (input_len <= blake3_simd_degree * BLAKE3_CHUNK_LEN) then
  377. begin
  378. Result:= compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out_);
  379. Exit;
  380. end;
  381. // With more than simd_degree chunks, we need to recurse. Start by dividing
  382. // the input into left and right subtrees. (Note that this is only optimal
  383. // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
  384. // of 3 or something, we'll need a more complicated strategy.)
  385. left_input_len := left_len(input_len);
  386. right_input_len := input_len - left_input_len;
  387. right_input := @input[left_input_len];
  388. right_chunk_counter := chunk_counter + cuint64(left_input_len div BLAKE3_CHUNK_LEN);
  389. // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
  390. // account for the special case of returning 2 outputs when the SIMD degree
  391. // is 1.
  392. degree := blake3_simd_degree;
  393. if (left_input_len > BLAKE3_CHUNK_LEN) and (degree = 1) then
  394. begin
  395. // The special case: We always use a degree of at least two, to make
  396. // sure there are two outputs. Except, as noted above, at the chunk
  397. // level, where we allow degree=1. (Note that the 1-chunk-input case is
  398. // a different codepath.)
  399. degree := 2;
  400. end;
  401. right_cvs := @cv_array[degree * BLAKE3_OUT_LEN];
  402. // Recurse! If this implementation adds multi-threading support in the
  403. // future, this is where it will go.
  404. left_n := blake3_compress_subtree_wide(input, left_input_len, key,
  405. chunk_counter, flags, cv_array);
  406. right_n := blake3_compress_subtree_wide(
  407. right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
  408. // The special case again. If simd_degree=1, then we'll have left_n=1 and
  409. // right_n=1. Rather than compressing them into a single output, return
  410. // them directly, to make sure we always have at least two outputs.
  411. if (left_n = 1) then
  412. begin
  413. Move(cv_array[0], out_^, 2 * BLAKE3_OUT_LEN);
  414. Exit(2);
  415. end;
  416. // Otherwise, do one layer of parent node compression.
  417. num_chaining_values := left_n + right_n;
  418. Result := compress_parents_parallel(cv_array, num_chaining_values, key, flags, out_);
  419. end;
  420. procedure compress_subtree_to_parent_node(
  421. const input: pcuint8; input_len: csize_t; const key: pcuint32;
  422. chunk_counter: cuint64; flags: cuint8; out_: pcuint8); inline;
  423. var
  424. num_cvs: csize_t;
  425. cv_array: array[0..Pred(MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN)] of cuint8;
  426. out_array: array[0..Pred(MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN div 2)] of cuint8;
  427. begin
  428. assert(input_len > BLAKE3_CHUNK_LEN);
  429. num_cvs := blake3_compress_subtree_wide(input, input_len, key,
  430. chunk_counter, flags, cv_array);
  431. // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
  432. // compress_subtree_wide() returns more than 2 chaining values. Condense
  433. // them into 2 by forming parent nodes repeatedly.
  434. while (num_cvs > 2) do
  435. begin
  436. num_cvs :=
  437. compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
  438. Move(out_array[0], cv_array[0], num_cvs * BLAKE3_OUT_LEN);
  439. end;
  440. Move(cv_array[0], out_^, 2 * BLAKE3_OUT_LEN);
  441. end;
  442. procedure hasher_init_base(self: Pblake3_hasher; const key: pcuint32;
  443. flags: cuint8); inline;
  444. begin
  445. Move(key^, self^.key[0], BLAKE3_KEY_LEN);
  446. chunk_state_init(@self^.chunk, key, flags);
  447. self^.cv_stack_len := 0;
  448. end;
  449. procedure blake3_hasher_init(self: Pblake3_hasher); inline;
  450. begin
  451. hasher_init_base(self, BLAKE3_IV, 0);
  452. end;
  453. procedure hasher_merge_cv_stack(self: Pblake3_hasher; total_len: cuint64); inline;
  454. var
  455. output: output_t;
  456. parent_node: pcuint8;
  457. post_merge_stack_len: csize_t;
  458. begin
  459. post_merge_stack_len := csize_t(popcnt(total_len));
  460. while (self^.cv_stack_len > post_merge_stack_len) do
  461. begin
  462. parent_node := @self^.cv_stack[(self^.cv_stack_len - 2) * BLAKE3_OUT_LEN];
  463. output := parent_output(parent_node, self^.key, self^.chunk.flags);
  464. output_chaining_value(@output, parent_node);
  465. self^.cv_stack_len -= 1;
  466. end;
  467. end;
  468. procedure hasher_push_cv(self: Pblake3_hasher; new_cv: pcuint8;
  469. chunk_counter: cuint64); inline;
  470. begin
  471. hasher_merge_cv_stack(self, chunk_counter);
  472. Move(new_cv^, self^.cv_stack[self^.cv_stack_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN);
  473. self^.cv_stack_len += 1;
  474. end;
  475. procedure blake3_hasher_update(self: Pblake3_hasher; const input: Pointer;
  476. input_len: csize_t);
  477. var
  478. take: csize_t;
  479. output: output_t;
  480. subtree_len: csize_t;
  481. input_bytes: pcuint8;
  482. count_so_far: cuint64;
  483. subtree_chunks: cuint64;
  484. chunk_state: blake3_chunk_state;
  485. chunk_cv: array[0..31] of cuint8;
  486. cv: array[0..Pred(BLAKE3_OUT_LEN)] of cuint8;
  487. cv_pair: array[0..Pred(2 * BLAKE3_OUT_LEN)] of cuint8;
  488. begin
  489. // Explicitly checking for zero avoids causing UB by passing a null pointer
  490. // to memcpy. This comes up in practice with things like:
  491. // std::vector<uint8_t> v;
  492. // blake3_hasher_update(&hasher, v.data(), v.size());
  493. if (input_len = 0) then Exit;
  494. input_bytes := pcuint8(input);
  495. // If we have some partial chunk bytes in the internal chunk_state, we need
  496. // to finish that chunk first.
  497. if (chunk_state_len(@self^.chunk) > 0) then
  498. begin
  499. take := BLAKE3_CHUNK_LEN - chunk_state_len(@self^.chunk);
  500. if (take > input_len) then begin
  501. take := input_len;
  502. end;
  503. chunk_state_update(@self^.chunk, input_bytes, take);
  504. input_bytes += take;
  505. input_len -= take;
  506. // If we've filled the current chunk and there's more coming, finalize this
  507. // chunk and proceed. In this case we know it's not the root.
  508. if (input_len > 0) then
  509. begin
  510. output := chunk_state_output(@self^.chunk);
  511. output_chaining_value(@output, chunk_cv);
  512. hasher_push_cv(self, chunk_cv, self^.chunk.chunk_counter);
  513. chunk_state_reset(@self^.chunk, self^.key, self^.chunk.chunk_counter + 1);
  514. end
  515. else begin
  516. Exit;
  517. end;
  518. end;
  519. // Now the chunk_state is clear, and we have more input. If there's more than
  520. // a single chunk (so, definitely not the root chunk), hash the largest whole
  521. // subtree we can, with the full benefits of SIMD (and maybe in the future,
  522. // multi-threading) parallelism. Two restrictions:
  523. // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
  524. // the right edge can be incomplete, and we don't know where the right edge
  525. // is going to be until we get to finalize().
  526. // - The subtree must evenly divide the total number of chunks up until this
  527. // point (if total is not 0). If the current incomplete subtree is only
  528. // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
  529. // to complete the current subtree first.
  530. // Because we might need to break up the input to form powers of 2, or to
  531. // evenly divide what we already have, this part runs in a loop.
  532. while (input_len > BLAKE3_CHUNK_LEN) do
  533. begin
  534. subtree_len := round_down_to_power_of_2(input_len);
  535. count_so_far := self^.chunk.chunk_counter * BLAKE3_CHUNK_LEN;
  536. // Shrink the subtree_len until it evenly divides the count so far. We know
  537. // that subtree_len itself is a power of 2, so we can use a bitmasking
  538. // trick instead of an actual remainder operation. (Note that if the caller
  539. // consistently passes power-of-2 inputs of the same size, as is hopefully
  540. // typical, this loop condition will always fail, and subtree_len will
  541. // always be the full length of the input.)
  542. //
  543. // An aside: We don't have to shrink subtree_len quite this much. For
  544. // example, if count_so_far is 1, we could pass 2 chunks to
  545. // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
  546. // get the right answer in the end, and we might get to use 2-way SIMD
  547. // parallelism. The problem with this optimization, is that it gets us
  548. // stuck always hashing 2 chunks. The total number of chunks will remain
  549. // odd, and we'll never graduate to higher degrees of parallelism. See
  550. // https://github.com/BLAKE3-team/BLAKE3/issues/69.
  551. while (((cuint64(subtree_len - 1)) and count_so_far) <> 0) do
  552. begin
  553. subtree_len := subtree_len div 2;
  554. end;
  555. // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
  556. // chunk by itself. Otherwise, compress the subtree into a pair of CVs.
  557. subtree_chunks := subtree_len div BLAKE3_CHUNK_LEN;
  558. if (subtree_len <= BLAKE3_CHUNK_LEN) then
  559. begin
  560. chunk_state_init(@chunk_state, self^.key, self^.chunk.flags);
  561. chunk_state.chunk_counter := self^.chunk.chunk_counter;
  562. chunk_state_update(@chunk_state, input_bytes, subtree_len);
  563. output := chunk_state_output(@chunk_state);
  564. output_chaining_value(@output, cv);
  565. hasher_push_cv(self, cv, chunk_state.chunk_counter);
  566. end
  567. else begin
  568. // This is the high-performance happy path, though getting here depends
  569. // on the caller giving us a long enough input.
  570. compress_subtree_to_parent_node(input_bytes, subtree_len, self^.key,
  571. self^.chunk.chunk_counter,
  572. self^.chunk.flags, cv_pair);
  573. hasher_push_cv(self, cv_pair, self^.chunk.chunk_counter);
  574. hasher_push_cv(self, @cv_pair[BLAKE3_OUT_LEN],
  575. self^.chunk.chunk_counter + (subtree_chunks div 2));
  576. end;
  577. self^.chunk.chunk_counter += subtree_chunks;
  578. input_bytes += subtree_len;
  579. input_len -= subtree_len;
  580. end;
  581. // If there's any remaining input less than a full chunk, add it to the chunk
  582. // state. In that case, also do a final merge loop to make sure the subtree
  583. // stack doesn't contain any unmerged pairs. The remaining input means we
  584. // know these merges are non-root. This merge loop isn't strictly necessary
  585. // here, because hasher_push_chunk_cv already does its own merge loop, but it
  586. // simplifies blake3_hasher_finalize below.
  587. if (input_len > 0) then
  588. begin
  589. chunk_state_update(@self^.chunk, input_bytes, input_len);
  590. hasher_merge_cv_stack(self, self^.chunk.chunk_counter);
  591. end;
  592. end;
  593. procedure blake3_hasher_finalize_seek(const self: Pblake3_hasher; seek: cuint64;
  594. out_: pcuint8; out_len: csize_t);
  595. var
  596. output: output_t;
  597. cvs_remaining: csize_t;
  598. parent_block: array[0..Pred(BLAKE3_BLOCK_LEN)] of cuint8;
  599. begin
  600. // Explicitly checking for zero avoids causing UB by passing a null pointer
  601. // to memcpy. This comes up in practice with things like:
  602. // std::vector<uint8_t> v;
  603. // blake3_hasher_finalize(&hasher, v.data(), v.size());
  604. if (out_len = 0) then Exit;
  605. // If the subtree stack is empty, then the current chunk is the root.
  606. if (self^.cv_stack_len = 0) then
  607. begin
  608. output := chunk_state_output(@self^.chunk);
  609. output_root_bytes(@output, seek, out_, out_len);
  610. Exit;
  611. end;
  612. // If there are any bytes in the chunk state, finalize that chunk and do a
  613. // roll-up merge between that chunk hash and every subtree in the stack. In
  614. // this case, the extra merge loop at the end of blake3_hasher_update
  615. // guarantees that none of the subtrees in the stack need to be merged with
  616. // each other first. Otherwise, if there are no bytes in the chunk state,
  617. // then the top of the stack is a chunk hash, and we start the merge from
  618. // that.
  619. if (chunk_state_len(@self^.chunk) > 0) then
  620. begin
  621. cvs_remaining := self^.cv_stack_len;
  622. output := chunk_state_output(@self^.chunk);
  623. end
  624. else begin
  625. // There are always at least 2 CVs in the stack in this case.
  626. cvs_remaining := self^.cv_stack_len - 2;
  627. output := parent_output(@self^.cv_stack[cvs_remaining * 32], self^.key,
  628. self^.chunk.flags);
  629. end;
  630. while (cvs_remaining > 0) do
  631. begin
  632. cvs_remaining -= 1;
  633. Move(self^.cv_stack[cvs_remaining * 32], parent_block[0], 32);
  634. output_chaining_value(@output, @parent_block[32]);
  635. output := parent_output(parent_block, self^.key, self^.chunk.flags);
  636. end;
  637. output_root_bytes(@output, seek, out_, out_len);
  638. end;
  639. procedure blake3_hasher_finalize(const self: Pblake3_hasher; out_: pcuint8; out_len: csize_t);
  640. begin
  641. blake3_hasher_finalize_seek(self, 0, out_, out_len);
  642. end;
  643. initialization
  644. {$IF DEFINED(CPUX86_64)}
  645. if AVX2Support then
  646. begin
  647. blake3_simd_degree:= 8;
  648. blake3_compress_in_place:= @blake3_compress_in_place_sse41;
  649. blake3_compress_xof:= @blake3_compress_xof_sse41;
  650. blake3_hash_many:= @blake3_hash_many_avx2;
  651. end
  652. else if SSE41Support then
  653. begin
  654. blake3_simd_degree:= 4;
  655. blake3_compress_in_place:= @blake3_compress_in_place_sse41;
  656. blake3_compress_xof:= @blake3_compress_xof_sse41;
  657. blake3_hash_many:= @blake3_hash_many_sse41;
  658. end
  659. else begin
  660. blake3_simd_degree:= 4;
  661. blake3_compress_in_place:= @blake3_compress_in_place_sse2;
  662. blake3_compress_xof:= @blake3_compress_xof_sse2;
  663. blake3_hash_many:= @blake3_hash_many_sse2;
  664. end;
  665. {$ELSEIF DEFINED(CPUAARCH64)}
  666. blake3_simd_degree:= 4;
  667. blake3_compress_in_place:= @blake3_compress_in_place_portable;
  668. blake3_compress_xof:= @blake3_compress_xof_portable;
  669. blake3_hash_many:= @blake3_hash_many_neon;
  670. {$ELSE}
  671. blake3_simd_degree:= 1;
  672. blake3_compress_in_place:= @blake3_compress_in_place_portable;
  673. blake3_compress_xof:= @blake3_compress_xof_portable;
  674. blake3_hash_many:= @blake3_hash_many_portable;
  675. {$ENDIF}
  676. end.