123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360 |
- package chacha20_ref
- import "core:crypto/_chacha20"
- import "core:encoding/endian"
- import "core:math/bits"
- stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
- // Enforce the maximum consumed keystream per IV.
- _chacha20.check_counter_limit(ctx, nr_blocks)
- dst, src := dst, src
- x := &ctx._s
- for n := 0; n < nr_blocks; n = n + 1 {
- x0, x1, x2, x3 :=
- _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
- x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 :=
- x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
- for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
- // Even when forcing inlining manually inlining all of
- // these is decently faster.
- // quarterround(x, 0, 4, 8, 12)
- x0 += x4
- x12 ~= x0
- x12 = bits.rotate_left32(x12, 16)
- x8 += x12
- x4 ~= x8
- x4 = bits.rotate_left32(x4, 12)
- x0 += x4
- x12 ~= x0
- x12 = bits.rotate_left32(x12, 8)
- x8 += x12
- x4 ~= x8
- x4 = bits.rotate_left32(x4, 7)
- // quarterround(x, 1, 5, 9, 13)
- x1 += x5
- x13 ~= x1
- x13 = bits.rotate_left32(x13, 16)
- x9 += x13
- x5 ~= x9
- x5 = bits.rotate_left32(x5, 12)
- x1 += x5
- x13 ~= x1
- x13 = bits.rotate_left32(x13, 8)
- x9 += x13
- x5 ~= x9
- x5 = bits.rotate_left32(x5, 7)
- // quarterround(x, 2, 6, 10, 14)
- x2 += x6
- x14 ~= x2
- x14 = bits.rotate_left32(x14, 16)
- x10 += x14
- x6 ~= x10
- x6 = bits.rotate_left32(x6, 12)
- x2 += x6
- x14 ~= x2
- x14 = bits.rotate_left32(x14, 8)
- x10 += x14
- x6 ~= x10
- x6 = bits.rotate_left32(x6, 7)
- // quarterround(x, 3, 7, 11, 15)
- x3 += x7
- x15 ~= x3
- x15 = bits.rotate_left32(x15, 16)
- x11 += x15
- x7 ~= x11
- x7 = bits.rotate_left32(x7, 12)
- x3 += x7
- x15 ~= x3
- x15 = bits.rotate_left32(x15, 8)
- x11 += x15
- x7 ~= x11
- x7 = bits.rotate_left32(x7, 7)
- // quarterround(x, 0, 5, 10, 15)
- x0 += x5
- x15 ~= x0
- x15 = bits.rotate_left32(x15, 16)
- x10 += x15
- x5 ~= x10
- x5 = bits.rotate_left32(x5, 12)
- x0 += x5
- x15 ~= x0
- x15 = bits.rotate_left32(x15, 8)
- x10 += x15
- x5 ~= x10
- x5 = bits.rotate_left32(x5, 7)
- // quarterround(x, 1, 6, 11, 12)
- x1 += x6
- x12 ~= x1
- x12 = bits.rotate_left32(x12, 16)
- x11 += x12
- x6 ~= x11
- x6 = bits.rotate_left32(x6, 12)
- x1 += x6
- x12 ~= x1
- x12 = bits.rotate_left32(x12, 8)
- x11 += x12
- x6 ~= x11
- x6 = bits.rotate_left32(x6, 7)
- // quarterround(x, 2, 7, 8, 13)
- x2 += x7
- x13 ~= x2
- x13 = bits.rotate_left32(x13, 16)
- x8 += x13
- x7 ~= x8
- x7 = bits.rotate_left32(x7, 12)
- x2 += x7
- x13 ~= x2
- x13 = bits.rotate_left32(x13, 8)
- x8 += x13
- x7 ~= x8
- x7 = bits.rotate_left32(x7, 7)
- // quarterround(x, 3, 4, 9, 14)
- x3 += x4
- x14 ~= x3
- x14 = bits.rotate_left32(x14, 16)
- x9 += x14
- x4 ~= x9
- x4 = bits.rotate_left32(x4, 12)
- x3 += x4
- x14 ~= x3
- x14 = bits.rotate_left32(x14, 8)
- x9 += x14
- x4 ~= x9
- x4 = bits.rotate_left32(x4, 7)
- }
- x0 += _chacha20.SIGMA_0
- x1 += _chacha20.SIGMA_1
- x2 += _chacha20.SIGMA_2
- x3 += _chacha20.SIGMA_3
- x4 += x[4]
- x5 += x[5]
- x6 += x[6]
- x7 += x[7]
- x8 += x[8]
- x9 += x[9]
- x10 += x[10]
- x11 += x[11]
- x12 += x[12]
- x13 += x[13]
- x14 += x[14]
- x15 += x[15]
- // - The caller(s) ensure that src/dst are valid.
- // - The compiler knows if the target is picky about alignment.
- #no_bounds_check {
- if src != nil {
- endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0)
- endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1)
- endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2)
- endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3)
- endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4)
- endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5)
- endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6)
- endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7)
- endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8)
- endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9)
- endian.unchecked_put_u32le(
- dst[40:44],
- endian.unchecked_get_u32le(src[40:44]) ~ x10,
- )
- endian.unchecked_put_u32le(
- dst[44:48],
- endian.unchecked_get_u32le(src[44:48]) ~ x11,
- )
- endian.unchecked_put_u32le(
- dst[48:52],
- endian.unchecked_get_u32le(src[48:52]) ~ x12,
- )
- endian.unchecked_put_u32le(
- dst[52:56],
- endian.unchecked_get_u32le(src[52:56]) ~ x13,
- )
- endian.unchecked_put_u32le(
- dst[56:60],
- endian.unchecked_get_u32le(src[56:60]) ~ x14,
- )
- endian.unchecked_put_u32le(
- dst[60:64],
- endian.unchecked_get_u32le(src[60:64]) ~ x15,
- )
- src = src[_chacha20.BLOCK_SIZE:]
- } else {
- endian.unchecked_put_u32le(dst[0:4], x0)
- endian.unchecked_put_u32le(dst[4:8], x1)
- endian.unchecked_put_u32le(dst[8:12], x2)
- endian.unchecked_put_u32le(dst[12:16], x3)
- endian.unchecked_put_u32le(dst[16:20], x4)
- endian.unchecked_put_u32le(dst[20:24], x5)
- endian.unchecked_put_u32le(dst[24:28], x6)
- endian.unchecked_put_u32le(dst[28:32], x7)
- endian.unchecked_put_u32le(dst[32:36], x8)
- endian.unchecked_put_u32le(dst[36:40], x9)
- endian.unchecked_put_u32le(dst[40:44], x10)
- endian.unchecked_put_u32le(dst[44:48], x11)
- endian.unchecked_put_u32le(dst[48:52], x12)
- endian.unchecked_put_u32le(dst[52:56], x13)
- endian.unchecked_put_u32le(dst[56:60], x14)
- endian.unchecked_put_u32le(dst[60:64], x15)
- }
- dst = dst[_chacha20.BLOCK_SIZE:]
- }
- // Increment the counter. Overflow checking is done upon
- // entry into the routine, so a 64-bit increment safely
- // covers both cases.
- new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
- x[12] = u32(new_ctr)
- x[13] = u32(new_ctr >> 32)
- }
- }
- hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
- x0, x1, x2, x3 := _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
- x4 := endian.unchecked_get_u32le(key[0:4])
- x5 := endian.unchecked_get_u32le(key[4:8])
- x6 := endian.unchecked_get_u32le(key[8:12])
- x7 := endian.unchecked_get_u32le(key[12:16])
- x8 := endian.unchecked_get_u32le(key[16:20])
- x9 := endian.unchecked_get_u32le(key[20:24])
- x10 := endian.unchecked_get_u32le(key[24:28])
- x11 := endian.unchecked_get_u32le(key[28:32])
- x12 := endian.unchecked_get_u32le(iv[0:4])
- x13 := endian.unchecked_get_u32le(iv[4:8])
- x14 := endian.unchecked_get_u32le(iv[8:12])
- x15 := endian.unchecked_get_u32le(iv[12:16])
- for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
- // quarterround(x, 0, 4, 8, 12)
- x0 += x4
- x12 ~= x0
- x12 = bits.rotate_left32(x12, 16)
- x8 += x12
- x4 ~= x8
- x4 = bits.rotate_left32(x4, 12)
- x0 += x4
- x12 ~= x0
- x12 = bits.rotate_left32(x12, 8)
- x8 += x12
- x4 ~= x8
- x4 = bits.rotate_left32(x4, 7)
- // quarterround(x, 1, 5, 9, 13)
- x1 += x5
- x13 ~= x1
- x13 = bits.rotate_left32(x13, 16)
- x9 += x13
- x5 ~= x9
- x5 = bits.rotate_left32(x5, 12)
- x1 += x5
- x13 ~= x1
- x13 = bits.rotate_left32(x13, 8)
- x9 += x13
- x5 ~= x9
- x5 = bits.rotate_left32(x5, 7)
- // quarterround(x, 2, 6, 10, 14)
- x2 += x6
- x14 ~= x2
- x14 = bits.rotate_left32(x14, 16)
- x10 += x14
- x6 ~= x10
- x6 = bits.rotate_left32(x6, 12)
- x2 += x6
- x14 ~= x2
- x14 = bits.rotate_left32(x14, 8)
- x10 += x14
- x6 ~= x10
- x6 = bits.rotate_left32(x6, 7)
- // quarterround(x, 3, 7, 11, 15)
- x3 += x7
- x15 ~= x3
- x15 = bits.rotate_left32(x15, 16)
- x11 += x15
- x7 ~= x11
- x7 = bits.rotate_left32(x7, 12)
- x3 += x7
- x15 ~= x3
- x15 = bits.rotate_left32(x15, 8)
- x11 += x15
- x7 ~= x11
- x7 = bits.rotate_left32(x7, 7)
- // quarterround(x, 0, 5, 10, 15)
- x0 += x5
- x15 ~= x0
- x15 = bits.rotate_left32(x15, 16)
- x10 += x15
- x5 ~= x10
- x5 = bits.rotate_left32(x5, 12)
- x0 += x5
- x15 ~= x0
- x15 = bits.rotate_left32(x15, 8)
- x10 += x15
- x5 ~= x10
- x5 = bits.rotate_left32(x5, 7)
- // quarterround(x, 1, 6, 11, 12)
- x1 += x6
- x12 ~= x1
- x12 = bits.rotate_left32(x12, 16)
- x11 += x12
- x6 ~= x11
- x6 = bits.rotate_left32(x6, 12)
- x1 += x6
- x12 ~= x1
- x12 = bits.rotate_left32(x12, 8)
- x11 += x12
- x6 ~= x11
- x6 = bits.rotate_left32(x6, 7)
- // quarterround(x, 2, 7, 8, 13)
- x2 += x7
- x13 ~= x2
- x13 = bits.rotate_left32(x13, 16)
- x8 += x13
- x7 ~= x8
- x7 = bits.rotate_left32(x7, 12)
- x2 += x7
- x13 ~= x2
- x13 = bits.rotate_left32(x13, 8)
- x8 += x13
- x7 ~= x8
- x7 = bits.rotate_left32(x7, 7)
- // quarterround(x, 3, 4, 9, 14)
- x3 += x4
- x14 ~= x3
- x14 = bits.rotate_left32(x14, 16)
- x9 += x14
- x4 ~= x9
- x4 = bits.rotate_left32(x4, 12)
- x3 += x4
- x14 ~= x3
- x14 = bits.rotate_left32(x14, 8)
- x9 += x14
- x4 ~= x9
- x4 = bits.rotate_left32(x4, 7)
- }
- endian.unchecked_put_u32le(dst[0:4], x0)
- endian.unchecked_put_u32le(dst[4:8], x1)
- endian.unchecked_put_u32le(dst[8:12], x2)
- endian.unchecked_put_u32le(dst[12:16], x3)
- endian.unchecked_put_u32le(dst[16:20], x12)
- endian.unchecked_put_u32le(dst[20:24], x13)
- endian.unchecked_put_u32le(dst[24:28], x14)
- endian.unchecked_put_u32le(dst[28:32], x15)
- }
|