Browse Source

core/crypto: Add poly1305

This package implements the Poly1305 MAC algorithm as specified in RFC
8439, using routines taked from fiat-crypto and poly1305-donna.
Yawning Angel 3 years ago
parent
commit
64db286582

+ 45 - 0
core/crypto/_fiat/field_poly1305/field.odin

@@ -0,0 +1,45 @@
+package field_poly1305
+
+import "core:crypto/util"
+import "core:mem"
+
+fe_relax_cast :: #force_inline proc "contextless" (arg1: ^Tight_Field_Element) -> ^Loose_Field_Element {
+	return transmute(^Loose_Field_Element)(arg1)
+}
+
+fe_tighten_cast :: #force_inline proc "contextless" (arg1: ^Loose_Field_Element) -> ^Tight_Field_Element {
+	return transmute(^Tight_Field_Element)(arg1)
+}
+
+fe_from_bytes :: proc (out1: ^Tight_Field_Element, arg1: []byte, arg2: byte, sanitize: bool = true) {
+	// fiat-crypto's deserialization routine wants 256-bits of input, but
+	// r/s are 128-bits long, and block processing works on 128-bits plus a
+	// final bit.
+	//
+	// This is more ergonomic, and while the copy is unfortunate, this avoids
+	// having to alter the fiat-crypto derived code.
+
+	assert(len(arg1) == 16)
+
+	tmp: [32]byte
+	copy_slice(tmp[0:16], arg1[:])
+	tmp[16] = arg2
+
+	_fe_from_bytes(out1, &tmp)
+
+	// Need to sanitize the temporary buffer when deserializing `s`.
+	if sanitize {
+		mem.zero_explicit(&tmp, size_of(tmp))
+	}
+}
+
+fe_from_u64s :: proc "contextless" (out1: ^Tight_Field_Element, lo, hi: u64) {
+	tmp: [32]byte
+	util.PUT_U64_LE(tmp[0:8], lo)
+	util.PUT_U64_LE(tmp[8:16], hi)
+
+	_fe_from_bytes(out1, &tmp)
+
+	// This routine is only used to deserialize `r` which is confidential.
+	mem.zero_explicit(&tmp, size_of(tmp))
+}

+ 356 - 0
core/crypto/_fiat/field_poly1305/field4344.odin

@@ -0,0 +1,356 @@
+// The BSD 1-Clause License (BSD-1-Clause)
+//
+// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file)
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design,
+// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package field_poly1305
+
+// This file provides arithmetic on the field Z/(2^130 - 5) using
+// unsaturated 64-bit integer arithmetic.  It is derived primarily
+// from the machine generate Golang output from the fiat-crypto project.
+//
+// While the base implementation is provably correct, this implementation
+// makes no such claims as the port and optimizations were done by hand.
+// At some point, it may be worth adding support to fiat-crypto for
+// generating Odin output.
+
+import fiat "core:crypto/_fiat"
+import "core:math/bits"
+
+Loose_Field_Element :: distinct [3]u64
+Tight_Field_Element :: distinct [3]u64
+
+_addcarryx_u44 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((u64(arg1) + arg2) + arg3)
+	x2 := (x1 & 0xfffffffffff)
+	x3 := fiat.u1((x1 >> 44))
+	out1 = x2
+	out2 = x3
+	return
+}
+
+_subborrowx_u44 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
+	x2 := fiat.i1((x1 >> 44))
+	x3 := (u64(x1) & 0xfffffffffff)
+	out1 = x3
+	out2 = (0x0 - fiat.u1(x2))
+	return
+}
+
+_addcarryx_u43 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((u64(arg1) + arg2) + arg3)
+	x2 := (x1 & 0x7ffffffffff)
+	x3 := fiat.u1((x1 >> 43))
+	out1 = x2
+	out2 = x3
+	return
+}
+
+_subborrowx_u43 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
+	x2 := fiat.i1((x1 >> 43))
+	x3 := (u64(x1) & 0x7ffffffffff)
+	out1 = x3
+	out2 = (0x0 - fiat.u1(x2))
+	return
+}
+
+fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) {
+	x2, x1 := bits.mul_u64(arg1[2], (arg2[2] * 0x5))
+	x4, x3 := bits.mul_u64(arg1[2], (arg2[1] * 0xa))
+	x6, x5 := bits.mul_u64(arg1[1], (arg2[2] * 0xa))
+	x8, x7 := bits.mul_u64(arg1[2], arg2[0])
+	x10, x9 := bits.mul_u64(arg1[1], (arg2[1] * 0x2))
+	x12, x11 := bits.mul_u64(arg1[1], arg2[0])
+	x14, x13 := bits.mul_u64(arg1[0], arg2[2])
+	x16, x15 := bits.mul_u64(arg1[0], arg2[1])
+	x18, x17 := bits.mul_u64(arg1[0], arg2[0])
+	x19, x20 := bits.add_u64(x5, x3, u64(0x0))
+	x21, _ := bits.add_u64(x6, x4, u64(fiat.u1(x20)))
+	x23, x24 := bits.add_u64(x17, x19, u64(0x0))
+	x25, _ := bits.add_u64(x18, x21, u64(fiat.u1(x24)))
+	x27 := ((x23 >> 44) | ((x25 << 20) & 0xffffffffffffffff))
+	x28 := (x23 & 0xfffffffffff)
+	x29, x30 := bits.add_u64(x9, x7, u64(0x0))
+	x31, _ := bits.add_u64(x10, x8, u64(fiat.u1(x30)))
+	x33, x34 := bits.add_u64(x13, x29, u64(0x0))
+	x35, _ := bits.add_u64(x14, x31, u64(fiat.u1(x34)))
+	x37, x38 := bits.add_u64(x11, x1, u64(0x0))
+	x39, _ := bits.add_u64(x12, x2, u64(fiat.u1(x38)))
+	x41, x42 := bits.add_u64(x15, x37, u64(0x0))
+	x43, _ := bits.add_u64(x16, x39, u64(fiat.u1(x42)))
+	x45, x46 := bits.add_u64(x27, x41, u64(0x0))
+	x47 := (u64(fiat.u1(x46)) + x43)
+	x48 := ((x45 >> 43) | ((x47 << 21) & 0xffffffffffffffff))
+	x49 := (x45 & 0x7ffffffffff)
+	x50, x51 := bits.add_u64(x48, x33, u64(0x0))
+	x52 := (u64(fiat.u1(x51)) + x35)
+	x53 := ((x50 >> 43) | ((x52 << 21) & 0xffffffffffffffff))
+	x54 := (x50 & 0x7ffffffffff)
+	x55 := (x53 * 0x5)
+	x56 := (x28 + x55)
+	x57 := (x56 >> 44)
+	x58 := (x56 & 0xfffffffffff)
+	x59 := (x57 + x49)
+	x60 := fiat.u1((x59 >> 43))
+	x61 := (x59 & 0x7ffffffffff)
+	x62 := (u64(x60) + x54)
+	out1[0] = x58
+	out1[1] = x61
+	out1[2] = x62
+}
+
+fe_carry_square :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x1 := (arg1[2] * 0x5)
+	x2 := (x1 * 0x2)
+	x3 := (arg1[2] * 0x2)
+	x4 := (arg1[1] * 0x2)
+	x6, x5 := bits.mul_u64(arg1[2], x1)
+	x8, x7 := bits.mul_u64(arg1[1], (x2 * 0x2))
+	x10, x9 := bits.mul_u64(arg1[1], (arg1[1] * 0x2))
+	x12, x11 := bits.mul_u64(arg1[0], x3)
+	x14, x13 := bits.mul_u64(arg1[0], x4)
+	x16, x15 := bits.mul_u64(arg1[0], arg1[0])
+	x17, x18 := bits.add_u64(x15, x7, u64(0x0))
+	x19, _ := bits.add_u64(x16, x8, u64(fiat.u1(x18)))
+	x21 := ((x17 >> 44) | ((x19 << 20) & 0xffffffffffffffff))
+	x22 := (x17 & 0xfffffffffff)
+	x23, x24 := bits.add_u64(x11, x9, u64(0x0))
+	x25, _ := bits.add_u64(x12, x10, u64(fiat.u1(x24)))
+	x27, x28 := bits.add_u64(x13, x5, u64(0x0))
+	x29, _ := bits.add_u64(x14, x6, u64(fiat.u1(x28)))
+	x31, x32 := bits.add_u64(x21, x27, u64(0x0))
+	x33 := (u64(fiat.u1(x32)) + x29)
+	x34 := ((x31 >> 43) | ((x33 << 21) & 0xffffffffffffffff))
+	x35 := (x31 & 0x7ffffffffff)
+	x36, x37 := bits.add_u64(x34, x23, u64(0x0))
+	x38 := (u64(fiat.u1(x37)) + x25)
+	x39 := ((x36 >> 43) | ((x38 << 21) & 0xffffffffffffffff))
+	x40 := (x36 & 0x7ffffffffff)
+	x41 := (x39 * 0x5)
+	x42 := (x22 + x41)
+	x43 := (x42 >> 44)
+	x44 := (x42 & 0xfffffffffff)
+	x45 := (x43 + x35)
+	x46 := fiat.u1((x45 >> 43))
+	x47 := (x45 & 0x7ffffffffff)
+	x48 := (u64(x46) + x40)
+	out1[0] = x44
+	out1[1] = x47
+	out1[2] = x48
+}
+
+fe_carry :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x1 := arg1[0]
+	x2 := ((x1 >> 44) + arg1[1])
+	x3 := ((x2 >> 43) + arg1[2])
+	x4 := ((x1 & 0xfffffffffff) + ((x3 >> 43) * 0x5))
+	x5 := (u64(fiat.u1((x4 >> 44))) + (x2 & 0x7ffffffffff))
+	x6 := (x4 & 0xfffffffffff)
+	x7 := (x5 & 0x7ffffffffff)
+	x8 := (u64(fiat.u1((x5 >> 43))) + (x3 & 0x7ffffffffff))
+	out1[0] = x6
+	out1[1] = x7
+	out1[2] = x8
+}
+
+fe_add :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
+	x1 := (arg1[0] + arg2[0])
+	x2 := (arg1[1] + arg2[1])
+	x3 := (arg1[2] + arg2[2])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+fe_sub :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
+	x1 := ((0x1ffffffffff6 + arg1[0]) - arg2[0])
+	x2 := ((0xffffffffffe + arg1[1]) - arg2[1])
+	x3 := ((0xffffffffffe + arg1[2]) - arg2[2])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
+	x1 := (0x1ffffffffff6 - arg1[0])
+	x2 := (0xffffffffffe - arg1[1])
+	x3 := (0xffffffffffe - arg1[2])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+fe_cond_assign :: proc "contextless" (out1, arg1: ^Tight_Field_Element, arg2: bool) {
+	x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
+	x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
+	x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+fe_to_bytes :: proc "contextless" (out1: ^[32]byte, arg1: ^Tight_Field_Element) {
+	x1, x2 := _subborrowx_u44(0x0, arg1[0], 0xffffffffffb)
+	x3, x4 := _subborrowx_u43(x2, arg1[1], 0x7ffffffffff)
+	x5, x6 := _subborrowx_u43(x4, arg1[2], 0x7ffffffffff)
+	x7 := fiat.cmovznz_u64(x6, u64(0x0), 0xffffffffffffffff)
+	x8, x9 := _addcarryx_u44(0x0, x1, (x7 & 0xffffffffffb))
+	x10, x11 := _addcarryx_u43(x9, x3, (x7 & 0x7ffffffffff))
+	x12, _ := _addcarryx_u43(x11, x5, (x7 & 0x7ffffffffff))
+	x14 := (x12 << 7)
+	x15 := (x10 << 4)
+	x16 := (u8(x8) & 0xff)
+	x17 := (x8 >> 8)
+	x18 := (u8(x17) & 0xff)
+	x19 := (x17 >> 8)
+	x20 := (u8(x19) & 0xff)
+	x21 := (x19 >> 8)
+	x22 := (u8(x21) & 0xff)
+	x23 := (x21 >> 8)
+	x24 := (u8(x23) & 0xff)
+	x25 := u8((x23 >> 8))
+	x26 := (x15 + u64(x25))
+	x27 := (u8(x26) & 0xff)
+	x28 := (x26 >> 8)
+	x29 := (u8(x28) & 0xff)
+	x30 := (x28 >> 8)
+	x31 := (u8(x30) & 0xff)
+	x32 := (x30 >> 8)
+	x33 := (u8(x32) & 0xff)
+	x34 := (x32 >> 8)
+	x35 := (u8(x34) & 0xff)
+	x36 := u8((x34 >> 8))
+	x37 := (x14 + u64(x36))
+	x38 := (u8(x37) & 0xff)
+	x39 := (x37 >> 8)
+	x40 := (u8(x39) & 0xff)
+	x41 := (x39 >> 8)
+	x42 := (u8(x41) & 0xff)
+	x43 := (x41 >> 8)
+	x44 := (u8(x43) & 0xff)
+	x45 := (x43 >> 8)
+	x46 := (u8(x45) & 0xff)
+	x47 := (x45 >> 8)
+	x48 := (u8(x47) & 0xff)
+	x49 := u8((x47 >> 8))
+	out1[0] = x16
+	out1[1] = x18
+	out1[2] = x20
+	out1[3] = x22
+	out1[4] = x24
+	out1[5] = x27
+	out1[6] = x29
+	out1[7] = x31
+	out1[8] = x33
+	out1[9] = x35
+	out1[10] = x38
+	out1[11] = x40
+	out1[12] = x42
+	out1[13] = x44
+	out1[14] = x46
+	out1[15] = x48
+	out1[16] = x49
+}
+
+_fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) {
+	x1 := (u64(arg1[16]) << 41)
+	x2 := (u64(arg1[15]) << 33)
+	x3 := (u64(arg1[14]) << 25)
+	x4 := (u64(arg1[13]) << 17)
+	x5 := (u64(arg1[12]) << 9)
+	x6 := (u64(arg1[11]) * u64(0x2))
+	x7 := (u64(arg1[10]) << 36)
+	x8 := (u64(arg1[9]) << 28)
+	x9 := (u64(arg1[8]) << 20)
+	x10 := (u64(arg1[7]) << 12)
+	x11 := (u64(arg1[6]) << 4)
+	x12 := (u64(arg1[5]) << 40)
+	x13 := (u64(arg1[4]) << 32)
+	x14 := (u64(arg1[3]) << 24)
+	x15 := (u64(arg1[2]) << 16)
+	x16 := (u64(arg1[1]) << 8)
+	x17 := arg1[0]
+	x18 := (x16 + u64(x17))
+	x19 := (x15 + x18)
+	x20 := (x14 + x19)
+	x21 := (x13 + x20)
+	x22 := (x12 + x21)
+	x23 := (x22 & 0xfffffffffff)
+	x24 := u8((x22 >> 44))
+	x25 := (x11 + u64(x24))
+	x26 := (x10 + x25)
+	x27 := (x9 + x26)
+	x28 := (x8 + x27)
+	x29 := (x7 + x28)
+	x30 := (x29 & 0x7ffffffffff)
+	x31 := fiat.u1((x29 >> 43))
+	x32 := (x6 + u64(x31))
+	x33 := (x5 + x32)
+	x34 := (x4 + x33)
+	x35 := (x3 + x34)
+	x36 := (x2 + x35)
+	x37 := (x1 + x36)
+	out1[0] = x23
+	out1[1] = x30
+	out1[2] = x37
+}
+
+fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+// The following routines were added by hand, and do not come from fiat-crypto.
+
+fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+}
+
+fe_set :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+fe_cond_swap :: proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: bool) {
+	mask := -u64(arg1)
+	x := (out1[0] ~ out2[0]) & mask
+	x1, y1 := out1[0] ~ x, out2[0] ~ x
+	x = (out1[1] ~ out2[1]) & mask
+	x2, y2 := out1[1] ~ x, out2[1] ~ x
+	x = (out1[2] ~ out2[2]) & mask
+	x3, y3 := out1[2] ~ x, out2[2] ~ x
+	out1[0], out2[0] = x1, y1
+	out1[1], out2[1] = x2, y2
+	out1[2], out2[2] = x3, y3
+}

+ 163 - 0
core/crypto/poly1305/poly1305.odin

@@ -0,0 +1,163 @@
+package poly1305
+
+import "core:crypto"
+import "core:crypto/util"
+import field "core:crypto/_fiat/field_poly1305"
+import "core:mem"
+
+KEY_SIZE :: 32
+TAG_SIZE :: 16
+
+_BLOCK_SIZE :: 16
+
+sum :: proc (dst, msg, key: []byte) {
+	ctx: Context = ---
+
+	init(&ctx, key)
+	update(&ctx, msg)
+	final(&ctx, dst)
+}
+
+verify :: proc (tag, msg, key: []byte) -> bool {
+	ctx: Context = ---
+	derived_tag: [16]byte = ---
+
+	if len(tag) != TAG_SIZE {
+		panic("crypto/poly1305: invalid tag size")
+	}
+
+	init(&ctx, key)
+	update(&ctx, msg)
+	final(&ctx, derived_tag[:])
+
+	return crypto.compare_constant_time(derived_tag[:], tag) == 1
+}
+
+Context :: struct {
+	_r: field.Tight_Field_Element,
+	_a: field.Tight_Field_Element,
+	_s: field.Tight_Field_Element,
+
+	_buffer: [_BLOCK_SIZE]byte,
+	_leftover: int,
+
+	_is_initialized: bool,
+}
+
+init :: proc (ctx: ^Context, key: []byte) {
+	if len(key) != KEY_SIZE {
+		panic("crypto/poly1305: invalid key size")
+	}
+
+	// r = le_bytes_to_num(key[0..15])
+	// r = clamp(r) (r &= 0xffffffc0ffffffc0ffffffc0fffffff)
+	tmp_lo := util.U64_LE(key[0:8]) & 0x0ffffffc0fffffff
+	tmp_hi := util.U64_LE(key[8:16]) & 0xffffffc0ffffffc
+	field.fe_from_u64s(&ctx._r, tmp_lo, tmp_hi)
+
+	// s = le_bytes_to_num(key[16..31])
+	field.fe_from_bytes(&ctx._s, key[16:32], 0)
+
+	// a = 0
+	field.fe_zero(&ctx._a)
+
+	// No leftover in buffer
+	ctx._leftover = 0
+
+	ctx._is_initialized = true
+}
+
+update :: proc (ctx: ^Context, data: []byte) {
+	assert(ctx._is_initialized)
+
+	msg := data
+	msg_len := len(data)
+
+	// Handle leftover
+	if ctx._leftover > 0 {
+		want := min(_BLOCK_SIZE - ctx._leftover, msg_len)
+		copy_slice(ctx._buffer[ctx._leftover:], msg[:want])
+		msg_len = msg_len - want
+		msg = msg[want:]
+		ctx._leftover = ctx._leftover + want
+		if ctx._leftover < _BLOCK_SIZE {
+			return
+		}
+		_blocks(ctx, ctx._buffer[:])
+		ctx._leftover = 0
+	}
+
+	// Process full blocks
+	if msg_len >= _BLOCK_SIZE {
+		want := msg_len & (~int(_BLOCK_SIZE - 1))
+		_blocks(ctx, msg[:want])
+		msg = msg[want:]
+		msg_len = msg_len - want
+	}
+
+	// Store leftover
+	if msg_len > 0 {
+		// TODO: While -donna does it this way, I'm fairly sure that
+		// `ctx._leftover == 0` is an invariant at this point.
+		copy(ctx._buffer[ctx._leftover:], msg)
+		ctx._leftover = ctx._leftover + msg_len
+	}
+}
+
+final :: proc (ctx: ^Context, dst: []byte) {
+	assert(ctx._is_initialized)
+
+	if len(dst) != TAG_SIZE {
+		panic("poly1305: invalid destination tag size")
+	}
+
+	// Process remaining block
+	if ctx._leftover > 0 {
+		ctx._buffer[ctx._leftover] = 1
+		for i := ctx._leftover + 1; i < _BLOCK_SIZE; i = i + 1 {
+			ctx._buffer[i] = 0
+		}
+		_blocks(ctx, ctx._buffer[:], true)
+	}
+
+	// a += s
+	field.fe_add(field.fe_relax_cast(&ctx._a), &ctx._a, &ctx._s) // _a unreduced
+	field.fe_carry(&ctx._a, field.fe_relax_cast(&ctx._a)) // _a reduced
+
+	// return num_to_16_le_bytes(a)
+	tmp: [32]byte = ---
+	field.fe_to_bytes(&tmp, &ctx._a)
+	copy_slice(dst, tmp[0:16])
+
+	reset(ctx)
+}
+
+reset :: proc (ctx: ^Context) {
+	mem.zero_explicit(&ctx._r, size_of(ctx._r))
+	mem.zero_explicit(&ctx._a, size_of(ctx._a))
+	mem.zero_explicit(&ctx._s, size_of(ctx._s))
+	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
+
+	ctx._is_initialized = false
+}
+
+_blocks :: proc (ctx: ^Context, msg: []byte, final := false) {
+	n: field.Tight_Field_Element = ---
+	final_byte := byte(!final)
+
+	data := msg
+	data_len := len(data)
+	for data_len >= _BLOCK_SIZE {
+		// n = le_bytes_to_num(msg[((i-1)*16)..*i*16] | [0x01])
+		field.fe_from_bytes(&n, data[:_BLOCK_SIZE], final_byte, false)
+
+		// a += n
+		field.fe_add(field.fe_relax_cast(&ctx._a), &ctx._a, &n) // _a unreduced
+
+		// a = (r * a) % p
+		field.fe_carry_mul(&ctx._a, field.fe_relax_cast(&ctx._a), field.fe_relax_cast(&ctx._r)) // _a reduced
+
+		data = data[_BLOCK_SIZE:]
+		data_len = data_len - _BLOCK_SIZE
+	}
+}

+ 1 - 0
tests/core/crypto/test_core_crypto.odin

@@ -116,6 +116,7 @@ main :: proc() {
     test_haval_256(&t)
 
     // "modern" crypto tests
+    test_poly1305(&t)
     test_x25519(&t)
 
     bench_modern(&t)

+ 131 - 0
tests/core/crypto/test_core_crypto_modern.odin

@@ -4,6 +4,7 @@ import "core:testing"
 import "core:fmt"
 import "core:time"
 
+import "core:crypto/poly1305"
 import "core:crypto/x25519"
 
 _digit_value :: proc(r: rune) -> int {
@@ -27,6 +28,70 @@ _decode_hex32 :: proc(s: string) -> [32]byte{
 	return b
 }
 
+@(test)
+test_poly1305 :: proc(t: ^testing.T) {
+	log(t, "Testing poly1305")
+
+	// Test cases taken from poly1305-donna.
+	key := [poly1305.KEY_SIZE]byte{
+		0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91,
+		0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25,
+		0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65,
+		0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80,
+	}
+
+	msg := [131]byte{
+		0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73,
+		0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce,
+		0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4,
+		0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a,
+		0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b,
+		0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72,
+		0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2,
+		0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38,
+		0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a,
+		0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae,
+		0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea,
+		0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda,
+		0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde,
+		0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3,
+		0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6,
+		0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74,
+		0xe3,0x55,0xa5,
+	}
+
+	tag := [poly1305.TAG_SIZE]byte{
+		0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5,
+		0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9,
+	}
+	tag_str := hex_string(tag[:])
+
+	// Verify - oneshot + compare
+	ok := poly1305.verify(tag[:], msg[:], key[:])
+	expect(t, ok, "oneshot verify call failed")
+
+	// Sum - oneshot
+	derived_tag: [poly1305.TAG_SIZE]byte
+	poly1305.sum(derived_tag[:], msg[:], key[:])
+	derived_tag_str := hex_string(derived_tag[:])
+	expect(t, derived_tag_str == tag_str, fmt.tprintf("Expected %s for sum(msg, key), but got %s instead", tag_str, derived_tag_str))
+
+	// Incremental
+	mem.zero(&derived_tag, size_of(derived_tag))
+	ctx: poly1305.Context = ---
+	poly1305.init(&ctx, key[:])
+	read_lengths := [11]int{32, 64, 16, 8, 4, 2, 1, 1, 1, 1, 1}
+	off := 0
+	for read_length in read_lengths {
+		to_read := msg[off:off+read_length]
+		poly1305.update(&ctx, to_read)
+		off = off + read_length
+	}
+	poly1305.final(&ctx, derived_tag[:])
+	derived_tag_str = hex_string(derived_tag[:])
+	expect(t, derived_tag_str == tag_str, fmt.tprintf("Expected %s for init/update/final - incremental, but got %s instead", tag_str, derived_tag_str))
+}
+
 TestECDH :: struct {
 	scalar:  string,
 	point:   string,
@@ -76,9 +141,75 @@ test_x25519 :: proc(t: ^testing.T) {
 bench_modern :: proc(t: ^testing.T) {
 	fmt.println("Starting benchmarks:")
 
+	bench_poly1305(t)
 	bench_x25519(t)
 }
 
+_setup_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
+	assert(options != nil)
+
+	options.input = make([]u8, options.bytes, allocator)
+	return nil if len(options.input) == options.bytes else .Allocation_Error
+}
+
+_teardown_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
+	assert(options != nil)
+
+	delete(options.input)
+	return nil
+}
+
+_benchmark_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
+	buf := options.input
+	key := [poly1305.KEY_SIZE]byte{
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+	}
+
+	tag: [poly1305.TAG_SIZE]byte = ---
+	for _ in 0..=options.rounds {
+		poly1305.sum(tag[:], buf, key[:])
+	}
+	options.count     = options.rounds
+	options.processed = options.rounds * options.bytes
+	//options.hash      = u128(h)
+	return nil
+}
+
+benchmark_print :: proc(name: string, options: ^time.Benchmark_Options) {
+	fmt.printf("\t[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n",
+		name,
+		options.rounds,
+		options.processed,
+		time.duration_nanoseconds(options.duration),
+		options.rounds_per_second,
+		options.megabytes_per_second,
+	)
+}
+
+bench_poly1305 :: proc(t: ^testing.T) {
+	name    := "Poly1305 64 zero bytes"
+	options := &time.Benchmark_Options{
+		rounds   = 1_000,
+		bytes    = 64,
+		setup    = _setup_poly1305,
+		bench    = _benchmark_poly1305,
+		teardown = _teardown_poly1305,
+	}
+
+	err  := time.benchmark(options, context.allocator)
+	expect(t, err == nil, name)
+	benchmark_print(name, options)
+
+	name = "Poly1305 1024 zero bytes"
+	options.bytes = 1024
+	err = time.benchmark(options, context.allocator)
+	expect(t, err == nil, name)
+	benchmark_print(name, options)
+}
+
 bench_x25519 :: proc(t: ^testing.T) {
 	point := _decode_hex32("deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef")
 	scalar := _decode_hex32("cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe")