Browse Source

core/crypto/x448: Initial import

Yawning Angel 1 year ago
parent
commit
9fdcc4e39a

+ 235 - 0
core/crypto/_fiat/field_curve448/field.odin

@@ -0,0 +1,235 @@
+package field_curve448
+
+import "core:mem"
+
+fe_relax_cast :: #force_inline proc "contextless" (
+	arg1: ^Tight_Field_Element,
+) -> ^Loose_Field_Element {
+	return (^Loose_Field_Element)(arg1)
+}
+
+fe_tighten_cast :: #force_inline proc "contextless" (
+	arg1: ^Loose_Field_Element,
+) -> ^Tight_Field_Element {
+	return (^Tight_Field_Element)(arg1)
+}
+
+fe_clear :: proc "contextless" (
+	arg1: $T,
+) where T == ^Tight_Field_Element || T == ^Loose_Field_Element {
+	mem.zero_explicit(arg1, size_of(arg1^))
+}
+
+fe_clear_vec :: proc "contextless" (
+	arg1: $T,
+) where T == []^Tight_Field_Element || T == []^Loose_Field_Element {
+	for fe in arg1 {
+		fe_clear(fe)
+	}
+}
+
+fe_carry_mul_small :: proc "contextless" (
+	out1: ^Tight_Field_Element,
+	arg1: ^Loose_Field_Element,
+	arg2: u64,
+) {
+	arg2_ := Loose_Field_Element{arg2, 0, 0, 0, 0, 0, 0, 0}
+	fe_carry_mul(out1, arg1, &arg2_)
+}
+
+fe_carry_pow2k :: proc "contextless" (
+	out1: ^Tight_Field_Element,
+	arg1: ^Loose_Field_Element,
+	arg2: uint,
+) {
+	// Special case: `arg1^(2 * 0) = 1`, though this should never happen.
+	if arg2 == 0 {
+		fe_one(out1)
+		return
+	}
+
+	fe_carry_square(out1, arg1)
+	for _ in 1 ..< arg2 {
+		fe_carry_square(out1, fe_relax_cast(out1))
+	}
+}
+
+fe_carry_inv :: proc "contextless" (
+	out1: ^Tight_Field_Element,
+	arg1: ^Loose_Field_Element,
+) {
+	// Inversion computation is derived from the addition chain:
+	//
+	//	_10     = 2*1
+	//	_11     = 1 + _10
+	//	_110    = 2*_11
+	//	_111    = 1 + _110
+	//	_111000 = _111 << 3
+	//	_111111 = _111 + _111000
+	//	x12     = _111111 << 6 + _111111
+	//	x24     = x12 << 12 + x12
+	//	i34     = x24 << 6
+	//	x30     = _111111 + i34
+	//	x48     = i34 << 18 + x24
+	//	x96     = x48 << 48 + x48
+	//	x192    = x96 << 96 + x96
+	//	x222    = x192 << 30 + x30
+	//	x223    = 2*x222 + 1
+	//	return    (x223 << 223 + x222) << 2 + 1
+	//
+	// Operations: 447 squares 13 multiplies
+	//
+	// Generated by github.com/mmcloughlin/addchain v0.4.0.
+
+	t0, t1, t2: Tight_Field_Element = ---, ---, ---
+
+	// Step 1: t0 = x^0x2
+	fe_carry_square(&t0, arg1)
+
+	// Step 2: t0 = x^0x3
+	fe_carry_mul(&t0, arg1, fe_relax_cast(&t0))
+
+	// t0.Sqr(t0)
+	fe_carry_square(&t0, fe_relax_cast(&t0))
+
+	// Step 4: t0 = x^0x7
+	fe_carry_mul(&t0, arg1, fe_relax_cast(&t0))
+
+	// Step 7: t1 = x^0x38
+	fe_carry_pow2k(&t1, fe_relax_cast(&t0), 3)
+
+	// Step 8: t0 = x^0x3f
+	fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1))
+
+	// Step 14: t1 = x^0xfc0
+	fe_carry_pow2k(&t1, fe_relax_cast(&t0), 6)
+
+	// Step 15: t1 = x^0xfff
+	fe_carry_mul(&t1, fe_relax_cast(&t0), fe_relax_cast(&t1))
+
+	// Step 27: t2 = x^0xfff000
+	fe_carry_pow2k(&t2, fe_relax_cast(&t1), 12)
+
+	// Step 28: t1 = x^0xffffff
+	fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2))
+
+	// Step 34: t2 = x^0x3fffffc0
+	fe_carry_pow2k(&t2, fe_relax_cast(&t1), 6)
+
+	// Step 35: t0 = x^0x3fffffff
+	fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t2))
+
+	// Step 53: t2 = x^0xffffff000000
+	fe_carry_pow2k(&t2, fe_relax_cast(&t2), 18)
+
+	// Step 54: t1 = x^0xffffffffffff
+	fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2))
+
+	// Step 102: t2 = x^0xffffffffffff000000000000
+	fe_carry_pow2k(&t2, fe_relax_cast(&t1), 48)
+
+	// Step 103: t1 = x^0xffffffffffffffffffffffff
+	fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2))
+
+	// Step 199: t2 = x^0xffffffffffffffffffffffff000000000000000000000000
+	fe_carry_pow2k(&t2, fe_relax_cast(&t1), 96)
+
+	// Step 200: t1 = x^0xffffffffffffffffffffffffffffffffffffffffffffffff
+	fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2))
+
+	// Step 230: t1 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffc0000000
+	fe_carry_pow2k(&t1, fe_relax_cast(&t1), 30)
+
+	// Step 231: t0 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffff
+	fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1))
+
+	// Step 232: t1 = x^0x7ffffffffffffffffffffffffffffffffffffffffffffffffffffffe
+	fe_carry_square(&t1, fe_relax_cast(&t0))
+
+	// Step 233: t1 = x^0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffff
+	fe_carry_mul(&t1, arg1, fe_relax_cast(&t1))
+
+	// Step 456: t1 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffff80000000000000000000000000000000000000000000000000000000
+	fe_carry_pow2k(&t1, fe_relax_cast(&t1), 223)
+
+	// Step 457: t0 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffffffffffffffffffffffffffffffffffffffffffffffffffffff
+	fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1))
+
+	// Step 459: t0 = x^0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffffffffffffffffffffffffffffffffffffffffffffffffffffc
+	fe_carry_pow2k(&t0, fe_relax_cast(&t0), 2)
+
+	// Step 460: z = x^0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffffffffffffffffffffffffffffffffffffffffffffffffffffd
+	fe_carry_mul(out1, arg1, fe_relax_cast(&t0))
+
+	fe_clear_vec([]^Tight_Field_Element{&t0, &t1, &t2})
+}
+
+fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+	out1[4] = 0
+	out1[5] = 0
+	out1[6] = 0
+	out1[7] = 0
+}
+
+fe_one :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 1
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+	out1[4] = 0
+	out1[5] = 0
+	out1[6] = 0
+	out1[7] = 0
+}
+
+fe_set :: proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	x5 := arg1[4]
+	x6 := arg1[5]
+	x7 := arg1[6]
+	x8 := arg1[7]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+	out1[5] = x6
+	out1[6] = x7
+	out1[7] = x8
+}
+
+@(optimization_mode = "none")
+fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: int) {
+	mask := (u64(arg1) * 0xffffffffffffffff)
+	x := (out1[0] ~ out2[0]) & mask
+	x1, y1 := out1[0] ~ x, out2[0] ~ x
+	x = (out1[1] ~ out2[1]) & mask
+	x2, y2 := out1[1] ~ x, out2[1] ~ x
+	x = (out1[2] ~ out2[2]) & mask
+	x3, y3 := out1[2] ~ x, out2[2] ~ x
+	x = (out1[3] ~ out2[3]) & mask
+	x4, y4 := out1[3] ~ x, out2[3] ~ x
+	x = (out1[4] ~ out2[4]) & mask
+	x5, y5 := out1[4] ~ x, out2[4] ~ x
+	x = (out1[5] ~ out2[5]) & mask
+	x6, y6 := out1[5] ~ x, out2[5] ~ x
+	x = (out1[6] ~ out2[6]) & mask
+	x7, y7 := out1[6] ~ x, out2[6] ~ x
+	x = (out1[7] ~ out2[7]) & mask
+	x8, y8 := out1[7] ~ x, out2[7] ~ x
+	out1[0], out2[0] = x1, y1
+	out1[1], out2[1] = x2, y2
+	out1[2], out2[2] = x3, y3
+	out1[3], out2[3] = x4, y4
+	out1[4], out2[4] = x5, y5
+	out1[5], out2[5] = x6, y6
+	out1[6], out2[6] = x7, y7
+	out1[7], out2[7] = x8, y8
+}

+ 1060 - 0
core/crypto/_fiat/field_curve448/field51.odin

@@ -0,0 +1,1060 @@
+// The BSD 1-Clause License (BSD-1-Clause)
+//
+// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file)
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design,
+// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package field_curve448
+
+// The file provides arithmetic on the field Z/(2^448 - 2^224 - 1) using
+// unsaturated 64-bit integer arithmetic.  It is derived primarily
+// from the machine generated Golang output from the fiat-crypto project.
+//
+// While the base implementation is provably correct, this implementation
+// makes no such claims as the port and optimizations were done by hand.
+//
+// TODO:
+//  * When fiat-crypto supports it, using a saturated 64-bit limbs
+//    instead of 56-bit limbs will be faster, though the gains are
+//    minimal unless adcx/adox/mulx are used.
+
+import fiat "core:crypto/_fiat"
+import "core:math/bits"
+
+Loose_Field_Element :: distinct [8]u64
+Tight_Field_Element :: distinct [8]u64
+
+@(rodata)
+FE_ZERO := Tight_Field_Element{0, 0, 0, 0, 0, 0, 0, 0}
+@(rodata)
+FE_ONE := Tight_Field_Element{1, 0, 0, 0, 0, 0, 0, 0}
+
+_addcarryx_u56 :: #force_inline proc "contextless" (
+	arg1: fiat.u1,
+	arg2, arg3: u64,
+) -> (
+	out1: u64,
+	out2: fiat.u1,
+) {
+	x1 := ((u64(arg1) + arg2) + arg3)
+	x2 := (x1 & 0xffffffffffffff)
+	x3 := fiat.u1((x1 >> 56))
+	out1 = x2
+	out2 = x3
+	return
+}
+
+_subborrowx_u56 :: #force_inline proc "contextless" (
+	arg1: fiat.u1,
+	arg2, arg3: u64,
+) -> (
+	out1: u64,
+	out2: fiat.u1,
+) {
+	x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
+	x2 := fiat.u1((x1 >> 56))
+	x3 := (u64(x1) & 0xffffffffffffff)
+	out1 = x3
+	out2 = (0x0 - fiat.u1(x2))
+	return
+}
+
+fe_carry_mul :: proc "contextless" (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) {
+	x2, x1 := bits.mul_u64(arg1[7], arg2[7])
+	x4, x3 := bits.mul_u64(arg1[7], arg2[6])
+	x6, x5 := bits.mul_u64(arg1[7], arg2[5])
+	x8, x7 := bits.mul_u64(arg1[6], arg2[7])
+	x10, x9 := bits.mul_u64(arg1[6], arg2[6])
+	x12, x11 := bits.mul_u64(arg1[5], arg2[7])
+	x14, x13 := bits.mul_u64(arg1[7], arg2[7])
+	x16, x15 := bits.mul_u64(arg1[7], arg2[6])
+	x18, x17 := bits.mul_u64(arg1[7], arg2[5])
+	x20, x19 := bits.mul_u64(arg1[6], arg2[7])
+	x22, x21 := bits.mul_u64(arg1[6], arg2[6])
+	x24, x23 := bits.mul_u64(arg1[5], arg2[7])
+	x26, x25 := bits.mul_u64(arg1[7], arg2[7])
+	x28, x27 := bits.mul_u64(arg1[7], arg2[6])
+	x30, x29 := bits.mul_u64(arg1[7], arg2[5])
+	x32, x31 := bits.mul_u64(arg1[7], arg2[4])
+	x34, x33 := bits.mul_u64(arg1[7], arg2[3])
+	x36, x35 := bits.mul_u64(arg1[7], arg2[2])
+	x38, x37 := bits.mul_u64(arg1[7], arg2[1])
+	x40, x39 := bits.mul_u64(arg1[6], arg2[7])
+	x42, x41 := bits.mul_u64(arg1[6], arg2[6])
+	x44, x43 := bits.mul_u64(arg1[6], arg2[5])
+	x46, x45 := bits.mul_u64(arg1[6], arg2[4])
+	x48, x47 := bits.mul_u64(arg1[6], arg2[3])
+	x50, x49 := bits.mul_u64(arg1[6], arg2[2])
+	x52, x51 := bits.mul_u64(arg1[5], arg2[7])
+	x54, x53 := bits.mul_u64(arg1[5], arg2[6])
+	x56, x55 := bits.mul_u64(arg1[5], arg2[5])
+	x58, x57 := bits.mul_u64(arg1[5], arg2[4])
+	x60, x59 := bits.mul_u64(arg1[5], arg2[3])
+	x62, x61 := bits.mul_u64(arg1[4], arg2[7])
+	x64, x63 := bits.mul_u64(arg1[4], arg2[6])
+	x66, x65 := bits.mul_u64(arg1[4], arg2[5])
+	x68, x67 := bits.mul_u64(arg1[4], arg2[4])
+	x70, x69 := bits.mul_u64(arg1[3], arg2[7])
+	x72, x71 := bits.mul_u64(arg1[3], arg2[6])
+	x74, x73 := bits.mul_u64(arg1[3], arg2[5])
+	x76, x75 := bits.mul_u64(arg1[2], arg2[7])
+	x78, x77 := bits.mul_u64(arg1[2], arg2[6])
+	x80, x79 := bits.mul_u64(arg1[1], arg2[7])
+	x82, x81 := bits.mul_u64(arg1[7], arg2[4])
+	x84, x83 := bits.mul_u64(arg1[7], arg2[3])
+	x86, x85 := bits.mul_u64(arg1[7], arg2[2])
+	x88, x87 := bits.mul_u64(arg1[7], arg2[1])
+	x90, x89 := bits.mul_u64(arg1[6], arg2[5])
+	x92, x91 := bits.mul_u64(arg1[6], arg2[4])
+	x94, x93 := bits.mul_u64(arg1[6], arg2[3])
+	x96, x95 := bits.mul_u64(arg1[6], arg2[2])
+	x98, x97 := bits.mul_u64(arg1[5], arg2[6])
+	x100, x99 := bits.mul_u64(arg1[5], arg2[5])
+	x102, x101 := bits.mul_u64(arg1[5], arg2[4])
+	x104, x103 := bits.mul_u64(arg1[5], arg2[3])
+	x106, x105 := bits.mul_u64(arg1[4], arg2[7])
+	x108, x107 := bits.mul_u64(arg1[4], arg2[6])
+	x110, x109 := bits.mul_u64(arg1[4], arg2[5])
+	x112, x111 := bits.mul_u64(arg1[4], arg2[4])
+	x114, x113 := bits.mul_u64(arg1[3], arg2[7])
+	x116, x115 := bits.mul_u64(arg1[3], arg2[6])
+	x118, x117 := bits.mul_u64(arg1[3], arg2[5])
+	x120, x119 := bits.mul_u64(arg1[2], arg2[7])
+	x122, x121 := bits.mul_u64(arg1[2], arg2[6])
+	x124, x123 := bits.mul_u64(arg1[1], arg2[7])
+	x126, x125 := bits.mul_u64(arg1[7], arg2[0])
+	x128, x127 := bits.mul_u64(arg1[6], arg2[1])
+	x130, x129 := bits.mul_u64(arg1[6], arg2[0])
+	x132, x131 := bits.mul_u64(arg1[5], arg2[2])
+	x134, x133 := bits.mul_u64(arg1[5], arg2[1])
+	x136, x135 := bits.mul_u64(arg1[5], arg2[0])
+	x138, x137 := bits.mul_u64(arg1[4], arg2[3])
+	x140, x139 := bits.mul_u64(arg1[4], arg2[2])
+	x142, x141 := bits.mul_u64(arg1[4], arg2[1])
+	x144, x143 := bits.mul_u64(arg1[4], arg2[0])
+	x146, x145 := bits.mul_u64(arg1[3], arg2[4])
+	x148, x147 := bits.mul_u64(arg1[3], arg2[3])
+	x150, x149 := bits.mul_u64(arg1[3], arg2[2])
+	x152, x151 := bits.mul_u64(arg1[3], arg2[1])
+	x154, x153 := bits.mul_u64(arg1[3], arg2[0])
+	x156, x155 := bits.mul_u64(arg1[2], arg2[5])
+	x158, x157 := bits.mul_u64(arg1[2], arg2[4])
+	x160, x159 := bits.mul_u64(arg1[2], arg2[3])
+	x162, x161 := bits.mul_u64(arg1[2], arg2[2])
+	x164, x163 := bits.mul_u64(arg1[2], arg2[1])
+	x166, x165 := bits.mul_u64(arg1[2], arg2[0])
+	x168, x167 := bits.mul_u64(arg1[1], arg2[6])
+	x170, x169 := bits.mul_u64(arg1[1], arg2[5])
+	x172, x171 := bits.mul_u64(arg1[1], arg2[4])
+	x174, x173 := bits.mul_u64(arg1[1], arg2[3])
+	x176, x175 := bits.mul_u64(arg1[1], arg2[2])
+	x178, x177 := bits.mul_u64(arg1[1], arg2[1])
+	x180, x179 := bits.mul_u64(arg1[1], arg2[0])
+	x182, x181 := bits.mul_u64(arg1[0], arg2[7])
+	x184, x183 := bits.mul_u64(arg1[0], arg2[6])
+	x186, x185 := bits.mul_u64(arg1[0], arg2[5])
+	x188, x187 := bits.mul_u64(arg1[0], arg2[4])
+	x190, x189 := bits.mul_u64(arg1[0], arg2[3])
+	x192, x191 := bits.mul_u64(arg1[0], arg2[2])
+	x194, x193 := bits.mul_u64(arg1[0], arg2[1])
+	x196, x195 := bits.mul_u64(arg1[0], arg2[0])
+	x197, x198 := bits.add_u64(x43, x31, u64(0x0))
+	x199, _ := bits.add_u64(x44, x32, u64(fiat.u1(x198)))
+	x201, x202 := bits.add_u64(x53, x197, u64(0x0))
+	x203, _ := bits.add_u64(x54, x199, u64(fiat.u1(x202)))
+	x205, x206 := bits.add_u64(x61, x201, u64(0x0))
+	x207, _ := bits.add_u64(x62, x203, u64(fiat.u1(x206)))
+	x209, x210 := bits.add_u64(x153, x205, u64(0x0))
+	x211, _ := bits.add_u64(x154, x207, u64(fiat.u1(x210)))
+	x213, x214 := bits.add_u64(x163, x209, u64(0x0))
+	x215, _ := bits.add_u64(x164, x211, u64(fiat.u1(x214)))
+	x217, x218 := bits.add_u64(x175, x213, u64(0x0))
+	x219, _ := bits.add_u64(x176, x215, u64(fiat.u1(x218)))
+	x221, x222 := bits.add_u64(x189, x217, u64(0x0))
+	x223, _ := bits.add_u64(x190, x219, u64(fiat.u1(x222)))
+	x225 := ((x221 >> 56) | ((x223 << 8) & 0xffffffffffffffff))
+	x226 := (x221 & 0xffffffffffffff)
+	x227, x228 := bits.add_u64(x89, x81, u64(0x0))
+	x229, _ := bits.add_u64(x90, x82, u64(fiat.u1(x228)))
+	x231, x232 := bits.add_u64(x97, x227, u64(0x0))
+	x233, _ := bits.add_u64(x98, x229, u64(fiat.u1(x232)))
+	x235, x236 := bits.add_u64(x105, x231, u64(0x0))
+	x237, _ := bits.add_u64(x106, x233, u64(fiat.u1(x236)))
+	x239, x240 := bits.add_u64(x125, x235, u64(0x0))
+	x241, _ := bits.add_u64(x126, x237, u64(fiat.u1(x240)))
+	x243, x244 := bits.add_u64(x127, x239, u64(0x0))
+	x245, _ := bits.add_u64(x128, x241, u64(fiat.u1(x244)))
+	x247, x248 := bits.add_u64(x131, x243, u64(0x0))
+	x249, _ := bits.add_u64(x132, x245, u64(fiat.u1(x248)))
+	x251, x252 := bits.add_u64(x137, x247, u64(0x0))
+	x253, _ := bits.add_u64(x138, x249, u64(fiat.u1(x252)))
+	x255, x256 := bits.add_u64(x145, x251, u64(0x0))
+	x257, _ := bits.add_u64(x146, x253, u64(fiat.u1(x256)))
+	x259, x260 := bits.add_u64(x155, x255, u64(0x0))
+	x261, _ := bits.add_u64(x156, x257, u64(fiat.u1(x260)))
+	x263, x264 := bits.add_u64(x167, x259, u64(0x0))
+	x265, _ := bits.add_u64(x168, x261, u64(fiat.u1(x264)))
+	x267, x268 := bits.add_u64(x181, x263, u64(0x0))
+	x269, _ := bits.add_u64(x182, x265, u64(fiat.u1(x268)))
+	x271, x272 := bits.add_u64(x25, x13, u64(0x0))
+	x273, _ := bits.add_u64(x26, x14, u64(fiat.u1(x272)))
+	x275, x276 := bits.add_u64(x83, x271, u64(0x0))
+	x277, _ := bits.add_u64(x84, x273, u64(fiat.u1(x276)))
+	x279, x280 := bits.add_u64(x91, x275, u64(0x0))
+	x281, _ := bits.add_u64(x92, x277, u64(fiat.u1(x280)))
+	x283, x284 := bits.add_u64(x99, x279, u64(0x0))
+	x285, _ := bits.add_u64(x100, x281, u64(fiat.u1(x284)))
+	x287, x288 := bits.add_u64(x107, x283, u64(0x0))
+	x289, _ := bits.add_u64(x108, x285, u64(fiat.u1(x288)))
+	x291, x292 := bits.add_u64(x113, x287, u64(0x0))
+	x293, _ := bits.add_u64(x114, x289, u64(fiat.u1(x292)))
+	x295, x296 := bits.add_u64(x129, x291, u64(0x0))
+	x297, _ := bits.add_u64(x130, x293, u64(fiat.u1(x296)))
+	x299, x300 := bits.add_u64(x133, x295, u64(0x0))
+	x301, _ := bits.add_u64(x134, x297, u64(fiat.u1(x300)))
+	x303, x304 := bits.add_u64(x139, x299, u64(0x0))
+	x305, _ := bits.add_u64(x140, x301, u64(fiat.u1(x304)))
+	x307, x308 := bits.add_u64(x147, x303, u64(0x0))
+	x309, _ := bits.add_u64(x148, x305, u64(fiat.u1(x308)))
+	x311, x312 := bits.add_u64(x157, x307, u64(0x0))
+	x313, _ := bits.add_u64(x158, x309, u64(fiat.u1(x312)))
+	x315, x316 := bits.add_u64(x169, x311, u64(0x0))
+	x317, _ := bits.add_u64(x170, x313, u64(fiat.u1(x316)))
+	x319, x320 := bits.add_u64(x183, x315, u64(0x0))
+	x321, _ := bits.add_u64(x184, x317, u64(fiat.u1(x320)))
+	x323, x324 := bits.add_u64(x19, x15, u64(0x0))
+	x325, _ := bits.add_u64(x20, x16, u64(fiat.u1(x324)))
+	x327, x328 := bits.add_u64(x27, x323, u64(0x0))
+	x329, _ := bits.add_u64(x28, x325, u64(fiat.u1(x328)))
+	x331, x332 := bits.add_u64(x39, x327, u64(0x0))
+	x333, _ := bits.add_u64(x40, x329, u64(fiat.u1(x332)))
+	x335, x336 := bits.add_u64(x85, x331, u64(0x0))
+	x337, _ := bits.add_u64(x86, x333, u64(fiat.u1(x336)))
+	x339, x340 := bits.add_u64(x93, x335, u64(0x0))
+	x341, _ := bits.add_u64(x94, x337, u64(fiat.u1(x340)))
+	x343, x344 := bits.add_u64(x101, x339, u64(0x0))
+	x345, _ := bits.add_u64(x102, x341, u64(fiat.u1(x344)))
+	x347, x348 := bits.add_u64(x109, x343, u64(0x0))
+	x349, _ := bits.add_u64(x110, x345, u64(fiat.u1(x348)))
+	x351, x352 := bits.add_u64(x115, x347, u64(0x0))
+	x353, _ := bits.add_u64(x116, x349, u64(fiat.u1(x352)))
+	x355, x356 := bits.add_u64(x119, x351, u64(0x0))
+	x357, _ := bits.add_u64(x120, x353, u64(fiat.u1(x356)))
+	x359, x360 := bits.add_u64(x135, x355, u64(0x0))
+	x361, _ := bits.add_u64(x136, x357, u64(fiat.u1(x360)))
+	x363, x364 := bits.add_u64(x141, x359, u64(0x0))
+	x365, _ := bits.add_u64(x142, x361, u64(fiat.u1(x364)))
+	x367, x368 := bits.add_u64(x149, x363, u64(0x0))
+	x369, _ := bits.add_u64(x150, x365, u64(fiat.u1(x368)))
+	x371, x372 := bits.add_u64(x159, x367, u64(0x0))
+	x373, _ := bits.add_u64(x160, x369, u64(fiat.u1(x372)))
+	x375, x376 := bits.add_u64(x171, x371, u64(0x0))
+	x377, _ := bits.add_u64(x172, x373, u64(fiat.u1(x376)))
+	x379, x380 := bits.add_u64(x185, x375, u64(0x0))
+	x381, _ := bits.add_u64(x186, x377, u64(fiat.u1(x380)))
+	x383, x384 := bits.add_u64(x21, x17, u64(0x0))
+	x385, _ := bits.add_u64(x22, x18, u64(fiat.u1(x384)))
+	x387, x388 := bits.add_u64(x23, x383, u64(0x0))
+	x389, _ := bits.add_u64(x24, x385, u64(fiat.u1(x388)))
+	x391, x392 := bits.add_u64(x29, x387, u64(0x0))
+	x393, _ := bits.add_u64(x30, x389, u64(fiat.u1(x392)))
+	x395, x396 := bits.add_u64(x41, x391, u64(0x0))
+	x397, _ := bits.add_u64(x42, x393, u64(fiat.u1(x396)))
+	x399, x400 := bits.add_u64(x51, x395, u64(0x0))
+	x401, _ := bits.add_u64(x52, x397, u64(fiat.u1(x400)))
+	x403, x404 := bits.add_u64(x87, x399, u64(0x0))
+	x405, _ := bits.add_u64(x88, x401, u64(fiat.u1(x404)))
+	x407, x408 := bits.add_u64(x95, x403, u64(0x0))
+	x409, _ := bits.add_u64(x96, x405, u64(fiat.u1(x408)))
+	x411, x412 := bits.add_u64(x103, x407, u64(0x0))
+	x413, _ := bits.add_u64(x104, x409, u64(fiat.u1(x412)))
+	x415, x416 := bits.add_u64(x111, x411, u64(0x0))
+	x417, _ := bits.add_u64(x112, x413, u64(fiat.u1(x416)))
+	x419, x420 := bits.add_u64(x117, x415, u64(0x0))
+	x421, _ := bits.add_u64(x118, x417, u64(fiat.u1(x420)))
+	x423, x424 := bits.add_u64(x121, x419, u64(0x0))
+	x425, _ := bits.add_u64(x122, x421, u64(fiat.u1(x424)))
+	x427, x428 := bits.add_u64(x123, x423, u64(0x0))
+	x429, _ := bits.add_u64(x124, x425, u64(fiat.u1(x428)))
+	x431, x432 := bits.add_u64(x143, x427, u64(0x0))
+	x433, _ := bits.add_u64(x144, x429, u64(fiat.u1(x432)))
+	x435, x436 := bits.add_u64(x151, x431, u64(0x0))
+	x437, _ := bits.add_u64(x152, x433, u64(fiat.u1(x436)))
+	x439, x440 := bits.add_u64(x161, x435, u64(0x0))
+	x441, _ := bits.add_u64(x162, x437, u64(fiat.u1(x440)))
+	x443, x444 := bits.add_u64(x173, x439, u64(0x0))
+	x445, _ := bits.add_u64(x174, x441, u64(fiat.u1(x444)))
+	x447, x448 := bits.add_u64(x187, x443, u64(0x0))
+	x449, _ := bits.add_u64(x188, x445, u64(fiat.u1(x448)))
+	x451, x452 := bits.add_u64(x33, x1, u64(0x0))
+	x453, _ := bits.add_u64(x34, x2, u64(fiat.u1(x452)))
+	x455, x456 := bits.add_u64(x45, x451, u64(0x0))
+	x457, _ := bits.add_u64(x46, x453, u64(fiat.u1(x456)))
+	x459, x460 := bits.add_u64(x55, x455, u64(0x0))
+	x461, _ := bits.add_u64(x56, x457, u64(fiat.u1(x460)))
+	x463, x464 := bits.add_u64(x63, x459, u64(0x0))
+	x465, _ := bits.add_u64(x64, x461, u64(fiat.u1(x464)))
+	x467, x468 := bits.add_u64(x69, x463, u64(0x0))
+	x469, _ := bits.add_u64(x70, x465, u64(fiat.u1(x468)))
+	x471, x472 := bits.add_u64(x165, x467, u64(0x0))
+	x473, _ := bits.add_u64(x166, x469, u64(fiat.u1(x472)))
+	x475, x476 := bits.add_u64(x177, x471, u64(0x0))
+	x477, _ := bits.add_u64(x178, x473, u64(fiat.u1(x476)))
+	x479, x480 := bits.add_u64(x191, x475, u64(0x0))
+	x481, _ := bits.add_u64(x192, x477, u64(fiat.u1(x480)))
+	x483, x484 := bits.add_u64(x7, x3, u64(0x0))
+	x485, _ := bits.add_u64(x8, x4, u64(fiat.u1(x484)))
+	x487, x488 := bits.add_u64(x35, x483, u64(0x0))
+	x489, _ := bits.add_u64(x36, x485, u64(fiat.u1(x488)))
+	x491, x492 := bits.add_u64(x47, x487, u64(0x0))
+	x493, _ := bits.add_u64(x48, x489, u64(fiat.u1(x492)))
+	x495, x496 := bits.add_u64(x57, x491, u64(0x0))
+	x497, _ := bits.add_u64(x58, x493, u64(fiat.u1(x496)))
+	x499, x500 := bits.add_u64(x65, x495, u64(0x0))
+	x501, _ := bits.add_u64(x66, x497, u64(fiat.u1(x500)))
+	x503, x504 := bits.add_u64(x71, x499, u64(0x0))
+	x505, _ := bits.add_u64(x72, x501, u64(fiat.u1(x504)))
+	x507, x508 := bits.add_u64(x75, x503, u64(0x0))
+	x509, _ := bits.add_u64(x76, x505, u64(fiat.u1(x508)))
+	x511, x512 := bits.add_u64(x179, x507, u64(0x0))
+	x513, _ := bits.add_u64(x180, x509, u64(fiat.u1(x512)))
+	x515, x516 := bits.add_u64(x193, x511, u64(0x0))
+	x517, _ := bits.add_u64(x194, x513, u64(fiat.u1(x516)))
+	x519, x520 := bits.add_u64(x9, x5, u64(0x0))
+	x521, _ := bits.add_u64(x10, x6, u64(fiat.u1(x520)))
+	x523, x524 := bits.add_u64(x11, x519, u64(0x0))
+	x525, _ := bits.add_u64(x12, x521, u64(fiat.u1(x524)))
+	x527, x528 := bits.add_u64(x37, x523, u64(0x0))
+	x529, _ := bits.add_u64(x38, x525, u64(fiat.u1(x528)))
+	x531, x532 := bits.add_u64(x49, x527, u64(0x0))
+	x533, _ := bits.add_u64(x50, x529, u64(fiat.u1(x532)))
+	x535, x536 := bits.add_u64(x59, x531, u64(0x0))
+	x537, _ := bits.add_u64(x60, x533, u64(fiat.u1(x536)))
+	x539, x540 := bits.add_u64(x67, x535, u64(0x0))
+	x541, _ := bits.add_u64(x68, x537, u64(fiat.u1(x540)))
+	x543, x544 := bits.add_u64(x73, x539, u64(0x0))
+	x545, _ := bits.add_u64(x74, x541, u64(fiat.u1(x544)))
+	x547, x548 := bits.add_u64(x77, x543, u64(0x0))
+	x549, _ := bits.add_u64(x78, x545, u64(fiat.u1(x548)))
+	x551, x552 := bits.add_u64(x79, x547, u64(0x0))
+	x553, _ := bits.add_u64(x80, x549, u64(fiat.u1(x552)))
+	x555, x556 := bits.add_u64(x195, x551, u64(0x0))
+	x557, _ := bits.add_u64(x196, x553, u64(fiat.u1(x556)))
+	x559, x560 := bits.add_u64(x225, x447, u64(0x0))
+	x561 := (u64(fiat.u1(x560)) + x449)
+	x562 := ((x267 >> 56) | ((x269 << 8) & 0xffffffffffffffff))
+	x563 := (x267 & 0xffffffffffffff)
+	x564, x565 := bits.add_u64(x559, x562, u64(0x0))
+	x566 := (u64(fiat.u1(x565)) + x561)
+	x567 := ((x564 >> 56) | ((x566 << 8) & 0xffffffffffffffff))
+	x568 := (x564 & 0xffffffffffffff)
+	x569, x570 := bits.add_u64(x555, x562, u64(0x0))
+	x571 := (u64(fiat.u1(x570)) + x557)
+	x572, x573 := bits.add_u64(x567, x379, u64(0x0))
+	x574 := (u64(fiat.u1(x573)) + x381)
+	x575 := ((x569 >> 56) | ((x571 << 8) & 0xffffffffffffffff))
+	x576 := (x569 & 0xffffffffffffff)
+	x577, x578 := bits.add_u64(x575, x515, u64(0x0))
+	x579 := (u64(fiat.u1(x578)) + x517)
+	x580 := ((x572 >> 56) | ((x574 << 8) & 0xffffffffffffffff))
+	x581 := (x572 & 0xffffffffffffff)
+	x582, x583 := bits.add_u64(x580, x319, u64(0x0))
+	x584 := (u64(fiat.u1(x583)) + x321)
+	x585 := ((x577 >> 56) | ((x579 << 8) & 0xffffffffffffffff))
+	x586 := (x577 & 0xffffffffffffff)
+	x587, x588 := bits.add_u64(x585, x479, u64(0x0))
+	x589 := (u64(fiat.u1(x588)) + x481)
+	x590 := ((x582 >> 56) | ((x584 << 8) & 0xffffffffffffffff))
+	x591 := (x582 & 0xffffffffffffff)
+	x592 := (x590 + x563)
+	x593 := ((x587 >> 56) | ((x589 << 8) & 0xffffffffffffffff))
+	x594 := (x587 & 0xffffffffffffff)
+	x595 := (x593 + x226)
+	x596 := (x592 >> 56)
+	x597 := (x592 & 0xffffffffffffff)
+	x598 := (x595 >> 56)
+	x599 := (x595 & 0xffffffffffffff)
+	x600 := (x568 + x596)
+	x601 := (x576 + x596)
+	x602 := (x598 + x600)
+	x603 := fiat.u1((x602 >> 56))
+	x604 := (x602 & 0xffffffffffffff)
+	x605 := (u64(x603) + x581)
+	x606 := fiat.u1((x601 >> 56))
+	x607 := (x601 & 0xffffffffffffff)
+	x608 := (u64(x606) + x586)
+	out1[0] = x607
+	out1[1] = x608
+	out1[2] = x594
+	out1[3] = x599
+	out1[4] = x604
+	out1[5] = x605
+	out1[6] = x591
+	out1[7] = x597
+}
+
+fe_carry_square :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x1 := arg1[7]
+	x2 := arg1[7]
+	x3 := (x1 * 0x2)
+	x4 := (x2 * 0x2)
+	x5 := (arg1[7] * 0x2)
+	x6 := arg1[6]
+	x7 := arg1[6]
+	x8 := (x6 * 0x2)
+	x9 := (x7 * 0x2)
+	x10 := (arg1[6] * 0x2)
+	x11 := arg1[5]
+	x12 := arg1[5]
+	x13 := (x11 * 0x2)
+	x14 := (x12 * 0x2)
+	x15 := (arg1[5] * 0x2)
+	x16 := arg1[4]
+	x17 := arg1[4]
+	x18 := (arg1[4] * 0x2)
+	x19 := (arg1[3] * 0x2)
+	x20 := (arg1[2] * 0x2)
+	x21 := (arg1[1] * 0x2)
+	x23, x22 := bits.mul_u64(arg1[7], x1)
+	x25, x24 := bits.mul_u64(arg1[6], x3)
+	x27, x26 := bits.mul_u64(arg1[6], x6)
+	x29, x28 := bits.mul_u64(arg1[5], x3)
+	x31, x30 := bits.mul_u64(arg1[7], x1)
+	x33, x32 := bits.mul_u64(arg1[6], x3)
+	x35, x34 := bits.mul_u64(arg1[6], x6)
+	x37, x36 := bits.mul_u64(arg1[5], x3)
+	x39, x38 := bits.mul_u64(arg1[7], x2)
+	x41, x40 := bits.mul_u64(arg1[6], x4)
+	x43, x42 := bits.mul_u64(arg1[6], x7)
+	x45, x44 := bits.mul_u64(arg1[5], x4)
+	x47, x46 := bits.mul_u64(arg1[5], x9)
+	x49, x48 := bits.mul_u64(arg1[5], x8)
+	x51, x50 := bits.mul_u64(arg1[5], x12)
+	x53, x52 := bits.mul_u64(arg1[5], x11)
+	x55, x54 := bits.mul_u64(arg1[4], x4)
+	x57, x56 := bits.mul_u64(arg1[4], x3)
+	x59, x58 := bits.mul_u64(arg1[4], x9)
+	x61, x60 := bits.mul_u64(arg1[4], x8)
+	x63, x62 := bits.mul_u64(arg1[4], x14)
+	x65, x64 := bits.mul_u64(arg1[4], x13)
+	x67, x66 := bits.mul_u64(arg1[4], x17)
+	x69, x68 := bits.mul_u64(arg1[4], x16)
+	x71, x70 := bits.mul_u64(arg1[3], x4)
+	x73, x72 := bits.mul_u64(arg1[3], x3)
+	x75, x74 := bits.mul_u64(arg1[3], x9)
+	x77, x76 := bits.mul_u64(arg1[3], x8)
+	x79, x78 := bits.mul_u64(arg1[3], x14)
+	x81, x80 := bits.mul_u64(arg1[3], x13)
+	x83, x82 := bits.mul_u64(arg1[3], x18)
+	x85, x84 := bits.mul_u64(arg1[3], arg1[3])
+	x87, x86 := bits.mul_u64(arg1[2], x4)
+	x89, x88 := bits.mul_u64(arg1[2], x3)
+	x91, x90 := bits.mul_u64(arg1[2], x9)
+	x93, x92 := bits.mul_u64(arg1[2], x8)
+	x95, x94 := bits.mul_u64(arg1[2], x15)
+	x97, x96 := bits.mul_u64(arg1[2], x18)
+	x99, x98 := bits.mul_u64(arg1[2], x19)
+	x101, x100 := bits.mul_u64(arg1[2], arg1[2])
+	x103, x102 := bits.mul_u64(arg1[1], x4)
+	x105, x104 := bits.mul_u64(arg1[1], x3)
+	x107, x106 := bits.mul_u64(arg1[1], x10)
+	x109, x108 := bits.mul_u64(arg1[1], x15)
+	x111, x110 := bits.mul_u64(arg1[1], x18)
+	x113, x112 := bits.mul_u64(arg1[1], x19)
+	x115, x114 := bits.mul_u64(arg1[1], x20)
+	x117, x116 := bits.mul_u64(arg1[1], arg1[1])
+	x119, x118 := bits.mul_u64(arg1[0], x5)
+	x121, x120 := bits.mul_u64(arg1[0], x10)
+	x123, x122 := bits.mul_u64(arg1[0], x15)
+	x125, x124 := bits.mul_u64(arg1[0], x18)
+	x127, x126 := bits.mul_u64(arg1[0], x19)
+	x129, x128 := bits.mul_u64(arg1[0], x20)
+	x131, x130 := bits.mul_u64(arg1[0], x21)
+	x133, x132 := bits.mul_u64(arg1[0], arg1[0])
+	x134, x135 := bits.add_u64(x54, x46, u64(0x0))
+	x136, _ := bits.add_u64(x55, x47, u64(fiat.u1(x135)))
+	x138, x139 := bits.add_u64(x114, x134, u64(0x0))
+	x140, _ := bits.add_u64(x115, x136, u64(fiat.u1(x139)))
+	x142, x143 := bits.add_u64(x126, x138, u64(0x0))
+	x144, _ := bits.add_u64(x127, x140, u64(fiat.u1(x143)))
+	x146 := ((x142 >> 56) | ((x144 << 8) & 0xffffffffffffffff))
+	x147 := (x142 & 0xffffffffffffff)
+	x148, x149 := bits.add_u64(x56, x48, u64(0x0))
+	x150, _ := bits.add_u64(x57, x49, u64(fiat.u1(x149)))
+	x152, x153 := bits.add_u64(x82, x148, u64(0x0))
+	x154, _ := bits.add_u64(x83, x150, u64(fiat.u1(x153)))
+	x156, x157 := bits.add_u64(x94, x152, u64(0x0))
+	x158, _ := bits.add_u64(x95, x154, u64(fiat.u1(x157)))
+	x160, x161 := bits.add_u64(x106, x156, u64(0x0))
+	x162, _ := bits.add_u64(x107, x158, u64(fiat.u1(x161)))
+	x164, x165 := bits.add_u64(x118, x160, u64(0x0))
+	x166, _ := bits.add_u64(x119, x162, u64(fiat.u1(x165)))
+	x168, x169 := bits.add_u64(x38, x30, u64(0x0))
+	x170, _ := bits.add_u64(x39, x31, u64(fiat.u1(x169)))
+	x172, x173 := bits.add_u64(x52, x168, u64(0x0))
+	x174, _ := bits.add_u64(x53, x170, u64(fiat.u1(x173)))
+	x176, x177 := bits.add_u64(x60, x172, u64(0x0))
+	x178, _ := bits.add_u64(x61, x174, u64(fiat.u1(x177)))
+	x180, x181 := bits.add_u64(x72, x176, u64(0x0))
+	x182, _ := bits.add_u64(x73, x178, u64(fiat.u1(x181)))
+	x184, x185 := bits.add_u64(x84, x180, u64(0x0))
+	x186, _ := bits.add_u64(x85, x182, u64(fiat.u1(x185)))
+	x188, x189 := bits.add_u64(x96, x184, u64(0x0))
+	x190, _ := bits.add_u64(x97, x186, u64(fiat.u1(x189)))
+	x192, x193 := bits.add_u64(x108, x188, u64(0x0))
+	x194, _ := bits.add_u64(x109, x190, u64(fiat.u1(x193)))
+	x196, x197 := bits.add_u64(x120, x192, u64(0x0))
+	x198, _ := bits.add_u64(x121, x194, u64(fiat.u1(x197)))
+	x200, x201 := bits.add_u64(x40, x32, u64(0x0))
+	x202, _ := bits.add_u64(x41, x33, u64(fiat.u1(x201)))
+	x204, x205 := bits.add_u64(x64, x200, u64(0x0))
+	x206, _ := bits.add_u64(x65, x202, u64(fiat.u1(x205)))
+	x208, x209 := bits.add_u64(x76, x204, u64(0x0))
+	x210, _ := bits.add_u64(x77, x206, u64(fiat.u1(x209)))
+	x212, x213 := bits.add_u64(x88, x208, u64(0x0))
+	x214, _ := bits.add_u64(x89, x210, u64(fiat.u1(x213)))
+	x216, x217 := bits.add_u64(x98, x212, u64(0x0))
+	x218, _ := bits.add_u64(x99, x214, u64(fiat.u1(x217)))
+	x220, x221 := bits.add_u64(x110, x216, u64(0x0))
+	x222, _ := bits.add_u64(x111, x218, u64(fiat.u1(x221)))
+	x224, x225 := bits.add_u64(x122, x220, u64(0x0))
+	x226, _ := bits.add_u64(x123, x222, u64(fiat.u1(x225)))
+	x228, x229 := bits.add_u64(x36, x34, u64(0x0))
+	x230, _ := bits.add_u64(x37, x35, u64(fiat.u1(x229)))
+	x232, x233 := bits.add_u64(x42, x228, u64(0x0))
+	x234, _ := bits.add_u64(x43, x230, u64(fiat.u1(x233)))
+	x236, x237 := bits.add_u64(x44, x232, u64(0x0))
+	x238, _ := bits.add_u64(x45, x234, u64(fiat.u1(x237)))
+	x240, x241 := bits.add_u64(x68, x236, u64(0x0))
+	x242, _ := bits.add_u64(x69, x238, u64(fiat.u1(x241)))
+	x244, x245 := bits.add_u64(x80, x240, u64(0x0))
+	x246, _ := bits.add_u64(x81, x242, u64(fiat.u1(x245)))
+	x248, x249 := bits.add_u64(x92, x244, u64(0x0))
+	x250, _ := bits.add_u64(x93, x246, u64(fiat.u1(x249)))
+	x252, x253 := bits.add_u64(x100, x248, u64(0x0))
+	x254, _ := bits.add_u64(x101, x250, u64(fiat.u1(x253)))
+	x256, x257 := bits.add_u64(x104, x252, u64(0x0))
+	x258, _ := bits.add_u64(x105, x254, u64(fiat.u1(x257)))
+	x260, x261 := bits.add_u64(x112, x256, u64(0x0))
+	x262, _ := bits.add_u64(x113, x258, u64(fiat.u1(x261)))
+	x264, x265 := bits.add_u64(x124, x260, u64(0x0))
+	x266, _ := bits.add_u64(x125, x262, u64(fiat.u1(x265)))
+	x268, x269 := bits.add_u64(x50, x22, u64(0x0))
+	x270, _ := bits.add_u64(x51, x23, u64(fiat.u1(x269)))
+	x272, x273 := bits.add_u64(x58, x268, u64(0x0))
+	x274, _ := bits.add_u64(x59, x270, u64(fiat.u1(x273)))
+	x276, x277 := bits.add_u64(x70, x272, u64(0x0))
+	x278, _ := bits.add_u64(x71, x274, u64(fiat.u1(x277)))
+	x280, x281 := bits.add_u64(x116, x276, u64(0x0))
+	x282, _ := bits.add_u64(x117, x278, u64(fiat.u1(x281)))
+	x284, x285 := bits.add_u64(x128, x280, u64(0x0))
+	x286, _ := bits.add_u64(x129, x282, u64(fiat.u1(x285)))
+	x288, x289 := bits.add_u64(x62, x24, u64(0x0))
+	x290, _ := bits.add_u64(x63, x25, u64(fiat.u1(x289)))
+	x292, x293 := bits.add_u64(x74, x288, u64(0x0))
+	x294, _ := bits.add_u64(x75, x290, u64(fiat.u1(x293)))
+	x296, x297 := bits.add_u64(x86, x292, u64(0x0))
+	x298, _ := bits.add_u64(x87, x294, u64(fiat.u1(x297)))
+	x300, x301 := bits.add_u64(x130, x296, u64(0x0))
+	x302, _ := bits.add_u64(x131, x298, u64(fiat.u1(x301)))
+	x304, x305 := bits.add_u64(x28, x26, u64(0x0))
+	x306, _ := bits.add_u64(x29, x27, u64(fiat.u1(x305)))
+	x308, x309 := bits.add_u64(x66, x304, u64(0x0))
+	x310, _ := bits.add_u64(x67, x306, u64(fiat.u1(x309)))
+	x312, x313 := bits.add_u64(x78, x308, u64(0x0))
+	x314, _ := bits.add_u64(x79, x310, u64(fiat.u1(x313)))
+	x316, x317 := bits.add_u64(x90, x312, u64(0x0))
+	x318, _ := bits.add_u64(x91, x314, u64(fiat.u1(x317)))
+	x320, x321 := bits.add_u64(x102, x316, u64(0x0))
+	x322, _ := bits.add_u64(x103, x318, u64(fiat.u1(x321)))
+	x324, x325 := bits.add_u64(x132, x320, u64(0x0))
+	x326, _ := bits.add_u64(x133, x322, u64(fiat.u1(x325)))
+	x328, x329 := bits.add_u64(x146, x264, u64(0x0))
+	x330 := (u64(fiat.u1(x329)) + x266)
+	x331 := ((x164 >> 56) | ((x166 << 8) & 0xffffffffffffffff))
+	x332 := (x164 & 0xffffffffffffff)
+	x333, x334 := bits.add_u64(x328, x331, u64(0x0))
+	x335 := (u64(fiat.u1(x334)) + x330)
+	x336 := ((x333 >> 56) | ((x335 << 8) & 0xffffffffffffffff))
+	x337 := (x333 & 0xffffffffffffff)
+	x338, x339 := bits.add_u64(x324, x331, u64(0x0))
+	x340 := (u64(fiat.u1(x339)) + x326)
+	x341, x342 := bits.add_u64(x336, x224, u64(0x0))
+	x343 := (u64(fiat.u1(x342)) + x226)
+	x344 := ((x338 >> 56) | ((x340 << 8) & 0xffffffffffffffff))
+	x345 := (x338 & 0xffffffffffffff)
+	x346, x347 := bits.add_u64(x344, x300, u64(0x0))
+	x348 := (u64(fiat.u1(x347)) + x302)
+	x349 := ((x341 >> 56) | ((x343 << 8) & 0xffffffffffffffff))
+	x350 := (x341 & 0xffffffffffffff)
+	x351, x352 := bits.add_u64(x349, x196, u64(0x0))
+	x353 := (u64(fiat.u1(x352)) + x198)
+	x354 := ((x346 >> 56) | ((x348 << 8) & 0xffffffffffffffff))
+	x355 := (x346 & 0xffffffffffffff)
+	x356, x357 := bits.add_u64(x354, x284, u64(0x0))
+	x358 := (u64(fiat.u1(x357)) + x286)
+	x359 := ((x351 >> 56) | ((x353 << 8) & 0xffffffffffffffff))
+	x360 := (x351 & 0xffffffffffffff)
+	x361 := (x359 + x332)
+	x362 := ((x356 >> 56) | ((x358 << 8) & 0xffffffffffffffff))
+	x363 := (x356 & 0xffffffffffffff)
+	x364 := (x362 + x147)
+	x365 := (x361 >> 56)
+	x366 := (x361 & 0xffffffffffffff)
+	x367 := (x364 >> 56)
+	x368 := (x364 & 0xffffffffffffff)
+	x369 := (x337 + x365)
+	x370 := (x345 + x365)
+	x371 := (x367 + x369)
+	x372 := fiat.u1((x371 >> 56))
+	x373 := (x371 & 0xffffffffffffff)
+	x374 := (u64(x372) + x350)
+	x375 := fiat.u1((x370 >> 56))
+	x376 := (x370 & 0xffffffffffffff)
+	x377 := (u64(x375) + x355)
+	out1[0] = x376
+	out1[1] = x377
+	out1[2] = x363
+	out1[3] = x368
+	out1[4] = x373
+	out1[5] = x374
+	out1[6] = x360
+	out1[7] = x366
+}
+
+fe_carry :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x1 := arg1[3]
+	x2 := arg1[7]
+	x3 := (x2 >> 56)
+	x4 := (((x1 >> 56) + arg1[4]) + x3)
+	x5 := (arg1[0] + x3)
+	x6 := ((x4 >> 56) + arg1[5])
+	x7 := ((x5 >> 56) + arg1[1])
+	x8 := ((x6 >> 56) + arg1[6])
+	x9 := ((x7 >> 56) + arg1[2])
+	x10 := ((x8 >> 56) + (x2 & 0xffffffffffffff))
+	x11 := ((x9 >> 56) + (x1 & 0xffffffffffffff))
+	x12 := fiat.u1((x10 >> 56))
+	x13 := ((x5 & 0xffffffffffffff) + u64(x12))
+	x14 := (u64(fiat.u1((x11 >> 56))) + ((x4 & 0xffffffffffffff) + u64(x12)))
+	x15 := (x13 & 0xffffffffffffff)
+	x16 := (u64(fiat.u1((x13 >> 56))) + (x7 & 0xffffffffffffff))
+	x17 := (x9 & 0xffffffffffffff)
+	x18 := (x11 & 0xffffffffffffff)
+	x19 := (x14 & 0xffffffffffffff)
+	x20 := (u64(fiat.u1((x14 >> 56))) + (x6 & 0xffffffffffffff))
+	x21 := (x8 & 0xffffffffffffff)
+	x22 := (x10 & 0xffffffffffffff)
+	out1[0] = x15
+	out1[1] = x16
+	out1[2] = x17
+	out1[3] = x18
+	out1[4] = x19
+	out1[5] = x20
+	out1[6] = x21
+	out1[7] = x22
+}
+
+fe_add :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
+	x1 := (arg1[0] + arg2[0])
+	x2 := (arg1[1] + arg2[1])
+	x3 := (arg1[2] + arg2[2])
+	x4 := (arg1[3] + arg2[3])
+	x5 := (arg1[4] + arg2[4])
+	x6 := (arg1[5] + arg2[5])
+	x7 := (arg1[6] + arg2[6])
+	x8 := (arg1[7] + arg2[7])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+	out1[5] = x6
+	out1[6] = x7
+	out1[7] = x8
+}
+
+fe_sub :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
+	x1 := ((0x1fffffffffffffe + arg1[0]) - arg2[0])
+	x2 := ((0x1fffffffffffffe + arg1[1]) - arg2[1])
+	x3 := ((0x1fffffffffffffe + arg1[2]) - arg2[2])
+	x4 := ((0x1fffffffffffffe + arg1[3]) - arg2[3])
+	x5 := ((0x1fffffffffffffc + arg1[4]) - arg2[4])
+	x6 := ((0x1fffffffffffffe + arg1[5]) - arg2[5])
+	x7 := ((0x1fffffffffffffe + arg1[6]) - arg2[6])
+	x8 := ((0x1fffffffffffffe + arg1[7]) - arg2[7])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+	out1[5] = x6
+	out1[6] = x7
+	out1[7] = x8
+}
+
+fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
+	x1 := (0x1fffffffffffffe - arg1[0])
+	x2 := (0x1fffffffffffffe - arg1[1])
+	x3 := (0x1fffffffffffffe - arg1[2])
+	x4 := (0x1fffffffffffffe - arg1[3])
+	x5 := (0x1fffffffffffffc - arg1[4])
+	x6 := (0x1fffffffffffffe - arg1[5])
+	x7 := (0x1fffffffffffffe - arg1[6])
+	x8 := (0x1fffffffffffffe - arg1[7])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+	out1[5] = x6
+	out1[6] = x7
+	out1[7] = x8
+}
+
+@(optimization_mode = "none")
+fe_cond_assign :: #force_no_inline proc "contextless" (
+	out1, arg1: ^Tight_Field_Element,
+	arg2: int,
+) {
+	x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
+	x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
+	x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
+	x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3])
+	x5 := fiat.cmovznz_u64(fiat.u1(arg2), out1[4], arg1[4])
+	x6 := fiat.cmovznz_u64(fiat.u1(arg2), out1[5], arg1[5])
+	x7 := fiat.cmovznz_u64(fiat.u1(arg2), out1[6], arg1[6])
+	x8 := fiat.cmovznz_u64(fiat.u1(arg2), out1[7], arg1[7])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+	out1[5] = x6
+	out1[6] = x7
+	out1[7] = x8
+}
+
+fe_to_bytes :: proc "contextless" (out1: ^[56]byte, arg1: ^Tight_Field_Element) {
+	x1, x2 := _subborrowx_u56(0x0, arg1[0], 0xffffffffffffff)
+	x3, x4 := _subborrowx_u56(x2, arg1[1], 0xffffffffffffff)
+	x5, x6 := _subborrowx_u56(x4, arg1[2], 0xffffffffffffff)
+	x7, x8 := _subborrowx_u56(x6, arg1[3], 0xffffffffffffff)
+	x9, x10 := _subborrowx_u56(x8, arg1[4], 0xfffffffffffffe)
+	x11, x12 := _subborrowx_u56(x10, arg1[5], 0xffffffffffffff)
+	x13, x14 := _subborrowx_u56(x12, arg1[6], 0xffffffffffffff)
+	x15, x16 := _subborrowx_u56(x14, arg1[7], 0xffffffffffffff)
+	x17 := fiat.cmovznz_u64(x16, u64(0x0), 0xffffffffffffffff)
+	x18, x19 := _addcarryx_u56(0x0, x1, (x17 & 0xffffffffffffff))
+	x20, x21 := _addcarryx_u56(x19, x3, (x17 & 0xffffffffffffff))
+	x22, x23 := _addcarryx_u56(x21, x5, (x17 & 0xffffffffffffff))
+	x24, x25 := _addcarryx_u56(x23, x7, (x17 & 0xffffffffffffff))
+	x26, x27 := _addcarryx_u56(x25, x9, (x17 & 0xfffffffffffffe))
+	x28, x29 := _addcarryx_u56(x27, x11, (x17 & 0xffffffffffffff))
+	x30, x31 := _addcarryx_u56(x29, x13, (x17 & 0xffffffffffffff))
+	x32, _ := _addcarryx_u56(x31, x15, (x17 & 0xffffffffffffff))
+	x34 := (u8(x18) & 0xff)
+	x35 := (x18 >> 8)
+	x36 := (u8(x35) & 0xff)
+	x37 := (x35 >> 8)
+	x38 := (u8(x37) & 0xff)
+	x39 := (x37 >> 8)
+	x40 := (u8(x39) & 0xff)
+	x41 := (x39 >> 8)
+	x42 := (u8(x41) & 0xff)
+	x43 := (x41 >> 8)
+	x44 := (u8(x43) & 0xff)
+	x45 := u8((x43 >> 8))
+	x46 := (u8(x20) & 0xff)
+	x47 := (x20 >> 8)
+	x48 := (u8(x47) & 0xff)
+	x49 := (x47 >> 8)
+	x50 := (u8(x49) & 0xff)
+	x51 := (x49 >> 8)
+	x52 := (u8(x51) & 0xff)
+	x53 := (x51 >> 8)
+	x54 := (u8(x53) & 0xff)
+	x55 := (x53 >> 8)
+	x56 := (u8(x55) & 0xff)
+	x57 := u8((x55 >> 8))
+	x58 := (u8(x22) & 0xff)
+	x59 := (x22 >> 8)
+	x60 := (u8(x59) & 0xff)
+	x61 := (x59 >> 8)
+	x62 := (u8(x61) & 0xff)
+	x63 := (x61 >> 8)
+	x64 := (u8(x63) & 0xff)
+	x65 := (x63 >> 8)
+	x66 := (u8(x65) & 0xff)
+	x67 := (x65 >> 8)
+	x68 := (u8(x67) & 0xff)
+	x69 := u8((x67 >> 8))
+	x70 := (u8(x24) & 0xff)
+	x71 := (x24 >> 8)
+	x72 := (u8(x71) & 0xff)
+	x73 := (x71 >> 8)
+	x74 := (u8(x73) & 0xff)
+	x75 := (x73 >> 8)
+	x76 := (u8(x75) & 0xff)
+	x77 := (x75 >> 8)
+	x78 := (u8(x77) & 0xff)
+	x79 := (x77 >> 8)
+	x80 := (u8(x79) & 0xff)
+	x81 := u8((x79 >> 8))
+	x82 := (u8(x26) & 0xff)
+	x83 := (x26 >> 8)
+	x84 := (u8(x83) & 0xff)
+	x85 := (x83 >> 8)
+	x86 := (u8(x85) & 0xff)
+	x87 := (x85 >> 8)
+	x88 := (u8(x87) & 0xff)
+	x89 := (x87 >> 8)
+	x90 := (u8(x89) & 0xff)
+	x91 := (x89 >> 8)
+	x92 := (u8(x91) & 0xff)
+	x93 := u8((x91 >> 8))
+	x94 := (u8(x28) & 0xff)
+	x95 := (x28 >> 8)
+	x96 := (u8(x95) & 0xff)
+	x97 := (x95 >> 8)
+	x98 := (u8(x97) & 0xff)
+	x99 := (x97 >> 8)
+	x100 := (u8(x99) & 0xff)
+	x101 := (x99 >> 8)
+	x102 := (u8(x101) & 0xff)
+	x103 := (x101 >> 8)
+	x104 := (u8(x103) & 0xff)
+	x105 := u8((x103 >> 8))
+	x106 := (u8(x30) & 0xff)
+	x107 := (x30 >> 8)
+	x108 := (u8(x107) & 0xff)
+	x109 := (x107 >> 8)
+	x110 := (u8(x109) & 0xff)
+	x111 := (x109 >> 8)
+	x112 := (u8(x111) & 0xff)
+	x113 := (x111 >> 8)
+	x114 := (u8(x113) & 0xff)
+	x115 := (x113 >> 8)
+	x116 := (u8(x115) & 0xff)
+	x117 := u8((x115 >> 8))
+	x118 := (u8(x32) & 0xff)
+	x119 := (x32 >> 8)
+	x120 := (u8(x119) & 0xff)
+	x121 := (x119 >> 8)
+	x122 := (u8(x121) & 0xff)
+	x123 := (x121 >> 8)
+	x124 := (u8(x123) & 0xff)
+	x125 := (x123 >> 8)
+	x126 := (u8(x125) & 0xff)
+	x127 := (x125 >> 8)
+	x128 := (u8(x127) & 0xff)
+	x129 := u8((x127 >> 8))
+	out1[0] = x34
+	out1[1] = x36
+	out1[2] = x38
+	out1[3] = x40
+	out1[4] = x42
+	out1[5] = x44
+	out1[6] = x45
+	out1[7] = x46
+	out1[8] = x48
+	out1[9] = x50
+	out1[10] = x52
+	out1[11] = x54
+	out1[12] = x56
+	out1[13] = x57
+	out1[14] = x58
+	out1[15] = x60
+	out1[16] = x62
+	out1[17] = x64
+	out1[18] = x66
+	out1[19] = x68
+	out1[20] = x69
+	out1[21] = x70
+	out1[22] = x72
+	out1[23] = x74
+	out1[24] = x76
+	out1[25] = x78
+	out1[26] = x80
+	out1[27] = x81
+	out1[28] = x82
+	out1[29] = x84
+	out1[30] = x86
+	out1[31] = x88
+	out1[32] = x90
+	out1[33] = x92
+	out1[34] = x93
+	out1[35] = x94
+	out1[36] = x96
+	out1[37] = x98
+	out1[38] = x100
+	out1[39] = x102
+	out1[40] = x104
+	out1[41] = x105
+	out1[42] = x106
+	out1[43] = x108
+	out1[44] = x110
+	out1[45] = x112
+	out1[46] = x114
+	out1[47] = x116
+	out1[48] = x117
+	out1[49] = x118
+	out1[50] = x120
+	out1[51] = x122
+	out1[52] = x124
+	out1[53] = x126
+	out1[54] = x128
+	out1[55] = x129
+}
+
+fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[56]byte) {
+	x1 := (u64(arg1[55]) << 48)
+	x2 := (u64(arg1[54]) << 40)
+	x3 := (u64(arg1[53]) << 32)
+	x4 := (u64(arg1[52]) << 24)
+	x5 := (u64(arg1[51]) << 16)
+	x6 := (u64(arg1[50]) << 8)
+	x7 := arg1[49]
+	x8 := (u64(arg1[48]) << 48)
+	x9 := (u64(arg1[47]) << 40)
+	x10 := (u64(arg1[46]) << 32)
+	x11 := (u64(arg1[45]) << 24)
+	x12 := (u64(arg1[44]) << 16)
+	x13 := (u64(arg1[43]) << 8)
+	x14 := arg1[42]
+	x15 := (u64(arg1[41]) << 48)
+	x16 := (u64(arg1[40]) << 40)
+	x17 := (u64(arg1[39]) << 32)
+	x18 := (u64(arg1[38]) << 24)
+	x19 := (u64(arg1[37]) << 16)
+	x20 := (u64(arg1[36]) << 8)
+	x21 := arg1[35]
+	x22 := (u64(arg1[34]) << 48)
+	x23 := (u64(arg1[33]) << 40)
+	x24 := (u64(arg1[32]) << 32)
+	x25 := (u64(arg1[31]) << 24)
+	x26 := (u64(arg1[30]) << 16)
+	x27 := (u64(arg1[29]) << 8)
+	x28 := arg1[28]
+	x29 := (u64(arg1[27]) << 48)
+	x30 := (u64(arg1[26]) << 40)
+	x31 := (u64(arg1[25]) << 32)
+	x32 := (u64(arg1[24]) << 24)
+	x33 := (u64(arg1[23]) << 16)
+	x34 := (u64(arg1[22]) << 8)
+	x35 := arg1[21]
+	x36 := (u64(arg1[20]) << 48)
+	x37 := (u64(arg1[19]) << 40)
+	x38 := (u64(arg1[18]) << 32)
+	x39 := (u64(arg1[17]) << 24)
+	x40 := (u64(arg1[16]) << 16)
+	x41 := (u64(arg1[15]) << 8)
+	x42 := arg1[14]
+	x43 := (u64(arg1[13]) << 48)
+	x44 := (u64(arg1[12]) << 40)
+	x45 := (u64(arg1[11]) << 32)
+	x46 := (u64(arg1[10]) << 24)
+	x47 := (u64(arg1[9]) << 16)
+	x48 := (u64(arg1[8]) << 8)
+	x49 := arg1[7]
+	x50 := (u64(arg1[6]) << 48)
+	x51 := (u64(arg1[5]) << 40)
+	x52 := (u64(arg1[4]) << 32)
+	x53 := (u64(arg1[3]) << 24)
+	x54 := (u64(arg1[2]) << 16)
+	x55 := (u64(arg1[1]) << 8)
+	x56 := arg1[0]
+	x57 := (x55 + u64(x56))
+	x58 := (x54 + x57)
+	x59 := (x53 + x58)
+	x60 := (x52 + x59)
+	x61 := (x51 + x60)
+	x62 := (x50 + x61)
+	x63 := (x48 + u64(x49))
+	x64 := (x47 + x63)
+	x65 := (x46 + x64)
+	x66 := (x45 + x65)
+	x67 := (x44 + x66)
+	x68 := (x43 + x67)
+	x69 := (x41 + u64(x42))
+	x70 := (x40 + x69)
+	x71 := (x39 + x70)
+	x72 := (x38 + x71)
+	x73 := (x37 + x72)
+	x74 := (x36 + x73)
+	x75 := (x34 + u64(x35))
+	x76 := (x33 + x75)
+	x77 := (x32 + x76)
+	x78 := (x31 + x77)
+	x79 := (x30 + x78)
+	x80 := (x29 + x79)
+	x81 := (x27 + u64(x28))
+	x82 := (x26 + x81)
+	x83 := (x25 + x82)
+	x84 := (x24 + x83)
+	x85 := (x23 + x84)
+	x86 := (x22 + x85)
+	x87 := (x20 + u64(x21))
+	x88 := (x19 + x87)
+	x89 := (x18 + x88)
+	x90 := (x17 + x89)
+	x91 := (x16 + x90)
+	x92 := (x15 + x91)
+	x93 := (x13 + u64(x14))
+	x94 := (x12 + x93)
+	x95 := (x11 + x94)
+	x96 := (x10 + x95)
+	x97 := (x9 + x96)
+	x98 := (x8 + x97)
+	x99 := (x6 + u64(x7))
+	x100 := (x5 + x99)
+	x101 := (x4 + x100)
+	x102 := (x3 + x101)
+	x103 := (x2 + x102)
+	x104 := (x1 + x103)
+	out1[0] = x62
+	out1[1] = x68
+	out1[2] = x74
+	out1[3] = x80
+	out1[4] = x86
+	out1[5] = x92
+	out1[6] = x98
+	out1[7] = x104
+}
+
+fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	x5 := arg1[4]
+	x6 := arg1[5]
+	x7 := arg1[6]
+	x8 := arg1[7]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+	out1[5] = x6
+	out1[6] = x7
+	out1[7] = x8
+}

+ 161 - 0
core/crypto/x448/x448.odin

@@ -0,0 +1,161 @@
+/*
+package x448 implements the X448 (aka curve448) Elliptic-Curve
+Diffie-Hellman key exchange protocol.
+
+See:
+- [[ https://www.rfc-editor.org/rfc/rfc7748 ]]
+*/
+package x448
+
+import field "core:crypto/_fiat/field_curve448"
+import "core:mem"
+
+// SCALAR_SIZE is the size of a X448 scalar (private key) in bytes.
+SCALAR_SIZE :: 56
+// POINT_SIZE is the size of a X448 point (public key/shared secret) in bytes.
+POINT_SIZE :: 56
+
+@(private, rodata)
+_BASE_POINT: [56]byte = {
+	5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+}
+
+@(private)
+_scalar_bit :: #force_inline proc "contextless" (s: ^[56]byte, i: int) -> u8 {
+	if i < 0 {
+		return 0
+	}
+	return (s[i >> 3] >> uint(i & 7)) & 1
+}
+
+@(private)
+_scalarmult :: proc "contextless" (out, scalar, point: ^[56]byte) {
+	// Montgomery pseudo-multiplication, using the RFC 7748 formula.
+	t1, t2: field.Loose_Field_Element = ---, ---
+
+	// x_1 = u
+	// x_2 = 1
+	// z_2 = 0
+	// x_3 = u
+	// z_3 = 1
+	x1: field.Tight_Field_Element = ---
+	field.fe_from_bytes(&x1, point)
+
+	x2, x3, z2, z3: field.Tight_Field_Element = ---, ---, ---, ---
+	field.fe_one(&x2)
+	field.fe_zero(&z2)
+	field.fe_set(&x3, &x1)
+	field.fe_one(&z3)
+
+	// swap = 0
+	swap: int
+
+	// For t = bits-1 down to 0:a
+	for t := 448 - 1; t >= 0; t -= 1 {
+		// k_t = (k >> t) & 1
+		k_t := int(_scalar_bit(scalar, t))
+		// swap ^= k_t
+		swap ~= k_t
+		// Conditional swap; see text below.
+		// (x_2, x_3) = cswap(swap, x_2, x_3)
+		field.fe_cond_swap(&x2, &x3, swap)
+		// (z_2, z_3) = cswap(swap, z_2, z_3)
+		field.fe_cond_swap(&z2, &z3, swap)
+		// swap = k_t
+		swap = k_t
+
+		// Note: This deliberately omits reductions after add/sub operations
+		// if the result is only ever used as the input to a mul/square since
+		// the implementations of those can deal with non-reduced inputs.
+		//
+		// fe_tighten_cast is only used to store a fully reduced
+		// output in a Loose_Field_Element, or to provide such a
+		// Loose_Field_Element as a Tight_Field_Element argument.
+
+		// A = x_2 + z_2
+		field.fe_add(&t1, &x2, &z2)
+		// B = x_2 - z_2
+		field.fe_sub(&t2, &x2, &z2)
+		// D = x_3 - z_3
+		field.fe_sub(field.fe_relax_cast(&z2), &x3, &z3) // (z2 unreduced)
+		// DA = D * A
+		field.fe_carry_mul(&x2, field.fe_relax_cast(&z2), &t1)
+		// C = x_3 + z_3
+		field.fe_add(field.fe_relax_cast(&z3), &x3, &z3) // (z3 unreduced)
+		// CB = C * B
+		field.fe_carry_mul(&x3, &t2, field.fe_relax_cast(&z3))
+		// z_3 = x_1 * (DA - CB)^2
+		field.fe_sub(field.fe_relax_cast(&z3), &x2, &x3) // (z3 unreduced)
+		field.fe_carry_square(&z3, field.fe_relax_cast(&z3))
+		field.fe_carry_mul(&z3, field.fe_relax_cast(&x1), field.fe_relax_cast(&z3))
+		// x_3 = (DA + CB)^2
+		field.fe_add(field.fe_relax_cast(&z2), &x2, &x3) // (z2 unreduced)
+		field.fe_carry_square(&x3, field.fe_relax_cast(&z2))
+
+		// AA = A^2
+		field.fe_carry_square(&z2, &t1)
+		// BB = B^2
+		field.fe_carry_square(field.fe_tighten_cast(&t1), &t2) // (t1 reduced)
+		// x_2 = AA * BB
+		field.fe_carry_mul(&x2, field.fe_relax_cast(&z2), &t1)
+		// E = AA - BB
+		field.fe_sub(&t2, &z2, field.fe_tighten_cast(&t1)) // (t1 (input) is reduced)
+		// z_2 = E * (AA + a24 * E)
+		field.fe_carry_mul_small(field.fe_tighten_cast(&t1), &t2, 39081) // (t1 reduced)
+		field.fe_add(&t1, &z2, field.fe_tighten_cast(&t1)) // (t1 (input) is reduced)
+		field.fe_carry_mul(&z2, &t2, &t1)
+	}
+
+	// Conditional swap; see text below.
+	// (x_2, x_3) = cswap(swap, x_2, x_3)
+	field.fe_cond_swap(&x2, &x3, swap)
+	// (z_2, z_3) = cswap(swap, z_2, z_3)
+	field.fe_cond_swap(&z2, &z3, swap)
+
+	// Return x_2 * (z_2^(p - 2))
+	field.fe_carry_inv(&z2, field.fe_relax_cast(&z2))
+	field.fe_carry_mul(&x2, field.fe_relax_cast(&x2), field.fe_relax_cast(&z2))
+	field.fe_to_bytes(out, &x2)
+
+	field.fe_clear_vec([]^field.Tight_Field_Element{&x1, &x2, &x3, &z2, &z3})
+	field.fe_clear_vec([]^field.Loose_Field_Element{&t1, &t2})
+}
+
+// scalarmult "multiplies" the provided scalar and point, and writes the
+// resulting point to dst.
+scalarmult :: proc(dst, scalar, point: []byte) {
+	if len(scalar) != SCALAR_SIZE {
+		panic("crypto/x448: invalid scalar size")
+	}
+	if len(point) != POINT_SIZE {
+		panic("crypto/x448: invalid point size")
+	}
+	if len(dst) != POINT_SIZE {
+		panic("crypto/x448: invalid destination point size")
+	}
+
+	// "clamp" the scalar
+	e: [56]byte = ---
+	copy_slice(e[:], scalar)
+	e[0] &= 252
+	e[55] |= 128
+
+	p: [56]byte = ---
+	copy_slice(p[:], point)
+
+	d: [56]byte = ---
+	_scalarmult(&d, &e, &p)
+	copy_slice(dst, d[:])
+
+	mem.zero_explicit(&e, size_of(e))
+	mem.zero_explicit(&d, size_of(d))
+}
+
+// scalarmult_basepoint "multiplies" the provided scalar with the X448
+// base point and writes the resulting point to dst.
+scalarmult_basepoint :: proc(dst, scalar: []byte) {
+	scalarmult(dst, scalar, _BASE_POINT[:])
+}

+ 2 - 0
examples/all/all_main.odin

@@ -48,6 +48,7 @@ import shake            "core:crypto/shake"
 import sm3              "core:crypto/sm3"
 import tuplehash        "core:crypto/tuplehash"
 import x25519           "core:crypto/x25519"
+import x448             "core:crypto/x448"
 
 import pe               "core:debug/pe"
 import trace            "core:debug/trace"
@@ -190,6 +191,7 @@ _ :: shake
 _ :: sm3
 _ :: tuplehash
 _ :: x25519
+_ :: x448
 _ :: pe
 _ :: trace
 _ :: dynlib

+ 21 - 0
tests/benchmark/crypto/benchmark_crypto.odin

@@ -14,6 +14,7 @@ import "core:crypto/chacha20poly1305"
 import "core:crypto/ed25519"
 import "core:crypto/poly1305"
 import "core:crypto/x25519"
+import "core:crypto/x448"
 
 // Cryptographic primitive benchmarks.
 
@@ -237,6 +238,26 @@ benchmark_crypto :: proc(t: ^testing.T) {
 			time.duration_microseconds(elapsed) / iters,
 		)
 	}
+	{
+		point_str := "deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef"
+		scalar_str := "cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe"
+
+		point, _ := hex.decode(transmute([]byte)(point_str), context.temp_allocator)
+		scalar, _ := hex.decode(transmute([]byte)(scalar_str), context.temp_allocator)
+		out: [x448.POINT_SIZE]byte = ---
+
+		iters :: 10000
+		start := time.now()
+		for i := 0; i < iters; i = i + 1 {
+			x448.scalarmult(out[:], scalar[:], point[:])
+		}
+		elapsed := time.since(start)
+
+		fmt.sbprintfln(&str,
+			"x448.scalarmult: ~%f us/op",
+			time.duration_microseconds(elapsed) / iters,
+		)
+	}
 }
 
 @(private)

+ 63 - 0
tests/core/crypto/test_core_crypto_ecc25519.odin → tests/core/crypto/test_core_crypto_edwards.odin

@@ -7,6 +7,7 @@ import field "core:crypto/_fiat/field_curve25519"
 import "core:crypto/ed25519"
 import "core:crypto/ristretto255"
 import "core:crypto/x25519"
+import "core:crypto/x448"
 
 @(test)
 test_sqrt_ratio_m1 :: proc(t: ^testing.T) {
@@ -684,6 +685,68 @@ test_x25519 :: proc(t: ^testing.T) {
 	}
 }
 
+@(test)
+test_x448 :: proc(t: ^testing.T) {
+	// Local copy of this so that the base point doesn't need to be exported.
+	_BASE_POINT: [56]byte = {
+		5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+	}
+
+	test_vectors := []struct {
+		scalar:  string,
+		point:   string,
+		product: string,
+	} {
+		// Test vectors from RFC 7748
+		{
+			"3d262fddf9ec8e88495266fea19a34d28882acef045104d0d1aae121700a779c984c24f8cdd78fbff44943eba368f54b29259a4f1c600ad3",
+			"06fce640fa3487bfda5f6cf2d5263f8aad88334cbd07437f020f08f9814dc031ddbdc38c19c6da2583fa5429db94ada18aa7a7fb4ef8a086",
+			"ce3e4ff95a60dc6697da1db1d85e6afbdf79b50a2412d7546d5f239fe14fbaadeb445fc66a01b0779d98223961111e21766282f73dd96b6f",
+		},
+		{
+			"203d494428b8399352665ddca42f9de8fef600908e0d461cb021f8c538345dd77c3e4806e25f46d3315c44e0a5b4371282dd2c8d5be3095f",
+			"0fbcc2f993cd56d3305b0b7d9e55d4c1a8fb5dbb52f8e9a1e9b6201b165d015894e56c4d3570bee52fe205e28a78b91cdfbde71ce8d157db",
+			"884a02576239ff7a2f2f63b2db6a9ff37047ac13568e1e30fe63c4a7ad1b3ee3a5700df34321d62077e63633c575c1c954514e99da7c179d",
+		},
+	}
+	for v, _ in test_vectors {
+		scalar, _ := hex.decode(transmute([]byte)(v.scalar), context.temp_allocator)
+		point, _ := hex.decode(transmute([]byte)(v.point), context.temp_allocator)
+
+		derived_point: [x448.POINT_SIZE]byte
+		x448.scalarmult(derived_point[:], scalar[:], point[:])
+		derived_point_str := string(hex.encode(derived_point[:], context.temp_allocator))
+
+		testing.expectf(
+			t,
+			derived_point_str == v.product,
+			"Expected %s for %s * %s, but got %s instead",
+			v.product,
+			v.scalar,
+			v.point,
+			derived_point_str,
+			)
+
+		// Abuse the test vectors to sanity-check the scalar-basepoint multiply.
+		p1, p2: [x448.POINT_SIZE]byte
+		x448.scalarmult_basepoint(p1[:], scalar[:])
+		x448.scalarmult(p2[:], scalar[:], _BASE_POINT[:])
+		p1_str := string(hex.encode(p1[:], context.temp_allocator))
+		p2_str := string(hex.encode(p2[:], context.temp_allocator))
+		testing.expectf(
+			t,
+			p1_str == p2_str,
+			"Expected %s for %s * basepoint, but got %s instead",
+			p2_str,
+			v.scalar,
+			p1_str,
+		)
+	}
+}
+
 @(private)
 ge_str :: proc(ge: ^ristretto255.Group_Element) -> string {
 	b: [ristretto255.ELEMENT_SIZE]byte