Преглед изворни кода

core/crypto/_edwards25519: Initial import

Yawning Angel пре 1 година
родитељ
комит
563c527419

+ 428 - 0
core/crypto/_edwards25519/edwards25519.odin

@@ -0,0 +1,428 @@
+package _edwards25519
+
+/*
+This implements the edwards25519 composite-order group, primarily for
+the purpose of implementing X25519, Ed25519, and ristretto255.  Use of
+this package for other purposes is NOT RECOMMENDED.
+
+See:
+- https://eprint.iacr.org/2011/368.pdf
+- https://datatracker.ietf.org/doc/html/rfc8032
+- https://www.hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html
+*/
+
+import "base:intrinsics"
+import "core:crypto"
+import field "core:crypto/_fiat/field_curve25519"
+import "core:mem"
+
+// Group_Element is an edwards25519 group element, as extended homogenous
+// coordinates, which represents the affine point `(x, y)` as `(X, Y, Z, T)`,
+// with the relations `x = X/Z`, `y = Y/Z`, and `x * y = T/Z`.
+//
+// d = -121665/121666 = 37095705934669439343138083508754565189542113879843219016388785533085940283555
+// a = -1
+//
+// Notes:
+// - There is considerable scope for optimization, however that
+//   will not change the external API, and this is simple and reasonably
+//   performant.
+// - The API delibarately makes it hard to create arbitrary group
+//   elements that are not on the curve.
+// - The group element decoding routine takes the opinionated stance of
+//   rejecting non-canonical encodings.
+
+FE_D := field.Tight_Field_Element {
+	929955233495203,
+	466365720129213,
+	1662059464998953,
+	2033849074728123,
+	1442794654840575,
+}
+@(private)
+FE_A := field.Tight_Field_Element {
+	2251799813685228,
+	2251799813685247,
+	2251799813685247,
+	2251799813685247,
+	2251799813685247,
+}
+@(private)
+FE_D2 := field.Tight_Field_Element {
+	1859910466990425,
+	932731440258426,
+	1072319116312658,
+	1815898335770999,
+	633789495995903,
+}
+@(private)
+GE_BASEPOINT := Group_Element {
+	field.Tight_Field_Element {
+		1738742601995546,
+		1146398526822698,
+		2070867633025821,
+		562264141797630,
+		587772402128613,
+	},
+	field.Tight_Field_Element {
+		1801439850948184,
+		1351079888211148,
+		450359962737049,
+		900719925474099,
+		1801439850948198,
+	},
+	field.Tight_Field_Element{1, 0, 0, 0, 0},
+	field.Tight_Field_Element {
+		1841354044333475,
+		16398895984059,
+		755974180946558,
+		900171276175154,
+		1821297809914039,
+	},
+}
+GE_IDENTITY := Group_Element {
+	field.Tight_Field_Element{0, 0, 0, 0, 0},
+	field.Tight_Field_Element{1, 0, 0, 0, 0},
+	field.Tight_Field_Element{1, 0, 0, 0, 0},
+	field.Tight_Field_Element{0, 0, 0, 0, 0},
+}
+
+Group_Element :: struct {
+	x: field.Tight_Field_Element,
+	y: field.Tight_Field_Element,
+	z: field.Tight_Field_Element,
+	t: field.Tight_Field_Element,
+}
+
+ge_clear :: proc "contextless" (ge: ^Group_Element) {
+	mem.zero_explicit(ge, size_of(Group_Element))
+}
+
+ge_set :: proc "contextless" (ge, a: ^Group_Element) {
+	field.fe_set(&ge.x, &a.x)
+	field.fe_set(&ge.y, &a.y)
+	field.fe_set(&ge.z, &a.z)
+	field.fe_set(&ge.t, &a.t)
+}
+
+@(require_results)
+ge_set_bytes :: proc "contextless" (ge: ^Group_Element, b: []byte) -> bool {
+	if len(b) != 32 {
+		intrinsics.trap()
+	}
+	b_ := transmute(^[32]byte)(raw_data(b))
+
+	// Do the work in a scratch element, so that ge is unchanged on
+	// failure.
+	tmp: Group_Element = ---
+	defer ge_clear(&tmp)
+	field.fe_one(&tmp.z) // Z = 1
+
+	// The encoding is the y-coordinate, with the x-coordinate polarity
+	// (odd/even) encoded in the MSB.
+	field.fe_from_bytes(&tmp.y, b_) // ignores high bit
+
+	// Recover the candidate x-coordinate via the curve equation:
+	// x^2 = (y^2 - 1) / (d * y^2 + 1) (mod p)
+
+	fe_tmp := &tmp.t // Use this to store intermediaries.
+	fe_one := &tmp.z
+
+	// x = num = y^2 - 1
+	field.fe_carry_square(fe_tmp, field.fe_relax_cast(&tmp.y)) // fe_tmp = y^2
+	field.fe_carry_sub(&tmp.x, fe_tmp, fe_one)
+
+	// den = d * y^2 + 1
+	field.fe_carry_mul(fe_tmp, field.fe_relax_cast(fe_tmp), field.fe_relax_cast(&FE_D))
+	field.fe_carry_add(fe_tmp, fe_tmp, fe_one)
+
+	// x = invsqrt(den/num)
+	is_square := field.fe_carry_sqrt_ratio_m1(
+		&tmp.x,
+		field.fe_relax_cast(&tmp.x),
+		field.fe_relax_cast(fe_tmp),
+	)
+	if is_square == 0 {
+		return false
+	}
+
+	// Pick the right x-coordinate.
+	field.fe_cond_negate(&tmp.x, &tmp.x, int(b[31] >> 7))
+
+	// t = x * y
+	field.fe_carry_mul(&tmp.t, field.fe_relax_cast(&tmp.x), field.fe_relax_cast(&tmp.y))
+
+	// Reject non-canonical encodings of ge.
+	buf: [32]byte = ---
+	field.fe_to_bytes(&buf, &tmp.y)
+	buf[31] |= byte(field.fe_is_negative(&tmp.x)) << 7
+	is_canonical := crypto.compare_constant_time(b, buf[:])
+
+	ge_cond_assign(ge, &tmp, is_canonical)
+
+	mem.zero_explicit(&buf, size_of(buf))
+
+	return is_canonical == 1
+}
+
+ge_bytes :: proc "contextless" (ge: ^Group_Element, dst: []byte) {
+	if len(dst) != 32 {
+		intrinsics.trap()
+	}
+	dst_ := transmute(^[32]byte)(raw_data(dst))
+
+	// Convert the element to affine (x, y) representation.
+	x, y, z_inv: field.Tight_Field_Element = ---, ---, ---
+	field.fe_carry_inv(&z_inv, field.fe_relax_cast(&ge.z))
+	field.fe_carry_mul(&x, field.fe_relax_cast(&ge.x), field.fe_relax_cast(&z_inv))
+	field.fe_carry_mul(&y, field.fe_relax_cast(&ge.y), field.fe_relax_cast(&z_inv))
+
+	// Encode the y-coordinate.
+	field.fe_to_bytes(dst_, &y)
+
+	// Copy the least significant bit of the x-coordinate to the most
+	// significant bit of the encoded y-coordinate.
+	dst_[31] |= byte((x[0] & 1) << 7)
+
+	field.fe_clear_vec([]^field.Tight_Field_Element{&x, &y, &z_inv})
+}
+
+ge_identity :: proc "contextless" (ge: ^Group_Element) {
+	field.fe_zero(&ge.x)
+	field.fe_one(&ge.y)
+	field.fe_one(&ge.z)
+	field.fe_zero(&ge.t)
+}
+
+ge_generator :: proc "contextless" (ge: ^Group_Element) {
+	ge_set(ge, &GE_BASEPOINT)
+}
+
+@(private)
+Addend_Group_Element :: struct {
+	y2_minus_x2:  field.Loose_Field_Element, // t1
+	y2_plus_x2:   field.Loose_Field_Element, // t3
+	k_times_t2:   field.Tight_Field_Element, // t4
+	two_times_z2: field.Loose_Field_Element, // t5
+}
+
+@(private)
+ge_addend_set :: proc "contextless" (ge_a: ^Addend_Group_Element, ge: ^Group_Element) {
+	field.fe_sub(&ge_a.y2_minus_x2, &ge.y, &ge.x)
+	field.fe_add(&ge_a.y2_plus_x2, &ge.y, &ge.x)
+	field.fe_carry_mul(&ge_a.k_times_t2, field.fe_relax_cast(&FE_D2), field.fe_relax_cast(&ge.t))
+	field.fe_add(&ge_a.two_times_z2, &ge.z, &ge.z)
+}
+
+@(private)
+ge_addend_conditional_assign :: proc "contextless" (ge_a, a: ^Addend_Group_Element, ctrl: int) {
+	field.fe_cond_select(&ge_a.y2_minus_x2, &ge_a.y2_minus_x2, &a.y2_minus_x2, ctrl)
+	field.fe_cond_select(&ge_a.y2_plus_x2, &ge_a.y2_plus_x2, &a.y2_plus_x2, ctrl)
+	field.fe_cond_select(&ge_a.k_times_t2, &ge_a.k_times_t2, &a.k_times_t2, ctrl)
+	field.fe_cond_select(&ge_a.two_times_z2, &ge_a.two_times_z2, &a.two_times_z2, ctrl)
+}
+
+@(private)
+Add_Scratch :: struct {
+	A, B, C, D: field.Tight_Field_Element,
+	E, F, G, H: field.Loose_Field_Element,
+	t0, t2:     field.Loose_Field_Element,
+}
+
+ge_add :: proc "contextless" (ge, a, b: ^Group_Element) {
+	b_: Addend_Group_Element = ---
+	ge_addend_set(&b_, b)
+
+	scratch: Add_Scratch = ---
+	ge_add_addend(ge, a, &b_, &scratch)
+
+	mem.zero_explicit(&b_, size_of(Addend_Group_Element))
+	mem.zero_explicit(&scratch, size_of(Add_Scratch))
+}
+
+@(private)
+ge_add_addend :: proc "contextless" (
+	ge, a: ^Group_Element,
+	b: ^Addend_Group_Element,
+	scratch: ^Add_Scratch,
+) {
+	// https://www.hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html#addition-add-2008-hwcd-3
+	// Assumptions: k=2*d.
+	//
+	// t0 = Y1-X1
+	// t1 = Y2-X2
+	// A = t0*t1
+	// t2 = Y1+X1
+	// t3 = Y2+X2
+	// B = t2*t3
+	// t4 = k*T2
+	// C = T1*t4
+	// t5 = 2*Z2
+	// D = Z1*t5
+	// E = B-A
+	// F = D-C
+	// G = D+C
+	// H = B+A
+	// X3 = E*F
+	// Y3 = G*H
+	// T3 = E*H
+	// Z3 = F*G
+	//
+	// In order to make the scalar multiply faster, the addend is provided
+	// as a `Addend_Group_Element` with t1, t3, t4, and t5 precomputed, as
+	// it is trivially obvious that those are the only values used by the
+	// formula that are directly dependent on `b`, and are only dependent
+	// on `b` and constants.  This saves 1 sub, 2 adds, and 1 multiply,
+	// each time the intermediate representation can be reused.
+
+	A, B, C, D := &scratch.A, &scratch.B, &scratch.C, &scratch.D
+	E, F, G, H := &scratch.E, &scratch.F, &scratch.G, &scratch.H
+	t0, t2 := &scratch.t0, &scratch.t2
+
+	field.fe_sub(t0, &a.y, &a.x)
+	t1 := &b.y2_minus_x2
+	field.fe_carry_mul(A, t0, t1)
+	field.fe_add(t2, &a.y, &a.x)
+	t3 := &b.y2_plus_x2
+	field.fe_carry_mul(B, t2, t3)
+	t4 := &b.k_times_t2
+	field.fe_carry_mul(C, field.fe_relax_cast(&a.t), field.fe_relax_cast(t4))
+	t5 := &b.two_times_z2
+	field.fe_carry_mul(D, field.fe_relax_cast(&a.z), t5)
+	field.fe_sub(E, B, A)
+	field.fe_sub(F, D, C)
+	field.fe_add(G, D, C)
+	field.fe_add(H, B, A)
+	field.fe_carry_mul(&ge.x, E, F)
+	field.fe_carry_mul(&ge.y, G, H)
+	field.fe_carry_mul(&ge.t, E, H)
+	field.fe_carry_mul(&ge.z, F, G)
+}
+
+@(private)
+Double_Scratch :: struct {
+	A, B, C, D, G: field.Tight_Field_Element,
+	t0, t2, t3:    field.Tight_Field_Element,
+	E, F, H:       field.Loose_Field_Element,
+	t1:            field.Loose_Field_Element,
+}
+
+ge_double :: proc "contextless" (ge, a: ^Group_Element, scratch: ^Double_Scratch = nil) {
+	// https://www.hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html#doubling-dbl-2008-hwcd
+	//
+	// A = X1^2
+	// B = Y1^2
+	// t0 = Z1^2
+	// C = 2*t0
+	// D = a*A
+	// t1 = X1+Y1
+	// t2 = t1^2
+	// t3 = t2-A
+	// E = t3-B
+	// G = D+B
+	// F = G-C
+	// H = D-B
+	// X3 = E*F
+	// Y3 = G*H
+	// T3 = E*H
+	// Z3 = F*G
+
+	sanitize, scratch := scratch == nil, scratch
+	if sanitize {
+		tmp: Double_Scratch = ---
+		scratch = &tmp
+	}
+
+	A, B, C, D, G := &scratch.A, &scratch.B, &scratch.C, &scratch.D, &scratch.G
+	t0, t2, t3 := &scratch.t0, &scratch.t2, &scratch.t3
+	E, F, H := &scratch.E, &scratch.F, &scratch.H
+	t1 := &scratch.t1
+
+	field.fe_carry_square(A, field.fe_relax_cast(&a.x))
+	field.fe_carry_square(B, field.fe_relax_cast(&a.y))
+	field.fe_carry_square(t0, field.fe_relax_cast(&a.z))
+	field.fe_carry_add(C, t0, t0)
+	field.fe_carry_mul(D, field.fe_relax_cast(&FE_A), field.fe_relax_cast(A))
+	field.fe_add(t1, &a.x, &a.y)
+	field.fe_carry_square(t2, t1)
+	field.fe_carry_sub(t3, t2, A)
+	field.fe_sub(E, t3, B)
+	field.fe_carry_add(G, D, B)
+	field.fe_sub(F, G, C)
+	field.fe_sub(H, D, B)
+	G_ := field.fe_relax_cast(G)
+	field.fe_carry_mul(&ge.x, E, F)
+	field.fe_carry_mul(&ge.y, G_, H)
+	field.fe_carry_mul(&ge.t, E, H)
+	field.fe_carry_mul(&ge.z, F, G_)
+
+	if sanitize {
+		mem.zero_explicit(scratch, size_of(Double_Scratch))
+	}
+}
+
+ge_negate :: proc "contextless" (ge, a: ^Group_Element) {
+	field.fe_carry_opp(&ge.x, &a.x)
+	field.fe_set(&ge.y, &a.y)
+	field.fe_set(&ge.z, &a.z)
+	field.fe_carry_opp(&ge.t, &a.t)
+}
+
+ge_cond_negate :: proc "contextless" (ge, a: ^Group_Element, ctrl: int) {
+	tmp: Group_Element = ---
+	ge_negate(&tmp, a)
+	ge_cond_assign(ge, &tmp, ctrl)
+
+	ge_clear(&tmp)
+}
+
+ge_cond_assign :: proc "contextless" (ge, a: ^Group_Element, ctrl: int) {
+	field.fe_cond_assign(&ge.x, &a.x, ctrl)
+	field.fe_cond_assign(&ge.y, &a.y, ctrl)
+	field.fe_cond_assign(&ge.z, &a.z, ctrl)
+	field.fe_cond_assign(&ge.t, &a.t, ctrl)
+}
+
+ge_cond_select :: proc "contextless" (ge, a, b: ^Group_Element, ctrl: int) {
+	field.fe_cond_select(&ge.x, &a.x, &b.x, ctrl)
+	field.fe_cond_select(&ge.y, &a.y, &b.y, ctrl)
+	field.fe_cond_select(&ge.z, &a.z, &b.z, ctrl)
+	field.fe_cond_select(&ge.t, &a.t, &b.t, ctrl)
+}
+
+@(require_results)
+ge_equal :: proc "contextless" (a, b: ^Group_Element) -> int {
+	// (x, y) ?= (x', y') -> (X/Z, Y/Z) ?= (X'/Z', Y'/Z')
+	// X/Z ?= X'/Z', Y/Z ?= Y'/Z' -> X*Z' ?= X'*Z, Y*Z' ?= Y'*Z
+	ax_bz, bx_az, ay_bz, by_az: field.Tight_Field_Element = ---, ---, ---, ---
+	field.fe_carry_mul(&ax_bz, field.fe_relax_cast(&a.x), field.fe_relax_cast(&b.z))
+	field.fe_carry_mul(&bx_az, field.fe_relax_cast(&b.x), field.fe_relax_cast(&a.z))
+	field.fe_carry_mul(&ay_bz, field.fe_relax_cast(&a.y), field.fe_relax_cast(&b.z))
+	field.fe_carry_mul(&by_az, field.fe_relax_cast(&b.y), field.fe_relax_cast(&a.z))
+
+	ret := field.fe_equal(&ax_bz, &bx_az) & field.fe_equal(&ay_bz, &by_az)
+
+	field.fe_clear_vec([]^field.Tight_Field_Element{&ax_bz, &ay_bz, &bx_az, &by_az})
+
+	return ret
+}
+
+@(require_results)
+ge_is_small_order :: proc "contextless" (ge: ^Group_Element) -> bool {
+	tmp: Group_Element = ---
+	ge_double(&tmp, ge)
+	ge_double(&tmp, &tmp)
+	ge_double(&tmp, &tmp)
+	return ge_equal(&tmp, &GE_IDENTITY) == 1
+}
+
+@(require_results)
+ge_in_prime_order_subgroup_vartime :: proc "contextless" (ge: ^Group_Element) -> bool {
+	// This is currently *very* expensive.  The faster method would be
+	// something like (https://eprint.iacr.org/2022/1164.pdf), however
+	// that is a ~50% speedup, and a lot of added complexity for something
+	// that is better solved by "just use ristretto255".
+	tmp: Group_Element = ---
+	_ge_scalarmult(&tmp, ge, &SC_ELL, true)
+	return ge_equal(&tmp, &GE_IDENTITY) == 1
+}

+ 61 - 0
core/crypto/_edwards25519/edwards25519_scalar.odin

@@ -0,0 +1,61 @@
+package _edwards25519
+
+import "base:intrinsics"
+import field "core:crypto/_fiat/field_scalar25519"
+import "core:mem"
+
+Scalar :: field.Montgomery_Domain_Field_Element
+
+// WARNING: This is non-canonical and only to be used when checking if
+// a group element is on the prime-order subgroup.
+@(private)
+SC_ELL := field.Non_Montgomery_Domain_Field_Element {
+	field.ELL[0],
+	field.ELL[1],
+	field.ELL[2],
+	field.ELL[3],
+}
+
+sc_set_u64 :: proc "contextless" (sc: ^Scalar, i: u64) {
+	tmp := field.Non_Montgomery_Domain_Field_Element{i, 0, 0, 0}
+	field.fe_to_montgomery(sc, &tmp)
+
+	mem.zero_explicit(&tmp, size_of(tmp))
+}
+
+@(require_results)
+sc_set_bytes :: proc "contextless" (sc: ^Scalar, b: []byte) -> bool {
+	if len(b) != 32 {
+		intrinsics.trap()
+	}
+	b_ := transmute(^[32]byte)(raw_data(b))
+	return field.fe_from_bytes(sc, b_)
+}
+
+sc_set_bytes_rfc8032 :: proc "contextless" (sc: ^Scalar, b: []byte) {
+	if len(b) != 32 {
+		intrinsics.trap()
+	}
+	b_ := transmute(^[32]byte)(raw_data(b))
+	field.fe_from_bytes_rfc8032(sc, b_)
+}
+
+sc_clear :: proc "contextless" (sc: ^Scalar) {
+	mem.zero_explicit(sc, size_of(Scalar))
+}
+
+sc_set :: field.fe_set
+sc_set_bytes_wide :: field.fe_from_bytes_wide
+sc_bytes :: field.fe_to_bytes
+
+sc_zero :: field.fe_zero
+sc_one :: field.fe_one
+
+sc_add :: field.fe_add
+sc_sub :: field.fe_sub
+sc_negate :: field.fe_opp
+sc_mul :: field.fe_mul
+sc_square :: field.fe_square
+
+sc_cond_assign :: field.fe_cond_assign
+sc_equal :: field.fe_equal

+ 288 - 0
core/crypto/_edwards25519/edwards25519_scalar_mul.odin

@@ -0,0 +1,288 @@
+package _edwards25519
+
+import field "core:crypto/_fiat/field_scalar25519"
+import "core:math/bits"
+import "core:mem"
+
+// GE_BASEPOINT_TABLE is 1 * G, ... 15 * G, in precomputed format.
+//
+// Note: When generating, the values were reduced to Tight_Field_Element
+// ranges, even though that is not required.
+@(private)
+GE_BASEPOINT_TABLE := Multiply_Table {
+	{
+		{62697248952638, 204681361388450, 631292143396476, 338455783676468, 1213667448819585},
+		{1288382639258501, 245678601348599, 269427782077623, 1462984067271730, 137412439391563},
+		{301289933810280, 1259582250014073, 1422107436869536, 796239922652654, 1953934009299142},
+		{2, 0, 0, 0, 0},
+	},
+	{
+		{1519297034332653, 1098796920435767, 1823476547744119, 808144629470969, 2110930855619772},
+		{338005982828284, 1667856962156925, 100399270107451, 1604566703601691, 1950338038771369},
+		{1920505767731247, 1443759578976892, 1659852098357048, 1484431291070208, 275018744912646},
+		{763163817085987, 2195095074806923, 2167883174351839, 1868059999999762, 911071066608705},
+	},
+	{
+		{960627541894068, 1314966688943942, 1126875971034044, 2059608312958945, 605975666152586},
+		{1714478358025626, 2209607666607510, 1600912834284834, 496072478982142, 481970031861896},
+		{851735079403194, 1088965826757164, 141569479297499, 602804610059257, 2004026468601520},
+		{197585529552380, 324719066578543, 564481854250498, 1173818332764578, 35452976395676},
+	},
+	{
+		{1152980410747203, 2196804280851952, 25745194962557, 1915167295473129, 1266299690309224},
+		{809905889679060, 979732230071345, 1509972345538142, 188492426534402, 818965583123815},
+		{997685409185036, 1451818320876327, 2126681166774509, 2000509606057528, 235432372486854},
+		{887734189279642, 1460338685162044, 877378220074262, 102436391401299, 153369156847490},
+	},
+	{
+		{2056621900836770, 1821657694132497, 1627986892909426, 1163363868678833, 1108873376459226},
+		{1187697490593623, 1066539945237335, 885654531892000, 1357534489491782, 359370291392448},
+		{1509033452137525, 1305318174298508, 613642471748944, 1987256352550234, 1044283663101541},
+		{220105720697037, 387661783287620, 328296827867762, 360035589590664, 795213236824054},
+	},
+	{
+		{1820794733038396, 1612235121681074, 757405923441402, 1094031020892801, 231025333128907},
+		{1639067873254194, 1484176557946322, 300800382144789, 1329915446659183, 1211704578730455},
+		{641900794791527, 1711751746971612, 179044712319955, 576455585963824, 1852617592509865},
+		{743549047192397, 685091042550147, 1952415336873496, 1965124675654685, 513364998442917},
+	},
+	{
+		{1004557076870448, 1762911374844520, 1330807633622723, 384072910939787, 953849032243810},
+		{2178275058221458, 257933183722891, 376684351537894, 2010189102001786, 1981824297484148},
+		{1332915663881114, 1286540505502549, 1741691283561518, 977214932156314, 1764059494778091},
+		{429702949064027, 1368332611650677, 2019867176450999, 2212258376161746, 526160996742554},
+	},
+	{
+		{2098932988258576, 2203688382075948, 2120400160059479, 1748488020948146, 1203264167282624},
+		{677131386735829, 1850249298025188, 672782146532031, 2144145693078904, 2088656272813787},
+		{1065622343976192, 1573853211848116, 223560413590068, 333846833073379, 27832122205830},
+		{1781008836504573, 917619542051793, 544322748939913, 882577394308384, 1720521246471195},
+	},
+	{
+		{660120928379860, 2081944024858618, 1878411111349191, 424587356517195, 2111317439894005},
+		{1834193977811532, 1864164086863319, 797334633289424, 150410812403062, 2085177078466389},
+		{1438117271371866, 783915531014482, 388731514584658, 292113935417795, 1945855002546714},
+		{1678140823166658, 679103239148744, 614102761596238, 1052962498997885, 1863983323810390},
+	},
+	{
+		{1690309392496233, 1116333140326275, 1377242323631039, 717196888780674, 82724646713353},
+		{1722370213432106, 74265192976253, 264239578448472, 1714909985012994, 2216984958602173},
+		{2010482366920922, 1294036471886319, 566466395005815, 1631955803657320, 1751698647538458},
+		{1073230604155753, 1159087041338551, 1664057985455483, 127472702826203, 1339591128522371},
+	},
+	{
+		{478053307175577, 2179515791720985, 21146535423512, 1831683844029536, 462805561553981},
+		{1945267486565588, 1298536818409655, 2214511796262989, 1904981051429012, 252904800782086},
+		{268945954671210, 222740425595395, 1208025911856230, 1080418823003555, 75929831922483},
+		{1884784014268948, 643868448202966, 978736549726821, 46385971089796, 1296884812292320},
+	},
+	{
+		{1861159462859103, 7077532564710, 963010365896826, 1938780006785270, 766241051941647},
+		{1778966986051906, 1713995999765361, 1394565822271816, 1366699246468722, 1213407027149475},
+		{1978989286560907, 2135084162045594, 1951565508865477, 671788336314416, 293123929458176},
+		{902608944504080, 2167765718046481, 1285718473078022, 1222562171329269, 492109027844479},
+	},
+	{
+		{1820807832746213, 1029220580458586, 1101997555432203, 1039081975563572, 202477981158221},
+		{1866134980680205, 2222325502763386, 1830284629571201, 1046966214478970, 418381946936795},
+		{1783460633291322, 1719505443254998, 1810489639976220, 877049370713018, 2187801198742619},
+		{197118243000763, 305493867565736, 518814410156522, 1656246186645170, 901894734874934},
+	},
+	{
+		{225454942125915, 478410476654509, 600524586037746, 643450007230715, 1018615928259319},
+		{1733330584845708, 881092297970296, 507039890129464, 496397090721598, 2230888519577628},
+		{690155664737246, 1010454785646677, 753170144375012, 1651277613844874, 1622648796364156},
+		{1321310321891618, 1089655277873603, 235891750867089, 815878279563688, 1709264240047556},
+	},
+	{
+		{805027036551342, 1387174275567452, 1156538511461704, 1465897486692171, 1208567094120903},
+		{2228417017817483, 202885584970535, 2182114782271881, 2077405042592934, 1029684358182774},
+		{460447547653983, 627817697755692, 524899434670834, 1228019344939427, 740684787777653},
+		{849757462467675, 447476306919899, 422618957298818, 302134659227815, 675831828440895},
+	},
+}
+
+ge_scalarmult :: proc "contextless" (ge, p: ^Group_Element, sc: ^Scalar) {
+	tmp: field.Non_Montgomery_Domain_Field_Element
+	field.fe_from_montgomery(&tmp, sc)
+
+	_ge_scalarmult(ge, p, &tmp)
+
+	mem.zero_explicit(&tmp, size_of(tmp))
+}
+
+ge_scalarmult_basepoint :: proc "contextless" (ge: ^Group_Element, sc: ^Scalar) {
+	// Something like the comb method from "Fast and compact elliptic-curve
+	// cryptography" Section 3.3, would be more performant, but more
+	// complex.
+	//
+	// - https://eprint.iacr.org/2012/309
+	ge_scalarmult(ge, &GE_BASEPOINT, sc)
+}
+
+ge_scalarmult_vartime :: proc "contextless" (ge, p: ^Group_Element, sc: ^Scalar) {
+	tmp: field.Non_Montgomery_Domain_Field_Element
+	field.fe_from_montgomery(&tmp, sc)
+
+	_ge_scalarmult(ge, p, &tmp, true)
+}
+
+ge_double_scalarmult_basepoint_vartime :: proc "contextless" (
+	ge: ^Group_Element,
+	a: ^Scalar,
+	A: ^Group_Element,
+	b: ^Scalar,
+) {
+	// Strauss-Shamir, commonly referred to as the "Shamir trick",
+	// saves half the doublings, relative to doing this the naive way.
+	//
+	// ABGLSV-Pornin (https://eprint.iacr.org/2020/454) is faster,
+	// but significantly more complex, and has incompatibilities with
+	// mixed-order group elements.
+
+	tmp_add: Add_Scratch = ---
+	tmp_addend: Addend_Group_Element = ---
+	tmp_dbl: Double_Scratch = ---
+	tmp: Group_Element = ---
+
+	A_tbl: Multiply_Table = ---
+	mul_tbl_set(&A_tbl, A, &tmp_add)
+
+	sc_a, sc_b: field.Non_Montgomery_Domain_Field_Element
+	field.fe_from_montgomery(&sc_a, a)
+	field.fe_from_montgomery(&sc_b, b)
+
+	ge_identity(&tmp)
+	for i := 31; i >= 0; i = i - 1 {
+		limb := i / 8
+		shift := uint(i & 7) * 8
+
+		limb_byte_a := sc_a[limb] >> shift
+		limb_byte_b := sc_b[limb] >> shift
+
+		hi_a, lo_a := (limb_byte_a >> 4) & 0x0f, limb_byte_a & 0x0f
+		hi_b, lo_b := (limb_byte_b >> 4) & 0x0f, limb_byte_b & 0x0f
+
+		if i != 31 {
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+		}
+		mul_tbl_add(&tmp, &A_tbl, hi_a, &tmp_add, &tmp_addend, true)
+		mul_tbl_add(&tmp, &GE_BASEPOINT_TABLE, hi_b, &tmp_add, &tmp_addend, true)
+
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		mul_tbl_add(&tmp, &A_tbl, lo_a, &tmp_add, &tmp_addend, true)
+		mul_tbl_add(&tmp, &GE_BASEPOINT_TABLE, lo_b, &tmp_add, &tmp_addend, true)
+	}
+
+	ge_set(ge, &tmp)
+}
+
+@(private)
+_ge_scalarmult :: proc "contextless" (
+	ge, p: ^Group_Element,
+	sc: ^field.Non_Montgomery_Domain_Field_Element,
+	unsafe_is_vartime := false,
+) {
+	// Do the simplest possible thing that works and provides adequate,
+	// performance, which is windowed add-then-multiply.
+
+	tmp_add: Add_Scratch = ---
+	tmp_addend: Addend_Group_Element = ---
+	tmp_dbl: Double_Scratch = ---
+	tmp: Group_Element = ---
+
+	p_tbl: Multiply_Table = ---
+	mul_tbl_set(&p_tbl, p, &tmp_add)
+
+	ge_identity(&tmp)
+	for i := 31; i >= 0; i = i - 1 {
+		limb := i / 8
+		shift := uint(i & 7) * 8
+		limb_byte := sc[limb] >> shift
+
+		hi, lo := (limb_byte >> 4) & 0x0f, limb_byte & 0x0f
+
+		if i != 31 {
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+		}
+		mul_tbl_add(&tmp, &p_tbl, hi, &tmp_add, &tmp_addend, unsafe_is_vartime)
+
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		mul_tbl_add(&tmp, &p_tbl, lo, &tmp_add, &tmp_addend, unsafe_is_vartime)
+	}
+
+	ge_set(ge, &tmp)
+
+	if !unsafe_is_vartime {
+		ge_clear(&tmp)
+		mem.zero_explicit(&tmp_add, size_of(Add_Scratch))
+		mem.zero_explicit(&tmp_addend, size_of(Addend_Group_Element))
+		mem.zero_explicit(&tmp_dbl, size_of(Double_Scratch))
+	}
+}
+
+@(private)
+Multiply_Table :: [15]Addend_Group_Element // 0 = inf, which is implicit.
+
+@(private)
+mul_tbl_set :: proc "contextless" (
+	tbl: ^Multiply_Table,
+	ge: ^Group_Element,
+	tmp_add: ^Add_Scratch,
+) {
+	tmp: Group_Element = ---
+	ge_set(&tmp, ge)
+
+	ge_addend_set(&tbl[0], ge)
+	for i := 1; i < 15; i = i + 1 {
+		ge_add_addend(&tmp, &tmp, &tbl[0], tmp_add)
+		ge_addend_set(&tbl[i], &tmp)
+	}
+
+	ge_clear(&tmp)
+}
+
+@(private)
+mul_tbl_add :: proc "contextless" (
+	ge: ^Group_Element,
+	tbl: ^Multiply_Table,
+	idx: u64,
+	tmp_add: ^Add_Scratch,
+	tmp_addend: ^Addend_Group_Element,
+	unsafe_is_vartime: bool,
+) {
+	// Variable time lookup, with the addition omitted entirely if idx == 0.
+	if unsafe_is_vartime {
+		// Skip adding the point at infinity.
+		if idx != 0 {
+			ge_add_addend(ge, ge, &tbl[idx - 1], tmp_add)
+		}
+		return
+	}
+
+	// Constant time lookup.
+	tmp_addend^ = {
+		// Point at infinity (0, 1, 1, 0) in precomputed form
+		{1, 0, 0, 0, 0}, // y - x
+		{1, 0, 0, 0, 0}, // y + x
+		{0, 0, 0, 0, 0}, // t * 2d
+		{2, 0, 0, 0, 0}, // z * 2
+	}
+	for i := u64(1); i < 16; i = i + 1 {
+		_, ctrl := bits.sub_u64(0, (i ~ idx), 0)
+		ge_addend_conditional_assign(tmp_addend, &tbl[i - 1], int(~ctrl) & 1)
+	}
+	ge_add_addend(ge, ge, tmp_addend, tmp_add)
+}

+ 100 - 42
core/crypto/_fiat/field_curve25519/field.odin

@@ -15,6 +15,20 @@ fe_tighten_cast :: #force_inline proc "contextless" (
 	return transmute(^Tight_Field_Element)(arg1)
 }
 
+fe_clear :: proc "contextless" (
+	arg1: $T,
+) where T == ^Tight_Field_Element || T == ^Loose_Field_Element {
+	mem.zero_explicit(arg1, size_of(arg1^))
+}
+
+fe_clear_vec :: proc "contextless" (
+	arg1: $T,
+) where T == []^Tight_Field_Element || T == []^Loose_Field_Element {
+	for fe in arg1 {
+		fe_clear(fe)
+	}
+}
+
 fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) {
 	// Ignore the unused bit by copying the input and masking the bit off
 	// prior to deserialization.
@@ -27,12 +41,25 @@ fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte
 	mem.zero_explicit(&tmp1, size_of(tmp1))
 }
 
+fe_is_negative :: proc "contextless" (arg1: ^Tight_Field_Element) -> int {
+	tmp1: [32]byte = ---
+
+	fe_to_bytes(&tmp1, arg1)
+	ret := tmp1[0] & 1
+
+	mem.zero_explicit(&tmp1, size_of(tmp1))
+
+	return int(ret)
+}
+
 fe_equal :: proc "contextless" (arg1, arg2: ^Tight_Field_Element) -> int {
-	tmp2: [32]byte = ---
+	tmp1, tmp2: [32]byte = ---, ---
 
+	fe_to_bytes(&tmp1, arg1)
 	fe_to_bytes(&tmp2, arg2)
-	ret := fe_equal_bytes(arg1, &tmp2)
+	ret := crypto.compare_constant_time(tmp1[:], tmp2[:])
 
+	mem.zero_explicit(&tmp1, size_of(tmp1))
 	mem.zero_explicit(&tmp2, size_of(tmp2))
 
 	return ret
@@ -67,25 +94,37 @@ fe_carry_pow2k :: proc "contextless" (
 	}
 }
 
+fe_carry_add :: #force_inline proc "contextless" (out1, arg1, arg2: ^Tight_Field_Element) {
+	fe_add(fe_relax_cast(out1), arg1, arg2)
+	fe_carry(out1, fe_relax_cast(out1))
+}
+
+fe_carry_sub :: #force_inline proc "contextless" (out1, arg1, arg2: ^Tight_Field_Element) {
+	fe_sub(fe_relax_cast(out1), arg1, arg2)
+	fe_carry(out1, fe_relax_cast(out1))
+}
+
 fe_carry_opp :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) {
 	fe_opp(fe_relax_cast(out1), arg1)
 	fe_carry(out1, fe_relax_cast(out1))
 }
 
-fe_carry_invsqrt :: proc "contextless" (
+fe_carry_sqrt_ratio_m1 :: proc "contextless" (
 	out1: ^Tight_Field_Element,
-	arg1: ^Loose_Field_Element,
+	arg1: ^Loose_Field_Element, // u
+	arg2: ^Loose_Field_Element, // v
 ) -> int {
-	// Inverse square root taken from Monocypher.
+	// SQRT_RATIO_M1(u, v) from RFC 9496 - 4.2, based on the inverse
+	// square root from Monocypher.
 
-	tmp1, tmp2, tmp3: Tight_Field_Element = ---, ---, ---
+	w: Tight_Field_Element = ---
+	fe_carry_mul(&w, arg1, arg2) // u * v
 
-	// t0 = x^((p-5)/8)
-	// Can be achieved with a simple double & add ladder,
-	// but it would be slower.
-	fe_carry_pow2k(&tmp1, arg1, 1)
+	// r = tmp1 = u * w^((p-5)/8)
+	tmp1, tmp2, tmp3: Tight_Field_Element = ---, ---, ---
+	fe_carry_pow2k(&tmp1, fe_relax_cast(&w), 1)
 	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 2)
-	fe_carry_mul(&tmp2, arg1, fe_relax_cast(&tmp2))
+	fe_carry_mul(&tmp2, fe_relax_cast(&w), fe_relax_cast(&tmp2))
 	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), fe_relax_cast(&tmp2))
 	fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 1)
 	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
@@ -104,48 +143,49 @@ fe_carry_invsqrt :: proc "contextless" (
 	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp2), 50)
 	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
 	fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 2)
-	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), arg1)
-
-	// quartic = x^((p-1)/4)
-	quartic := &tmp2
-	fe_carry_square(quartic, fe_relax_cast(&tmp1))
-	fe_carry_mul(quartic, fe_relax_cast(quartic), arg1)
-
-	// Serialize quartic once to save on repeated serialization/sanitization.
-	quartic_buf: [32]byte = ---
-	fe_to_bytes(&quartic_buf, quartic)
-	check := &tmp3
-
-	fe_one(check)
-	p1 := fe_equal_bytes(check, &quartic_buf)
-	fe_carry_opp(check, check)
-	m1 := fe_equal_bytes(check, &quartic_buf)
-	fe_carry_opp(check, &SQRT_M1)
-	ms := fe_equal_bytes(check, &quartic_buf)
-
-	// if quartic == -1 or sqrt(-1)
-	// then  isr = x^((p-1)/4) * sqrt(-1)
-	// else  isr = x^((p-1)/4)
-	fe_carry_mul(out1, fe_relax_cast(&tmp1), fe_relax_cast(&SQRT_M1))
-	fe_cond_assign(out1, &tmp1, (m1 | ms) ~ 1)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), fe_relax_cast(&w)) // w^((p-5)/8)
 
-	mem.zero_explicit(&tmp1, size_of(tmp1))
-	mem.zero_explicit(&tmp2, size_of(tmp2))
-	mem.zero_explicit(&tmp3, size_of(tmp3))
-	mem.zero_explicit(&quartic_buf, size_of(quartic_buf))
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), arg1) // u * w^((p-5)/8)
+
+	// Serialize `check` once to save on repeated serialization.
+	r, check := &tmp1, &tmp2
+	b: [32]byte = ---
+	fe_carry_square(check, fe_relax_cast(r))
+	fe_carry_mul(check, fe_relax_cast(check), arg2) // check * v
+	fe_to_bytes(&b, check)
+
+	u, neg_u, neg_u_i := &tmp3, &w, check
+	fe_carry(u, arg1)
+	fe_carry_opp(neg_u, u)
+	fe_carry_mul(neg_u_i, fe_relax_cast(neg_u), fe_relax_cast(&FE_SQRT_M1))
+
+	correct_sign_sqrt := fe_equal_bytes(u, &b)
+	flipped_sign_sqrt := fe_equal_bytes(neg_u, &b)
+	flipped_sign_sqrt_i := fe_equal_bytes(neg_u_i, &b)
+
+	r_prime := check
+	fe_carry_mul(r_prime, fe_relax_cast(r), fe_relax_cast(&FE_SQRT_M1))
+	fe_cond_assign(r, r_prime, flipped_sign_sqrt | flipped_sign_sqrt_i)
+
+	// Pick the non-negative square root.
+	fe_carry_opp(r_prime, r)
+	fe_cond_select(out1, r, r_prime, fe_is_negative(r))
 
-	return p1 | m1
+	fe_clear_vec([]^Tight_Field_Element{&w, &tmp1, &tmp2, &tmp3})
+	mem.zero_explicit(&b, size_of(b))
+
+	return correct_sign_sqrt | flipped_sign_sqrt
 }
 
 fe_carry_inv :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
 	tmp1: Tight_Field_Element
 
 	fe_carry_square(&tmp1, arg1)
-	_ = fe_carry_invsqrt(&tmp1, fe_relax_cast(&tmp1))
+	_ = fe_carry_sqrt_ratio_m1(&tmp1, fe_relax_cast(&FE_ONE), fe_relax_cast(&tmp1))
 	fe_carry_square(&tmp1, fe_relax_cast(&tmp1))
 	fe_carry_mul(out1, fe_relax_cast(&tmp1), arg1)
 
-	mem.zero_explicit(&tmp1, size_of(tmp1))
+	fe_clear(&tmp1)
 }
 
 fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
@@ -196,3 +236,21 @@ fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Tight_Field_El
 	out1[3], out2[3] = x4, y4
 	out1[4], out2[4] = x5, y5
 }
+
+@(optimization_mode = "none")
+fe_cond_select :: #force_no_inline proc "contextless" (
+	out1, arg1, arg2: $T,
+	arg3: int,
+) where T == ^Tight_Field_Element || T == ^Loose_Field_Element {
+	mask := (u64(arg3) * 0xffffffffffffffff)
+	x1 := ((mask & arg2[0]) | ((~mask) & arg1[0]))
+	x2 := ((mask & arg2[1]) | ((~mask) & arg1[1]))
+	x3 := ((mask & arg2[2]) | ((~mask) & arg1[2]))
+	x4 := ((mask & arg2[3]) | ((~mask) & arg1[3]))
+	x5 := ((mask & arg2[4]) | ((~mask) & arg1[4]))
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}

+ 4 - 1
core/crypto/_fiat/field_curve25519/field51.odin

@@ -42,7 +42,10 @@ import "core:math/bits"
 Loose_Field_Element :: distinct [5]u64
 Tight_Field_Element :: distinct [5]u64
 
-SQRT_M1 := Tight_Field_Element {
+FE_ZERO := Tight_Field_Element{0, 0, 0, 0, 0}
+FE_ONE := Tight_Field_Element{1, 0, 0, 0, 0}
+
+FE_SQRT_M1 := Tight_Field_Element {
 	1718705420411056,
 	234908883556509,
 	2233514472574048,

+ 6 - 2
core/crypto/_fiat/field_scalar25519/field.odin

@@ -20,6 +20,10 @@ _TWO_336 := Montgomery_Domain_Field_Element {
 	0x3d217f5be65cb5c,
 }
 
+fe_clear :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) {
+	mem.zero_explicit(arg1, size_of(Montgomery_Domain_Field_Element))
+}
+
 fe_from_bytes :: proc "contextless" (
 	out1: ^Montgomery_Domain_Field_Element,
 	arg1: ^[32]byte,
@@ -85,7 +89,7 @@ fe_from_bytes_wide :: proc "contextless" (
 	fe_mul(&tmp, &tmp, &_TWO_336) // c * 2^336
 	fe_add(out1, out1, &tmp) // a + b * 2^168 + c * 2^336
 
-	mem.zero_explicit(&tmp, size_of(tmp))
+	fe_clear(&tmp)
 }
 
 @(private)
@@ -125,7 +129,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
 	// which will be 1.
 	_, borrow := bits.sub_u64(fe_non_zero(&tmp), 1, 0)
 
-	mem.zero_explicit(&tmp, size_of(tmp))
+	fe_clear(&tmp)
 
 	return int(borrow)
 }

+ 2 - 8
core/crypto/x25519/x25519.odin

@@ -94,13 +94,8 @@ _scalarmult :: proc "contextless" (out, scalar, point: ^[32]byte) {
 	field.fe_carry_mul(&x2, field.fe_relax_cast(&x2), field.fe_relax_cast(&z2))
 	field.fe_to_bytes(out, &x2)
 
-	mem.zero_explicit(&x1, size_of(x1))
-	mem.zero_explicit(&x2, size_of(x2))
-	mem.zero_explicit(&x3, size_of(x3))
-	mem.zero_explicit(&z2, size_of(z2))
-	mem.zero_explicit(&z3, size_of(z3))
-	mem.zero_explicit(&t0, size_of(t0))
-	mem.zero_explicit(&t1, size_of(t1))
+	field.fe_clear_vec([]^field.Tight_Field_Element{&x1, &x2, &x3, &z2, &z3})
+	field.fe_clear_vec([]^field.Loose_Field_Element{&t0, &t1})
 }
 
 // scalarmult "multiplies" the provided scalar and point, and writes the
@@ -137,6 +132,5 @@ scalarmult :: proc(dst, scalar, point: []byte) {
 // scalarmult_basepoint "multiplies" the provided scalar with the X25519
 // base point and writes the resulting point to dst.
 scalarmult_basepoint :: proc(dst, scalar: []byte) {
-	// TODO/perf: Switch to using a precomputed table.
 	scalarmult(dst, scalar, _BASE_POINT[:])
 }