4 anni fa · a056e19434
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -218,7 +218,7 @@ demo :: proc() {
 
				 	set(b, 6);
			
 
				 	set(c, 131);
			
 
				 
			
 
				-	if err := internal_int_exponent_mod_fast(res, a, b, c, 1); err != nil {
			
 
				+	if err := _private_int_exponent_mod(res, a, b, c, 1); err != nil {
			
 
				 		fmt.printf("Error: %v\n", err);
			
 
				 	}
			
 
				 
			
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -10,8 +10,6 @@
 
				 */
			
 
				 package math_big
			
 
				 
			
 
				-import "core:mem"
			
 
				-
			
 
				 /*
			
 
				 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
			
 
				 	Returns true if it is, false if not. 
			
@@ -35,1027 +33,82 @@ internal_int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator)
 
				 }
			
 
				 
			
 
				 /*
			
 
				-	Computes xR**-1 == x (mod N) via Montgomery Reduction.
			
 
				-*/
			
 
				-internal_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
			
 
				-	context.allocator = allocator;
			
 
				-	/*
			
 
				-		Can the fast reduction [comba] method be used?
			
 
				-		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
			
 
				-		since carries are fixed up in the inner loop.
			
 
				-	*/
			
 
				-	digs := (n.used * 2) + 1;
			
 
				-	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
			
 
				-		return _private_montgomery_reduce_comba(x, n, rho);
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Grow the input as required
			
 
				-	*/
			
 
				-	internal_grow(x, digs)                                           or_return;
			
 
				-	x.used = digs;
			
 
				-
			
 
				-	for ix := 0; ix < n.used; ix += 1 {
			
 
				-		/*
			
 
				-			`mu = ai * rho mod b`
			
 
				-			The value of rho must be precalculated via `int_montgomery_setup()`,
			
 
				-			such that it equals -1/n0 mod b this allows the following inner loop
			
 
				-			to reduce the input one digit at a time.
			
 
				-		*/
			
 
				-
			
 
				-		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK));
			
 
				-
			
 
				-		/*
			
 
				-			a = a + mu * m * b**i
			
 
				-			Multiply and add in place.
			
 
				-		*/
			
 
				-		u  := DIGIT(0);
			
 
				-		iy := int(0);
			
 
				-		for ; iy < n.used; iy += 1 {
			
 
				-			/*
			
 
				-				Compute product and sum.
			
 
				-			*/
			
 
				-			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]));
			
 
				-
			
 
				-			/*
			
 
				-				Get carry.
			
 
				-			*/
			
 
				-			u = DIGIT(r >> _DIGIT_BITS);
			
 
				-
			
 
				-			/*
			
 
				-				Fix digit.
			
 
				-			*/
			
 
				-			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK));
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-			At this point the ix'th digit of x should be zero.
			
 
				-			Propagate carries upwards as required.
			
 
				-		*/
			
 
				-		for u != 0 {
			
 
				-			x.digit[ix + iy] += u;
			
 
				-			u = x.digit[ix + iy] >> _DIGIT_BITS;
			
 
				-			x.digit[ix + iy] &= _MASK;
			
 
				-			iy += 1;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		At this point the n.used'th least significant digits of x are all zero,
			
 
				-		which means we can shift x to the right by n.used digits and the
			
 
				-		residue is unchanged.
			
 
				-
			
 
				-		x = x/b**n.used.
			
 
				-	*/
			
 
				-	internal_clamp(x);
			
 
				-	internal_shr_digit(x, n.used);
			
 
				-
			
 
				-	/*
			
 
				-		if x >= n then x = x - n
			
 
				-	*/
			
 
				-	if internal_cmp_mag(x, n) != -1 {
			
 
				-		return internal_sub(x, x, n);
			
 
				-	}
			
 
				-
			
 
				-	return nil;
			
 
				-}
			
 
				-
			
 
				-int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
			
 
				-	assert_if_nil(x, n);
			
 
				-	context.allocator = allocator;
			
 
				-
			
 
				-	internal_clear_if_uninitialized(x, n) or_return;
			
 
				-
			
 
				-	return #force_inline internal_int_montgomery_reduce(x, n, rho);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Shifts with subtractions when the result is greater than b.
			
 
				-
			
 
				-	The method is slightly modified to shift B unconditionally upto just under
			
 
				-	the leading bit of b.  This saves alot of multiple precision shifting.
			
 
				-*/
			
 
				-internal_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
			
 
				-	context.allocator = allocator;
			
 
				-	/*
			
 
				-		How many bits of last digit does b use.
			
 
				-	*/
			
 
				-	bits := internal_count_bits(b) % _DIGIT_BITS;
			
 
				-
			
 
				-	if b.used > 1 {
			
 
				-		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;
			
 
				-		internal_int_power_of_two(a, power)                          or_return;
			
 
				-	} else {
			
 
				-		internal_one(a)                                              or_return;
			
 
				-		bits = 1;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Now compute C = A * B mod b.
			
 
				-	*/
			
 
				-	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
			
 
				-		internal_int_shl1(a, a)                                      or_return;
			
 
				-		if internal_cmp_mag(a, b) != -1 {
			
 
				-			internal_sub(a, a, b)                                    or_return;
			
 
				-		}
			
 
				-	}
			
 
				-	return nil;
			
 
				-}
			
 
				-
			
 
				-int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
			
 
				-	assert_if_nil(a, b);
			
 
				-	context.allocator = allocator;
			
 
				-
			
 
				-	internal_clear_if_uninitialized(a, b) or_return;
			
 
				-
			
 
				-	return #force_inline internal_int_montgomery_calc_normalization(a, b);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Sets up the Montgomery reduction stuff.
			
 
				-*/
			
 
				-internal_int_montgomery_setup :: proc(n: ^Int) -> (rho: DIGIT, err: Error) {
			
 
				-	/*
			
 
				-		Fast inversion mod 2**k
			
 
				-		Based on the fact that:
			
 
				-
			
 
				-		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
			
 
				-		                  =>  2*X*A - X*X*A*A = 1
			
 
				-		                  =>  2*(1) - (1)     = 1
			
 
				-	*/
			
 
				-	b := n.digit[0];
			
 
				-	if b & 1 == 0 { return 0, .Invalid_Argument; }
			
 
				-
			
 
				-	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
			
 
				-	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */
			
 
				-	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */
			
 
				-
			
 
				-	when _DIGIT_TYPE_BITS == 64 {
			
 
				-		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */
			
 
				-		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		rho = -1/m mod b
			
 
				-	*/
			
 
				-	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK));
			
 
				-	return rho, nil;
			
 
				-}
			
 
				-
			
 
				-int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
			
 
				-	assert_if_nil(n);
			
 
				-	internal_clear_if_uninitialized(n, allocator) or_return;
			
 
				-
			
 
				-	return #force_inline internal_int_montgomery_setup(n);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
			
 
				-	From HAC pp.604 Algorithm 14.42
			
 
				-
			
 
				-	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
			
 
				-*/
			
 
				-internal_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
			
 
				-	context.allocator = allocator;
			
 
				-
			
 
				-	q := &Int{};
			
 
				-	defer internal_destroy(q);
			
 
				-	um := m.used;
			
 
				-
			
 
				-	/*
			
 
				-		q = x
			
 
				-	*/
			
 
				-	internal_copy(q, x)                                              or_return;
			
 
				-
			
 
				-	/*
			
 
				-		q1 = x / b**(k-1)
			
 
				-	*/
			
 
				-	internal_shr_digit(q, um - 1);
			
 
				-
			
 
				-	/*
			
 
				-		According to HAC this optimization is ok.
			
 
				-	*/
			
 
				-	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
			
 
				-		internal_mul(q, q, mu)                                       or_return;
			
 
				-	} else {
			
 
				-		_private_int_mul_high(q, q, mu, um)                          or_return;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		q3 = q2 / b**(k+1)
			
 
				-	*/
			
 
				-	internal_shr_digit(q, um + 1);
			
 
				-
			
 
				-	/*
			
 
				-		x = x mod b**(k+1), quick (no division)
			
 
				-	*/
			
 
				-	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return;
			
 
				-
			
 
				-	/*
			
 
				-		q = q * m mod b**(k+1), quick (no division)
			
 
				-	*/
			
 
				-	_private_int_mul(q, q, m, um + 1)                                or_return;
			
 
				-
			
 
				-	/*
			
 
				-		x = x - q
			
 
				-	*/
			
 
				-	internal_sub(x, x, q)                                            or_return;
			
 
				-
			
 
				-	/*
			
 
				-		If x < 0, add b**(k+1) to it.
			
 
				-	*/
			
 
				-	if internal_cmp(x, 0) == -1 {
			
 
				-		internal_set(q, 1)                                           or_return;
			
 
				-		internal_shl_digit(q, um + 1)                                or_return;
			
 
				-		internal_add(x, x, q)                                        or_return;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Back off if it's too big.
			
 
				-	*/
			
 
				-	for internal_cmp(x, m) != -1 {
			
 
				-		internal_sub(x, x, m)                                        or_return;
			
 
				-	}
			
 
				-
			
 
				-	return nil;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
			
 
				-*/
			
 
				-internal_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
			
 
				-	context.allocator = allocator;
			
 
				-
			
 
				-	q := &Int{};
			
 
				-	defer internal_destroy(q);
			
 
				-
			
 
				-	internal_zero(q)                                                 or_return;
			
 
				-
			
 
				-	p := internal_count_bits(n);
			
 
				-
			
 
				-	for {
			
 
				-		/*
			
 
				-			q = a/2**p, a = a mod 2**p
			
 
				-		*/
			
 
				-		internal_shrmod(q, a, a, p)                                  or_return;
			
 
				-
			
 
				-		if d != 1 {
			
 
				-			/*
			
 
				-				q = q * d
			
 
				-			*/
			
 
				-			internal_mul(q, q, d)                                    or_return;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-			a = a + q
			
 
				-		*/
			
 
				-		internal_add(a, a, q)                                        or_return;
			
 
				-		if internal_cmp_mag(a, n) == -1                              { break; }
			
 
				-		internal_sub(a, a, n)                                        or_return;
			
 
				-	}
			
 
				-
			
 
				-	return nil;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Reduces `a` modulo `n` where `n` is of the form 2**p - d
			
 
				-	This differs from reduce_2k since "d" can be larger than a single digit.
			
 
				-*/
			
 
				-internal_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
			
 
				-	context.allocator = allocator;
			
 
				-
			
 
				-	q := &Int{};
			
 
				-	defer internal_destroy(q);
			
 
				-
			
 
				-	internal_zero(q)                                                 or_return;
			
 
				-
			
 
				-	p := internal_count_bits(n);
			
 
				-
			
 
				-	for {
			
 
				-		/*
			
 
				-			q = a/2**p, a = a mod 2**p
			
 
				-		*/
			
 
				-		internal_shrmod(q, a, a, p)                                  or_return;
			
 
				-
			
 
				-		/*
			
 
				-			q = q * d
			
 
				-		*/
			
 
				-		internal_mul(q, q, d)                                        or_return;
			
 
				-
			
 
				-		/*
			
 
				-			a = a + q
			
 
				-		*/
			
 
				-		internal_add(a, a, q)                                        or_return;
			
 
				-		if internal_cmp_mag(a, n) == -1                              { break; }
			
 
				-		internal_sub(a, a, n)                                        or_return;
			
 
				-	}
			
 
				-
			
 
				-	return nil;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Determines if `internal_int_reduce_2k` can be used.
			
 
				-	Asssumes `a` not to be `nil` and to have been initialized.
			
 
				-*/
			
 
				-internal_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
			
 
				-	assert_if_nil(a);
			
 
				-
			
 
				-	if internal_is_zero(a) {
			
 
				-		return false, nil;
			
 
				-	} else if a.used == 1 {
			
 
				-		return true, nil;
			
 
				-	} else if a.used  > 1 {
			
 
				-		iy := internal_count_bits(a);
			
 
				-		iw := 1;
			
 
				-		iz := DIGIT(1);
			
 
				-
			
 
				-		/*
			
 
				-			Test every bit from the second digit up, must be 1.
			
 
				-		*/
			
 
				-		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
			
 
				-			if a.digit[iw] & iz == 0 {
			
 
				-				return false, nil;
			
 
				-			}
			
 
				-
			
 
				-			iz <<= 1;
			
 
				-			if iz > _DIGIT_MAX {
			
 
				-				iw += 1;
			
 
				-				iz  = 1;
			
 
				-			}
			
 
				-		}
			
 
				-		return true, nil;
			
 
				-	} else {
			
 
				-		return true, nil;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Determines if `internal_int_reduce_2k_l` can be used.
			
 
				-	Asssumes `a` not to be `nil` and to have been initialized.
			
 
				-*/
			
 
				-internal_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
			
 
				-	assert_if_nil(a);
			
 
				-
			
 
				-	if internal_int_is_zero(a) {
			
 
				-		return false, nil;
			
 
				-	} else if a.used == 1 {
			
 
				-		return true, nil;
			
 
				-	} else if a.used  > 1 {
			
 
				-		/*
			
 
				-			If more than half of the digits are -1 we're sold.
			
 
				-		*/
			
 
				-		ix := 0;
			
 
				-		iy := 0;
			
 
				-
			
 
				-		for ; ix < a.used; ix += 1 {
			
 
				-			if a.digit[ix] == _DIGIT_MAX {
			
 
				-				iy += 1;
			
 
				-			}
			
 
				-		}
			
 
				-		return iy >= (a.used / 2), nil;
			
 
				-	} else {
			
 
				-		return false, nil;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Determines the setup value.
			
 
				-	Assumes `a` is not `nil`.
			
 
				-*/
			
 
				-internal_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
			
 
				-	context.allocator = allocator;
			
 
				-
			
 
				-	tmp := &Int{};
			
 
				-	defer internal_destroy(tmp);
			
 
				-	internal_zero(tmp)                                               or_return;
			
 
				-
			
 
				-	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
			
 
				-	internal_sub(tmp, tmp, a)                                        or_return;
			
 
				-
			
 
				-	return tmp.digit[0], nil;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Determines the setup value.
			
 
				-	Assumes `mu` and `P` are not `nil`.
			
 
				-
			
 
				-	d := (1 << a.bits) - a;
			
 
				-*/
			
 
				-internal_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
			
 
				-	context.allocator = allocator;
			
 
				-
			
 
				-	tmp := &Int{};
			
 
				-	defer internal_destroy(tmp);
			
 
				-	internal_zero(tmp)                                               or_return;
			
 
				-
			
 
				-	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return;
			
 
				-	internal_sub(mu, tmp, P)                                         or_return;
			
 
				-
			
 
				-	return nil;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Pre-calculate the value required for Barrett reduction.
			
 
				-	For a given modulus "P" it calulates the value required in "mu"
			
 
				-	Assumes `mu` and `P` are not `nil`.
			
 
				-*/
			
 
				-internal_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
			
 
				-	context.allocator = allocator;
			
 
				-
			
 
				-	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return;
			
 
				-	return internal_int_div(mu, mu, P);
			
 
				-}
			
 
				+	This is a shell function that calls either the normal or Montgomery exptmod functions.
			
 
				+	Originally the call to the Montgomery code was embedded in the normal function but that
			
 
				+	wasted alot of stack space for nothing (since 99% of the time the Montgomery code would be called).
			
 
				 
			
 
				-/*
			
 
				 	Computes res == G**X mod P.
			
 
				 	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
			
 
				 */
			
 
				-internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
			
 
				+internal_int_exponent_mod :: proc(res, G, X, P: ^Int, allocator := context.allocator) -> (err: Error) {
			
 
				 	context.allocator = allocator;
			
 
				 
			
 
				-	M := [_TAB_SIZE]Int{};
			
 
				-	winsize: uint;
			
 
				-
			
 
				-	/*
			
 
				-		Use a pointer to the reduction algorithm.
			
 
				-		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
			
 
				-	*/
			
 
				-	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
			
 
				-
			
 
				-	defer {
			
 
				-		internal_destroy(&M[1]);
			
 
				-		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
			
 
				-			internal_destroy(&M[x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Find window size.
			
 
				-	*/
			
 
				-	x := internal_count_bits(X);
			
 
				-	switch {
			
 
				-	case x <= 7:
			
 
				-		winsize = 2;
			
 
				-	case x <= 36:
			
 
				-		winsize = 3;
			
 
				-	case x <= 140:
			
 
				-		winsize = 4;
			
 
				-	case x <= 450:
			
 
				-		winsize = 5;
			
 
				-	case x <= 1303:
			
 
				-		winsize = 6;
			
 
				-	case x <= 3529:
			
 
				-		winsize = 7;
			
 
				-	case:
			
 
				-		winsize = 8;
			
 
				-	}
			
 
				-
			
 
				-	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
			
 
				-
			
 
				-	/*
			
 
				-		Init M array.
			
 
				-		Init first cell.
			
 
				-	*/
			
 
				-	internal_zero(&M[1])                                             or_return;
			
 
				-
			
 
				-	/*
			
 
				-		Now init the second half of the array.
			
 
				-	*/
			
 
				-	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
			
 
				-		internal_zero(&M[x])                                         or_return;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Create `mu`, used for Barrett reduction.
			
 
				-	*/
			
 
				-	mu := &Int{};
			
 
				-	defer internal_destroy(mu);
			
 
				-	internal_zero(mu)                                                or_return;
			
 
				-
			
 
				-	if redmode == 0 {
			
 
				-		internal_int_reduce_setup(mu, P)                             or_return;
			
 
				-		redux = internal_int_reduce;
			
 
				-	} else {
			
 
				-		internal_int_reduce_2k_setup_l(mu, P)                        or_return;
			
 
				-		redux = internal_int_reduce_2k_l;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Create M table.
			
 
				-
			
 
				-		The M table contains powers of the base, e.g. M[x] = G**x mod P.
			
 
				-		The first half of the table is not computed, though, except for M[0] and M[1].
			
 
				-	*/
			
 
				-	internal_int_mod(&M[1], G, P)                                    or_return;
			
 
				-
			
 
				-	/*
			
 
				-		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
			
 
				-
			
 
				-		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
			
 
				-		instead of repeated squaring.
			
 
				-	*/
			
 
				-	slot := 1 << (winsize - 1);
			
 
				-	internal_copy(&M[slot], &M[1])                                   or_return;
			
 
				-
			
 
				-	for x = 0; x < int(winsize - 1); x += 1 {
			
 
				-		/*
			
 
				-			Square it.
			
 
				-		*/
			
 
				-		internal_sqr(&M[slot], &M[slot])                             or_return;
			
 
				-
			
 
				-		/*
			
 
				-			Reduce modulo P
			
 
				-		*/
			
 
				-		redux(&M[slot], P, mu)                                       or_return;
			
 
				-	}
			
 
				+/*
			
 
				+	int dr;
			
 
				 
			
 
				-	/*
			
 
				-		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
			
 
				-		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
			
 
				-	*/
			
 
				-	for x = slot + 1; x < (1 << winsize); x += 1 {
			
 
				-		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
			
 
				-		redux(&M[x], P, mu)                                          or_return;
			
 
				+	/* modulus P must be positive */
			
 
				+	if (mp_isneg(P)) {
			
 
				+		return MP_VAL;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-		Setup result.
			
 
				-	*/
			
 
				-	internal_one(res)                                                or_return;
			
 
				-
			
 
				-	/*
			
 
				-		Set initial mode and bit cnt.
			
 
				-	*/
			
 
				-	mode   := 0;
			
 
				-	bitcnt := 1;
			
 
				-	buf    := DIGIT(0);
			
 
				-	digidx := X.used - 1;
			
 
				-	bitcpy := uint(0);
			
 
				-	bitbuf := DIGIT(0);
			
 
				-
			
 
				-	for {
			
 
				-		/*
			
 
				-			Grab next digit as required.
			
 
				-		*/
			
 
				-		bitcnt -= 1;
			
 
				-		if bitcnt == 0 {
			
 
				-			/*
			
 
				-				If digidx == -1 we are out of digits.
			
 
				-			*/
			
 
				-			if digidx == -1 { break; }
			
 
				+	/* if exponent X is negative we have to recurse */
			
 
				+	if (mp_isneg(X)) {
			
 
				+		mp_int tmpG, tmpX;
			
 
				+		mp_err err;
			
 
				 
			
 
				-			/*
			
 
				-				Read next digit and reset the bitcnt.
			
 
				-			*/
			
 
				-			buf    = X.digit[digidx];
			
 
				-			digidx -= 1;
			
 
				-			bitcnt = _DIGIT_BITS;
			
 
				+		if (!MP_HAS(MP_INVMOD)) {
			
 
				+			return MP_VAL;
			
 
				 		}
			
 
				 
			
 
				-		/*
			
 
				-			Grab the next msb from the exponent.
			
 
				-		*/
			
 
				-		y := buf >> (_DIGIT_BITS - 1) & 1;
			
 
				-		buf <<= 1;
			
 
				-
			
 
				-		/*
			
 
				-			If the bit is zero and mode == 0 then we ignore it.
			
 
				-			These represent the leading zero bits before the first 1 bit
			
 
				-			in the exponent.  Technically this opt is not required but it
			
 
				-			does lower the # of trivial squaring/reductions used.
			
 
				-		*/
			
 
				-		if mode == 0 && y == 0 {
			
 
				-			continue;
			
 
				+		if ((err = mp_init_multi(&tmpG, &tmpX, NULL)) != MP_OKAY) {
			
 
				+			return err;
			
 
				 		}
			
 
				 
			
 
				-		/*
			
 
				-			If the bit is zero and mode == 1 then we square.
			
 
				-		*/
			
 
				-		if mode == 1 && y == 0 {
			
 
				-			internal_sqr(res, res)                                   or_return;
			
 
				-			redux(res, P, mu)                                        or_return;
			
 
				-			continue;
			
 
				+		/* first compute 1/G mod P */
			
 
				+		if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
			
 
				+			goto LBL_ERR;
			
 
				 		}
			
 
				 
			
 
				-		/*
			
 
				-			Else we add it to the window.
			
 
				-		*/
			
 
				-		bitcpy += 1;
			
 
				-		bitbuf |= (y << (winsize - bitcpy));
			
 
				-		mode    = 2;
			
 
				-
			
 
				-		if (bitcpy == winsize) {
			
 
				-			/*
			
 
				-				Window is filled so square as required and multiply.
			
 
				-				Square first.
			
 
				-			*/
			
 
				-			for x = 0; x < int(winsize); x += 1 {
			
 
				-				internal_sqr(res, res)                               or_return;
			
 
				-				redux(res, P, mu)                                    or_return;
			
 
				-			}
			
 
				-
			
 
				-			/*
			
 
				-				Then multiply.
			
 
				-			*/
			
 
				-			internal_mul(res, res, &M[bitbuf])                       or_return;
			
 
				-			redux(res, P, mu)                                        or_return;
			
 
				-
			
 
				-			/*
			
 
				-				Empty window and reset.
			
 
				-			*/
			
 
				-			bitcpy = 0;
			
 
				-			bitbuf = 0;
			
 
				-			mode   = 1;
			
 
				+		/* now get |X| */
			
 
				+		if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
			
 
				+			goto LBL_ERR;
			
 
				 		}
			
 
				-	}
			
 
				 
			
 
				-	/*
			
 
				-		If bits remain then square/multiply.
			
 
				-	*/
			
 
				-	if mode == 2 && bitcpy > 0 {
			
 
				-		/*
			
 
				-			Square then multiply if the bit is set.
			
 
				-		*/
			
 
				-		for x = 0; x < int(bitcpy); x += 1 {
			
 
				-			internal_sqr(res, res)                                   or_return;
			
 
				-			redux(res, P, mu)                                        or_return;
			
 
				-
			
 
				-			bitbuf <<= 1;
			
 
				-			if ((bitbuf & (1 << winsize)) != 0) {
			
 
				-				/*
			
 
				-					Then multiply.
			
 
				-				*/
			
 
				-				internal_mul(res, res, &M[1])                        or_return;
			
 
				-				redux(res, P, mu)                                    or_return;
			
 
				-			}
			
 
				-		}
			
 
				+		/* and now compute (1/G)**|X| instead of G**X [X < 0] */
			
 
				+		err = mp_exptmod(&tmpG, &tmpX, P, Y);
			
 
				+LBL_ERR:
			
 
				+		mp_clear_multi(&tmpG, &tmpX, NULL);
			
 
				+		return err;
			
 
				 	}
			
 
				-	return err;
			
 
				-}
			
 
				 
			
 
				-/*
			
 
				-	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
			
 
				-
			
 
				-	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
			
 
				-	The value of `k` changes based on the size of the exponent.
			
 
				-
			
 
				-	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
			
 
				-
			
 
				-	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
			
 
				-*/
			
 
				-internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
			
 
				-	context.allocator = allocator;
			
 
				-
			
 
				-	M := [_TAB_SIZE]Int{};
			
 
				-	winsize: uint;
			
 
				-
			
 
				-	/*
			
 
				-		Use a pointer to the reduction algorithm.
			
 
				-		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
			
 
				-	*/
			
 
				-	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error);
			
 
				-
			
 
				-	defer {
			
 
				-		internal_destroy(&M[1]);
			
 
				-		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
			
 
				-			internal_destroy(&M[x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Find window size.
			
 
				-	*/
			
 
				-	x := internal_count_bits(X);
			
 
				-	switch {
			
 
				-	case x <= 7:
			
 
				-		winsize = 2;
			
 
				-	case x <= 36:
			
 
				-		winsize = 3;
			
 
				-	case x <= 140:
			
 
				-		winsize = 4;
			
 
				-	case x <= 450:
			
 
				-		winsize = 5;
			
 
				-	case x <= 1303:
			
 
				-		winsize = 6;
			
 
				-	case x <= 3529:
			
 
				-		winsize = 7;
			
 
				-	case:
			
 
				-		winsize = 8;
			
 
				+	/* modified diminished radix reduction */
			
 
				+	if (MP_HAS(MP_REDUCE_IS_2K_L) && MP_HAS(MP_REDUCE_2K_L) && MP_HAS(S_MP_EXPTMOD) &&
			
 
				+		 mp_reduce_is_2k_l(P)) {
			
 
				+		return s_mp_exptmod(G, X, P, Y, 1);
			
 
				 	}
			
 
				 
			
 
				-	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
			
 
				-
			
 
				-	/*
			
 
				-		Init M array
			
 
				-		Init first cell.
			
 
				-	*/
			
 
				-	cap := internal_int_allocated_cap(P);
			
 
				-	internal_grow(&M[1], cap)                                        or_return;
			
 
				+	/* is it a DR modulus? default to no */
			
 
				+	dr = (MP_HAS(MP_DR_IS_MODULUS) && mp_dr_is_modulus(P)) ? 1 : 0;
			
 
				 
			
 
				-	/*
			
 
				-		Now init the second half of the array.
			
 
				-	*/
			
 
				-	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
			
 
				-		internal_grow(&M[x], cap)                                    or_return;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Determine and setup reduction code.
			
 
				-	*/
			
 
				-	rho: DIGIT;
			
 
				-
			
 
				-	if redmode == 0 {
			
 
				-		/*
			
 
				-			Now setup Montgomery.
			
 
				-		*/
			
 
				-		rho = internal_int_montgomery_setup(P)                       or_return;
			
 
				-
			
 
				-		/*
			
 
				-			Automatically pick the comba one if available (saves quite a few calls/ifs).
			
 
				-		*/
			
 
				-		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
			
 
				-			redux = _private_montgomery_reduce_comba;
			
 
				-		} else {
			
 
				-			/*
			
 
				-				Use slower baseline Montgomery method.
			
 
				-			*/
			
 
				-			redux = internal_int_montgomery_reduce;
			
 
				-		}
			
 
				-	} else if redmode == 1 {
			
 
				-		/*
			
 
				-			Setup DR reduction for moduli of the form B**k - b.
			
 
				-		*/
			
 
				-		rho = internal_int_dr_setup(P);
			
 
				-		redux = internal_int_dr_reduce;
			
 
				-	} else {
			
 
				-		/*
			
 
				-			Setup DR reduction for moduli of the form 2**k - b.
			
 
				-		*/
			
 
				-		rho = internal_int_reduce_2k_setup(P)                        or_return;
			
 
				-		redux = internal_int_reduce_2k;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Setup result.
			
 
				-	*/
			
 
				-	internal_grow(res, cap)                                          or_return;
			
 
				-
			
 
				-	/*
			
 
				-		Create M table
			
 
				-		The first half of the table is not computed, though, except for M[0] and M[1]
			
 
				-	*/
			
 
				-
			
 
				-	if redmode == 0 {
			
 
				-		/*
			
 
				-			Now we need R mod m.
			
 
				-		*/
			
 
				-		internal_int_montgomery_calc_normalization(res, P)           or_return;
			
 
				-
			
 
				-		/*
			
 
				-			Now set M[1] to G * R mod m.
			
 
				-		*/
			
 
				-		internal_mulmod(&M[1], G, res, P)                            or_return;
			
 
				-	} else {
			
 
				-		internal_one(res)                                            or_return;
			
 
				-		internal_mod(&M[1], G, P)                                    or_return;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
			
 
				-	*/
			
 
				-	slot := 1 << (winsize - 1);
			
 
				-	internal_copy(&M[slot], &M[1])                                   or_return;
			
 
				-
			
 
				-	for x = 0; x < int(winsize - 1); x += 1 {
			
 
				-		internal_sqr(&M[slot], &M[slot])                             or_return;
			
 
				-		redux(&M[slot], P, rho)                                      or_return;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Create upper table.
			
 
				-	*/
			
 
				-	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
			
 
				-		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
			
 
				-		redux(&M[x], P, rho)                                         or_return;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Set initial mode and bit cnt.
			
 
				-	*/
			
 
				-	mode   := 0;
			
 
				-	bitcnt := 1;
			
 
				-	buf    := DIGIT(0);
			
 
				-	digidx := X.used - 1;
			
 
				-	bitcpy := 0;
			
 
				-	bitbuf := DIGIT(0);
			
 
				-
			
 
				-	for {
			
 
				-		/*
			
 
				-			Grab next digit as required.
			
 
				-		*/
			
 
				-		bitcnt -= 1;
			
 
				-		if bitcnt == 0 {
			
 
				-			/*
			
 
				-				If digidx == -1 we are out of digits so break.
			
 
				-			*/
			
 
				-			if digidx == -1 { break; }
			
 
				-
			
 
				-			/*
			
 
				-				Read next digit and reset the bitcnt.
			
 
				-			*/
			
 
				-			buf    = X.digit[digidx];
			
 
				-			digidx -= 1;
			
 
				-			bitcnt = _DIGIT_BITS;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-			Grab the next msb from the exponent.
			
 
				-		*/
			
 
				-		y := (buf >> (_DIGIT_BITS - 1)) & 1;
			
 
				-		buf <<= 1;
			
 
				-
			
 
				-		/*
			
 
				-			If the bit is zero and mode == 0 then we ignore it.
			
 
				-			These represent the leading zero bits before the first 1 bit in the exponent.
			
 
				-			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
			
 
				-		*/
			
 
				-		if mode == 0 && y == 0 { continue; }
			
 
				-
			
 
				-		/*
			
 
				-			If the bit is zero and mode == 1 then we square.
			
 
				-		*/
			
 
				-		if mode == 1 && y == 0 {
			
 
				-			internal_sqr(res, res)                                   or_return;
			
 
				-			redux(res, P, rho)                                       or_return;
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-			Else we add it to the window.
			
 
				-		*/
			
 
				-		bitcpy += 1;
			
 
				-		bitbuf |= (y << (winsize - uint(bitcpy)));
			
 
				-		mode    = 2;
			
 
				-
			
 
				-		if bitcpy == int(winsize) {
			
 
				-			/*
			
 
				-				Window is filled so square as required and multiply
			
 
				-				Square first.
			
 
				-			*/
			
 
				-			for x = 0; x < int(winsize); x += 1 {
			
 
				-				internal_sqr(res, res)                               or_return;
			
 
				-				redux(res, P, rho)                                   or_return;
			
 
				-			}
			
 
				-
			
 
				-			/*
			
 
				-				Then multiply.
			
 
				-			*/
			
 
				-			internal_mul(res, res, &M[bitbuf])                       or_return;
			
 
				-			redux(res, P, rho)                                       or_return;
			
 
				-
			
 
				-			/*
			
 
				-				Empty window and reset.
			
 
				-			*/
			
 
				-			bitcpy = 0;
			
 
				-			bitbuf = 0;
			
 
				-			mode   = 1;
			
 
				-		}
			
 
				+	/* if not, is it a unrestricted DR modulus? */
			
 
				+	if (MP_HAS(MP_REDUCE_IS_2K) && (dr == 0)) {
			
 
				+		dr = (mp_reduce_is_2k(P)) ? 2 : 0;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-		If bits remain then square/multiply.
			
 
				-	*/
			
 
				-	if mode == 2 && bitcpy > 0 {
			
 
				-		/*
			
 
				-			Square then multiply if the bit is set.
			
 
				-		*/
			
 
				-		for x = 0; x < bitcpy; x += 1 {
			
 
				-			internal_sqr(res, res)                                   or_return;
			
 
				-			redux(res, P, rho)                                       or_return;
			
 
				-
			
 
				-			/*
			
 
				-				Get next bit of the window.
			
 
				-			*/
			
 
				-			bitbuf <<= 1;
			
 
				-			if bitbuf & (1 << winsize) != 0 {
			
 
				-				/*
			
 
				-					Then multiply.
			
 
				-				*/
			
 
				-				internal_mul(res, res, &M[1])                        or_return;
			
 
				-				redux(res, P, rho)                                   or_return;
			
 
				-			}
			
 
				-		}
			
 
				+	/* if the modulus is odd or dr != 0 use the montgomery method */
			
 
				+	if (MP_HAS(S_MP_EXPTMOD_FAST) && (mp_isodd(P) || (dr != 0))) {
			
 
				+		return s_mp_exptmod_fast(G, X, P, Y, dr);
			
 
				 	}
			
 
				 
			
 
				-	if redmode == 0 {
			
 
				-		/*
			
 
				-			Fixup result if Montgomery reduction is used.
			
 
				-			Recall that any value in a Montgomery system is actually multiplied by R mod n.
			
 
				-			So we have to reduce one more time to cancel out the factor of R.
			
 
				-		*/
			
 
				-		redux(res, P, rho)                                           or_return;
			
 
				+	/* otherwise use the generic Barrett reduction technique */
			
 
				+	if (MP_HAS(S_MP_EXPTMOD)) {
			
 
				+		return s_mp_exptmod(G, X, P, Y, 0);
			
 
				 	}
			
 
				 
			
 
				-	return nil;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Determines the setup value.
			
 
				-	Assumes `a` to not be `nil` and to have been initialized.
			
 
				-*/
			
 
				-internal_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
			
 
				-	/*
			
 
				-		The casts are required if _DIGIT_BITS is one less than
			
 
				-		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
			
 
				-	*/
			
 
				-	return DIGIT((1 << _DIGIT_BITS) - a.digit[0]);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Determines if a number is a valid DR modulus.
			
 
				-	Assumes `a` to not be `nil` and to have been initialized.
			
 
				-*/
			
 
				-internal_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
			
 
				-	/*
			
 
				-		Must be at least two digits.
			
 
				-	*/
			
 
				-	if a.used < 2 { return false; }
			
 
				+	/* no exptmod for evens */
			
 
				+	return MP_VAL;
			
 
				 
			
 
				-	/*
			
 
				-		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
			
 
				 	*/
			
 
				-	for ix := 1; ix < a.used; ix += 1 {
			
 
				-		if a.digit[ix] != _MASK {
			
 
				-			return false;
			
 
				-		}
			
 
				-	}
			
 
				-	return true;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				-	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
			
 
				-	Based on algorithm from the paper
			
 
				-
			
 
				-		"Generating Efficient Primes for Discrete Log Cryptosystems"
			
 
				-					Chae Hoon Lim, Pil Joong Lee,
			
 
				-			POSTECH Information Research Laboratories
			
 
				-
			
 
				-	The modulus must be of a special format [see manual].
			
 
				-	Has been modified to use algorithm 7.10 from the LTM book instead
			
 
				-
			
 
				-	Input x must be in the range 0 <= x <= (n-1)**2
			
 
				-	Assumes `x` and `n` to not be `nil` and to have been initialized.
			
 
				-*/
			
 
				-internal_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
			
 
				-	/*
			
 
				-		m = digits in modulus.
			
 
				-	*/
			
 
				-	m := n.used;
			
 
				-
			
 
				-	/*
			
 
				-		Ensure that "x" has at least 2m digits.
			
 
				-	*/
			
 
				-	internal_grow(x, m + m)                                          or_return;
			
 
				-
			
 
				-	/*
			
 
				-		Top of loop, this is where the code resumes if another reduction pass is required.
			
 
				-	*/
			
 
				-	for {
			
 
				-		i: int;
			
 
				-		mu := DIGIT(0);
			
 
				-
			
 
				-		/*
			
 
				-			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
			
 
				-		*/
			
 
				-		for i = 0; i < m; i += 1 {
			
 
				-			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu);
			
 
				-			x.digit[i] = DIGIT(r & _WORD(_MASK));
			
 
				-			mu         = DIGIT(r >> _WORD(_DIGIT_BITS));
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-			Set final carry.
			
 
				-		*/
			
 
				-		x.digit[i] = mu;
			
 
				-
			
 
				-		/*
			
 
				-			Zero words above m.
			
 
				-		*/
			
 
				-		mem.zero_slice(x.digit[m + 1:][:x.used - m]);
			
 
				-
			
 
				-		/*
			
 
				-			Clamp, sub and return.
			
 
				-		*/
			
 
				-		internal_clamp(x)                                            or_return;
			
 
				-
			
 
				-		/*
			
 
				-			If x >= n then subtract and reduce again.
			
 
				-			Each successive "recursion" makes the input smaller and smaller.
			
 
				-		*/
			
 
				-		if internal_cmp_mag(x, n) == -1 { break; }
			
 
				-
			
 
				-		internal_sub(x, x, n)                                        or_return;
			
 
				-	}
			
 
				 	return nil;
			
 
				 }
			
 
				 
			
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1850,6 +1850,1015 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 
				 	return nil;

			
 
				 }

			
 
				 

			
 
				+/*

			
 
				+	Computes xR**-1 == x (mod N) via Montgomery Reduction.

			
 
				+	Assumes `x` and `n` not to be nil.

			
 
				+*/

			
 
				+_private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {

			
 
				+	context.allocator = allocator;

			
 
				+	/*

			
 
				+		Can the fast reduction [comba] method be used?

			
 
				+		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],

			
 
				+		since carries are fixed up in the inner loop.

			
 
				+	*/

			
 
				+	internal_clear_if_uninitialized(x, n) or_return;

			
 
				+

			
 
				+	digs := (n.used * 2) + 1;

			
 
				+	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {

			
 
				+		return _private_montgomery_reduce_comba(x, n, rho);

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Grow the input as required

			
 
				+	*/

			
 
				+	internal_grow(x, digs)                                           or_return;

			
 
				+	x.used = digs;

			
 
				+

			
 
				+	for ix := 0; ix < n.used; ix += 1 {

			
 
				+		/*

			
 
				+			`mu = ai * rho mod b`

			
 
				+			The value of rho must be precalculated via `int_montgomery_setup()`,

			
 
				+			such that it equals -1/n0 mod b this allows the following inner loop

			
 
				+			to reduce the input one digit at a time.

			
 
				+		*/

			
 
				+

			
 
				+		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK));

			
 
				+

			
 
				+		/*

			
 
				+			a = a + mu * m * b**i

			
 
				+			Multiply and add in place.

			
 
				+		*/

			
 
				+		u  := DIGIT(0);

			
 
				+		iy := int(0);

			
 
				+		for ; iy < n.used; iy += 1 {

			
 
				+			/*

			
 
				+				Compute product and sum.

			
 
				+			*/

			
 
				+			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]));

			
 
				+

			
 
				+			/*

			
 
				+				Get carry.

			
 
				+			*/

			
 
				+			u = DIGIT(r >> _DIGIT_BITS);

			
 
				+

			
 
				+			/*

			
 
				+				Fix digit.

			
 
				+			*/

			
 
				+			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK));

			
 
				+		}

			
 
				+

			
 
				+		/*

			
 
				+			At this point the ix'th digit of x should be zero.

			
 
				+			Propagate carries upwards as required.

			
 
				+		*/

			
 
				+		for u != 0 {

			
 
				+			x.digit[ix + iy] += u;

			
 
				+			u = x.digit[ix + iy] >> _DIGIT_BITS;

			
 
				+			x.digit[ix + iy] &= _MASK;

			
 
				+			iy += 1;

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		At this point the n.used'th least significant digits of x are all zero,

			
 
				+		which means we can shift x to the right by n.used digits and the

			
 
				+		residue is unchanged.

			
 
				+

			
 
				+		x = x/b**n.used.

			
 
				+	*/

			
 
				+	internal_clamp(x);

			
 
				+	internal_shr_digit(x, n.used);

			
 
				+

			
 
				+	/*

			
 
				+		if x >= n then x = x - n

			
 
				+	*/

			
 
				+	if internal_cmp_mag(x, n) != -1 {

			
 
				+		return internal_sub(x, x, n);

			
 
				+	}

			
 
				+

			
 
				+	return nil;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Shifts with subtractions when the result is greater than b.

			
 
				+

			
 
				+	The method is slightly modified to shift B unconditionally upto just under

			
 
				+	the leading bit of b.  This saves alot of multiple precision shifting.

			
 
				+

			
 
				+	Assumes `a` and `b` not to be `nil`.

			
 
				+*/

			
 
				+_private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {

			
 
				+	context.allocator = allocator;

			
 
				+	/*

			
 
				+		How many bits of last digit does b use.

			
 
				+	*/

			
 
				+	internal_clear_if_uninitialized(a, b) or_return;

			
 
				+

			
 
				+	bits := internal_count_bits(b) % _DIGIT_BITS;

			
 
				+

			
 
				+	if b.used > 1 {

			
 
				+		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;

			
 
				+		internal_int_power_of_two(a, power)                          or_return;

			
 
				+	} else {

			
 
				+		internal_one(a)                                              or_return;

			
 
				+		bits = 1;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Now compute C = A * B mod b.

			
 
				+	*/

			
 
				+	for x := bits - 1; x < _DIGIT_BITS; x += 1 {

			
 
				+		internal_int_shl1(a, a)                                      or_return;

			
 
				+		if internal_cmp_mag(a, b) != -1 {

			
 
				+			internal_sub(a, a, b)                                    or_return;

			
 
				+		}

			
 
				+	}

			
 
				+	return nil;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Sets up the Montgomery reduction stuff.

			
 
				+*/

			
 
				+_private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {

			
 
				+	/*

			
 
				+		Fast inversion mod 2**k

			
 
				+		Based on the fact that:

			
 
				+

			
 
				+		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)

			
 
				+		                  =>  2*X*A - X*X*A*A = 1

			
 
				+		                  =>  2*(1) - (1)     = 1

			
 
				+	*/

			
 
				+	internal_clear_if_uninitialized(n, allocator) or_return;

			
 
				+

			
 
				+	b := n.digit[0];

			
 
				+	if b & 1 == 0 { return 0, .Invalid_Argument; }

			
 
				+

			
 
				+	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */

			
 
				+	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */

			
 
				+	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */

			
 
				+

			
 
				+	when _DIGIT_TYPE_BITS == 64 {

			
 
				+		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */

			
 
				+		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		rho = -1/m mod b

			
 
				+	*/

			
 
				+	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK));

			
 
				+	return rho, nil;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.

			
 
				+	From HAC pp.604 Algorithm 14.42

			
 
				+

			
 
				+	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.

			
 
				+*/

			
 
				+_private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {

			
 
				+	context.allocator = allocator;

			
 
				+

			
 
				+	q := &Int{};

			
 
				+	defer internal_destroy(q);

			
 
				+	um := m.used;

			
 
				+

			
 
				+	/*

			
 
				+		q = x

			
 
				+	*/

			
 
				+	internal_copy(q, x)                                              or_return;

			
 
				+

			
 
				+	/*

			
 
				+		q1 = x / b**(k-1)

			
 
				+	*/

			
 
				+	internal_shr_digit(q, um - 1);

			
 
				+

			
 
				+	/*

			
 
				+		According to HAC this optimization is ok.

			
 
				+	*/

			
 
				+	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {

			
 
				+		internal_mul(q, q, mu)                                       or_return;

			
 
				+	} else {

			
 
				+		_private_int_mul_high(q, q, mu, um)                          or_return;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		q3 = q2 / b**(k+1)

			
 
				+	*/

			
 
				+	internal_shr_digit(q, um + 1);

			
 
				+

			
 
				+	/*

			
 
				+		x = x mod b**(k+1), quick (no division)

			
 
				+	*/

			
 
				+	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return;

			
 
				+

			
 
				+	/*

			
 
				+		q = q * m mod b**(k+1), quick (no division)

			
 
				+	*/

			
 
				+	_private_int_mul(q, q, m, um + 1)                                or_return;

			
 
				+

			
 
				+	/*

			
 
				+		x = x - q

			
 
				+	*/

			
 
				+	internal_sub(x, x, q)                                            or_return;

			
 
				+

			
 
				+	/*

			
 
				+		If x < 0, add b**(k+1) to it.

			
 
				+	*/

			
 
				+	if internal_cmp(x, 0) == -1 {

			
 
				+		internal_set(q, 1)                                           or_return;

			
 
				+		internal_shl_digit(q, um + 1)                                or_return;

			
 
				+		internal_add(x, x, q)                                        or_return;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Back off if it's too big.

			
 
				+	*/

			
 
				+	for internal_cmp(x, m) != -1 {

			
 
				+		internal_sub(x, x, m)                                        or_return;

			
 
				+	}

			
 
				+

			
 
				+	return nil;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.

			
 
				+*/

			
 
				+_private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {

			
 
				+	context.allocator = allocator;

			
 
				+

			
 
				+	q := &Int{};

			
 
				+	defer internal_destroy(q);

			
 
				+

			
 
				+	internal_zero(q)                                                 or_return;

			
 
				+

			
 
				+	p := internal_count_bits(n);

			
 
				+

			
 
				+	for {

			
 
				+		/*

			
 
				+			q = a/2**p, a = a mod 2**p

			
 
				+		*/

			
 
				+		internal_shrmod(q, a, a, p)                                  or_return;

			
 
				+

			
 
				+		if d != 1 {

			
 
				+			/*

			
 
				+				q = q * d

			
 
				+			*/

			
 
				+			internal_mul(q, q, d)                                    or_return;

			
 
				+		}

			
 
				+

			
 
				+		/*

			
 
				+			a = a + q

			
 
				+		*/

			
 
				+		internal_add(a, a, q)                                        or_return;

			
 
				+		if internal_cmp_mag(a, n) == -1                              { break; }

			
 
				+		internal_sub(a, a, n)                                        or_return;

			
 
				+	}

			
 
				+

			
 
				+	return nil;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Reduces `a` modulo `n` where `n` is of the form 2**p - d

			
 
				+	This differs from reduce_2k since "d" can be larger than a single digit.

			
 
				+*/

			
 
				+_private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {

			
 
				+	context.allocator = allocator;

			
 
				+

			
 
				+	q := &Int{};

			
 
				+	defer internal_destroy(q);

			
 
				+

			
 
				+	internal_zero(q)                                                 or_return;

			
 
				+

			
 
				+	p := internal_count_bits(n);

			
 
				+

			
 
				+	for {

			
 
				+		/*

			
 
				+			q = a/2**p, a = a mod 2**p

			
 
				+		*/

			
 
				+		internal_shrmod(q, a, a, p)                                  or_return;

			
 
				+

			
 
				+		/*

			
 
				+			q = q * d

			
 
				+		*/

			
 
				+		internal_mul(q, q, d)                                        or_return;

			
 
				+

			
 
				+		/*

			
 
				+			a = a + q

			
 
				+		*/

			
 
				+		internal_add(a, a, q)                                        or_return;

			
 
				+		if internal_cmp_mag(a, n) == -1                              { break; }

			
 
				+		internal_sub(a, a, n)                                        or_return;

			
 
				+	}

			
 
				+

			
 
				+	return nil;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Determines if `internal_int_reduce_2k` can be used.

			
 
				+	Asssumes `a` not to be `nil` and to have been initialized.

			
 
				+*/

			
 
				+_private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {

			
 
				+	assert_if_nil(a);

			
 
				+

			
 
				+	if internal_is_zero(a) {

			
 
				+		return false, nil;

			
 
				+	} else if a.used == 1 {

			
 
				+		return true, nil;

			
 
				+	} else if a.used  > 1 {

			
 
				+		iy := internal_count_bits(a);

			
 
				+		iw := 1;

			
 
				+		iz := DIGIT(1);

			
 
				+

			
 
				+		/*

			
 
				+			Test every bit from the second digit up, must be 1.

			
 
				+		*/

			
 
				+		for ix := _DIGIT_BITS; ix < iy; ix += 1 {

			
 
				+			if a.digit[iw] & iz == 0 {

			
 
				+				return false, nil;

			
 
				+			}

			
 
				+

			
 
				+			iz <<= 1;

			
 
				+			if iz > _DIGIT_MAX {

			
 
				+				iw += 1;

			
 
				+				iz  = 1;

			
 
				+			}

			
 
				+		}

			
 
				+		return true, nil;

			
 
				+	} else {

			
 
				+		return true, nil;

			
 
				+	}

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Determines if `internal_int_reduce_2k_l` can be used.

			
 
				+	Asssumes `a` not to be `nil` and to have been initialized.

			
 
				+*/

			
 
				+_private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {

			
 
				+	assert_if_nil(a);

			
 
				+

			
 
				+	if internal_int_is_zero(a) {

			
 
				+		return false, nil;

			
 
				+	} else if a.used == 1 {

			
 
				+		return true, nil;

			
 
				+	} else if a.used  > 1 {

			
 
				+		/*

			
 
				+			If more than half of the digits are -1 we're sold.

			
 
				+		*/

			
 
				+		ix := 0;

			
 
				+		iy := 0;

			
 
				+

			
 
				+		for ; ix < a.used; ix += 1 {

			
 
				+			if a.digit[ix] == _DIGIT_MAX {

			
 
				+				iy += 1;

			
 
				+			}

			
 
				+		}

			
 
				+		return iy >= (a.used / 2), nil;

			
 
				+	} else {

			
 
				+		return false, nil;

			
 
				+	}

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Determines the setup value.

			
 
				+	Assumes `a` is not `nil`.

			
 
				+*/

			
 
				+_private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {

			
 
				+	context.allocator = allocator;

			
 
				+

			
 
				+	tmp := &Int{};

			
 
				+	defer internal_destroy(tmp);

			
 
				+	internal_zero(tmp)                                               or_return;

			
 
				+

			
 
				+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;

			
 
				+	internal_sub(tmp, tmp, a)                                        or_return;

			
 
				+

			
 
				+	return tmp.digit[0], nil;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Determines the setup value.

			
 
				+	Assumes `mu` and `P` are not `nil`.

			
 
				+

			
 
				+	d := (1 << a.bits) - a;

			
 
				+*/

			
 
				+_private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {

			
 
				+	context.allocator = allocator;

			
 
				+

			
 
				+	tmp := &Int{};

			
 
				+	defer internal_destroy(tmp);

			
 
				+	internal_zero(tmp)                                               or_return;

			
 
				+

			
 
				+	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return;

			
 
				+	internal_sub(mu, tmp, P)                                         or_return;

			
 
				+

			
 
				+	return nil;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Pre-calculate the value required for Barrett reduction.

			
 
				+	For a given modulus "P" it calulates the value required in "mu"

			
 
				+	Assumes `mu` and `P` are not `nil`.

			
 
				+*/

			
 
				+_private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {

			
 
				+	context.allocator = allocator;

			
 
				+

			
 
				+	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return;

			
 
				+	return internal_int_div(mu, mu, P);

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Determines the setup value.

			
 
				+	Assumes `a` to not be `nil` and to have been initialized.

			
 
				+*/

			
 
				+_private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {

			
 
				+	/*

			
 
				+		The casts are required if _DIGIT_BITS is one less than

			
 
				+		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].

			
 
				+	*/

			
 
				+	return DIGIT((1 << _DIGIT_BITS) - a.digit[0]);

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Determines if a number is a valid DR modulus.

			
 
				+	Assumes `a` to not be `nil` and to have been initialized.

			
 
				+*/

			
 
				+_private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {

			
 
				+	/*

			
 
				+		Must be at least two digits.

			
 
				+	*/

			
 
				+	if a.used < 2 { return false; }

			
 
				+

			
 
				+	/*

			
 
				+		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).

			
 
				+	*/

			
 
				+	for ix := 1; ix < a.used; ix += 1 {

			
 
				+		if a.digit[ix] != _MASK {

			
 
				+			return false;

			
 
				+		}

			
 
				+	}

			
 
				+	return true;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.

			
 
				+	Based on algorithm from the paper

			
 
				+

			
 
				+		"Generating Efficient Primes for Discrete Log Cryptosystems"

			
 
				+					Chae Hoon Lim, Pil Joong Lee,

			
 
				+			POSTECH Information Research Laboratories

			
 
				+

			
 
				+	The modulus must be of a special format [see manual].

			
 
				+	Has been modified to use algorithm 7.10 from the LTM book instead

			
 
				+

			
 
				+	Input x must be in the range 0 <= x <= (n-1)**2

			
 
				+	Assumes `x` and `n` to not be `nil` and to have been initialized.

			
 
				+*/

			
 
				+_private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {

			
 
				+	/*

			
 
				+		m = digits in modulus.

			
 
				+	*/

			
 
				+	m := n.used;

			
 
				+

			
 
				+	/*

			
 
				+		Ensure that "x" has at least 2m digits.

			
 
				+	*/

			
 
				+	internal_grow(x, m + m)                                          or_return;

			
 
				+

			
 
				+	/*

			
 
				+		Top of loop, this is where the code resumes if another reduction pass is required.

			
 
				+	*/

			
 
				+	for {

			
 
				+		i: int;

			
 
				+		mu := DIGIT(0);

			
 
				+

			
 
				+		/*

			
 
				+			Compute (x mod B**m) + k * [x/B**m] inline and inplace.

			
 
				+		*/

			
 
				+		for i = 0; i < m; i += 1 {

			
 
				+			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu);

			
 
				+			x.digit[i] = DIGIT(r & _WORD(_MASK));

			
 
				+			mu         = DIGIT(r >> _WORD(_DIGIT_BITS));

			
 
				+		}

			
 
				+

			
 
				+		/*

			
 
				+			Set final carry.

			
 
				+		*/

			
 
				+		x.digit[i] = mu;

			
 
				+

			
 
				+		/*

			
 
				+			Zero words above m.

			
 
				+		*/

			
 
				+		mem.zero_slice(x.digit[m + 1:][:x.used - m]);

			
 
				+

			
 
				+		/*

			
 
				+			Clamp, sub and return.

			
 
				+		*/

			
 
				+		internal_clamp(x)                                            or_return;

			
 
				+

			
 
				+		/*

			
 
				+			If x >= n then subtract and reduce again.

			
 
				+			Each successive "recursion" makes the input smaller and smaller.

			
 
				+		*/

			
 
				+		if internal_cmp_mag(x, n) == -1 { break; }

			
 
				+

			
 
				+		internal_sub(x, x, n)                                        or_return;

			
 
				+	}

			
 
				+	return nil;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Computes res == G**X mod P.

			
 
				+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.

			
 
				+*/

			
 
				+_private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {

			
 
				+	context.allocator = allocator;

			
 
				+

			
 
				+	M := [_TAB_SIZE]Int{};

			
 
				+	winsize: uint;

			
 
				+

			
 
				+	/*

			
 
				+		Use a pointer to the reduction algorithm.

			
 
				+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.

			
 
				+	*/

			
 
				+	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);

			
 
				+

			
 
				+	defer {

			
 
				+		internal_destroy(&M[1]);

			
 
				+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {

			
 
				+			internal_destroy(&M[x]);

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Find window size.

			
 
				+	*/

			
 
				+	x := internal_count_bits(X);

			
 
				+	switch {

			
 
				+	case x <= 7:

			
 
				+		winsize = 2;

			
 
				+	case x <= 36:

			
 
				+		winsize = 3;

			
 
				+	case x <= 140:

			
 
				+		winsize = 4;

			
 
				+	case x <= 450:

			
 
				+		winsize = 5;

			
 
				+	case x <= 1303:

			
 
				+		winsize = 6;

			
 
				+	case x <= 3529:

			
 
				+		winsize = 7;

			
 
				+	case:

			
 
				+		winsize = 8;

			
 
				+	}

			
 
				+

			
 
				+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;

			
 
				+

			
 
				+	/*

			
 
				+		Init M array.

			
 
				+		Init first cell.

			
 
				+	*/

			
 
				+	internal_zero(&M[1])                                             or_return;

			
 
				+

			
 
				+	/*

			
 
				+		Now init the second half of the array.

			
 
				+	*/

			
 
				+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {

			
 
				+		internal_zero(&M[x])                                         or_return;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Create `mu`, used for Barrett reduction.

			
 
				+	*/

			
 
				+	mu := &Int{};

			
 
				+	defer internal_destroy(mu);

			
 
				+	internal_zero(mu)                                                or_return;

			
 
				+

			
 
				+	if redmode == 0 {

			
 
				+		_private_int_reduce_setup(mu, P)                             or_return;

			
 
				+		redux = _private_int_reduce;

			
 
				+	} else {

			
 
				+		_private_int_reduce_2k_setup_l(mu, P)                        or_return;

			
 
				+		redux = _private_int_reduce_2k_l;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Create M table.

			
 
				+

			
 
				+		The M table contains powers of the base, e.g. M[x] = G**x mod P.

			
 
				+		The first half of the table is not computed, though, except for M[0] and M[1].

			
 
				+	*/

			
 
				+	internal_int_mod(&M[1], G, P)                                    or_return;

			
 
				+

			
 
				+	/*

			
 
				+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.

			
 
				+

			
 
				+		TODO: This can probably be replaced by computing the power and using `pow` to raise to it

			
 
				+		instead of repeated squaring.

			
 
				+	*/

			
 
				+	slot := 1 << (winsize - 1);

			
 
				+	internal_copy(&M[slot], &M[1])                                   or_return;

			
 
				+

			
 
				+	for x = 0; x < int(winsize - 1); x += 1 {

			
 
				+		/*

			
 
				+			Square it.

			
 
				+		*/

			
 
				+		internal_sqr(&M[slot], &M[slot])                             or_return;

			
 
				+

			
 
				+		/*

			
 
				+			Reduce modulo P

			
 
				+		*/

			
 
				+		redux(&M[slot], P, mu)                                       or_return;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)

			
 
				+		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)

			
 
				+	*/

			
 
				+	for x = slot + 1; x < (1 << winsize); x += 1 {

			
 
				+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;

			
 
				+		redux(&M[x], P, mu)                                          or_return;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Setup result.

			
 
				+	*/

			
 
				+	internal_one(res)                                                or_return;

			
 
				+

			
 
				+	/*

			
 
				+		Set initial mode and bit cnt.

			
 
				+	*/

			
 
				+	mode   := 0;

			
 
				+	bitcnt := 1;

			
 
				+	buf    := DIGIT(0);

			
 
				+	digidx := X.used - 1;

			
 
				+	bitcpy := uint(0);

			
 
				+	bitbuf := DIGIT(0);

			
 
				+

			
 
				+	for {

			
 
				+		/*

			
 
				+			Grab next digit as required.

			
 
				+		*/

			
 
				+		bitcnt -= 1;

			
 
				+		if bitcnt == 0 {

			
 
				+			/*

			
 
				+				If digidx == -1 we are out of digits.

			
 
				+			*/

			
 
				+			if digidx == -1 { break; }

			
 
				+

			
 
				+			/*

			
 
				+				Read next digit and reset the bitcnt.

			
 
				+			*/

			
 
				+			buf    = X.digit[digidx];

			
 
				+			digidx -= 1;

			
 
				+			bitcnt = _DIGIT_BITS;

			
 
				+		}

			
 
				+

			
 
				+		/*

			
 
				+			Grab the next msb from the exponent.

			
 
				+		*/

			
 
				+		y := buf >> (_DIGIT_BITS - 1) & 1;

			
 
				+		buf <<= 1;

			
 
				+

			
 
				+		/*

			
 
				+			If the bit is zero and mode == 0 then we ignore it.

			
 
				+			These represent the leading zero bits before the first 1 bit

			
 
				+			in the exponent.  Technically this opt is not required but it

			
 
				+			does lower the # of trivial squaring/reductions used.

			
 
				+		*/

			
 
				+		if mode == 0 && y == 0 {

			
 
				+			continue;

			
 
				+		}

			
 
				+

			
 
				+		/*

			
 
				+			If the bit is zero and mode == 1 then we square.

			
 
				+		*/

			
 
				+		if mode == 1 && y == 0 {

			
 
				+			internal_sqr(res, res)                                   or_return;

			
 
				+			redux(res, P, mu)                                        or_return;

			
 
				+			continue;

			
 
				+		}

			
 
				+

			
 
				+		/*

			
 
				+			Else we add it to the window.

			
 
				+		*/

			
 
				+		bitcpy += 1;

			
 
				+		bitbuf |= (y << (winsize - bitcpy));

			
 
				+		mode    = 2;

			
 
				+

			
 
				+		if (bitcpy == winsize) {

			
 
				+			/*

			
 
				+				Window is filled so square as required and multiply.

			
 
				+				Square first.

			
 
				+			*/

			
 
				+			for x = 0; x < int(winsize); x += 1 {

			
 
				+				internal_sqr(res, res)                               or_return;

			
 
				+				redux(res, P, mu)                                    or_return;

			
 
				+			}

			
 
				+

			
 
				+			/*

			
 
				+				Then multiply.

			
 
				+			*/

			
 
				+			internal_mul(res, res, &M[bitbuf])                       or_return;

			
 
				+			redux(res, P, mu)                                        or_return;

			
 
				+

			
 
				+			/*

			
 
				+				Empty window and reset.

			
 
				+			*/

			
 
				+			bitcpy = 0;

			
 
				+			bitbuf = 0;

			
 
				+			mode   = 1;

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		If bits remain then square/multiply.

			
 
				+	*/

			
 
				+	if mode == 2 && bitcpy > 0 {

			
 
				+		/*

			
 
				+			Square then multiply if the bit is set.

			
 
				+		*/

			
 
				+		for x = 0; x < int(bitcpy); x += 1 {

			
 
				+			internal_sqr(res, res)                                   or_return;

			
 
				+			redux(res, P, mu)                                        or_return;

			
 
				+

			
 
				+			bitbuf <<= 1;

			
 
				+			if ((bitbuf & (1 << winsize)) != 0) {

			
 
				+				/*

			
 
				+					Then multiply.

			
 
				+				*/

			
 
				+				internal_mul(res, res, &M[1])                        or_return;

			
 
				+				redux(res, P, mu)                                    or_return;

			
 
				+			}

			
 
				+		}

			
 
				+	}

			
 
				+	return err;

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85

			
 
				+

			
 
				+	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.

			
 
				+	The value of `k` changes based on the size of the exponent.

			
 
				+

			
 
				+	Uses Montgomery or Diminished Radix reduction [whichever appropriate]

			
 
				+

			
 
				+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.

			
 
				+*/

			
 
				+_private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {

			
 
				+	context.allocator = allocator;

			
 
				+

			
 
				+	M := [_TAB_SIZE]Int{};

			
 
				+	winsize: uint;

			
 
				+

			
 
				+	/*

			
 
				+		Use a pointer to the reduction algorithm.

			
 
				+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.

			
 
				+	*/

			
 
				+	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error);

			
 
				+

			
 
				+	defer {

			
 
				+		internal_destroy(&M[1]);

			
 
				+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {

			
 
				+			internal_destroy(&M[x]);

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Find window size.

			
 
				+	*/

			
 
				+	x := internal_count_bits(X);

			
 
				+	switch {

			
 
				+	case x <= 7:

			
 
				+		winsize = 2;

			
 
				+	case x <= 36:

			
 
				+		winsize = 3;

			
 
				+	case x <= 140:

			
 
				+		winsize = 4;

			
 
				+	case x <= 450:

			
 
				+		winsize = 5;

			
 
				+	case x <= 1303:

			
 
				+		winsize = 6;

			
 
				+	case x <= 3529:

			
 
				+		winsize = 7;

			
 
				+	case:

			
 
				+		winsize = 8;

			
 
				+	}

			
 
				+

			
 
				+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;

			
 
				+

			
 
				+	/*

			
 
				+		Init M array

			
 
				+		Init first cell.

			
 
				+	*/

			
 
				+	cap := internal_int_allocated_cap(P);

			
 
				+	internal_grow(&M[1], cap)                                        or_return;

			
 
				+

			
 
				+	/*

			
 
				+		Now init the second half of the array.

			
 
				+	*/

			
 
				+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {

			
 
				+		internal_grow(&M[x], cap)                                    or_return;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Determine and setup reduction code.

			
 
				+	*/

			
 
				+	rho: DIGIT;

			
 
				+

			
 
				+	if redmode == 0 {

			
 
				+		/*

			
 
				+			Now setup Montgomery.

			
 
				+		*/

			
 
				+		rho = _private_int_montgomery_setup(P)                       or_return;

			
 
				+

			
 
				+		/*

			
 
				+			Automatically pick the comba one if available (saves quite a few calls/ifs).

			
 
				+		*/

			
 
				+		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {

			
 
				+			redux = _private_montgomery_reduce_comba;

			
 
				+		} else {

			
 
				+			/*

			
 
				+				Use slower baseline Montgomery method.

			
 
				+			*/

			
 
				+			redux = _private_int_montgomery_reduce;

			
 
				+		}

			
 
				+	} else if redmode == 1 {

			
 
				+		/*

			
 
				+			Setup DR reduction for moduli of the form B**k - b.

			
 
				+		*/

			
 
				+		rho = _private_int_dr_setup(P);

			
 
				+		redux = _private_int_dr_reduce;

			
 
				+	} else {

			
 
				+		/*

			
 
				+			Setup DR reduction for moduli of the form 2**k - b.

			
 
				+		*/

			
 
				+		rho = _private_int_reduce_2k_setup(P)                        or_return;

			
 
				+		redux = _private_int_reduce_2k;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Setup result.

			
 
				+	*/

			
 
				+	internal_grow(res, cap)                                          or_return;

			
 
				+

			
 
				+	/*

			
 
				+		Create M table

			
 
				+		The first half of the table is not computed, though, except for M[0] and M[1]

			
 
				+	*/

			
 
				+

			
 
				+	if redmode == 0 {

			
 
				+		/*

			
 
				+			Now we need R mod m.

			
 
				+		*/

			
 
				+		_private_int_montgomery_calc_normalization(res, P)           or_return;

			
 
				+

			
 
				+		/*

			
 
				+			Now set M[1] to G * R mod m.

			
 
				+		*/

			
 
				+		internal_mulmod(&M[1], G, res, P)                            or_return;

			
 
				+	} else {

			
 
				+		internal_one(res)                                            or_return;

			
 
				+		internal_mod(&M[1], G, P)                                    or_return;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.

			
 
				+	*/

			
 
				+	slot := 1 << (winsize - 1);

			
 
				+	internal_copy(&M[slot], &M[1])                                   or_return;

			
 
				+

			
 
				+	for x = 0; x < int(winsize - 1); x += 1 {

			
 
				+		internal_sqr(&M[slot], &M[slot])                             or_return;

			
 
				+		redux(&M[slot], P, rho)                                      or_return;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Create upper table.

			
 
				+	*/

			
 
				+	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {

			
 
				+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;

			
 
				+		redux(&M[x], P, rho)                                         or_return;

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		Set initial mode and bit cnt.

			
 
				+	*/

			
 
				+	mode   := 0;

			
 
				+	bitcnt := 1;

			
 
				+	buf    := DIGIT(0);

			
 
				+	digidx := X.used - 1;

			
 
				+	bitcpy := 0;

			
 
				+	bitbuf := DIGIT(0);

			
 
				+

			
 
				+	for {

			
 
				+		/*

			
 
				+			Grab next digit as required.

			
 
				+		*/

			
 
				+		bitcnt -= 1;

			
 
				+		if bitcnt == 0 {

			
 
				+			/*

			
 
				+				If digidx == -1 we are out of digits so break.

			
 
				+			*/

			
 
				+			if digidx == -1 { break; }

			
 
				+

			
 
				+			/*

			
 
				+				Read next digit and reset the bitcnt.

			
 
				+			*/

			
 
				+			buf    = X.digit[digidx];

			
 
				+			digidx -= 1;

			
 
				+			bitcnt = _DIGIT_BITS;

			
 
				+		}

			
 
				+

			
 
				+		/*

			
 
				+			Grab the next msb from the exponent.

			
 
				+		*/

			
 
				+		y := (buf >> (_DIGIT_BITS - 1)) & 1;

			
 
				+		buf <<= 1;

			
 
				+

			
 
				+		/*

			
 
				+			If the bit is zero and mode == 0 then we ignore it.

			
 
				+			These represent the leading zero bits before the first 1 bit in the exponent.

			
 
				+			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.

			
 
				+		*/

			
 
				+		if mode == 0 && y == 0 { continue; }

			
 
				+

			
 
				+		/*

			
 
				+			If the bit is zero and mode == 1 then we square.

			
 
				+		*/

			
 
				+		if mode == 1 && y == 0 {

			
 
				+			internal_sqr(res, res)                                   or_return;

			
 
				+			redux(res, P, rho)                                       or_return;

			
 
				+			continue;

			
 
				+		}

			
 
				+

			
 
				+		/*

			
 
				+			Else we add it to the window.

			
 
				+		*/

			
 
				+		bitcpy += 1;

			
 
				+		bitbuf |= (y << (winsize - uint(bitcpy)));

			
 
				+		mode    = 2;

			
 
				+

			
 
				+		if bitcpy == int(winsize) {

			
 
				+			/*

			
 
				+				Window is filled so square as required and multiply

			
 
				+				Square first.

			
 
				+			*/

			
 
				+			for x = 0; x < int(winsize); x += 1 {

			
 
				+				internal_sqr(res, res)                               or_return;

			
 
				+				redux(res, P, rho)                                   or_return;

			
 
				+			}

			
 
				+

			
 
				+			/*

			
 
				+				Then multiply.

			
 
				+			*/

			
 
				+			internal_mul(res, res, &M[bitbuf])                       or_return;

			
 
				+			redux(res, P, rho)                                       or_return;

			
 
				+

			
 
				+			/*

			
 
				+				Empty window and reset.

			
 
				+			*/

			
 
				+			bitcpy = 0;

			
 
				+			bitbuf = 0;

			
 
				+			mode   = 1;

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	/*

			
 
				+		If bits remain then square/multiply.

			
 
				+	*/

			
 
				+	if mode == 2 && bitcpy > 0 {

			
 
				+		/*

			
 
				+			Square then multiply if the bit is set.

			
 
				+		*/

			
 
				+		for x = 0; x < bitcpy; x += 1 {

			
 
				+			internal_sqr(res, res)                                   or_return;

			
 
				+			redux(res, P, rho)                                       or_return;

			
 
				+

			
 
				+			/*

			
 
				+				Get next bit of the window.

			
 
				+			*/

			
 
				+			bitbuf <<= 1;

			
 
				+			if bitbuf & (1 << winsize) != 0 {

			
 
				+				/*

			
 
				+					Then multiply.

			
 
				+				*/

			
 
				+				internal_mul(res, res, &M[1])                        or_return;

			
 
				+				redux(res, P, rho)                                   or_return;

			
 
				+			}

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	if redmode == 0 {

			
 
				+		/*

			
 
				+			Fixup result if Montgomery reduction is used.

			
 
				+			Recall that any value in a Montgomery system is actually multiplied by R mod n.

			
 
				+			So we have to reduce one more time to cancel out the factor of R.

			
 
				+		*/

			
 
				+		redux(res, P, rho)                                           or_return;

			
 
				+	}

			
 
				+

			
 
				+	return nil;

			
 
				+}

			
 
				+

			
 
				 /*

			
 
				 	hac 14.61, pp608

			
 
				 */