|
@@ -1,3402 +1,3355 @@
|
|
|
-/*
|
|
|
- Copyright 2021 Jeroen van Rijn <[email protected]>.
|
|
|
- Made available under Odin's BSD-3 license.
|
|
|
-
|
|
|
- An arbitrary precision mathematics implementation in Odin.
|
|
|
- For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
|
|
|
- The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
|
|
|
-
|
|
|
- ============================= Private procedures =============================
|
|
|
-
|
|
|
- Private procedures used by the above low-level routines follow.
|
|
|
-
|
|
|
- Don't call these yourself unless you really know what you're doing.
|
|
|
- They include implementations that are optimimal for certain ranges of input only.
|
|
|
-
|
|
|
- These aren't exported for the same reasons.
|
|
|
-*/
|
|
|
-
|
|
|
-
|
|
|
-package math_big
|
|
|
-
|
|
|
-import "base:intrinsics"
|
|
|
-import "core:mem"
|
|
|
-
|
|
|
-/*
|
|
|
- Multiplies |a| * |b| and only computes upto digs digits of result.
|
|
|
- HAC pp. 595, Algorithm 14.12 Modified so you can control how
|
|
|
- many digits of output are created.
|
|
|
-*/
|
|
|
-_private_int_mul :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- /*
|
|
|
- Can we use the fast multiplier?
|
|
|
- */
|
|
|
- if digits < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
|
|
|
- return #force_inline _private_int_mul_comba(dest, a, b, digits)
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Set up temporary output `Int`, which we'll swap for `dest` when done.
|
|
|
- */
|
|
|
-
|
|
|
- t := &Int{}
|
|
|
-
|
|
|
- internal_grow(t, max(digits, _DEFAULT_DIGIT_COUNT)) or_return
|
|
|
- t.used = digits
|
|
|
-
|
|
|
- /*
|
|
|
- Compute the digits of the product directly.
|
|
|
- */
|
|
|
- pa := a.used
|
|
|
- for ix := 0; ix < pa; ix += 1 {
|
|
|
- /*
|
|
|
- Limit ourselves to `digits` DIGITs of output.
|
|
|
- */
|
|
|
- pb := min(b.used, digits - ix)
|
|
|
- carry := _WORD(0)
|
|
|
- iy := 0
|
|
|
-
|
|
|
- /*
|
|
|
- Compute the column of the output and propagate the carry.
|
|
|
- */
|
|
|
- #no_bounds_check for iy = 0; iy < pb; iy += 1 {
|
|
|
- /*
|
|
|
- Compute the column as a _WORD.
|
|
|
- */
|
|
|
- column := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + carry
|
|
|
-
|
|
|
- /*
|
|
|
- The new column is the lower part of the result.
|
|
|
- */
|
|
|
- t.digit[ix + iy] = DIGIT(column & _WORD(_MASK))
|
|
|
-
|
|
|
- /*
|
|
|
- Get the carry word from the result.
|
|
|
- */
|
|
|
- carry = column >> _DIGIT_BITS
|
|
|
- }
|
|
|
- /*
|
|
|
- Set carry if it is placed below digits
|
|
|
- */
|
|
|
- if ix + iy < digits {
|
|
|
- t.digit[ix + pb] = DIGIT(carry)
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- internal_swap(dest, t)
|
|
|
- internal_destroy(t)
|
|
|
- return internal_clamp(dest)
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-/*
|
|
|
- Multiplication using the Toom-Cook 3-way algorithm.
|
|
|
-
|
|
|
- Much more complicated than Karatsuba but has a lower asymptotic running time of O(N**1.464).
|
|
|
- This algorithm is only particularly useful on VERY large inputs.
|
|
|
- (We're talking 1000s of digits here...).
|
|
|
-
|
|
|
- This file contains code from J. Arndt's book "Matters Computational"
|
|
|
- and the accompanying FXT-library with permission of the author.
|
|
|
-
|
|
|
- Setup from:
|
|
|
- Chung, Jaewook, and M. Anwar Hasan. "Asymmetric squaring formulae."
|
|
|
- 18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.
|
|
|
-
|
|
|
- The interpolation from above needed one temporary variable more than the interpolation here:
|
|
|
-
|
|
|
- Bodrato, Marco, and Alberto Zanoni. "What about Toom-Cook matrices optimality."
|
|
|
- Centro Vito Volterra Universita di Roma Tor Vergata (2006)
|
|
|
-*/
|
|
|
-_private_int_mul_toom :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- S1, S2, T1, a0, a1, a2, b0, b1, b2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(S1, S2, T1, a0, a1, a2, b0, b1, b2)
|
|
|
-
|
|
|
- /*
|
|
|
- Init temps.
|
|
|
- */
|
|
|
- internal_init_multi(S1, S2, T1) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- B
|
|
|
- */
|
|
|
- B := min(a.used, b.used) / 3
|
|
|
-
|
|
|
- /*
|
|
|
- a = a2 * x^2 + a1 * x + a0;
|
|
|
- */
|
|
|
- internal_grow(a0, B) or_return
|
|
|
- internal_grow(a1, B) or_return
|
|
|
- internal_grow(a2, a.used - 2 * B) or_return
|
|
|
-
|
|
|
- a0.used, a1.used = B, B
|
|
|
- a2.used = a.used - 2 * B
|
|
|
-
|
|
|
- internal_copy_digits(a0, a, a0.used) or_return
|
|
|
- internal_copy_digits(a1, a, a1.used, B) or_return
|
|
|
- internal_copy_digits(a2, a, a2.used, 2 * B) or_return
|
|
|
-
|
|
|
- internal_clamp(a0)
|
|
|
- internal_clamp(a1)
|
|
|
- internal_clamp(a2)
|
|
|
-
|
|
|
- /*
|
|
|
- b = b2 * x^2 + b1 * x + b0;
|
|
|
- */
|
|
|
- internal_grow(b0, B) or_return
|
|
|
- internal_grow(b1, B) or_return
|
|
|
- internal_grow(b2, b.used - 2 * B) or_return
|
|
|
-
|
|
|
- b0.used, b1.used = B, B
|
|
|
- b2.used = b.used - 2 * B
|
|
|
-
|
|
|
- internal_copy_digits(b0, b, b0.used) or_return
|
|
|
- internal_copy_digits(b1, b, b1.used, B) or_return
|
|
|
- internal_copy_digits(b2, b, b2.used, 2 * B) or_return
|
|
|
-
|
|
|
- internal_clamp(b0)
|
|
|
- internal_clamp(b1)
|
|
|
- internal_clamp(b2)
|
|
|
-
|
|
|
-
|
|
|
- /*
|
|
|
- \\ S1 = (a2+a1+a0) * (b2+b1+b0);
|
|
|
- */
|
|
|
- internal_add(T1, a2, a1) or_return /* T1 = a2 + a1; */
|
|
|
- internal_add(S2, T1, a0) or_return /* S2 = T1 + a0; */
|
|
|
- internal_add(dest, b2, b1) or_return /* dest = b2 + b1; */
|
|
|
- internal_add(S1, dest, b0) or_return /* S1 = c + b0; */
|
|
|
- internal_mul(S1, S1, S2) or_return /* S1 = S1 * S2; */
|
|
|
-
|
|
|
- /*
|
|
|
- \\S2 = (4*a2+2*a1+a0) * (4*b2+2*b1+b0);
|
|
|
- */
|
|
|
- internal_add(T1, T1, a2) or_return /* T1 = T1 + a2; */
|
|
|
- internal_int_shl1(T1, T1) or_return /* T1 = T1 << 1; */
|
|
|
- internal_add(T1, T1, a0) or_return /* T1 = T1 + a0; */
|
|
|
- internal_add(dest, dest, b2) or_return /* c = c + b2; */
|
|
|
- internal_int_shl1(dest, dest) or_return /* c = c << 1; */
|
|
|
- internal_add(dest, dest, b0) or_return /* c = c + b0; */
|
|
|
- internal_mul(S2, T1, dest) or_return /* S2 = T1 * c; */
|
|
|
-
|
|
|
- /*
|
|
|
- \\S3 = (a2-a1+a0) * (b2-b1+b0);
|
|
|
- */
|
|
|
- internal_sub(a1, a2, a1) or_return /* a1 = a2 - a1; */
|
|
|
- internal_add(a1, a1, a0) or_return /* a1 = a1 + a0; */
|
|
|
- internal_sub(b1, b2, b1) or_return /* b1 = b2 - b1; */
|
|
|
- internal_add(b1, b1, b0) or_return /* b1 = b1 + b0; */
|
|
|
- internal_mul(a1, a1, b1) or_return /* a1 = a1 * b1; */
|
|
|
- internal_mul(b1, a2, b2) or_return /* b1 = a2 * b2; */
|
|
|
-
|
|
|
- /*
|
|
|
- \\S2 = (S2 - S3) / 3;
|
|
|
- */
|
|
|
- internal_sub(S2, S2, a1) or_return /* S2 = S2 - a1; */
|
|
|
- _private_int_div_3(S2, S2) or_return /* S2 = S2 / 3; \\ this is an exact division */
|
|
|
- internal_sub(a1, S1, a1) or_return /* a1 = S1 - a1; */
|
|
|
- internal_int_shr1(a1, a1) or_return /* a1 = a1 >> 1; */
|
|
|
- internal_mul(a0, a0, b0) or_return /* a0 = a0 * b0; */
|
|
|
- internal_sub(S1, S1, a0) or_return /* S1 = S1 - a0; */
|
|
|
- internal_sub(S2, S2, S1) or_return /* S2 = S2 - S1; */
|
|
|
- internal_int_shr1(S2, S2) or_return /* S2 = S2 >> 1; */
|
|
|
- internal_sub(S1, S1, a1) or_return /* S1 = S1 - a1; */
|
|
|
- internal_sub(S1, S1, b1) or_return /* S1 = S1 - b1; */
|
|
|
- internal_int_shl1(T1, b1) or_return /* T1 = b1 << 1; */
|
|
|
- internal_sub(S2, S2, T1) or_return /* S2 = S2 - T1; */
|
|
|
- internal_sub(a1, a1, S2) or_return /* a1 = a1 - S2; */
|
|
|
-
|
|
|
- /*
|
|
|
- P = b1*x^4+ S2*x^3+ S1*x^2+ a1*x + a0;
|
|
|
- */
|
|
|
- _private_int_shl_leg(b1, 4 * B) or_return
|
|
|
- _private_int_shl_leg(S2, 3 * B) or_return
|
|
|
- internal_add(b1, b1, S2) or_return
|
|
|
- _private_int_shl_leg(S1, 2 * B) or_return
|
|
|
- internal_add(b1, b1, S1) or_return
|
|
|
- _private_int_shl_leg(a1, 1 * B) or_return
|
|
|
- internal_add(b1, b1, a1) or_return
|
|
|
- internal_add(dest, b1, a0) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- a * b - P
|
|
|
- */
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- product = |a| * |b| using Karatsuba Multiplication using three half size multiplications.
|
|
|
-
|
|
|
- Let `B` represent the radix [e.g. 2**_DIGIT_BITS] and let `n` represent
|
|
|
- half of the number of digits in the min(a,b)
|
|
|
-
|
|
|
- `a` = `a1` * `B`**`n` + `a0`
|
|
|
- `b` = `b`1 * `B`**`n` + `b0`
|
|
|
-
|
|
|
- Then, a * b => 1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
|
|
|
-
|
|
|
- Note that a1b1 and a0b0 are used twice and only need to be computed once.
|
|
|
- So in total three half size (half # of digit) multiplications are performed,
|
|
|
- a0b0, a1b1 and (a1+b1)(a0+b0)
|
|
|
-
|
|
|
- Note that a multiplication of half the digits requires 1/4th the number of
|
|
|
- single precision multiplications, so in total after one call 25% of the
|
|
|
- single precision multiplications are saved.
|
|
|
-
|
|
|
- Note also that the call to `internal_mul` can end up back in this function
|
|
|
- if the a0, a1, b0, or b1 are above the threshold.
|
|
|
-
|
|
|
- This is known as divide-and-conquer and leads to the famous O(N**lg(3)) or O(N**1.584)
|
|
|
- work which is asymptopically lower than the standard O(N**2) that the
|
|
|
- baseline/comba methods use. Generally though, the overhead of this method doesn't pay off
|
|
|
- until a certain size is reached, of around 80 used DIGITs.
|
|
|
-*/
|
|
|
-_private_int_mul_karatsuba :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- x0, x1, y0, y1, t1, x0y0, x1y1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(x0, x1, y0, y1, t1, x0y0, x1y1)
|
|
|
-
|
|
|
- /*
|
|
|
- min # of digits, divided by two.
|
|
|
- */
|
|
|
- B := min(a.used, b.used) >> 1
|
|
|
-
|
|
|
- /*
|
|
|
- Init all the temps.
|
|
|
- */
|
|
|
- internal_grow(x0, B) or_return
|
|
|
- internal_grow(x1, a.used - B) or_return
|
|
|
- internal_grow(y0, B) or_return
|
|
|
- internal_grow(y1, b.used - B) or_return
|
|
|
- internal_grow(t1, B * 2) or_return
|
|
|
- internal_grow(x0y0, B * 2) or_return
|
|
|
- internal_grow(x1y1, B * 2) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Now shift the digits.
|
|
|
- */
|
|
|
- x0.used, y0.used = B, B
|
|
|
- x1.used = a.used - B
|
|
|
- y1.used = b.used - B
|
|
|
-
|
|
|
- /*
|
|
|
- We copy the digits directly instead of using higher level functions
|
|
|
- since we also need to shift the digits.
|
|
|
- */
|
|
|
- internal_copy_digits(x0, a, x0.used)
|
|
|
- internal_copy_digits(y0, b, y0.used)
|
|
|
- internal_copy_digits(x1, a, x1.used, B)
|
|
|
- internal_copy_digits(y1, b, y1.used, B)
|
|
|
-
|
|
|
- /*
|
|
|
- Only need to clamp the lower words since by definition the
|
|
|
- upper words x1/y1 must have a known number of digits.
|
|
|
- */
|
|
|
- clamp(x0)
|
|
|
- clamp(y0)
|
|
|
-
|
|
|
- /*
|
|
|
- Now calc the products x0y0 and x1y1,
|
|
|
- after this x0 is no longer required, free temp [x0==t2]!
|
|
|
- */
|
|
|
- internal_mul(x0y0, x0, y0) or_return /* x0y0 = x0*y0 */
|
|
|
- internal_mul(x1y1, x1, y1) or_return /* x1y1 = x1*y1 */
|
|
|
- internal_add(t1, x1, x0) or_return /* now calc x1+x0 and */
|
|
|
- internal_add(x0, y1, y0) or_return /* t2 = y1 + y0 */
|
|
|
- internal_mul(t1, t1, x0) or_return /* t1 = (x1 + x0) * (y1 + y0) */
|
|
|
-
|
|
|
- /*
|
|
|
- Add x0y0.
|
|
|
- */
|
|
|
- internal_add(x0, x0y0, x1y1) or_return /* t2 = x0y0 + x1y1 */
|
|
|
- internal_sub(t1, t1, x0) or_return /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
|
|
|
-
|
|
|
- /*
|
|
|
- shift by B.
|
|
|
- */
|
|
|
- _private_int_shl_leg(t1, B) or_return /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
|
|
|
- _private_int_shl_leg(x1y1, B * 2) or_return /* x1y1 = x1y1 << 2*B */
|
|
|
-
|
|
|
- internal_add(t1, x0y0, t1) or_return /* t1 = x0y0 + t1 */
|
|
|
- internal_add(dest, t1, x1y1) or_return /* t1 = x0y0 + t1 + x1y1 */
|
|
|
-
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-/*
|
|
|
- Fast (comba) multiplier
|
|
|
-
|
|
|
- This is the fast column-array [comba] multiplier. It is
|
|
|
- designed to compute the columns of the product first
|
|
|
- then handle the carries afterwards. This has the effect
|
|
|
- of making the nested loops that compute the columns very
|
|
|
- simple and schedulable on super-scalar processors.
|
|
|
-
|
|
|
- This has been modified to produce a variable number of
|
|
|
- digits of output so if say only a half-product is required
|
|
|
- you don't have to compute the upper half (a feature
|
|
|
- required for fast Barrett reduction).
|
|
|
-
|
|
|
- Based on Algorithm 14.12 on pp.595 of HAC.
|
|
|
-*/
|
|
|
-_private_int_mul_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- /*
|
|
|
- Set up array.
|
|
|
- */
|
|
|
- W: [_WARRAY]DIGIT = ---
|
|
|
-
|
|
|
- /*
|
|
|
- Grow the destination as required.
|
|
|
- */
|
|
|
- internal_grow(dest, digits) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Number of output digits to produce.
|
|
|
- */
|
|
|
- pa := min(digits, a.used + b.used)
|
|
|
-
|
|
|
- /*
|
|
|
- Clear the carry
|
|
|
- */
|
|
|
- _W := _WORD(0)
|
|
|
-
|
|
|
- ix: int
|
|
|
- for ix = 0; ix < pa; ix += 1 {
|
|
|
- tx, ty, iy, iz: int
|
|
|
-
|
|
|
- /*
|
|
|
- Get offsets into the two bignums.
|
|
|
- */
|
|
|
- ty = min(b.used - 1, ix)
|
|
|
- tx = ix - ty
|
|
|
-
|
|
|
- /*
|
|
|
- This is the number of times the loop will iterate, essentially.
|
|
|
- while (tx++ < a->used && ty-- >= 0) { ... }
|
|
|
- */
|
|
|
-
|
|
|
- iy = min(a.used - tx, ty + 1)
|
|
|
-
|
|
|
- /*
|
|
|
- Execute loop.
|
|
|
- */
|
|
|
- #no_bounds_check for iz = 0; iz < iy; iz += 1 {
|
|
|
- _W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz])
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Store term.
|
|
|
- */
|
|
|
- W[ix] = DIGIT(_W) & _MASK
|
|
|
-
|
|
|
- /*
|
|
|
- Make next carry.
|
|
|
- */
|
|
|
- _W = _W >> _WORD(_DIGIT_BITS)
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Setup dest.
|
|
|
- */
|
|
|
- old_used := dest.used
|
|
|
- dest.used = pa
|
|
|
-
|
|
|
- /*
|
|
|
- Now extract the previous digit [below the carry].
|
|
|
- */
|
|
|
- copy_slice(dest.digit[0:], W[:pa])
|
|
|
-
|
|
|
- /*
|
|
|
- Clear unused digits [that existed in the old copy of dest].
|
|
|
- */
|
|
|
- internal_zero_unused(dest, old_used)
|
|
|
-
|
|
|
- /*
|
|
|
- Adjust dest.used based on leading zeroes.
|
|
|
- */
|
|
|
-
|
|
|
- return internal_clamp(dest)
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Multiplies |a| * |b| and does not compute the lower digs digits
|
|
|
- [meant to get the higher part of the product]
|
|
|
-*/
|
|
|
-_private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- /*
|
|
|
- Can we use the fast multiplier?
|
|
|
- */
|
|
|
- if a.used + b.used + 1 < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
|
|
|
- return _private_int_mul_high_comba(dest, a, b, digits)
|
|
|
- }
|
|
|
-
|
|
|
- internal_grow(dest, a.used + b.used + 1) or_return
|
|
|
- dest.used = a.used + b.used + 1
|
|
|
-
|
|
|
- pa := a.used
|
|
|
- pb := b.used
|
|
|
- for ix := 0; ix < pa; ix += 1 {
|
|
|
- carry := DIGIT(0)
|
|
|
-
|
|
|
- for iy := digits - ix; iy < pb; iy += 1 {
|
|
|
- /*
|
|
|
- Calculate the double precision result.
|
|
|
- */
|
|
|
- r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
|
|
|
-
|
|
|
- /*
|
|
|
- Get the lower part.
|
|
|
- */
|
|
|
- dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
|
|
|
-
|
|
|
- /*
|
|
|
- Carry the carry.
|
|
|
- */
|
|
|
- carry = DIGIT(r >> _WORD(_DIGIT_BITS))
|
|
|
- }
|
|
|
- dest.digit[ix + pb] = carry
|
|
|
- }
|
|
|
- return internal_clamp(dest)
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- This is a modified version of `_private_int_mul_comba` that only produces output digits *above* `digits`.
|
|
|
- See the comments for `_private_int_mul_comba` to see how it works.
|
|
|
-
|
|
|
- This is used in the Barrett reduction since for one of the multiplications
|
|
|
- only the higher digits were needed. This essentially halves the work.
|
|
|
-
|
|
|
- Based on Algorithm 14.12 on pp.595 of HAC.
|
|
|
-*/
|
|
|
-_private_int_mul_high_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- W: [_WARRAY]DIGIT = ---
|
|
|
- _W: _WORD = 0
|
|
|
-
|
|
|
- /*
|
|
|
- Number of output digits to produce. Grow the destination as required.
|
|
|
- */
|
|
|
- pa := a.used + b.used
|
|
|
- internal_grow(dest, pa) or_return
|
|
|
-
|
|
|
- ix: int
|
|
|
- for ix = digits; ix < pa; ix += 1 {
|
|
|
- /*
|
|
|
- Get offsets into the two bignums.
|
|
|
- */
|
|
|
- ty := min(b.used - 1, ix)
|
|
|
- tx := ix - ty
|
|
|
-
|
|
|
- /*
|
|
|
- This is the number of times the loop will iterrate, essentially it's
|
|
|
- while (tx++ < a->used && ty-- >= 0) { ... }
|
|
|
- */
|
|
|
- iy := min(a.used - tx, ty + 1)
|
|
|
-
|
|
|
- /*
|
|
|
- Execute loop.
|
|
|
- */
|
|
|
- for iz := 0; iz < iy; iz += 1 {
|
|
|
- _W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz])
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Store term.
|
|
|
- */
|
|
|
- W[ix] = DIGIT(_W) & DIGIT(_MASK)
|
|
|
-
|
|
|
- /*
|
|
|
- Make next carry.
|
|
|
- */
|
|
|
- _W = _W >> _WORD(_DIGIT_BITS)
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Setup dest
|
|
|
- */
|
|
|
- old_used := dest.used
|
|
|
- dest.used = pa
|
|
|
-
|
|
|
- for ix = digits; ix < pa; ix += 1 {
|
|
|
- /*
|
|
|
- Now extract the previous digit [below the carry].
|
|
|
- */
|
|
|
- dest.digit[ix] = W[ix]
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Zero remainder.
|
|
|
- */
|
|
|
- internal_zero_unused(dest, old_used)
|
|
|
-
|
|
|
- /*
|
|
|
- Adjust dest.used based on leading zeroes.
|
|
|
- */
|
|
|
- return internal_clamp(dest)
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Single-digit multiplication with the smaller number as the single-digit.
|
|
|
-*/
|
|
|
-_private_int_mul_balance :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
- a, b := a, b
|
|
|
-
|
|
|
- a0, tmp, r := &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(a0, tmp, r)
|
|
|
-
|
|
|
- b_size := min(a.used, b.used)
|
|
|
- n_blocks := max(a.used, b.used) / b_size
|
|
|
-
|
|
|
- internal_grow(a0, b_size + 2) or_return
|
|
|
- internal_init_multi(tmp, r) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Make sure that `a` is the larger one.
|
|
|
- */
|
|
|
- if a.used < b.used {
|
|
|
- a, b = b, a
|
|
|
- }
|
|
|
- assert(a.used >= b.used)
|
|
|
-
|
|
|
- i, j := 0, 0
|
|
|
- for ; i < n_blocks; i += 1 {
|
|
|
- /*
|
|
|
- Cut a slice off of `a`.
|
|
|
- */
|
|
|
-
|
|
|
- a0.used = b_size
|
|
|
- internal_copy_digits(a0, a, a0.used, j)
|
|
|
- j += a0.used
|
|
|
- internal_clamp(a0)
|
|
|
-
|
|
|
- /*
|
|
|
- Multiply with `b`.
|
|
|
- */
|
|
|
- internal_mul(tmp, a0, b) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Shift `tmp` to the correct position.
|
|
|
- */
|
|
|
- _private_int_shl_leg(tmp, b_size * i) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Add to output. No carry needed.
|
|
|
- */
|
|
|
- internal_add(r, r, tmp) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- The left-overs; there are always left-overs.
|
|
|
- */
|
|
|
- if j < a.used {
|
|
|
- a0.used = a.used - j
|
|
|
- internal_copy_digits(a0, a, a0.used, j)
|
|
|
- j += a0.used
|
|
|
- internal_clamp(a0)
|
|
|
-
|
|
|
- internal_mul(tmp, a0, b) or_return
|
|
|
- _private_int_shl_leg(tmp, b_size * i) or_return
|
|
|
- internal_add(r, r, tmp) or_return
|
|
|
- }
|
|
|
-
|
|
|
- internal_swap(dest, r)
|
|
|
- return
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16
|
|
|
- Assumes `dest` and `src` to not be `nil`, and `src` to have been initialized.
|
|
|
-*/
|
|
|
-_private_int_sqr :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
- pa := src.used
|
|
|
-
|
|
|
- t := &Int{}; ix, iy: int
|
|
|
- /*
|
|
|
- Grow `t` to maximum needed size, or `_DEFAULT_DIGIT_COUNT`, whichever is bigger.
|
|
|
- */
|
|
|
- internal_grow(t, max((2 * pa) + 1, _DEFAULT_DIGIT_COUNT)) or_return
|
|
|
- t.used = (2 * pa) + 1
|
|
|
-
|
|
|
- #no_bounds_check for ix = 0; ix < pa; ix += 1 {
|
|
|
- carry := DIGIT(0)
|
|
|
- /*
|
|
|
- First calculate the digit at 2*ix; calculate double precision result.
|
|
|
- */
|
|
|
- r := _WORD(t.digit[ix+ix]) + (_WORD(src.digit[ix]) * _WORD(src.digit[ix]))
|
|
|
-
|
|
|
- /*
|
|
|
- Store lower part in result.
|
|
|
- */
|
|
|
- t.digit[ix+ix] = DIGIT(r & _WORD(_MASK))
|
|
|
- /*
|
|
|
- Get the carry.
|
|
|
- */
|
|
|
- carry = DIGIT(r >> _DIGIT_BITS)
|
|
|
-
|
|
|
- #no_bounds_check for iy = ix + 1; iy < pa; iy += 1 {
|
|
|
- /*
|
|
|
- First calculate the product.
|
|
|
- */
|
|
|
- r = _WORD(src.digit[ix]) * _WORD(src.digit[iy])
|
|
|
-
|
|
|
- /* Now calculate the double precision result. Nóte we use
|
|
|
- * addition instead of *2 since it's easier to optimize
|
|
|
- */
|
|
|
- r = _WORD(t.digit[ix+iy]) + r + r + _WORD(carry)
|
|
|
-
|
|
|
- /*
|
|
|
- Store lower part.
|
|
|
- */
|
|
|
- t.digit[ix+iy] = DIGIT(r & _WORD(_MASK))
|
|
|
-
|
|
|
- /*
|
|
|
- Get carry.
|
|
|
- */
|
|
|
- carry = DIGIT(r >> _DIGIT_BITS)
|
|
|
- }
|
|
|
- /*
|
|
|
- Propagate upwards.
|
|
|
- */
|
|
|
- #no_bounds_check for carry != 0 {
|
|
|
- r = _WORD(t.digit[ix+iy]) + _WORD(carry)
|
|
|
- t.digit[ix+iy] = DIGIT(r & _WORD(_MASK))
|
|
|
- carry = DIGIT(r >> _WORD(_DIGIT_BITS))
|
|
|
- iy += 1
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- err = internal_clamp(t)
|
|
|
- internal_swap(dest, t)
|
|
|
- internal_destroy(t)
|
|
|
- return err
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- The jist of squaring...
|
|
|
- You do like mult except the offset of the tmpx [one that starts closer to zero] can't equal the offset of tmpy.
|
|
|
- So basically you set up iy like before then you min it with (ty-tx) so that it never happens.
|
|
|
- You double all those you add in the inner loop. After that loop you do the squares and add them in.
|
|
|
-
|
|
|
- Assumes `dest` and `src` not to be `nil` and `src` to have been initialized.
|
|
|
-*/
|
|
|
-_private_int_sqr_comba :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- W: [_WARRAY]DIGIT = ---
|
|
|
-
|
|
|
- /*
|
|
|
- Grow the destination as required.
|
|
|
- */
|
|
|
- pa := uint(src.used) + uint(src.used)
|
|
|
- internal_grow(dest, int(pa)) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Number of output digits to produce.
|
|
|
- */
|
|
|
- W1 := _WORD(0)
|
|
|
- _W : _WORD = ---
|
|
|
- ix := uint(0)
|
|
|
-
|
|
|
- #no_bounds_check for ; ix < pa; ix += 1 {
|
|
|
- /*
|
|
|
- Clear counter.
|
|
|
- */
|
|
|
- _W = {}
|
|
|
-
|
|
|
- /*
|
|
|
- Get offsets into the two bignums.
|
|
|
- */
|
|
|
- ty := min(uint(src.used) - 1, ix)
|
|
|
- tx := ix - ty
|
|
|
-
|
|
|
- /*
|
|
|
- This is the number of times the loop will iterate,
|
|
|
- essentially while (tx++ < a->used && ty-- >= 0) { ... }
|
|
|
- */
|
|
|
- iy := min(uint(src.used) - tx, ty + 1)
|
|
|
-
|
|
|
- /*
|
|
|
- Now for squaring, tx can never equal ty.
|
|
|
- We halve the distance since they approach at a rate of 2x,
|
|
|
- and we have to round because odd cases need to be executed.
|
|
|
- */
|
|
|
- iy = min(iy, ((ty - tx) + 1) >> 1 )
|
|
|
-
|
|
|
- /*
|
|
|
- Execute loop.
|
|
|
- */
|
|
|
- #no_bounds_check for iz := uint(0); iz < iy; iz += 1 {
|
|
|
- _W += _WORD(src.digit[tx + iz]) * _WORD(src.digit[ty - iz])
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Double the inner product and add carry.
|
|
|
- */
|
|
|
- _W = _W + _W + W1
|
|
|
-
|
|
|
- /*
|
|
|
- Even columns have the square term in them.
|
|
|
- */
|
|
|
- if ix & 1 == 0 {
|
|
|
- _W += _WORD(src.digit[ix >> 1]) * _WORD(src.digit[ix >> 1])
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Store it.
|
|
|
- */
|
|
|
- W[ix] = DIGIT(_W & _WORD(_MASK))
|
|
|
-
|
|
|
- /*
|
|
|
- Make next carry.
|
|
|
- */
|
|
|
- W1 = _W >> _DIGIT_BITS
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Setup dest.
|
|
|
- */
|
|
|
- old_used := dest.used
|
|
|
- dest.used = src.used + src.used
|
|
|
-
|
|
|
- #no_bounds_check for ix = 0; ix < pa; ix += 1 {
|
|
|
- dest.digit[ix] = W[ix] & _MASK
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Clear unused digits [that existed in the old copy of dest].
|
|
|
- */
|
|
|
- internal_zero_unused(dest, old_used)
|
|
|
-
|
|
|
- return internal_clamp(dest)
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Karatsuba squaring, computes `dest` = `src` * `src` using three half-size squarings.
|
|
|
-
|
|
|
- See comments of `_private_int_mul_karatsuba` for details.
|
|
|
- It is essentially the same algorithm but merely tuned to perform recursive squarings.
|
|
|
-*/
|
|
|
-_private_int_sqr_karatsuba :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- x0, x1, t1, t2, x0x0, x1x1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(x0, x1, t1, t2, x0x0, x1x1)
|
|
|
-
|
|
|
- /*
|
|
|
- Min # of digits, divided by two.
|
|
|
- */
|
|
|
- B := src.used >> 1
|
|
|
-
|
|
|
- /*
|
|
|
- Init temps.
|
|
|
- */
|
|
|
- internal_grow(x0, B) or_return
|
|
|
- internal_grow(x1, src.used - B) or_return
|
|
|
- internal_grow(t1, src.used * 2) or_return
|
|
|
- internal_grow(t2, src.used * 2) or_return
|
|
|
- internal_grow(x0x0, B * 2 ) or_return
|
|
|
- internal_grow(x1x1, (src.used - B) * 2) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Now shift the digits.
|
|
|
- */
|
|
|
- x0.used = B
|
|
|
- x1.used = src.used - B
|
|
|
-
|
|
|
- #force_inline internal_copy_digits(x0, src, x0.used)
|
|
|
- #force_inline mem.copy_non_overlapping(&x1.digit[0], &src.digit[B], size_of(DIGIT) * x1.used)
|
|
|
- #force_inline internal_clamp(x0)
|
|
|
-
|
|
|
- /*
|
|
|
- Now calc the products x0*x0 and x1*x1.
|
|
|
- */
|
|
|
- internal_sqr(x0x0, x0) or_return
|
|
|
- internal_sqr(x1x1, x1) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Now calc (x1+x0)^2
|
|
|
- */
|
|
|
- internal_add(t1, x0, x1) or_return
|
|
|
- internal_sqr(t1, t1) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Add x0y0
|
|
|
- */
|
|
|
- internal_add(t2, x0x0, x1x1) or_return
|
|
|
- internal_sub(t1, t1, t2) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Shift by B.
|
|
|
- */
|
|
|
- _private_int_shl_leg(t1, B) or_return
|
|
|
- _private_int_shl_leg(x1x1, B * 2) or_return
|
|
|
- internal_add(t1, t1, x0x0) or_return
|
|
|
- internal_add(dest, t1, x1x1) or_return
|
|
|
-
|
|
|
- return #force_inline internal_clamp(dest)
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Squaring using Toom-Cook 3-way algorithm.
|
|
|
-
|
|
|
- Setup and interpolation from algorithm SQR_3 in Chung, Jaewook, and M. Anwar Hasan. "Asymmetric squaring formulae."
|
|
|
- 18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.
|
|
|
-*/
|
|
|
-_private_int_sqr_toom :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- S0, a0, a1, a2 := &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(S0, a0, a1, a2)
|
|
|
-
|
|
|
- /*
|
|
|
- Init temps.
|
|
|
- */
|
|
|
- internal_zero(S0) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- B
|
|
|
- */
|
|
|
- B := src.used / 3
|
|
|
-
|
|
|
- /*
|
|
|
- a = a2 * x^2 + a1 * x + a0;
|
|
|
- */
|
|
|
- internal_grow(a0, B) or_return
|
|
|
- internal_grow(a1, B) or_return
|
|
|
- internal_grow(a2, src.used - (2 * B)) or_return
|
|
|
-
|
|
|
- a0.used = B
|
|
|
- a1.used = B
|
|
|
- a2.used = src.used - 2 * B
|
|
|
-
|
|
|
- #force_inline mem.copy_non_overlapping(&a0.digit[0], &src.digit[ 0], size_of(DIGIT) * a0.used)
|
|
|
- #force_inline mem.copy_non_overlapping(&a1.digit[0], &src.digit[ B], size_of(DIGIT) * a1.used)
|
|
|
- #force_inline mem.copy_non_overlapping(&a2.digit[0], &src.digit[2 * B], size_of(DIGIT) * a2.used)
|
|
|
-
|
|
|
- internal_clamp(a0)
|
|
|
- internal_clamp(a1)
|
|
|
- internal_clamp(a2)
|
|
|
-
|
|
|
- /** S0 = a0^2; */
|
|
|
- internal_sqr(S0, a0) or_return
|
|
|
-
|
|
|
- /** \\S1 = (a2 + a1 + a0)^2 */
|
|
|
- /** \\S2 = (a2 - a1 + a0)^2 */
|
|
|
- /** \\S1 = a0 + a2; */
|
|
|
- /** a0 = a0 + a2; */
|
|
|
- internal_add(a0, a0, a2) or_return
|
|
|
- /** \\S2 = S1 - a1; */
|
|
|
- /** b = a0 - a1; */
|
|
|
- internal_sub(dest, a0, a1) or_return
|
|
|
- /** \\S1 = S1 + a1; */
|
|
|
- /** a0 = a0 + a1; */
|
|
|
- internal_add(a0, a0, a1) or_return
|
|
|
- /** \\S1 = S1^2; */
|
|
|
- /** a0 = a0^2; */
|
|
|
- internal_sqr(a0, a0) or_return
|
|
|
- /** \\S2 = S2^2; */
|
|
|
- /** b = b^2; */
|
|
|
- internal_sqr(dest, dest) or_return
|
|
|
- /** \\ S3 = 2 * a1 * a2 */
|
|
|
- /** \\S3 = a1 * a2; */
|
|
|
- /** a1 = a1 * a2; */
|
|
|
- internal_mul(a1, a1, a2) or_return
|
|
|
- /** \\S3 = S3 << 1; */
|
|
|
- /** a1 = a1 << 1; */
|
|
|
- internal_shl(a1, a1, 1) or_return
|
|
|
- /** \\S4 = a2^2; */
|
|
|
- /** a2 = a2^2; */
|
|
|
- internal_sqr(a2, a2) or_return
|
|
|
- /** \\ tmp = (S1 + S2)/2 */
|
|
|
- /** \\tmp = S1 + S2; */
|
|
|
- /** b = a0 + b; */
|
|
|
- internal_add(dest, a0, dest) or_return
|
|
|
- /** \\tmp = tmp >> 1; */
|
|
|
- /** b = b >> 1; */
|
|
|
- internal_shr(dest, dest, 1) or_return
|
|
|
- /** \\ S1 = S1 - tmp - S3 */
|
|
|
- /** \\S1 = S1 - tmp; */
|
|
|
- /** a0 = a0 - b; */
|
|
|
- internal_sub(a0, a0, dest) or_return
|
|
|
- /** \\S1 = S1 - S3; */
|
|
|
- /** a0 = a0 - a1; */
|
|
|
- internal_sub(a0, a0, a1) or_return
|
|
|
- /** \\S2 = tmp - S4 -S0 */
|
|
|
- /** \\S2 = tmp - S4; */
|
|
|
- /** b = b - a2; */
|
|
|
- internal_sub(dest, dest, a2) or_return
|
|
|
- /** \\S2 = S2 - S0; */
|
|
|
- /** b = b - S0; */
|
|
|
- internal_sub(dest, dest, S0) or_return
|
|
|
- /** \\P = S4*x^4 + S3*x^3 + S2*x^2 + S1*x + S0; */
|
|
|
- /** P = a2*x^4 + a1*x^3 + b*x^2 + a0*x + S0; */
|
|
|
- _private_int_shl_leg( a2, 4 * B) or_return
|
|
|
- _private_int_shl_leg( a1, 3 * B) or_return
|
|
|
- _private_int_shl_leg(dest, 2 * B) or_return
|
|
|
- _private_int_shl_leg( a0, 1 * B) or_return
|
|
|
-
|
|
|
- internal_add(a2, a2, a1) or_return
|
|
|
- internal_add(dest, dest, a2) or_return
|
|
|
- internal_add(dest, dest, a0) or_return
|
|
|
- internal_add(dest, dest, S0) or_return
|
|
|
- /** a^2 - P */
|
|
|
-
|
|
|
- return #force_inline internal_clamp(dest)
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Divide by three (based on routine from MPI and the GMP manual).
|
|
|
-*/
|
|
|
-_private_int_div_3 :: proc(quotient, numerator: ^Int, allocator := context.allocator) -> (remainder: DIGIT, err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- /*
|
|
|
- b = 2^_DIGIT_BITS / 3
|
|
|
- */
|
|
|
- b := _WORD(1) << _WORD(_DIGIT_BITS) / _WORD(3)
|
|
|
-
|
|
|
- q := &Int{}
|
|
|
- internal_grow(q, numerator.used) or_return
|
|
|
- q.used = numerator.used
|
|
|
- q.sign = numerator.sign
|
|
|
-
|
|
|
- w, t: _WORD
|
|
|
- #no_bounds_check for ix := numerator.used; ix >= 0; ix -= 1 {
|
|
|
- w = (w << _WORD(_DIGIT_BITS)) | _WORD(numerator.digit[ix])
|
|
|
- if w >= 3 {
|
|
|
- /*
|
|
|
- Multiply w by [1/3].
|
|
|
- */
|
|
|
- t = (w * b) >> _WORD(_DIGIT_BITS)
|
|
|
-
|
|
|
- /*
|
|
|
- Now subtract 3 * [w/3] from w, to get the remainder.
|
|
|
- */
|
|
|
- w -= t+t+t
|
|
|
-
|
|
|
- /*
|
|
|
- Fixup the remainder as required since the optimization is not exact.
|
|
|
- */
|
|
|
- for w >= 3 {
|
|
|
- t += 1
|
|
|
- w -= 3
|
|
|
- }
|
|
|
- } else {
|
|
|
- t = 0
|
|
|
- }
|
|
|
- q.digit[ix] = DIGIT(t)
|
|
|
- }
|
|
|
- remainder = DIGIT(w)
|
|
|
-
|
|
|
- /*
|
|
|
- [optional] store the quotient.
|
|
|
- */
|
|
|
- if quotient != nil {
|
|
|
- err = clamp(q)
|
|
|
- internal_swap(q, quotient)
|
|
|
- }
|
|
|
- internal_destroy(q)
|
|
|
- return remainder, nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Signed Integer Division
|
|
|
-
|
|
|
- c*b + d == a [i.e. a/b, c=quotient, d=remainder], HAC pp.598 Algorithm 14.20
|
|
|
-
|
|
|
- Note that the description in HAC is horribly incomplete.
|
|
|
- For example, it doesn't consider the case where digits are removed from 'x' in
|
|
|
- the inner loop.
|
|
|
-
|
|
|
- It also doesn't consider the case that y has fewer than three digits, etc.
|
|
|
- The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
|
|
|
-*/
|
|
|
-_private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- error_if_immutable(quotient, remainder) or_return
|
|
|
-
|
|
|
- q, x, y, t1, t2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(q, x, y, t1, t2)
|
|
|
-
|
|
|
- internal_grow(q, numerator.used + 2) or_return
|
|
|
- q.used = numerator.used + 2
|
|
|
-
|
|
|
- internal_init_multi(t1, t2) or_return
|
|
|
- internal_copy(x, numerator) or_return
|
|
|
- internal_copy(y, denominator) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Fix the sign.
|
|
|
- */
|
|
|
- neg := numerator.sign != denominator.sign
|
|
|
- x.sign = .Zero_or_Positive
|
|
|
- y.sign = .Zero_or_Positive
|
|
|
-
|
|
|
- /*
|
|
|
- Normalize both x and y, ensure that y >= b/2, [b == 2**MP_DIGIT_BIT]
|
|
|
- */
|
|
|
- norm := internal_count_bits(y) % _DIGIT_BITS
|
|
|
-
|
|
|
- if norm < _DIGIT_BITS - 1 {
|
|
|
- norm = (_DIGIT_BITS - 1) - norm
|
|
|
- internal_shl(x, x, norm) or_return
|
|
|
- internal_shl(y, y, norm) or_return
|
|
|
- } else {
|
|
|
- norm = 0
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Note: HAC does 0 based, so if used==5 then it's 0,1,2,3,4, i.e. use 4
|
|
|
- */
|
|
|
- n := x.used - 1
|
|
|
- t := y.used - 1
|
|
|
-
|
|
|
- /*
|
|
|
- while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} }
|
|
|
- y = y*b**{n-t}
|
|
|
- */
|
|
|
-
|
|
|
- _private_int_shl_leg(y, n - t) or_return
|
|
|
-
|
|
|
- gte := internal_gte(x, y)
|
|
|
- for gte {
|
|
|
- q.digit[n - t] += 1
|
|
|
- internal_sub(x, x, y) or_return
|
|
|
- gte = internal_gte(x, y)
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Reset y by shifting it back down.
|
|
|
- */
|
|
|
- _private_int_shr_leg(y, n - t)
|
|
|
-
|
|
|
- /*
|
|
|
- Step 3. for i from n down to (t + 1).
|
|
|
- */
|
|
|
- #no_bounds_check for i := n; i >= (t + 1); i -= 1 {
|
|
|
- if i > x.used { continue }
|
|
|
-
|
|
|
- /*
|
|
|
- step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt
|
|
|
- */
|
|
|
- if x.digit[i] == y.digit[t] {
|
|
|
- q.digit[(i - t) - 1] = 1 << (_DIGIT_BITS - 1)
|
|
|
- } else {
|
|
|
-
|
|
|
- tmp := _WORD(x.digit[i]) << _DIGIT_BITS
|
|
|
- tmp |= _WORD(x.digit[i - 1])
|
|
|
- tmp /= _WORD(y.digit[t])
|
|
|
- if tmp > _WORD(_MASK) {
|
|
|
- tmp = _WORD(_MASK)
|
|
|
- }
|
|
|
- q.digit[(i - t) - 1] = DIGIT(tmp & _WORD(_MASK))
|
|
|
- }
|
|
|
-
|
|
|
- /* while (q{i-t-1} * (yt * b + y{t-1})) >
|
|
|
- xi * b**2 + xi-1 * b + xi-2
|
|
|
-
|
|
|
- do q{i-t-1} -= 1;
|
|
|
- */
|
|
|
-
|
|
|
- iter := 0
|
|
|
-
|
|
|
- q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] + 1) & _MASK
|
|
|
- #no_bounds_check for {
|
|
|
- q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK
|
|
|
-
|
|
|
- /*
|
|
|
- Find left hand.
|
|
|
- */
|
|
|
- internal_zero(t1)
|
|
|
- t1.digit[0] = ((t - 1) < 0) ? 0 : y.digit[t - 1]
|
|
|
- t1.digit[1] = y.digit[t]
|
|
|
- t1.used = 2
|
|
|
- internal_mul(t1, t1, q.digit[(i - t) - 1]) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Find right hand.
|
|
|
- */
|
|
|
- t2.digit[0] = ((i - 2) < 0) ? 0 : x.digit[i - 2]
|
|
|
- t2.digit[1] = x.digit[i - 1] /* i >= 1 always holds */
|
|
|
- t2.digit[2] = x.digit[i]
|
|
|
- t2.used = 3
|
|
|
-
|
|
|
- if internal_lte(t1, t2) {
|
|
|
- break
|
|
|
- }
|
|
|
- iter += 1; if iter > 100 {
|
|
|
- return .Max_Iterations_Reached
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Step 3.3 x = x - q{i-t-1} * y * b**{i-t-1}
|
|
|
- */
|
|
|
- int_mul_digit(t1, y, q.digit[(i - t) - 1]) or_return
|
|
|
- _private_int_shl_leg(t1, (i - t) - 1) or_return
|
|
|
- internal_sub(x, x, t1) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; }
|
|
|
- */
|
|
|
- if x.sign == .Negative {
|
|
|
- internal_copy(t1, y) or_return
|
|
|
- _private_int_shl_leg(t1, (i - t) - 1) or_return
|
|
|
- internal_add(x, x, t1) or_return
|
|
|
-
|
|
|
- q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Now q is the quotient and x is the remainder, [which we have to normalize]
|
|
|
- Get sign before writing to c.
|
|
|
- */
|
|
|
- z, _ := is_zero(x)
|
|
|
- x.sign = .Zero_or_Positive if z else numerator.sign
|
|
|
-
|
|
|
- if quotient != nil {
|
|
|
- internal_clamp(q)
|
|
|
- internal_swap(q, quotient)
|
|
|
- quotient.sign = .Negative if neg else .Zero_or_Positive
|
|
|
- }
|
|
|
-
|
|
|
- if remainder != nil {
|
|
|
- internal_shr(x, x, norm) or_return
|
|
|
- internal_swap(x, remainder)
|
|
|
- }
|
|
|
-
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Direct implementation of algorithms 1.8 "RecursiveDivRem" and 1.9 "UnbalancedDivision" from:
|
|
|
-
|
|
|
- Brent, Richard P., and Paul Zimmermann. "Modern computer arithmetic"
|
|
|
- Vol. 18. Cambridge University Press, 2010
|
|
|
- Available online at https://arxiv.org/pdf/1004.4710
|
|
|
-
|
|
|
- pages 19ff. in the above online document.
|
|
|
-*/
|
|
|
-_private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- A1, A2, B1, B0, Q1, Q0, R1, R0, t := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(A1, A2, B1, B0, Q1, Q0, R1, R0, t)
|
|
|
-
|
|
|
- m := a.used - b.used
|
|
|
- k := m / 2
|
|
|
-
|
|
|
- if m < MUL_KARATSUBA_CUTOFF {
|
|
|
- return _private_int_div_school(quotient, remainder, a, b)
|
|
|
- }
|
|
|
-
|
|
|
- internal_init_multi(A1, A2, B1, B0, Q1, Q0, R1, R0, t) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- `B1` = `b` / `beta`^`k`, `B0` = `b` % `beta`^`k`
|
|
|
- */
|
|
|
- internal_shrmod(B1, B0, b, k * _DIGIT_BITS) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- (Q1, R1) = RecursiveDivRem(A / beta^(2k), B1)
|
|
|
- */
|
|
|
- internal_shrmod(A1, t, a, 2 * k * _DIGIT_BITS) or_return
|
|
|
- _private_div_recursion(Q1, R1, A1, B1) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- A1 = (R1 * beta^(2k)) + (A % beta^(2k)) - (Q1 * B0 * beta^k)
|
|
|
- */
|
|
|
- _private_int_shl_leg(R1, 2 * k) or_return
|
|
|
- internal_add(A1, R1, t) or_return
|
|
|
- internal_mul(t, Q1, B0) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- While A1 < 0 do Q1 = Q1 - 1, A1 = A1 + (beta^k * B)
|
|
|
- */
|
|
|
- if internal_lt(A1, 0) {
|
|
|
- internal_shl(t, b, k * _DIGIT_BITS) or_return
|
|
|
-
|
|
|
- for {
|
|
|
- internal_decr(Q1) or_return
|
|
|
- internal_add(A1, A1, t) or_return
|
|
|
- if internal_gte(A1, 0) { break }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- (Q0, R0) = RecursiveDivRem(A1 / beta^(k), B1)
|
|
|
- */
|
|
|
- internal_shrmod(A1, t, A1, k * _DIGIT_BITS) or_return
|
|
|
- _private_div_recursion(Q0, R0, A1, B1) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- A2 = (R0*beta^k) + (A1 % beta^k) - (Q0*B0)
|
|
|
- */
|
|
|
- _private_int_shl_leg(R0, k) or_return
|
|
|
- internal_add(A2, R0, t) or_return
|
|
|
- internal_mul(t, Q0, B0) or_return
|
|
|
- internal_sub(A2, A2, t) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- While A2 < 0 do Q0 = Q0 - 1, A2 = A2 + B.
|
|
|
- */
|
|
|
- for internal_is_negative(A2) { // internal_lt(A2, 0) {
|
|
|
- internal_decr(Q0) or_return
|
|
|
- internal_add(A2, A2, b) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Return q = (Q1*beta^k) + Q0, r = A2.
|
|
|
- */
|
|
|
- _private_int_shl_leg(Q1, k) or_return
|
|
|
- internal_add(quotient, Q1, Q0) or_return
|
|
|
-
|
|
|
- return internal_copy(remainder, A2)
|
|
|
-}
|
|
|
-
|
|
|
-_private_int_div_recursive :: proc(quotient, remainder, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- A, B, Q, Q1, R, A_div, A_mod := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(A, B, Q, Q1, R, A_div, A_mod)
|
|
|
-
|
|
|
- internal_init_multi(A, B, Q, Q1, R, A_div, A_mod) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Most significant bit of a limb.
|
|
|
- Assumes _DIGIT_MAX < (sizeof(DIGIT) * sizeof(u8)).
|
|
|
- */
|
|
|
- msb := (_DIGIT_MAX + DIGIT(1)) >> 1
|
|
|
- sigma := 0
|
|
|
- msb_b := b.digit[b.used - 1]
|
|
|
- for msb_b < msb {
|
|
|
- sigma += 1
|
|
|
- msb_b <<= 1
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Use that sigma to normalize B.
|
|
|
- */
|
|
|
- internal_shl(B, b, sigma) or_return
|
|
|
- internal_shl(A, a, sigma) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Fix the sign.
|
|
|
- */
|
|
|
- neg := a.sign != b.sign
|
|
|
- A.sign = .Zero_or_Positive; B.sign = .Zero_or_Positive
|
|
|
-
|
|
|
- /*
|
|
|
- If the magnitude of "A" is not more more than twice that of "B" we can work
|
|
|
- on them directly, otherwise we need to work at "A" in chunks.
|
|
|
- */
|
|
|
- n := B.used
|
|
|
- m := A.used - B.used
|
|
|
-
|
|
|
- /*
|
|
|
- Q = 0. We already ensured that when we called `internal_init_multi`.
|
|
|
- */
|
|
|
- for m > n {
|
|
|
- /*
|
|
|
- (q, r) = RecursiveDivRem(A / (beta^(m-n)), B)
|
|
|
- */
|
|
|
- j := (m - n) * _DIGIT_BITS
|
|
|
- internal_shrmod(A_div, A_mod, A, j) or_return
|
|
|
- _private_div_recursion(Q1, R, A_div, B) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Q = (Q*beta!(n)) + q
|
|
|
- */
|
|
|
- internal_shl(Q, Q, n * _DIGIT_BITS) or_return
|
|
|
- internal_add(Q, Q, Q1) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- A = (r * beta^(m-n)) + (A % beta^(m-n))
|
|
|
- */
|
|
|
- internal_shl(R, R, (m - n) * _DIGIT_BITS) or_return
|
|
|
- internal_add(A, R, A_mod) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- m = m - n
|
|
|
- */
|
|
|
- m -= n
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- (q, r) = RecursiveDivRem(A, B)
|
|
|
- */
|
|
|
- _private_div_recursion(Q1, R, A, B) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Q = (Q * beta^m) + q, R = r
|
|
|
- */
|
|
|
- internal_shl(Q, Q, m * _DIGIT_BITS) or_return
|
|
|
- internal_add(Q, Q, Q1) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Get sign before writing to dest.
|
|
|
- */
|
|
|
- R.sign = .Zero_or_Positive if internal_is_zero(Q) else a.sign
|
|
|
-
|
|
|
- if quotient != nil {
|
|
|
- swap(quotient, Q)
|
|
|
- quotient.sign = .Negative if neg else .Zero_or_Positive
|
|
|
- }
|
|
|
- if remainder != nil {
|
|
|
- /*
|
|
|
- De-normalize the remainder.
|
|
|
- */
|
|
|
- internal_shrmod(R, nil, R, sigma) or_return
|
|
|
- swap(remainder, R)
|
|
|
- }
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Slower bit-bang division... also smaller.
|
|
|
-*/
|
|
|
-@(deprecated="Use `_int_div_school`, it's 3.5x faster.")
|
|
|
-_private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) {
|
|
|
-
|
|
|
- ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
-
|
|
|
- defer internal_destroy(ta, tb, tq, q)
|
|
|
-
|
|
|
- for {
|
|
|
- internal_one(tq) or_return
|
|
|
-
|
|
|
- num_bits, _ := count_bits(numerator)
|
|
|
- den_bits, _ := count_bits(denominator)
|
|
|
- n := num_bits - den_bits
|
|
|
-
|
|
|
- abs(ta, numerator) or_return
|
|
|
- abs(tb, denominator) or_return
|
|
|
- shl(tb, tb, n) or_return
|
|
|
- shl(tq, tq, n) or_return
|
|
|
-
|
|
|
- for n >= 0 {
|
|
|
- if internal_gte(ta, tb) {
|
|
|
- // ta -= tb
|
|
|
- sub(ta, ta, tb) or_return
|
|
|
- // q += tq
|
|
|
- add( q, q, tq) or_return
|
|
|
- }
|
|
|
- shr1(tb, tb) or_return
|
|
|
- shr1(tq, tq) or_return
|
|
|
-
|
|
|
- n -= 1
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Now q == quotient and ta == remainder.
|
|
|
- */
|
|
|
- neg := numerator.sign != denominator.sign
|
|
|
- if quotient != nil {
|
|
|
- swap(quotient, q)
|
|
|
- z, _ := is_zero(quotient)
|
|
|
- quotient.sign = .Negative if neg && !z else .Zero_or_Positive
|
|
|
- }
|
|
|
- if remainder != nil {
|
|
|
- swap(remainder, ta)
|
|
|
- z, _ := is_zero(numerator)
|
|
|
- remainder.sign = .Zero_or_Positive if z else numerator.sign
|
|
|
- }
|
|
|
-
|
|
|
- break
|
|
|
- }
|
|
|
- return err
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-/*
|
|
|
- Binary split factorial algo due to: http://www.luschny.de/math/factorial/binarysplitfact.html
|
|
|
-*/
|
|
|
-_private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- inner, outer, start, stop, temp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(inner, outer, start, stop, temp)
|
|
|
-
|
|
|
- internal_one(inner, false) or_return
|
|
|
- internal_one(outer, false) or_return
|
|
|
-
|
|
|
- bits_used := ilog2(n)
|
|
|
-
|
|
|
- for i := bits_used; i >= 0; i -= 1 {
|
|
|
- start := (n >> (uint(i) + 1)) + 1 | 1
|
|
|
- stop := (n >> uint(i)) + 1 | 1
|
|
|
- _private_int_recursive_product(temp, start, stop, 0) or_return
|
|
|
- internal_mul(inner, inner, temp) or_return
|
|
|
- internal_mul(outer, outer, inner) or_return
|
|
|
- }
|
|
|
- shift := n - intrinsics.count_ones(n)
|
|
|
-
|
|
|
- return internal_shl(res, outer, int(shift))
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Recursive product used by binary split factorial algorithm.
|
|
|
-*/
|
|
|
-_private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int(0), allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- t1, t2 := &Int{}, &Int{}
|
|
|
- defer internal_destroy(t1, t2)
|
|
|
-
|
|
|
- if level > FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS {
|
|
|
- return .Max_Iterations_Reached
|
|
|
- }
|
|
|
-
|
|
|
- num_factors := (stop - start) >> 1
|
|
|
- if num_factors == 2 {
|
|
|
- internal_set(t1, start, false) or_return
|
|
|
- when true {
|
|
|
- internal_grow(t2, t1.used + 1, false) or_return
|
|
|
- internal_add(t2, t1, 2) or_return
|
|
|
- } else {
|
|
|
- internal_add(t2, t1, 2) or_return
|
|
|
- }
|
|
|
- return internal_mul(res, t1, t2)
|
|
|
- }
|
|
|
-
|
|
|
- if num_factors > 1 {
|
|
|
- mid := (start + num_factors) | 1
|
|
|
- _private_int_recursive_product(t1, start, mid, level + 1) or_return
|
|
|
- _private_int_recursive_product(t2, mid, stop, level + 1) or_return
|
|
|
- return internal_mul(res, t1, t2)
|
|
|
- }
|
|
|
-
|
|
|
- if num_factors == 1 {
|
|
|
- return #force_inline internal_set(res, start, true)
|
|
|
- }
|
|
|
-
|
|
|
- return #force_inline internal_one(res, true)
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Internal function computing both GCD using the binary method,
|
|
|
- and, if target isn't `nil`, also LCM.
|
|
|
-
|
|
|
- Expects the `a` and `b` to have been initialized
|
|
|
- and one or both of `res_gcd` or `res_lcm` not to be `nil`.
|
|
|
-
|
|
|
- If both `a` and `b` are zero, return zero.
|
|
|
- If either `a` or `b`, return the other one.
|
|
|
-
|
|
|
- The `gcd` and `lcm` wrappers have already done this test,
|
|
|
- but `gcd_lcm` wouldn't have, so we still need to perform it.
|
|
|
-
|
|
|
- If neither result is wanted, we have nothing to do.
|
|
|
-*/
|
|
|
-_private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- if res_gcd == nil && res_lcm == nil {
|
|
|
- return nil
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- We need a temporary because `res_gcd` is allowed to be `nil`.
|
|
|
- */
|
|
|
- if a.used == 0 && b.used == 0 {
|
|
|
- /*
|
|
|
- GCD(0, 0) and LCM(0, 0) are both 0.
|
|
|
- */
|
|
|
- if res_gcd != nil {
|
|
|
- internal_zero(res_gcd) or_return
|
|
|
- }
|
|
|
- if res_lcm != nil {
|
|
|
- internal_zero(res_lcm) or_return
|
|
|
- }
|
|
|
- return nil
|
|
|
- } else if a.used == 0 {
|
|
|
- /*
|
|
|
- We can early out with GCD = B and LCM = 0
|
|
|
- */
|
|
|
- if res_gcd != nil {
|
|
|
- internal_abs(res_gcd, b) or_return
|
|
|
- }
|
|
|
- if res_lcm != nil {
|
|
|
- internal_zero(res_lcm) or_return
|
|
|
- }
|
|
|
- return nil
|
|
|
- } else if b.used == 0 {
|
|
|
- /*
|
|
|
- We can early out with GCD = A and LCM = 0
|
|
|
- */
|
|
|
- if res_gcd != nil {
|
|
|
- internal_abs(res_gcd, a) or_return
|
|
|
- }
|
|
|
- if res_lcm != nil {
|
|
|
- internal_zero(res_lcm) or_return
|
|
|
- }
|
|
|
- return nil
|
|
|
- }
|
|
|
-
|
|
|
- temp_gcd_res := &Int{}
|
|
|
- defer internal_destroy(temp_gcd_res)
|
|
|
-
|
|
|
- /*
|
|
|
- If neither `a` or `b` was zero, we need to compute `gcd`.
|
|
|
- Get copies of `a` and `b` we can modify.
|
|
|
- */
|
|
|
- u, v := &Int{}, &Int{}
|
|
|
- defer internal_destroy(u, v)
|
|
|
- internal_copy(u, a) or_return
|
|
|
- internal_copy(v, b) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Must be positive for the remainder of the algorithm.
|
|
|
- */
|
|
|
- u.sign = .Zero_or_Positive; v.sign = .Zero_or_Positive
|
|
|
-
|
|
|
- /*
|
|
|
- B1. Find the common power of two for `u` and `v`.
|
|
|
- */
|
|
|
- u_lsb, _ := internal_count_lsb(u)
|
|
|
- v_lsb, _ := internal_count_lsb(v)
|
|
|
- k := min(u_lsb, v_lsb)
|
|
|
-
|
|
|
- if k > 0 {
|
|
|
- /*
|
|
|
- Divide the power of two out.
|
|
|
- */
|
|
|
- internal_shr(u, u, k) or_return
|
|
|
- internal_shr(v, v, k) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Divide any remaining factors of two out.
|
|
|
- */
|
|
|
- if u_lsb != k {
|
|
|
- internal_shr(u, u, u_lsb - k) or_return
|
|
|
- }
|
|
|
- if v_lsb != k {
|
|
|
- internal_shr(v, v, v_lsb - k) or_return
|
|
|
- }
|
|
|
-
|
|
|
- for v.used != 0 {
|
|
|
- /*
|
|
|
- Make sure `v` is the largest.
|
|
|
- */
|
|
|
- if internal_gt(u, v) {
|
|
|
- /*
|
|
|
- Swap `u` and `v` to make sure `v` is >= `u`.
|
|
|
- */
|
|
|
- internal_swap(u, v)
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Subtract smallest from largest.
|
|
|
- */
|
|
|
- internal_sub(v, v, u) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Divide out all factors of two.
|
|
|
- */
|
|
|
- b, _ := internal_count_lsb(v)
|
|
|
- internal_shr(v, v, b) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Multiply by 2**k which we divided out at the beginning.
|
|
|
- */
|
|
|
- internal_shl(temp_gcd_res, u, k) or_return
|
|
|
- temp_gcd_res.sign = .Zero_or_Positive
|
|
|
-
|
|
|
- /*
|
|
|
- We've computed `gcd`, either the long way, or because one of the inputs was zero.
|
|
|
- If we don't want `lcm`, we're done.
|
|
|
- */
|
|
|
- if res_lcm == nil {
|
|
|
- internal_swap(temp_gcd_res, res_gcd)
|
|
|
- return nil
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Computes least common multiple as `|a*b|/gcd(a,b)`
|
|
|
- Divide the smallest by the GCD.
|
|
|
- */
|
|
|
- if internal_lt_abs(a, b) {
|
|
|
- /*
|
|
|
- Store quotient in `t2` such that `t2 * b` is the LCM.
|
|
|
- */
|
|
|
- internal_div(res_lcm, a, temp_gcd_res) or_return
|
|
|
- err = internal_mul(res_lcm, res_lcm, b)
|
|
|
- } else {
|
|
|
- /*
|
|
|
- Store quotient in `t2` such that `t2 * a` is the LCM.
|
|
|
- */
|
|
|
- internal_div(res_lcm, b, temp_gcd_res) or_return
|
|
|
- err = internal_mul(res_lcm, res_lcm, a)
|
|
|
- }
|
|
|
-
|
|
|
- if res_gcd != nil {
|
|
|
- internal_swap(temp_gcd_res, res_gcd)
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Fix the sign to positive and return.
|
|
|
- */
|
|
|
- res_lcm.sign = .Zero_or_Positive
|
|
|
- return err
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Internal implementation of log.
|
|
|
- Assumes `a` not to be `nil` and to have been initialized.
|
|
|
-*/
|
|
|
-_private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -> (res: int, err: Error) {
|
|
|
- bracket_low, bracket_high, bracket_mid, t, bi_base := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(bracket_low, bracket_high, bracket_mid, t, bi_base)
|
|
|
-
|
|
|
- ic := #force_inline internal_cmp(a, base)
|
|
|
- if ic == -1 || ic == 0 {
|
|
|
- return 1 if ic == 0 else 0, nil
|
|
|
- }
|
|
|
- defer if err != nil {
|
|
|
- res = -1
|
|
|
- }
|
|
|
-
|
|
|
- internal_set(bi_base, base, true, allocator) or_return
|
|
|
- internal_clear(bracket_mid, false, allocator) or_return
|
|
|
- internal_clear(t, false, allocator) or_return
|
|
|
- internal_one(bracket_low, false, allocator) or_return
|
|
|
- internal_set(bracket_high, base, false, allocator) or_return
|
|
|
-
|
|
|
- low := 0; high := 1
|
|
|
-
|
|
|
- /*
|
|
|
- A kind of Giant-step/baby-step algorithm.
|
|
|
- Idea shamelessly stolen from https://programmingpraxis.com/2010/05/07/integer-logarithms/2/
|
|
|
- The effect is asymptotic, hence needs benchmarks to test if the Giant-step should be skipped
|
|
|
- for small n.
|
|
|
- */
|
|
|
-
|
|
|
- for {
|
|
|
- /*
|
|
|
- Iterate until `a` is bracketed between low + high.
|
|
|
- */
|
|
|
- if #force_inline internal_gte(bracket_high, a) { break }
|
|
|
-
|
|
|
- low = high
|
|
|
- #force_inline internal_copy(bracket_low, bracket_high) or_return
|
|
|
- high <<= 1
|
|
|
- #force_inline internal_sqr(bracket_high, bracket_high) or_return
|
|
|
- }
|
|
|
-
|
|
|
- for (high - low) > 1 {
|
|
|
- mid := (high + low) >> 1
|
|
|
-
|
|
|
- #force_inline internal_pow(t, bi_base, mid - low) or_return
|
|
|
-
|
|
|
- #force_inline internal_mul(bracket_mid, bracket_low, t) or_return
|
|
|
-
|
|
|
- mc := #force_inline internal_cmp(a, bracket_mid)
|
|
|
- switch mc {
|
|
|
- case -1:
|
|
|
- high = mid
|
|
|
- internal_swap(bracket_mid, bracket_high)
|
|
|
- case 0:
|
|
|
- return mid, nil
|
|
|
- case 1:
|
|
|
- low = mid
|
|
|
- internal_swap(bracket_mid, bracket_low)
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- fc := #force_inline internal_cmp(bracket_high, a)
|
|
|
- res = high if fc == 0 else low
|
|
|
-
|
|
|
- return
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Computes xR**-1 == x (mod N) via Montgomery Reduction.
|
|
|
- This is an optimized implementation of `internal_montgomery_reduce`
|
|
|
- which uses the comba method to quickly calculate the columns of the reduction.
|
|
|
- Based on Algorithm 14.32 on pp.601 of HAC.
|
|
|
-*/
|
|
|
-_private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
- W: [_WARRAY]_WORD = ---
|
|
|
-
|
|
|
- if x.used > _WARRAY { return .Invalid_Argument }
|
|
|
-
|
|
|
- /*
|
|
|
- Get old used count.
|
|
|
- */
|
|
|
- old_used := x.used
|
|
|
-
|
|
|
- /*
|
|
|
- Grow `x` as required.
|
|
|
- */
|
|
|
- internal_grow(x, n.used + 1) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- First we have to get the digits of the input into an array of double precision words W[...]
|
|
|
- Copy the digits of `x` into W[0..`x.used` - 1]
|
|
|
- */
|
|
|
- ix: int
|
|
|
- for ix = 0; ix < x.used; ix += 1 {
|
|
|
- W[ix] = _WORD(x.digit[ix])
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Zero the high words of W[a->used..m->used*2].
|
|
|
- */
|
|
|
- zero_upper := (n.used * 2) + 1
|
|
|
- if ix < zero_upper {
|
|
|
- for ix = x.used; ix < zero_upper; ix += 1 {
|
|
|
- W[ix] = {}
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Now we proceed to zero successive digits from the least significant upwards.
|
|
|
- */
|
|
|
- for ix = 0; ix < n.used; ix += 1 {
|
|
|
- /*
|
|
|
- `mu = ai * m' mod b`
|
|
|
-
|
|
|
- We avoid a double precision multiplication (which isn't required)
|
|
|
- by casting the value down to a DIGIT. Note this requires
|
|
|
- that W[ix-1] have the carry cleared (see after the inner loop)
|
|
|
- */
|
|
|
- mu := ((W[ix] & _WORD(_MASK)) * _WORD(rho)) & _WORD(_MASK)
|
|
|
-
|
|
|
- /*
|
|
|
- `a = a + mu * m * b**i`
|
|
|
-
|
|
|
- This is computed in place and on the fly. The multiplication
|
|
|
- by b**i is handled by offseting which columns the results
|
|
|
- are added to.
|
|
|
-
|
|
|
- Note the comba method normally doesn't handle carries in the
|
|
|
- inner loop In this case we fix the carry from the previous
|
|
|
- column since the Montgomery reduction requires digits of the
|
|
|
- result (so far) [see above] to work.
|
|
|
-
|
|
|
- This is handled by fixing up one carry after the inner loop.
|
|
|
- The carry fixups are done in order so after these loops the
|
|
|
- first m->used words of W[] have the carries fixed.
|
|
|
- */
|
|
|
- for iy := 0; iy < n.used; iy += 1 {
|
|
|
- W[ix + iy] += mu * _WORD(n.digit[iy])
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Now fix carry for next digit, W[ix+1].
|
|
|
- */
|
|
|
- W[ix + 1] += (W[ix] >> _DIGIT_BITS)
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Now we have to propagate the carries and shift the words downward
|
|
|
- [all those least significant digits we zeroed].
|
|
|
- */
|
|
|
-
|
|
|
- for ; ix < n.used * 2; ix += 1 {
|
|
|
- W[ix + 1] += (W[ix] >> _DIGIT_BITS)
|
|
|
- }
|
|
|
-
|
|
|
- /* copy out, A = A/b**n
|
|
|
- *
|
|
|
- * The result is A/b**n but instead of converting from an
|
|
|
- * array of mp_word to mp_digit than calling mp_rshd
|
|
|
- * we just copy them in the right order
|
|
|
- */
|
|
|
-
|
|
|
- for ix = 0; ix < (n.used + 1); ix += 1 {
|
|
|
- x.digit[ix] = DIGIT(W[n.used + ix] & _WORD(_MASK))
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Set the max used.
|
|
|
- */
|
|
|
- x.used = n.used + 1
|
|
|
-
|
|
|
- /*
|
|
|
- Zero old_used digits, if the input a was larger than m->used+1 we'll have to clear the digits.
|
|
|
- */
|
|
|
- internal_zero_unused(x, old_used)
|
|
|
- internal_clamp(x)
|
|
|
-
|
|
|
- /*
|
|
|
- if A >= m then A = A - m
|
|
|
- */
|
|
|
- if internal_gte_abs(x, n) {
|
|
|
- return internal_sub(x, x, n)
|
|
|
- }
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Computes xR**-1 == x (mod N) via Montgomery Reduction.
|
|
|
- Assumes `x` and `n` not to be nil.
|
|
|
-*/
|
|
|
-_private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
- /*
|
|
|
- Can the fast reduction [comba] method be used?
|
|
|
- Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
|
|
|
- since carries are fixed up in the inner loop.
|
|
|
- */
|
|
|
- internal_clear_if_uninitialized(x, n) or_return
|
|
|
-
|
|
|
- digs := (n.used * 2) + 1
|
|
|
- if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
|
|
|
- return _private_montgomery_reduce_comba(x, n, rho)
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Grow the input as required
|
|
|
- */
|
|
|
- internal_grow(x, digs) or_return
|
|
|
- x.used = digs
|
|
|
-
|
|
|
- for ix := 0; ix < n.used; ix += 1 {
|
|
|
- /*
|
|
|
- `mu = ai * rho mod b`
|
|
|
- The value of rho must be precalculated via `int_montgomery_setup()`,
|
|
|
- such that it equals -1/n0 mod b this allows the following inner loop
|
|
|
- to reduce the input one digit at a time.
|
|
|
- */
|
|
|
-
|
|
|
- mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK))
|
|
|
-
|
|
|
- /*
|
|
|
- a = a + mu * m * b**i
|
|
|
- Multiply and add in place.
|
|
|
- */
|
|
|
- u := DIGIT(0)
|
|
|
- iy := int(0)
|
|
|
- for ; iy < n.used; iy += 1 {
|
|
|
- /*
|
|
|
- Compute product and sum.
|
|
|
- */
|
|
|
- r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]))
|
|
|
-
|
|
|
- /*
|
|
|
- Get carry.
|
|
|
- */
|
|
|
- u = DIGIT(r >> _DIGIT_BITS)
|
|
|
-
|
|
|
- /*
|
|
|
- Fix digit.
|
|
|
- */
|
|
|
- x.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- At this point the ix'th digit of x should be zero.
|
|
|
- Propagate carries upwards as required.
|
|
|
- */
|
|
|
- for u != 0 {
|
|
|
- x.digit[ix + iy] += u
|
|
|
- u = x.digit[ix + iy] >> _DIGIT_BITS
|
|
|
- x.digit[ix + iy] &= _MASK
|
|
|
- iy += 1
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- At this point the n.used'th least significant digits of x are all zero,
|
|
|
- which means we can shift x to the right by n.used digits and the
|
|
|
- residue is unchanged.
|
|
|
-
|
|
|
- x = x/b**n.used.
|
|
|
- */
|
|
|
- internal_clamp(x)
|
|
|
- _private_int_shr_leg(x, n.used)
|
|
|
-
|
|
|
- /*
|
|
|
- if x >= n then x = x - n
|
|
|
- */
|
|
|
- if internal_gte_abs(x, n) {
|
|
|
- return internal_sub(x, x, n)
|
|
|
- }
|
|
|
-
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Shifts with subtractions when the result is greater than b.
|
|
|
-
|
|
|
- The method is slightly modified to shift B unconditionally upto just under
|
|
|
- the leading bit of b. This saves alot of multiple precision shifting.
|
|
|
-
|
|
|
- Assumes `a` and `b` not to be `nil`.
|
|
|
-*/
|
|
|
-_private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
- /*
|
|
|
- How many bits of last digit does b use.
|
|
|
- */
|
|
|
- internal_clear_if_uninitialized(a, b) or_return
|
|
|
-
|
|
|
- bits := internal_count_bits(b) % _DIGIT_BITS
|
|
|
-
|
|
|
- if b.used > 1 {
|
|
|
- power := ((b.used - 1) * _DIGIT_BITS) + bits - 1
|
|
|
- internal_int_power_of_two(a, power) or_return
|
|
|
- } else {
|
|
|
- internal_one(a) or_return
|
|
|
- bits = 1
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Now compute C = A * B mod b.
|
|
|
- */
|
|
|
- for x := bits - 1; x < _DIGIT_BITS; x += 1 {
|
|
|
- internal_int_shl1(a, a) or_return
|
|
|
- if internal_gte_abs(a, b) {
|
|
|
- internal_sub(a, a, b) or_return
|
|
|
- }
|
|
|
- }
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Sets up the Montgomery reduction stuff.
|
|
|
-*/
|
|
|
-_private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
|
|
|
- /*
|
|
|
- Fast inversion mod 2**k
|
|
|
- Based on the fact that:
|
|
|
-
|
|
|
- XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
|
|
|
- => 2*X*A - X*X*A*A = 1
|
|
|
- => 2*(1) - (1) = 1
|
|
|
- */
|
|
|
- internal_clear_if_uninitialized(n, allocator) or_return
|
|
|
-
|
|
|
- b := n.digit[0]
|
|
|
- if b & 1 == 0 { return 0, .Invalid_Argument }
|
|
|
-
|
|
|
- x := (((b + 2) & 4) << 1) + b /* here x*a==1 mod 2**4 */
|
|
|
- x *= 2 - (b * x) /* here x*a==1 mod 2**8 */
|
|
|
- x *= 2 - (b * x) /* here x*a==1 mod 2**16 */
|
|
|
-
|
|
|
- when _DIGIT_TYPE_BITS == 64 {
|
|
|
- x *= 2 - (b * x) /* here x*a==1 mod 2**32 */
|
|
|
- x *= 2 - (b * x) /* here x*a==1 mod 2**64 */
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- rho = -1/m mod b
|
|
|
- */
|
|
|
- rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK))
|
|
|
- return rho, nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
|
|
|
- From HAC pp.604 Algorithm 14.42
|
|
|
-
|
|
|
- Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
|
|
|
-*/
|
|
|
-_private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- q := &Int{}
|
|
|
- defer internal_destroy(q)
|
|
|
- um := m.used
|
|
|
-
|
|
|
- /*
|
|
|
- q = x
|
|
|
- */
|
|
|
- internal_copy(q, x) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- q1 = x / b**(k-1)
|
|
|
- */
|
|
|
- _private_int_shr_leg(q, um - 1)
|
|
|
-
|
|
|
- /*
|
|
|
- According to HAC this optimization is ok.
|
|
|
- */
|
|
|
- if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
|
|
|
- internal_mul(q, q, mu) or_return
|
|
|
- } else {
|
|
|
- _private_int_mul_high(q, q, mu, um) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- q3 = q2 / b**(k+1)
|
|
|
- */
|
|
|
- _private_int_shr_leg(q, um + 1)
|
|
|
-
|
|
|
- /*
|
|
|
- x = x mod b**(k+1), quick (no division)
|
|
|
- */
|
|
|
- internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1)) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- q = q * m mod b**(k+1), quick (no division)
|
|
|
- */
|
|
|
- _private_int_mul(q, q, m, um + 1) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- x = x - q
|
|
|
- */
|
|
|
- internal_sub(x, x, q) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- If x < 0, add b**(k+1) to it.
|
|
|
- */
|
|
|
- if internal_is_negative(x) {
|
|
|
- internal_set(q, 1) or_return
|
|
|
- _private_int_shl_leg(q, um + 1) or_return
|
|
|
- internal_add(x, x, q) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Back off if it's too big.
|
|
|
- */
|
|
|
- for internal_gte(x, m) {
|
|
|
- internal_sub(x, x, m) or_return
|
|
|
- }
|
|
|
-
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
|
|
|
-*/
|
|
|
-_private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- q := &Int{}
|
|
|
- defer internal_destroy(q)
|
|
|
-
|
|
|
- internal_zero(q) or_return
|
|
|
-
|
|
|
- p := internal_count_bits(n)
|
|
|
-
|
|
|
- for {
|
|
|
- /*
|
|
|
- q = a/2**p, a = a mod 2**p
|
|
|
- */
|
|
|
- internal_shrmod(q, a, a, p) or_return
|
|
|
-
|
|
|
- if d != 1 {
|
|
|
- /*
|
|
|
- q = q * d
|
|
|
- */
|
|
|
- internal_mul(q, q, d) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- a = a + q
|
|
|
- */
|
|
|
- internal_add(a, a, q) or_return
|
|
|
- if internal_lt_abs(a, n) { break }
|
|
|
- internal_sub(a, a, n) or_return
|
|
|
- }
|
|
|
-
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Reduces `a` modulo `n` where `n` is of the form 2**p - d
|
|
|
- This differs from reduce_2k since "d" can be larger than a single digit.
|
|
|
-*/
|
|
|
-_private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- q := &Int{}
|
|
|
- defer internal_destroy(q)
|
|
|
-
|
|
|
- internal_zero(q) or_return
|
|
|
-
|
|
|
- p := internal_count_bits(n)
|
|
|
-
|
|
|
- for {
|
|
|
- /*
|
|
|
- q = a/2**p, a = a mod 2**p
|
|
|
- */
|
|
|
- internal_shrmod(q, a, a, p) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- q = q * d
|
|
|
- */
|
|
|
- internal_mul(q, q, d) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- a = a + q
|
|
|
- */
|
|
|
- internal_add(a, a, q) or_return
|
|
|
- if internal_lt_abs(a, n) { break }
|
|
|
- internal_sub(a, a, n) or_return
|
|
|
- }
|
|
|
-
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Determines if `internal_int_reduce_2k` can be used.
|
|
|
- Asssumes `a` not to be `nil` and to have been initialized.
|
|
|
-*/
|
|
|
-_private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
|
|
|
- assert_if_nil(a)
|
|
|
-
|
|
|
- if internal_is_zero(a) {
|
|
|
- return false, nil
|
|
|
- } else if a.used == 1 {
|
|
|
- return true, nil
|
|
|
- } else if a.used > 1 {
|
|
|
- iy := internal_count_bits(a)
|
|
|
- iw := 1
|
|
|
- iz := DIGIT(1)
|
|
|
-
|
|
|
- /*
|
|
|
- Test every bit from the second digit up, must be 1.
|
|
|
- */
|
|
|
- for ix := _DIGIT_BITS; ix < iy; ix += 1 {
|
|
|
- if a.digit[iw] & iz == 0 {
|
|
|
- return false, nil
|
|
|
- }
|
|
|
-
|
|
|
- iz <<= 1
|
|
|
- if iz > _DIGIT_MAX {
|
|
|
- iw += 1
|
|
|
- iz = 1
|
|
|
- }
|
|
|
- }
|
|
|
- return true, nil
|
|
|
- } else {
|
|
|
- return true, nil
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Determines if `internal_int_reduce_2k_l` can be used.
|
|
|
- Asssumes `a` not to be `nil` and to have been initialized.
|
|
|
-*/
|
|
|
-_private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
|
|
|
- assert_if_nil(a)
|
|
|
-
|
|
|
- if internal_int_is_zero(a) {
|
|
|
- return false, nil
|
|
|
- } else if a.used == 1 {
|
|
|
- return true, nil
|
|
|
- } else if a.used > 1 {
|
|
|
- /*
|
|
|
- If more than half of the digits are -1 we're sold.
|
|
|
- */
|
|
|
- ix := 0
|
|
|
- iy := 0
|
|
|
-
|
|
|
- for ; ix < a.used; ix += 1 {
|
|
|
- if a.digit[ix] == _DIGIT_MAX {
|
|
|
- iy += 1
|
|
|
- }
|
|
|
- }
|
|
|
- return iy >= (a.used / 2), nil
|
|
|
- } else {
|
|
|
- return false, nil
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Determines the setup value.
|
|
|
- Assumes `a` is not `nil`.
|
|
|
-*/
|
|
|
-_private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- tmp := &Int{}
|
|
|
- defer internal_destroy(tmp)
|
|
|
- internal_zero(tmp) or_return
|
|
|
-
|
|
|
- internal_int_power_of_two(tmp, internal_count_bits(a)) or_return
|
|
|
- internal_sub(tmp, tmp, a) or_return
|
|
|
-
|
|
|
- return tmp.digit[0], nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Determines the setup value.
|
|
|
- Assumes `mu` and `P` are not `nil`.
|
|
|
-
|
|
|
- d := (1 << a.bits) - a;
|
|
|
-*/
|
|
|
-_private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- tmp := &Int{}
|
|
|
- defer internal_destroy(tmp)
|
|
|
- internal_zero(tmp) or_return
|
|
|
-
|
|
|
- internal_int_power_of_two(tmp, internal_count_bits(P)) or_return
|
|
|
- internal_sub(mu, tmp, P) or_return
|
|
|
-
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Pre-calculate the value required for Barrett reduction.
|
|
|
- For a given modulus "P" it calulates the value required in "mu"
|
|
|
- Assumes `mu` and `P` are not `nil`.
|
|
|
-*/
|
|
|
-_private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS) or_return
|
|
|
- return internal_int_div(mu, mu, P)
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Determines the setup value.
|
|
|
- Assumes `a` to not be `nil` and to have been initialized.
|
|
|
-*/
|
|
|
-_private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
|
|
|
- /*
|
|
|
- The casts are required if _DIGIT_BITS is one less than
|
|
|
- the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
|
|
|
- */
|
|
|
- return DIGIT((1 << _DIGIT_BITS) - a.digit[0])
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Determines if a number is a valid DR modulus.
|
|
|
- Assumes `a` to not be `nil` and to have been initialized.
|
|
|
-*/
|
|
|
-_private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
|
|
|
- /*
|
|
|
- Must be at least two digits.
|
|
|
- */
|
|
|
- if a.used < 2 { return false }
|
|
|
-
|
|
|
- /*
|
|
|
- Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
|
|
|
- */
|
|
|
- for ix := 1; ix < a.used; ix += 1 {
|
|
|
- if a.digit[ix] != _MASK {
|
|
|
- return false
|
|
|
- }
|
|
|
- }
|
|
|
- return true
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
|
|
|
- Based on algorithm from the paper
|
|
|
-
|
|
|
- "Generating Efficient Primes for Discrete Log Cryptosystems"
|
|
|
- Chae Hoon Lim, Pil Joong Lee,
|
|
|
- POSTECH Information Research Laboratories
|
|
|
-
|
|
|
- The modulus must be of a special format [see manual].
|
|
|
- Has been modified to use algorithm 7.10 from the LTM book instead
|
|
|
-
|
|
|
- Input x must be in the range 0 <= x <= (n-1)**2
|
|
|
- Assumes `x` and `n` to not be `nil` and to have been initialized.
|
|
|
-*/
|
|
|
-_private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
|
|
|
- /*
|
|
|
- m = digits in modulus.
|
|
|
- */
|
|
|
- m := n.used
|
|
|
-
|
|
|
- /*
|
|
|
- Ensure that "x" has at least 2m digits.
|
|
|
- */
|
|
|
- internal_grow(x, m + m) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Top of loop, this is where the code resumes if another reduction pass is required.
|
|
|
- */
|
|
|
- for {
|
|
|
- i: int
|
|
|
- mu := DIGIT(0)
|
|
|
-
|
|
|
- /*
|
|
|
- Compute (x mod B**m) + k * [x/B**m] inline and inplace.
|
|
|
- */
|
|
|
- for i = 0; i < m; i += 1 {
|
|
|
- r := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu)
|
|
|
- x.digit[i] = DIGIT(r & _WORD(_MASK))
|
|
|
- mu = DIGIT(r >> _WORD(_DIGIT_BITS))
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Set final carry.
|
|
|
- */
|
|
|
- x.digit[i] = mu
|
|
|
-
|
|
|
- /*
|
|
|
- Zero words above m.
|
|
|
- */
|
|
|
- mem.zero_slice(x.digit[m + 1:][:x.used - m])
|
|
|
-
|
|
|
- /*
|
|
|
- Clamp, sub and return.
|
|
|
- */
|
|
|
- internal_clamp(x) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- If x >= n then subtract and reduce again.
|
|
|
- Each successive "recursion" makes the input smaller and smaller.
|
|
|
- */
|
|
|
- if internal_lt_abs(x, n) { break }
|
|
|
-
|
|
|
- internal_sub(x, x, n) or_return
|
|
|
- }
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Computes res == G**X mod P.
|
|
|
- Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
|
|
|
-*/
|
|
|
-_private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- M := [_TAB_SIZE]Int{}
|
|
|
- winsize: uint
|
|
|
-
|
|
|
- /*
|
|
|
- Use a pointer to the reduction algorithm.
|
|
|
- This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
|
|
|
- */
|
|
|
- redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error)
|
|
|
-
|
|
|
- defer {
|
|
|
- internal_destroy(&M[1])
|
|
|
- for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
|
|
|
- internal_destroy(&M[x])
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Find window size.
|
|
|
- */
|
|
|
- x := internal_count_bits(X)
|
|
|
- switch {
|
|
|
- case x <= 7:
|
|
|
- winsize = 2
|
|
|
- case x <= 36:
|
|
|
- winsize = 3
|
|
|
- case x <= 140:
|
|
|
- winsize = 4
|
|
|
- case x <= 450:
|
|
|
- winsize = 5
|
|
|
- case x <= 1303:
|
|
|
- winsize = 6
|
|
|
- case x <= 3529:
|
|
|
- winsize = 7
|
|
|
- case:
|
|
|
- winsize = 8
|
|
|
- }
|
|
|
-
|
|
|
- winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
|
|
|
-
|
|
|
- /*
|
|
|
- Init M array.
|
|
|
- Init first cell.
|
|
|
- */
|
|
|
- internal_zero(&M[1]) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Now init the second half of the array.
|
|
|
- */
|
|
|
- for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
|
|
|
- internal_zero(&M[x]) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Create `mu`, used for Barrett reduction.
|
|
|
- */
|
|
|
- mu := &Int{}
|
|
|
- defer internal_destroy(mu)
|
|
|
- internal_zero(mu) or_return
|
|
|
-
|
|
|
- if redmode == 0 {
|
|
|
- _private_int_reduce_setup(mu, P) or_return
|
|
|
- redux = _private_int_reduce
|
|
|
- } else {
|
|
|
- _private_int_reduce_2k_setup_l(mu, P) or_return
|
|
|
- redux = _private_int_reduce_2k_l
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Create M table.
|
|
|
-
|
|
|
- The M table contains powers of the base, e.g. M[x] = G**x mod P.
|
|
|
- The first half of the table is not computed, though, except for M[0] and M[1].
|
|
|
- */
|
|
|
- internal_int_mod(&M[1], G, P) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
|
|
|
-
|
|
|
- TODO: This can probably be replaced by computing the power and using `pow` to raise to it
|
|
|
- instead of repeated squaring.
|
|
|
- */
|
|
|
- slot := 1 << (winsize - 1)
|
|
|
- internal_copy(&M[slot], &M[1]) or_return
|
|
|
-
|
|
|
- for x = 0; x < int(winsize - 1); x += 1 {
|
|
|
- /*
|
|
|
- Square it.
|
|
|
- */
|
|
|
- internal_sqr(&M[slot], &M[slot]) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Reduce modulo P
|
|
|
- */
|
|
|
- redux(&M[slot], P, mu) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
|
|
|
- for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
|
|
|
- */
|
|
|
- for x = slot + 1; x < (1 << winsize); x += 1 {
|
|
|
- internal_mul(&M[x], &M[x - 1], &M[1]) or_return
|
|
|
- redux(&M[x], P, mu) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Setup result.
|
|
|
- */
|
|
|
- internal_one(res) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Set initial mode and bit cnt.
|
|
|
- */
|
|
|
- mode := 0
|
|
|
- bitcnt := 1
|
|
|
- buf := DIGIT(0)
|
|
|
- digidx := X.used - 1
|
|
|
- bitcpy := uint(0)
|
|
|
- bitbuf := DIGIT(0)
|
|
|
-
|
|
|
- for {
|
|
|
- /*
|
|
|
- Grab next digit as required.
|
|
|
- */
|
|
|
- bitcnt -= 1
|
|
|
- if bitcnt == 0 {
|
|
|
- /*
|
|
|
- If digidx == -1 we are out of digits.
|
|
|
- */
|
|
|
- if digidx == -1 { break }
|
|
|
-
|
|
|
- /*
|
|
|
- Read next digit and reset the bitcnt.
|
|
|
- */
|
|
|
- buf = X.digit[digidx]
|
|
|
- digidx -= 1
|
|
|
- bitcnt = _DIGIT_BITS
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Grab the next msb from the exponent.
|
|
|
- */
|
|
|
- y := buf >> (_DIGIT_BITS - 1) & 1
|
|
|
- buf <<= 1
|
|
|
-
|
|
|
- /*
|
|
|
- If the bit is zero and mode == 0 then we ignore it.
|
|
|
- These represent the leading zero bits before the first 1 bit
|
|
|
- in the exponent. Technically this opt is not required but it
|
|
|
- does lower the # of trivial squaring/reductions used.
|
|
|
- */
|
|
|
- if mode == 0 && y == 0 {
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- If the bit is zero and mode == 1 then we square.
|
|
|
- */
|
|
|
- if mode == 1 && y == 0 {
|
|
|
- internal_sqr(res, res) or_return
|
|
|
- redux(res, P, mu) or_return
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Else we add it to the window.
|
|
|
- */
|
|
|
- bitcpy += 1
|
|
|
- bitbuf |= (y << (winsize - bitcpy))
|
|
|
- mode = 2
|
|
|
-
|
|
|
- if (bitcpy == winsize) {
|
|
|
- /*
|
|
|
- Window is filled so square as required and multiply.
|
|
|
- Square first.
|
|
|
- */
|
|
|
- for x = 0; x < int(winsize); x += 1 {
|
|
|
- internal_sqr(res, res) or_return
|
|
|
- redux(res, P, mu) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Then multiply.
|
|
|
- */
|
|
|
- internal_mul(res, res, &M[bitbuf]) or_return
|
|
|
- redux(res, P, mu) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Empty window and reset.
|
|
|
- */
|
|
|
- bitcpy = 0
|
|
|
- bitbuf = 0
|
|
|
- mode = 1
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- If bits remain then square/multiply.
|
|
|
- */
|
|
|
- if mode == 2 && bitcpy > 0 {
|
|
|
- /*
|
|
|
- Square then multiply if the bit is set.
|
|
|
- */
|
|
|
- for x = 0; x < int(bitcpy); x += 1 {
|
|
|
- internal_sqr(res, res) or_return
|
|
|
- redux(res, P, mu) or_return
|
|
|
-
|
|
|
- bitbuf <<= 1
|
|
|
- if ((bitbuf & (1 << winsize)) != 0) {
|
|
|
- /*
|
|
|
- Then multiply.
|
|
|
- */
|
|
|
- internal_mul(res, res, &M[1]) or_return
|
|
|
- redux(res, P, mu) or_return
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- return err
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
|
|
|
-
|
|
|
- Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
|
|
|
- The value of `k` changes based on the size of the exponent.
|
|
|
-
|
|
|
- Uses Montgomery or Diminished Radix reduction [whichever appropriate]
|
|
|
-
|
|
|
- Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
|
|
|
-*/
|
|
|
-_private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- M := [_TAB_SIZE]Int{}
|
|
|
- winsize: uint
|
|
|
-
|
|
|
- /*
|
|
|
- Use a pointer to the reduction algorithm.
|
|
|
- This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
|
|
|
- */
|
|
|
- redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error)
|
|
|
-
|
|
|
- defer {
|
|
|
- internal_destroy(&M[1])
|
|
|
- for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
|
|
|
- internal_destroy(&M[x])
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Find window size.
|
|
|
- */
|
|
|
- x := internal_count_bits(X)
|
|
|
- switch {
|
|
|
- case x <= 7:
|
|
|
- winsize = 2
|
|
|
- case x <= 36:
|
|
|
- winsize = 3
|
|
|
- case x <= 140:
|
|
|
- winsize = 4
|
|
|
- case x <= 450:
|
|
|
- winsize = 5
|
|
|
- case x <= 1303:
|
|
|
- winsize = 6
|
|
|
- case x <= 3529:
|
|
|
- winsize = 7
|
|
|
- case:
|
|
|
- winsize = 8
|
|
|
- }
|
|
|
-
|
|
|
- winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
|
|
|
-
|
|
|
- /*
|
|
|
- Init M array
|
|
|
- Init first cell.
|
|
|
- */
|
|
|
- cap := internal_int_allocated_cap(P)
|
|
|
- internal_grow(&M[1], cap) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Now init the second half of the array.
|
|
|
- */
|
|
|
- for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
|
|
|
- internal_grow(&M[x], cap) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Determine and setup reduction code.
|
|
|
- */
|
|
|
- rho: DIGIT
|
|
|
-
|
|
|
- if redmode == 0 {
|
|
|
- /*
|
|
|
- Now setup Montgomery.
|
|
|
- */
|
|
|
- rho = _private_int_montgomery_setup(P) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Automatically pick the comba one if available (saves quite a few calls/ifs).
|
|
|
- */
|
|
|
- if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
|
|
|
- redux = _private_montgomery_reduce_comba
|
|
|
- } else {
|
|
|
- /*
|
|
|
- Use slower baseline Montgomery method.
|
|
|
- */
|
|
|
- redux = _private_int_montgomery_reduce
|
|
|
- }
|
|
|
- } else if redmode == 1 {
|
|
|
- /*
|
|
|
- Setup DR reduction for moduli of the form B**k - b.
|
|
|
- */
|
|
|
- rho = _private_int_dr_setup(P)
|
|
|
- redux = _private_int_dr_reduce
|
|
|
- } else {
|
|
|
- /*
|
|
|
- Setup DR reduction for moduli of the form 2**k - b.
|
|
|
- */
|
|
|
- rho = _private_int_reduce_2k_setup(P) or_return
|
|
|
- redux = _private_int_reduce_2k
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Setup result.
|
|
|
- */
|
|
|
- internal_grow(res, cap) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Create M table
|
|
|
- The first half of the table is not computed, though, except for M[0] and M[1]
|
|
|
- */
|
|
|
-
|
|
|
- if redmode == 0 {
|
|
|
- /*
|
|
|
- Now we need R mod m.
|
|
|
- */
|
|
|
- _private_int_montgomery_calc_normalization(res, P) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Now set M[1] to G * R mod m.
|
|
|
- */
|
|
|
- internal_mulmod(&M[1], G, res, P) or_return
|
|
|
- } else {
|
|
|
- internal_one(res) or_return
|
|
|
- internal_mod(&M[1], G, P) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
|
|
|
- */
|
|
|
- slot := 1 << (winsize - 1)
|
|
|
- internal_copy(&M[slot], &M[1]) or_return
|
|
|
-
|
|
|
- for x = 0; x < int(winsize - 1); x += 1 {
|
|
|
- internal_sqr(&M[slot], &M[slot]) or_return
|
|
|
- redux(&M[slot], P, rho) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Create upper table.
|
|
|
- */
|
|
|
- for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
|
|
|
- internal_mul(&M[x], &M[x - 1], &M[1]) or_return
|
|
|
- redux(&M[x], P, rho) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Set initial mode and bit cnt.
|
|
|
- */
|
|
|
- mode := 0
|
|
|
- bitcnt := 1
|
|
|
- buf := DIGIT(0)
|
|
|
- digidx := X.used - 1
|
|
|
- bitcpy := 0
|
|
|
- bitbuf := DIGIT(0)
|
|
|
-
|
|
|
- for {
|
|
|
- /*
|
|
|
- Grab next digit as required.
|
|
|
- */
|
|
|
- bitcnt -= 1
|
|
|
- if bitcnt == 0 {
|
|
|
- /*
|
|
|
- If digidx == -1 we are out of digits so break.
|
|
|
- */
|
|
|
- if digidx == -1 { break }
|
|
|
-
|
|
|
- /*
|
|
|
- Read next digit and reset the bitcnt.
|
|
|
- */
|
|
|
- buf = X.digit[digidx]
|
|
|
- digidx -= 1
|
|
|
- bitcnt = _DIGIT_BITS
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Grab the next msb from the exponent.
|
|
|
- */
|
|
|
- y := (buf >> (_DIGIT_BITS - 1)) & 1
|
|
|
- buf <<= 1
|
|
|
-
|
|
|
- /*
|
|
|
- If the bit is zero and mode == 0 then we ignore it.
|
|
|
- These represent the leading zero bits before the first 1 bit in the exponent.
|
|
|
- Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
|
|
|
- */
|
|
|
- if mode == 0 && y == 0 { continue }
|
|
|
-
|
|
|
- /*
|
|
|
- If the bit is zero and mode == 1 then we square.
|
|
|
- */
|
|
|
- if mode == 1 && y == 0 {
|
|
|
- internal_sqr(res, res) or_return
|
|
|
- redux(res, P, rho) or_return
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Else we add it to the window.
|
|
|
- */
|
|
|
- bitcpy += 1
|
|
|
- bitbuf |= (y << (winsize - uint(bitcpy)))
|
|
|
- mode = 2
|
|
|
-
|
|
|
- if bitcpy == int(winsize) {
|
|
|
- /*
|
|
|
- Window is filled so square as required and multiply
|
|
|
- Square first.
|
|
|
- */
|
|
|
- for x = 0; x < int(winsize); x += 1 {
|
|
|
- internal_sqr(res, res) or_return
|
|
|
- redux(res, P, rho) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Then multiply.
|
|
|
- */
|
|
|
- internal_mul(res, res, &M[bitbuf]) or_return
|
|
|
- redux(res, P, rho) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Empty window and reset.
|
|
|
- */
|
|
|
- bitcpy = 0
|
|
|
- bitbuf = 0
|
|
|
- mode = 1
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- If bits remain then square/multiply.
|
|
|
- */
|
|
|
- if mode == 2 && bitcpy > 0 {
|
|
|
- /*
|
|
|
- Square then multiply if the bit is set.
|
|
|
- */
|
|
|
- for x = 0; x < bitcpy; x += 1 {
|
|
|
- internal_sqr(res, res) or_return
|
|
|
- redux(res, P, rho) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Get next bit of the window.
|
|
|
- */
|
|
|
- bitbuf <<= 1
|
|
|
- if bitbuf & (1 << winsize) != 0 {
|
|
|
- /*
|
|
|
- Then multiply.
|
|
|
- */
|
|
|
- internal_mul(res, res, &M[1]) or_return
|
|
|
- redux(res, P, rho) or_return
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if redmode == 0 {
|
|
|
- /*
|
|
|
- Fixup result if Montgomery reduction is used.
|
|
|
- Recall that any value in a Montgomery system is actually multiplied by R mod n.
|
|
|
- So we have to reduce one more time to cancel out the factor of R.
|
|
|
- */
|
|
|
- redux(res, P, rho) or_return
|
|
|
- }
|
|
|
-
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- hac 14.61, pp608
|
|
|
-*/
|
|
|
-_private_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
- x, y, u, v, A, B, C, D := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(x, y, u, v, A, B, C, D)
|
|
|
-
|
|
|
- /*
|
|
|
- `b` cannot be negative.
|
|
|
- */
|
|
|
- if b.sign == .Negative || internal_is_zero(b) {
|
|
|
- return .Invalid_Argument
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- init temps.
|
|
|
- */
|
|
|
- internal_init_multi(x, y, u, v, A, B, C, D) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- `x` = `a` % `b`, `y` = `b`
|
|
|
- */
|
|
|
- internal_mod(x, a, b) or_return
|
|
|
- internal_copy(y, b) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- 2. [modified] if x,y are both even then return an error!
|
|
|
- */
|
|
|
- if internal_is_even(x) && internal_is_even(y) {
|
|
|
- return .Invalid_Argument
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- 3. u=x, v=y, A=1, B=0, C=0, D=1
|
|
|
- */
|
|
|
- internal_copy(u, x) or_return
|
|
|
- internal_copy(v, y) or_return
|
|
|
- internal_one(A) or_return
|
|
|
- internal_one(D) or_return
|
|
|
-
|
|
|
- for {
|
|
|
- /*
|
|
|
- 4. while `u` is even do:
|
|
|
- */
|
|
|
- for internal_is_even(u) {
|
|
|
- /*
|
|
|
- 4.1 `u` = `u` / 2
|
|
|
- */
|
|
|
- internal_int_shr1(u, u) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- 4.2 if `A` or `B` is odd then:
|
|
|
- */
|
|
|
- if internal_is_odd(A) || internal_is_odd(B) {
|
|
|
- /*
|
|
|
- `A` = (`A`+`y`) / 2, `B` = (`B`-`x`) / 2
|
|
|
- */
|
|
|
- internal_add(A, A, y) or_return
|
|
|
- internal_add(B, B, x) or_return
|
|
|
- }
|
|
|
- /*
|
|
|
- `A` = `A` / 2, `B` = `B` / 2
|
|
|
- */
|
|
|
- internal_int_shr1(A, A) or_return
|
|
|
- internal_int_shr1(B, B) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- 5. while `v` is even do:
|
|
|
- */
|
|
|
- for internal_is_even(v) {
|
|
|
- /*
|
|
|
- 5.1 `v` = `v` / 2
|
|
|
- */
|
|
|
- internal_int_shr1(v, v) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- 5.2 if `C` or `D` is odd then:
|
|
|
- */
|
|
|
- if internal_is_odd(C) || internal_is_odd(D) {
|
|
|
- /*
|
|
|
- `C` = (`C`+`y`) / 2, `D` = (`D`-`x`) / 2
|
|
|
- */
|
|
|
- internal_add(C, C, y) or_return
|
|
|
- internal_add(D, D, x) or_return
|
|
|
- }
|
|
|
- /*
|
|
|
- `C` = `C` / 2, `D` = `D` / 2
|
|
|
- */
|
|
|
- internal_int_shr1(C, C) or_return
|
|
|
- internal_int_shr1(D, D) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- 6. if `u` >= `v` then:
|
|
|
- */
|
|
|
- if internal_cmp(u, v) != -1 {
|
|
|
- /*
|
|
|
- `u` = `u` - `v`, `A` = `A` - `C`, `B` = `B` - `D`
|
|
|
- */
|
|
|
- internal_sub(u, u, v) or_return
|
|
|
- internal_sub(A, A, C) or_return
|
|
|
- internal_sub(B, B, D) or_return
|
|
|
- } else {
|
|
|
- /* v - v - u, C = C - A, D = D - B */
|
|
|
- internal_sub(v, v, u) or_return
|
|
|
- internal_sub(C, C, A) or_return
|
|
|
- internal_sub(D, D, B) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- If not zero goto step 4
|
|
|
- */
|
|
|
- if internal_is_zero(u) {
|
|
|
- break
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Now `a` = `C`, `b` = `D`, `gcd` == `g`*`v`
|
|
|
- */
|
|
|
-
|
|
|
- /*
|
|
|
- If `v` != `1` then there is no inverse.
|
|
|
- */
|
|
|
- if !internal_eq(v, 1) {
|
|
|
- return .Invalid_Argument
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- If its too low.
|
|
|
- */
|
|
|
- if internal_is_negative(C) {
|
|
|
- internal_add(C, C, b) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Too big.
|
|
|
- */
|
|
|
- if internal_gte(C, 0) {
|
|
|
- internal_sub(C, C, b) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- `C` is now the inverse.
|
|
|
- */
|
|
|
- swap(dest, C)
|
|
|
-
|
|
|
- return
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Computes the modular inverse via binary extended Euclidean algorithm, that is `dest` = 1 / `a` mod `b`.
|
|
|
-
|
|
|
- Based on slow invmod except this is optimized for the case where `b` is odd,
|
|
|
- as per HAC Note 14.64 on pp. 610.
|
|
|
-*/
|
|
|
-_private_inverse_modulo_odd :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
- x, y, u, v, B, D := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
- defer internal_destroy(x, y, u, v, B, D)
|
|
|
-
|
|
|
- sign: Sign
|
|
|
-
|
|
|
- /*
|
|
|
- 2. [modified] `b` must be odd.
|
|
|
- */
|
|
|
- if internal_is_even(b) { return .Invalid_Argument }
|
|
|
-
|
|
|
- /*
|
|
|
- Init all our temps.
|
|
|
- */
|
|
|
- internal_init_multi(x, y, u, v, B, D) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- `x` == modulus, `y` == value to invert.
|
|
|
- */
|
|
|
- internal_copy(x, b) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- We need `y` = `|a|`.
|
|
|
- */
|
|
|
- internal_mod(y, a, b) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- If one of `x`, `y` is zero return an error!
|
|
|
- */
|
|
|
- if internal_is_zero(x) || internal_is_zero(y) { return .Invalid_Argument }
|
|
|
-
|
|
|
- /*
|
|
|
- 3. `u` = `x`, `v` = `y`, `A` = 1, `B` = 0, `C` = 0, `D` = 1
|
|
|
- */
|
|
|
- internal_copy(u, x) or_return
|
|
|
- internal_copy(v, y) or_return
|
|
|
-
|
|
|
- internal_one(D) or_return
|
|
|
-
|
|
|
- for {
|
|
|
- /*
|
|
|
- 4. while `u` is even do.
|
|
|
- */
|
|
|
- for internal_is_even(u) {
|
|
|
- /*
|
|
|
- 4.1 `u` = `u` / 2
|
|
|
- */
|
|
|
- internal_int_shr1(u, u) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- 4.2 if `B` is odd then:
|
|
|
- */
|
|
|
- if internal_is_odd(B) {
|
|
|
- /*
|
|
|
- `B` = (`B` - `x`) / 2
|
|
|
- */
|
|
|
- internal_sub(B, B, x) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- `B` = `B` / 2
|
|
|
- */
|
|
|
- internal_int_shr1(B, B) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- 5. while `v` is even do:
|
|
|
- */
|
|
|
- for internal_is_even(v) {
|
|
|
- /*
|
|
|
- 5.1 `v` = `v` / 2
|
|
|
- */
|
|
|
- internal_int_shr1(v, v) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- 5.2 if `D` is odd then:
|
|
|
- */
|
|
|
- if internal_is_odd(D) {
|
|
|
- /*
|
|
|
- `D` = (`D` - `x`) / 2
|
|
|
- */
|
|
|
- internal_sub(D, D, x) or_return
|
|
|
- }
|
|
|
- /*
|
|
|
- `D` = `D` / 2
|
|
|
- */
|
|
|
- internal_int_shr1(D, D) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- 6. if `u` >= `v` then:
|
|
|
- */
|
|
|
- if internal_cmp(u, v) != -1 {
|
|
|
- /*
|
|
|
- `u` = `u` - `v`, `B` = `B` - `D`
|
|
|
- */
|
|
|
- internal_sub(u, u, v) or_return
|
|
|
- internal_sub(B, B, D) or_return
|
|
|
- } else {
|
|
|
- /*
|
|
|
- `v` - `v` - `u`, `D` = `D` - `B`
|
|
|
- */
|
|
|
- internal_sub(v, v, u) or_return
|
|
|
- internal_sub(D, D, B) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- If not zero goto step 4.
|
|
|
- */
|
|
|
- if internal_is_zero(u) { break }
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Now `a` = C, `b` = D, gcd == g*v
|
|
|
- */
|
|
|
-
|
|
|
- /*
|
|
|
- if `v` != 1 then there is no inverse
|
|
|
- */
|
|
|
- if internal_cmp(v, 1) != 0 {
|
|
|
- return .Invalid_Argument
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- `b` is now the inverse.
|
|
|
- */
|
|
|
- sign = a.sign
|
|
|
- for internal_int_is_negative(D) {
|
|
|
- internal_add(D, D, b) or_return
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Too big.
|
|
|
- */
|
|
|
- for internal_gte_abs(D, b) {
|
|
|
- internal_sub(D, D, b) or_return
|
|
|
- }
|
|
|
-
|
|
|
- swap(dest, D)
|
|
|
- dest.sign = sign
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-/*
|
|
|
- Returns the log2 of an `Int`.
|
|
|
- Assumes `a` not to be `nil` and to have been initialized.
|
|
|
- Also assumes `base` is a power of two.
|
|
|
-*/
|
|
|
-_private_log_power_of_two :: proc(a: ^Int, base: DIGIT) -> (log: int, err: Error) {
|
|
|
- base := base
|
|
|
- y: int
|
|
|
- for y = 0; base & 1 == 0; {
|
|
|
- y += 1
|
|
|
- base >>= 1
|
|
|
- }
|
|
|
- log = internal_count_bits(a)
|
|
|
- return (log - 1) / y, err
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Copies DIGITs from `src` to `dest`.
|
|
|
- Assumes `src` and `dest` to not be `nil` and have been initialized.
|
|
|
-*/
|
|
|
-_private_copy_digits :: proc(dest, src: ^Int, digits: int, offset := int(0)) -> (err: Error) {
|
|
|
- digits := digits
|
|
|
- /*
|
|
|
- If dest == src, do nothing
|
|
|
- */
|
|
|
- if dest == src {
|
|
|
- return nil
|
|
|
- }
|
|
|
-
|
|
|
- digits = min(digits, len(src.digit), len(dest.digit))
|
|
|
- mem.copy_non_overlapping(&dest.digit[0], &src.digit[offset], size_of(DIGIT) * digits)
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-/*
|
|
|
- Shift left by `digits` * _DIGIT_BITS bits.
|
|
|
-*/
|
|
|
-_private_int_shl_leg :: proc(quotient: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- if digits <= 0 { return nil }
|
|
|
-
|
|
|
- /*
|
|
|
- No need to shift a zero.
|
|
|
- */
|
|
|
- if #force_inline internal_is_zero(quotient) {
|
|
|
- return nil
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- Resize `quotient` to accomodate extra digits.
|
|
|
- */
|
|
|
- #force_inline internal_grow(quotient, quotient.used + digits) or_return
|
|
|
-
|
|
|
- /*
|
|
|
- Increment the used by the shift amount then copy upwards.
|
|
|
- */
|
|
|
-
|
|
|
- /*
|
|
|
- Much like `_private_int_shr_leg`, this is implemented using a sliding window,
|
|
|
- except the window goes the other way around.
|
|
|
- */
|
|
|
- #no_bounds_check for x := quotient.used; x > 0; x -= 1 {
|
|
|
- quotient.digit[x+digits-1] = quotient.digit[x-1]
|
|
|
- }
|
|
|
-
|
|
|
- quotient.used += digits
|
|
|
- mem.zero_slice(quotient.digit[:digits])
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- Shift right by `digits` * _DIGIT_BITS bits.
|
|
|
-*/
|
|
|
-_private_int_shr_leg :: proc(quotient: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
- context.allocator = allocator
|
|
|
-
|
|
|
- if digits <= 0 { return nil }
|
|
|
-
|
|
|
- /*
|
|
|
- If digits > used simply zero and return.
|
|
|
- */
|
|
|
- if digits > quotient.used { return internal_zero(quotient) }
|
|
|
-
|
|
|
- /*
|
|
|
- Much like `int_shl_digit`, this is implemented using a sliding window,
|
|
|
- except the window goes the other way around.
|
|
|
-
|
|
|
- b-2 | b-1 | b0 | b1 | b2 | ... | bb | ---->
|
|
|
- /\ | ---->
|
|
|
- \-------------------/ ---->
|
|
|
- */
|
|
|
-
|
|
|
- #no_bounds_check for x := 0; x < (quotient.used - digits); x += 1 {
|
|
|
- quotient.digit[x] = quotient.digit[x + digits]
|
|
|
- }
|
|
|
- quotient.used -= digits
|
|
|
- internal_zero_unused(quotient)
|
|
|
- return internal_clamp(quotient)
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- ======================== End of private procedures =======================
|
|
|
-
|
|
|
- =============================== Private tables ===============================
|
|
|
-
|
|
|
- Tables used by `internal_*` and `_*`.
|
|
|
-*/
|
|
|
-
|
|
|
-_private_int_rem_128 := [?]DIGIT{
|
|
|
- 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
- 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
- 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
- 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
- 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
- 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
- 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
- 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
-}
|
|
|
-#assert(128 * size_of(DIGIT) == size_of(_private_int_rem_128))
|
|
|
-
|
|
|
-_private_int_rem_105 := [?]DIGIT{
|
|
|
- 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
|
|
|
- 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
|
|
|
- 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
|
|
|
- 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
- 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
|
|
|
- 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
|
|
|
- 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
|
|
|
-}
|
|
|
-#assert(105 * size_of(DIGIT) == size_of(_private_int_rem_105))
|
|
|
-
|
|
|
-_PRIME_TAB_SIZE :: 256
|
|
|
-_private_prime_table := [_PRIME_TAB_SIZE]DIGIT{
|
|
|
- 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
|
|
|
- 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
|
|
|
- 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
|
|
|
- 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
|
|
|
- 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
|
|
|
- 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
|
|
|
- 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
|
|
|
- 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
|
|
|
-
|
|
|
- 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
|
|
|
- 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
|
|
|
- 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
|
|
|
- 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
|
|
|
- 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
|
|
|
- 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
|
|
|
- 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
|
|
|
- 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
|
|
|
-
|
|
|
- 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
|
|
|
- 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
|
|
|
- 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
|
|
|
- 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
|
|
|
- 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
|
|
|
- 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
|
|
|
- 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
|
|
|
- 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
|
|
|
-
|
|
|
- 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
|
|
|
- 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
|
|
|
- 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
|
|
|
- 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
|
|
|
- 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
|
|
|
- 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
|
|
|
- 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
|
|
|
- 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653,
|
|
|
-}
|
|
|
-#assert(_PRIME_TAB_SIZE * size_of(DIGIT) == size_of(_private_prime_table))
|
|
|
-
|
|
|
-when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
|
|
|
- _factorial_table := [35]_WORD{
|
|
|
-/* f(00): */ 1,
|
|
|
-/* f(01): */ 1,
|
|
|
-/* f(02): */ 2,
|
|
|
-/* f(03): */ 6,
|
|
|
-/* f(04): */ 24,
|
|
|
-/* f(05): */ 120,
|
|
|
-/* f(06): */ 720,
|
|
|
-/* f(07): */ 5_040,
|
|
|
-/* f(08): */ 40_320,
|
|
|
-/* f(09): */ 362_880,
|
|
|
-/* f(10): */ 3_628_800,
|
|
|
-/* f(11): */ 39_916_800,
|
|
|
-/* f(12): */ 479_001_600,
|
|
|
-/* f(13): */ 6_227_020_800,
|
|
|
-/* f(14): */ 87_178_291_200,
|
|
|
-/* f(15): */ 1_307_674_368_000,
|
|
|
-/* f(16): */ 20_922_789_888_000,
|
|
|
-/* f(17): */ 355_687_428_096_000,
|
|
|
-/* f(18): */ 6_402_373_705_728_000,
|
|
|
-/* f(19): */ 121_645_100_408_832_000,
|
|
|
-/* f(20): */ 2_432_902_008_176_640_000,
|
|
|
-/* f(21): */ 51_090_942_171_709_440_000,
|
|
|
-/* f(22): */ 1_124_000_727_777_607_680_000,
|
|
|
-/* f(23): */ 25_852_016_738_884_976_640_000,
|
|
|
-/* f(24): */ 620_448_401_733_239_439_360_000,
|
|
|
-/* f(25): */ 15_511_210_043_330_985_984_000_000,
|
|
|
-/* f(26): */ 403_291_461_126_605_635_584_000_000,
|
|
|
-/* f(27): */ 10_888_869_450_418_352_160_768_000_000,
|
|
|
-/* f(28): */ 304_888_344_611_713_860_501_504_000_000,
|
|
|
-/* f(29): */ 8_841_761_993_739_701_954_543_616_000_000,
|
|
|
-/* f(30): */ 265_252_859_812_191_058_636_308_480_000_000,
|
|
|
-/* f(31): */ 8_222_838_654_177_922_817_725_562_880_000_000,
|
|
|
-/* f(32): */ 263_130_836_933_693_530_167_218_012_160_000_000,
|
|
|
-/* f(33): */ 8_683_317_618_811_886_495_518_194_401_280_000_000,
|
|
|
-/* f(34): */ 295_232_799_039_604_140_847_618_609_643_520_000_000,
|
|
|
- }
|
|
|
-} else {
|
|
|
- _factorial_table := [21]_WORD{
|
|
|
-/* f(00): */ 1,
|
|
|
-/* f(01): */ 1,
|
|
|
-/* f(02): */ 2,
|
|
|
-/* f(03): */ 6,
|
|
|
-/* f(04): */ 24,
|
|
|
-/* f(05): */ 120,
|
|
|
-/* f(06): */ 720,
|
|
|
-/* f(07): */ 5_040,
|
|
|
-/* f(08): */ 40_320,
|
|
|
-/* f(09): */ 362_880,
|
|
|
-/* f(10): */ 3_628_800,
|
|
|
-/* f(11): */ 39_916_800,
|
|
|
-/* f(12): */ 479_001_600,
|
|
|
-/* f(13): */ 6_227_020_800,
|
|
|
-/* f(14): */ 87_178_291_200,
|
|
|
-/* f(15): */ 1_307_674_368_000,
|
|
|
-/* f(16): */ 20_922_789_888_000,
|
|
|
-/* f(17): */ 355_687_428_096_000,
|
|
|
-/* f(18): */ 6_402_373_705_728_000,
|
|
|
-/* f(19): */ 121_645_100_408_832_000,
|
|
|
-/* f(20): */ 2_432_902_008_176_640_000,
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- ========================= End of private tables ========================
|
|
|
+/*
|
|
|
+ Copyright 2021 Jeroen van Rijn <[email protected]>.
|
|
|
+ Made available under Odin's BSD-3 license.
|
|
|
+
|
|
|
+ An arbitrary precision mathematics implementation in Odin.
|
|
|
+ For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
|
|
|
+ The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
|
|
|
+
|
|
|
+ ============================= Private procedures =============================
|
|
|
+
|
|
|
+ Private procedures used by the above low-level routines follow.
|
|
|
+
|
|
|
+ Don't call these yourself unless you really know what you're doing.
|
|
|
+ They include implementations that are optimimal for certain ranges of input only.
|
|
|
+
|
|
|
+ These aren't exported for the same reasons.
|
|
|
+*/
|
|
|
+
|
|
|
+
|
|
|
+package math_big
|
|
|
+
|
|
|
+import "base:intrinsics"
|
|
|
+import "core:mem"
|
|
|
+
|
|
|
+/*
|
|
|
+ Multiplies |a| * |b| and only computes upto digs digits of result.
|
|
|
+ HAC pp. 595, Algorithm 14.12 Modified so you can control how
|
|
|
+ many digits of output are created.
|
|
|
+*/
|
|
|
+_private_int_mul :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ /*
|
|
|
+ Can we use the fast multiplier?
|
|
|
+ */
|
|
|
+ if digits < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
|
|
|
+ return #force_inline _private_int_mul_comba(dest, a, b, digits)
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Set up temporary output `Int`, which we'll swap for `dest` when done.
|
|
|
+ */
|
|
|
+
|
|
|
+ t := &Int{}
|
|
|
+
|
|
|
+ internal_grow(t, max(digits, _DEFAULT_DIGIT_COUNT)) or_return
|
|
|
+ t.used = digits
|
|
|
+
|
|
|
+ /*
|
|
|
+ Compute the digits of the product directly.
|
|
|
+ */
|
|
|
+ pa := a.used
|
|
|
+ for ix := 0; ix < pa; ix += 1 {
|
|
|
+ /*
|
|
|
+ Limit ourselves to `digits` DIGITs of output.
|
|
|
+ */
|
|
|
+ pb := min(b.used, digits - ix)
|
|
|
+ carry := _WORD(0)
|
|
|
+ iy := 0
|
|
|
+
|
|
|
+ /*
|
|
|
+ Compute the column of the output and propagate the carry.
|
|
|
+ */
|
|
|
+ #no_bounds_check for iy = 0; iy < pb; iy += 1 {
|
|
|
+ /*
|
|
|
+ Compute the column as a _WORD.
|
|
|
+ */
|
|
|
+ column := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + carry
|
|
|
+
|
|
|
+ /*
|
|
|
+ The new column is the lower part of the result.
|
|
|
+ */
|
|
|
+ t.digit[ix + iy] = DIGIT(column & _WORD(_MASK))
|
|
|
+
|
|
|
+ /*
|
|
|
+ Get the carry word from the result.
|
|
|
+ */
|
|
|
+ carry = column >> _DIGIT_BITS
|
|
|
+ }
|
|
|
+ /*
|
|
|
+ Set carry if it is placed below digits
|
|
|
+ */
|
|
|
+ if ix + iy < digits {
|
|
|
+ t.digit[ix + pb] = DIGIT(carry)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ internal_swap(dest, t)
|
|
|
+ internal_destroy(t)
|
|
|
+ return internal_clamp(dest)
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+/*
|
|
|
+ Multiplication using the Toom-Cook 3-way algorithm.
|
|
|
+
|
|
|
+ Much more complicated than Karatsuba but has a lower asymptotic running time of O(N**1.464).
|
|
|
+ This algorithm is only particularly useful on VERY large inputs.
|
|
|
+ (We're talking 1000s of digits here...).
|
|
|
+
|
|
|
+ This file contains code from J. Arndt's book "Matters Computational"
|
|
|
+ and the accompanying FXT-library with permission of the author.
|
|
|
+
|
|
|
+ Setup from:
|
|
|
+ Chung, Jaewook, and M. Anwar Hasan. "Asymmetric squaring formulae."
|
|
|
+ 18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.
|
|
|
+
|
|
|
+ The interpolation from above needed one temporary variable more than the interpolation here:
|
|
|
+
|
|
|
+ Bodrato, Marco, and Alberto Zanoni. "What about Toom-Cook matrices optimality."
|
|
|
+ Centro Vito Volterra Universita di Roma Tor Vergata (2006)
|
|
|
+*/
|
|
|
+_private_int_mul_toom :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ S1, S2, T1, a0, a1, a2, b0, b1, b2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(S1, S2, T1, a0, a1, a2, b0, b1, b2)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Init temps.
|
|
|
+ */
|
|
|
+ internal_init_multi(S1, S2, T1) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ B
|
|
|
+ */
|
|
|
+ B := min(a.used, b.used) / 3
|
|
|
+
|
|
|
+ /*
|
|
|
+ a = a2 * x^2 + a1 * x + a0;
|
|
|
+ */
|
|
|
+ internal_grow(a0, B) or_return
|
|
|
+ internal_grow(a1, B) or_return
|
|
|
+ internal_grow(a2, a.used - 2 * B) or_return
|
|
|
+
|
|
|
+ a0.used, a1.used = B, B
|
|
|
+ a2.used = a.used - 2 * B
|
|
|
+
|
|
|
+ internal_copy_digits(a0, a, a0.used) or_return
|
|
|
+ internal_copy_digits(a1, a, a1.used, B) or_return
|
|
|
+ internal_copy_digits(a2, a, a2.used, 2 * B) or_return
|
|
|
+
|
|
|
+ internal_clamp(a0)
|
|
|
+ internal_clamp(a1)
|
|
|
+ internal_clamp(a2)
|
|
|
+
|
|
|
+ /*
|
|
|
+ b = b2 * x^2 + b1 * x + b0;
|
|
|
+ */
|
|
|
+ internal_grow(b0, B) or_return
|
|
|
+ internal_grow(b1, B) or_return
|
|
|
+ internal_grow(b2, b.used - 2 * B) or_return
|
|
|
+
|
|
|
+ b0.used, b1.used = B, B
|
|
|
+ b2.used = b.used - 2 * B
|
|
|
+
|
|
|
+ internal_copy_digits(b0, b, b0.used) or_return
|
|
|
+ internal_copy_digits(b1, b, b1.used, B) or_return
|
|
|
+ internal_copy_digits(b2, b, b2.used, 2 * B) or_return
|
|
|
+
|
|
|
+ internal_clamp(b0)
|
|
|
+ internal_clamp(b1)
|
|
|
+ internal_clamp(b2)
|
|
|
+
|
|
|
+
|
|
|
+ /*
|
|
|
+ \\ S1 = (a2+a1+a0) * (b2+b1+b0);
|
|
|
+ */
|
|
|
+ internal_add(T1, a2, a1) or_return /* T1 = a2 + a1; */
|
|
|
+ internal_add(S2, T1, a0) or_return /* S2 = T1 + a0; */
|
|
|
+ internal_add(dest, b2, b1) or_return /* dest = b2 + b1; */
|
|
|
+ internal_add(S1, dest, b0) or_return /* S1 = c + b0; */
|
|
|
+ internal_mul(S1, S1, S2) or_return /* S1 = S1 * S2; */
|
|
|
+
|
|
|
+ /*
|
|
|
+ \\S2 = (4*a2+2*a1+a0) * (4*b2+2*b1+b0);
|
|
|
+ */
|
|
|
+ internal_add(T1, T1, a2) or_return /* T1 = T1 + a2; */
|
|
|
+ internal_int_shl1(T1, T1) or_return /* T1 = T1 << 1; */
|
|
|
+ internal_add(T1, T1, a0) or_return /* T1 = T1 + a0; */
|
|
|
+ internal_add(dest, dest, b2) or_return /* c = c + b2; */
|
|
|
+ internal_int_shl1(dest, dest) or_return /* c = c << 1; */
|
|
|
+ internal_add(dest, dest, b0) or_return /* c = c + b0; */
|
|
|
+ internal_mul(S2, T1, dest) or_return /* S2 = T1 * c; */
|
|
|
+
|
|
|
+ /*
|
|
|
+ \\S3 = (a2-a1+a0) * (b2-b1+b0);
|
|
|
+ */
|
|
|
+ internal_sub(a1, a2, a1) or_return /* a1 = a2 - a1; */
|
|
|
+ internal_add(a1, a1, a0) or_return /* a1 = a1 + a0; */
|
|
|
+ internal_sub(b1, b2, b1) or_return /* b1 = b2 - b1; */
|
|
|
+ internal_add(b1, b1, b0) or_return /* b1 = b1 + b0; */
|
|
|
+ internal_mul(a1, a1, b1) or_return /* a1 = a1 * b1; */
|
|
|
+ internal_mul(b1, a2, b2) or_return /* b1 = a2 * b2; */
|
|
|
+
|
|
|
+ /*
|
|
|
+ \\S2 = (S2 - S3) / 3;
|
|
|
+ */
|
|
|
+ internal_sub(S2, S2, a1) or_return /* S2 = S2 - a1; */
|
|
|
+ _private_int_div_3(S2, S2) or_return /* S2 = S2 / 3; \\ this is an exact division */
|
|
|
+ internal_sub(a1, S1, a1) or_return /* a1 = S1 - a1; */
|
|
|
+ internal_int_shr1(a1, a1) or_return /* a1 = a1 >> 1; */
|
|
|
+ internal_mul(a0, a0, b0) or_return /* a0 = a0 * b0; */
|
|
|
+ internal_sub(S1, S1, a0) or_return /* S1 = S1 - a0; */
|
|
|
+ internal_sub(S2, S2, S1) or_return /* S2 = S2 - S1; */
|
|
|
+ internal_int_shr1(S2, S2) or_return /* S2 = S2 >> 1; */
|
|
|
+ internal_sub(S1, S1, a1) or_return /* S1 = S1 - a1; */
|
|
|
+ internal_sub(S1, S1, b1) or_return /* S1 = S1 - b1; */
|
|
|
+ internal_int_shl1(T1, b1) or_return /* T1 = b1 << 1; */
|
|
|
+ internal_sub(S2, S2, T1) or_return /* S2 = S2 - T1; */
|
|
|
+ internal_sub(a1, a1, S2) or_return /* a1 = a1 - S2; */
|
|
|
+
|
|
|
+ /*
|
|
|
+ P = b1*x^4+ S2*x^3+ S1*x^2+ a1*x + a0;
|
|
|
+ */
|
|
|
+ _private_int_shl_leg(b1, 4 * B) or_return
|
|
|
+ _private_int_shl_leg(S2, 3 * B) or_return
|
|
|
+ internal_add(b1, b1, S2) or_return
|
|
|
+ _private_int_shl_leg(S1, 2 * B) or_return
|
|
|
+ internal_add(b1, b1, S1) or_return
|
|
|
+ _private_int_shl_leg(a1, 1 * B) or_return
|
|
|
+ internal_add(b1, b1, a1) or_return
|
|
|
+ internal_add(dest, b1, a0) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ a * b - P
|
|
|
+ */
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ product = |a| * |b| using Karatsuba Multiplication using three half size multiplications.
|
|
|
+
|
|
|
+ Let `B` represent the radix [e.g. 2**_DIGIT_BITS] and let `n` represent
|
|
|
+ half of the number of digits in the min(a,b)
|
|
|
+
|
|
|
+ `a` = `a1` * `B`**`n` + `a0`
|
|
|
+ `b` = `b`1 * `B`**`n` + `b0`
|
|
|
+
|
|
|
+ Then, a * b => 1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
|
|
|
+
|
|
|
+ Note that a1b1 and a0b0 are used twice and only need to be computed once.
|
|
|
+ So in total three half size (half # of digit) multiplications are performed,
|
|
|
+ a0b0, a1b1 and (a1+b1)(a0+b0)
|
|
|
+
|
|
|
+ Note that a multiplication of half the digits requires 1/4th the number of
|
|
|
+ single precision multiplications, so in total after one call 25% of the
|
|
|
+ single precision multiplications are saved.
|
|
|
+
|
|
|
+ Note also that the call to `internal_mul` can end up back in this function
|
|
|
+ if the a0, a1, b0, or b1 are above the threshold.
|
|
|
+
|
|
|
+ This is known as divide-and-conquer and leads to the famous O(N**lg(3)) or O(N**1.584)
|
|
|
+ work which is asymptopically lower than the standard O(N**2) that the
|
|
|
+ baseline/comba methods use. Generally though, the overhead of this method doesn't pay off
|
|
|
+ until a certain size is reached, of around 80 used DIGITs.
|
|
|
+*/
|
|
|
+_private_int_mul_karatsuba :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ x0, x1, y0, y1, t1, x0y0, x1y1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(x0, x1, y0, y1, t1, x0y0, x1y1)
|
|
|
+
|
|
|
+ /*
|
|
|
+ min # of digits, divided by two.
|
|
|
+ */
|
|
|
+ B := min(a.used, b.used) >> 1
|
|
|
+
|
|
|
+ /*
|
|
|
+ Init all the temps.
|
|
|
+ */
|
|
|
+ internal_grow(x0, B) or_return
|
|
|
+ internal_grow(x1, a.used - B) or_return
|
|
|
+ internal_grow(y0, B) or_return
|
|
|
+ internal_grow(y1, b.used - B) or_return
|
|
|
+ internal_grow(t1, B * 2) or_return
|
|
|
+ internal_grow(x0y0, B * 2) or_return
|
|
|
+ internal_grow(x1y1, B * 2) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now shift the digits.
|
|
|
+ */
|
|
|
+ x0.used, y0.used = B, B
|
|
|
+ x1.used = a.used - B
|
|
|
+ y1.used = b.used - B
|
|
|
+
|
|
|
+ /*
|
|
|
+ We copy the digits directly instead of using higher level functions
|
|
|
+ since we also need to shift the digits.
|
|
|
+ */
|
|
|
+ internal_copy_digits(x0, a, x0.used)
|
|
|
+ internal_copy_digits(y0, b, y0.used)
|
|
|
+ internal_copy_digits(x1, a, x1.used, B)
|
|
|
+ internal_copy_digits(y1, b, y1.used, B)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Only need to clamp the lower words since by definition the
|
|
|
+ upper words x1/y1 must have a known number of digits.
|
|
|
+ */
|
|
|
+ clamp(x0)
|
|
|
+ clamp(y0)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now calc the products x0y0 and x1y1,
|
|
|
+ after this x0 is no longer required, free temp [x0==t2]!
|
|
|
+ */
|
|
|
+ internal_mul(x0y0, x0, y0) or_return /* x0y0 = x0*y0 */
|
|
|
+ internal_mul(x1y1, x1, y1) or_return /* x1y1 = x1*y1 */
|
|
|
+ internal_add(t1, x1, x0) or_return /* now calc x1+x0 and */
|
|
|
+ internal_add(x0, y1, y0) or_return /* t2 = y1 + y0 */
|
|
|
+ internal_mul(t1, t1, x0) or_return /* t1 = (x1 + x0) * (y1 + y0) */
|
|
|
+
|
|
|
+ /*
|
|
|
+ Add x0y0.
|
|
|
+ */
|
|
|
+ internal_add(x0, x0y0, x1y1) or_return /* t2 = x0y0 + x1y1 */
|
|
|
+ internal_sub(t1, t1, x0) or_return /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
|
|
|
+
|
|
|
+ /*
|
|
|
+ shift by B.
|
|
|
+ */
|
|
|
+ _private_int_shl_leg(t1, B) or_return /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
|
|
|
+ _private_int_shl_leg(x1y1, B * 2) or_return /* x1y1 = x1y1 << 2*B */
|
|
|
+
|
|
|
+ internal_add(t1, x0y0, t1) or_return /* t1 = x0y0 + t1 */
|
|
|
+ internal_add(dest, t1, x1y1) or_return /* t1 = x0y0 + t1 + x1y1 */
|
|
|
+
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+/*
|
|
|
+ Fast (comba) multiplier
|
|
|
+
|
|
|
+ This is the fast column-array [comba] multiplier. It is
|
|
|
+ designed to compute the columns of the product first
|
|
|
+ then handle the carries afterwards. This has the effect
|
|
|
+ of making the nested loops that compute the columns very
|
|
|
+ simple and schedulable on super-scalar processors.
|
|
|
+
|
|
|
+ This has been modified to produce a variable number of
|
|
|
+ digits of output so if say only a half-product is required
|
|
|
+ you don't have to compute the upper half (a feature
|
|
|
+ required for fast Barrett reduction).
|
|
|
+
|
|
|
+ Based on Algorithm 14.12 on pp.595 of HAC.
|
|
|
+*/
|
|
|
+_private_int_mul_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ /*
|
|
|
+ Set up array.
|
|
|
+ */
|
|
|
+ W: [_WARRAY]DIGIT = ---
|
|
|
+
|
|
|
+ /*
|
|
|
+ Grow the destination as required.
|
|
|
+ */
|
|
|
+ internal_grow(dest, digits) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Number of output digits to produce.
|
|
|
+ */
|
|
|
+ pa := min(digits, a.used + b.used)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Clear the carry
|
|
|
+ */
|
|
|
+ _W := _WORD(0)
|
|
|
+
|
|
|
+ ix: int
|
|
|
+ for ix = 0; ix < pa; ix += 1 {
|
|
|
+ tx, ty, iy, iz: int
|
|
|
+
|
|
|
+ /*
|
|
|
+ Get offsets into the two bignums.
|
|
|
+ */
|
|
|
+ ty = min(b.used - 1, ix)
|
|
|
+ tx = ix - ty
|
|
|
+
|
|
|
+ /*
|
|
|
+ This is the number of times the loop will iterate, essentially.
|
|
|
+ while (tx++ < a->used && ty-- >= 0) { ... }
|
|
|
+ */
|
|
|
+
|
|
|
+ iy = min(a.used - tx, ty + 1)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Execute loop.
|
|
|
+ */
|
|
|
+ #no_bounds_check for iz = 0; iz < iy; iz += 1 {
|
|
|
+ _W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz])
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Store term.
|
|
|
+ */
|
|
|
+ W[ix] = DIGIT(_W) & _MASK
|
|
|
+
|
|
|
+ /*
|
|
|
+ Make next carry.
|
|
|
+ */
|
|
|
+ _W = _W >> _WORD(_DIGIT_BITS)
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Setup dest.
|
|
|
+ */
|
|
|
+ old_used := dest.used
|
|
|
+ dest.used = pa
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now extract the previous digit [below the carry].
|
|
|
+ */
|
|
|
+ copy_slice(dest.digit[0:], W[:pa])
|
|
|
+
|
|
|
+ /*
|
|
|
+ Clear unused digits [that existed in the old copy of dest].
|
|
|
+ */
|
|
|
+ internal_zero_unused(dest, old_used)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Adjust dest.used based on leading zeroes.
|
|
|
+ */
|
|
|
+
|
|
|
+ return internal_clamp(dest)
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Multiplies |a| * |b| and does not compute the lower digs digits
|
|
|
+ [meant to get the higher part of the product]
|
|
|
+*/
|
|
|
+_private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ /*
|
|
|
+ Can we use the fast multiplier?
|
|
|
+ */
|
|
|
+ if a.used + b.used + 1 < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
|
|
|
+ return _private_int_mul_high_comba(dest, a, b, digits)
|
|
|
+ }
|
|
|
+
|
|
|
+ internal_grow(dest, a.used + b.used + 1) or_return
|
|
|
+ dest.used = a.used + b.used + 1
|
|
|
+
|
|
|
+ pa := a.used
|
|
|
+ pb := b.used
|
|
|
+ for ix := 0; ix < pa; ix += 1 {
|
|
|
+ carry := DIGIT(0)
|
|
|
+
|
|
|
+ for iy := digits - ix; iy < pb; iy += 1 {
|
|
|
+ /*
|
|
|
+ Calculate the double precision result.
|
|
|
+ */
|
|
|
+ r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Get the lower part.
|
|
|
+ */
|
|
|
+ dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
|
|
|
+
|
|
|
+ /*
|
|
|
+ Carry the carry.
|
|
|
+ */
|
|
|
+ carry = DIGIT(r >> _WORD(_DIGIT_BITS))
|
|
|
+ }
|
|
|
+ dest.digit[ix + pb] = carry
|
|
|
+ }
|
|
|
+ return internal_clamp(dest)
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ This is a modified version of `_private_int_mul_comba` that only produces output digits *above* `digits`.
|
|
|
+ See the comments for `_private_int_mul_comba` to see how it works.
|
|
|
+
|
|
|
+ This is used in the Barrett reduction since for one of the multiplications
|
|
|
+ only the higher digits were needed. This essentially halves the work.
|
|
|
+
|
|
|
+ Based on Algorithm 14.12 on pp.595 of HAC.
|
|
|
+*/
|
|
|
+_private_int_mul_high_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ W: [_WARRAY]DIGIT = ---
|
|
|
+ _W: _WORD = 0
|
|
|
+
|
|
|
+ /*
|
|
|
+ Number of output digits to produce. Grow the destination as required.
|
|
|
+ */
|
|
|
+ pa := a.used + b.used
|
|
|
+ internal_grow(dest, pa) or_return
|
|
|
+
|
|
|
+ ix: int
|
|
|
+ for ix = digits; ix < pa; ix += 1 {
|
|
|
+ /*
|
|
|
+ Get offsets into the two bignums.
|
|
|
+ */
|
|
|
+ ty := min(b.used - 1, ix)
|
|
|
+ tx := ix - ty
|
|
|
+
|
|
|
+ /*
|
|
|
+ This is the number of times the loop will iterrate, essentially it's
|
|
|
+ while (tx++ < a->used && ty-- >= 0) { ... }
|
|
|
+ */
|
|
|
+ iy := min(a.used - tx, ty + 1)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Execute loop.
|
|
|
+ */
|
|
|
+ for iz := 0; iz < iy; iz += 1 {
|
|
|
+ _W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz])
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Store term.
|
|
|
+ */
|
|
|
+ W[ix] = DIGIT(_W) & DIGIT(_MASK)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Make next carry.
|
|
|
+ */
|
|
|
+ _W = _W >> _WORD(_DIGIT_BITS)
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Setup dest
|
|
|
+ */
|
|
|
+ old_used := dest.used
|
|
|
+ dest.used = pa
|
|
|
+
|
|
|
+ for ix = digits; ix < pa; ix += 1 {
|
|
|
+ /*
|
|
|
+ Now extract the previous digit [below the carry].
|
|
|
+ */
|
|
|
+ dest.digit[ix] = W[ix]
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Zero remainder.
|
|
|
+ */
|
|
|
+ internal_zero_unused(dest, old_used)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Adjust dest.used based on leading zeroes.
|
|
|
+ */
|
|
|
+ return internal_clamp(dest)
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Single-digit multiplication with the smaller number as the single-digit.
|
|
|
+*/
|
|
|
+_private_int_mul_balance :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+ a, b := a, b
|
|
|
+
|
|
|
+ a0, tmp, r := &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(a0, tmp, r)
|
|
|
+
|
|
|
+ b_size := min(a.used, b.used)
|
|
|
+ n_blocks := max(a.used, b.used) / b_size
|
|
|
+
|
|
|
+ internal_grow(a0, b_size + 2) or_return
|
|
|
+ internal_init_multi(tmp, r) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Make sure that `a` is the larger one.
|
|
|
+ */
|
|
|
+ if a.used < b.used {
|
|
|
+ a, b = b, a
|
|
|
+ }
|
|
|
+ assert(a.used >= b.used)
|
|
|
+
|
|
|
+ i, j := 0, 0
|
|
|
+ for ; i < n_blocks; i += 1 {
|
|
|
+ /*
|
|
|
+ Cut a slice off of `a`.
|
|
|
+ */
|
|
|
+
|
|
|
+ a0.used = b_size
|
|
|
+ internal_copy_digits(a0, a, a0.used, j)
|
|
|
+ j += a0.used
|
|
|
+ internal_clamp(a0)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Multiply with `b`.
|
|
|
+ */
|
|
|
+ internal_mul(tmp, a0, b) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Shift `tmp` to the correct position.
|
|
|
+ */
|
|
|
+ _private_int_shl_leg(tmp, b_size * i) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Add to output. No carry needed.
|
|
|
+ */
|
|
|
+ internal_add(r, r, tmp) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ The left-overs; there are always left-overs.
|
|
|
+ */
|
|
|
+ if j < a.used {
|
|
|
+ a0.used = a.used - j
|
|
|
+ internal_copy_digits(a0, a, a0.used, j)
|
|
|
+ j += a0.used
|
|
|
+ internal_clamp(a0)
|
|
|
+
|
|
|
+ internal_mul(tmp, a0, b) or_return
|
|
|
+ _private_int_shl_leg(tmp, b_size * i) or_return
|
|
|
+ internal_add(r, r, tmp) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ internal_swap(dest, r)
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16
|
|
|
+ Assumes `dest` and `src` to not be `nil`, and `src` to have been initialized.
|
|
|
+*/
|
|
|
+_private_int_sqr :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+ pa := src.used
|
|
|
+
|
|
|
+ t := &Int{}; ix, iy: int
|
|
|
+ /*
|
|
|
+ Grow `t` to maximum needed size, or `_DEFAULT_DIGIT_COUNT`, whichever is bigger.
|
|
|
+ */
|
|
|
+ internal_grow(t, max((2 * pa) + 1, _DEFAULT_DIGIT_COUNT)) or_return
|
|
|
+ t.used = (2 * pa) + 1
|
|
|
+
|
|
|
+ #no_bounds_check for ix = 0; ix < pa; ix += 1 {
|
|
|
+ carry := DIGIT(0)
|
|
|
+ /*
|
|
|
+ First calculate the digit at 2*ix; calculate double precision result.
|
|
|
+ */
|
|
|
+ r := _WORD(t.digit[ix+ix]) + (_WORD(src.digit[ix]) * _WORD(src.digit[ix]))
|
|
|
+
|
|
|
+ /*
|
|
|
+ Store lower part in result.
|
|
|
+ */
|
|
|
+ t.digit[ix+ix] = DIGIT(r & _WORD(_MASK))
|
|
|
+ /*
|
|
|
+ Get the carry.
|
|
|
+ */
|
|
|
+ carry = DIGIT(r >> _DIGIT_BITS)
|
|
|
+
|
|
|
+ #no_bounds_check for iy = ix + 1; iy < pa; iy += 1 {
|
|
|
+ /*
|
|
|
+ First calculate the product.
|
|
|
+ */
|
|
|
+ r = _WORD(src.digit[ix]) * _WORD(src.digit[iy])
|
|
|
+
|
|
|
+ /* Now calculate the double precision result. Nóte we use
|
|
|
+ * addition instead of *2 since it's easier to optimize
|
|
|
+ */
|
|
|
+ r = _WORD(t.digit[ix+iy]) + r + r + _WORD(carry)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Store lower part.
|
|
|
+ */
|
|
|
+ t.digit[ix+iy] = DIGIT(r & _WORD(_MASK))
|
|
|
+
|
|
|
+ /*
|
|
|
+ Get carry.
|
|
|
+ */
|
|
|
+ carry = DIGIT(r >> _DIGIT_BITS)
|
|
|
+ }
|
|
|
+ /*
|
|
|
+ Propagate upwards.
|
|
|
+ */
|
|
|
+ #no_bounds_check for carry != 0 {
|
|
|
+ r = _WORD(t.digit[ix+iy]) + _WORD(carry)
|
|
|
+ t.digit[ix+iy] = DIGIT(r & _WORD(_MASK))
|
|
|
+ carry = DIGIT(r >> _WORD(_DIGIT_BITS))
|
|
|
+ iy += 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ err = internal_clamp(t)
|
|
|
+ internal_swap(dest, t)
|
|
|
+ internal_destroy(t)
|
|
|
+ return err
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ The jist of squaring...
|
|
|
+ You do like mult except the offset of the tmpx [one that starts closer to zero] can't equal the offset of tmpy.
|
|
|
+ So basically you set up iy like before then you min it with (ty-tx) so that it never happens.
|
|
|
+ You double all those you add in the inner loop. After that loop you do the squares and add them in.
|
|
|
+
|
|
|
+ Assumes `dest` and `src` not to be `nil` and `src` to have been initialized.
|
|
|
+*/
|
|
|
+_private_int_sqr_comba :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ W: [_WARRAY]DIGIT = ---
|
|
|
+
|
|
|
+ /*
|
|
|
+ Grow the destination as required.
|
|
|
+ */
|
|
|
+ pa := uint(src.used) + uint(src.used)
|
|
|
+ internal_grow(dest, int(pa)) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Number of output digits to produce.
|
|
|
+ */
|
|
|
+ W1 := _WORD(0)
|
|
|
+ _W : _WORD = ---
|
|
|
+ ix := uint(0)
|
|
|
+
|
|
|
+ #no_bounds_check for ; ix < pa; ix += 1 {
|
|
|
+ /*
|
|
|
+ Clear counter.
|
|
|
+ */
|
|
|
+ _W = {}
|
|
|
+
|
|
|
+ /*
|
|
|
+ Get offsets into the two bignums.
|
|
|
+ */
|
|
|
+ ty := min(uint(src.used) - 1, ix)
|
|
|
+ tx := ix - ty
|
|
|
+
|
|
|
+ /*
|
|
|
+ This is the number of times the loop will iterate,
|
|
|
+ essentially while (tx++ < a->used && ty-- >= 0) { ... }
|
|
|
+ */
|
|
|
+ iy := min(uint(src.used) - tx, ty + 1)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now for squaring, tx can never equal ty.
|
|
|
+ We halve the distance since they approach at a rate of 2x,
|
|
|
+ and we have to round because odd cases need to be executed.
|
|
|
+ */
|
|
|
+ iy = min(iy, ((ty - tx) + 1) >> 1 )
|
|
|
+
|
|
|
+ /*
|
|
|
+ Execute loop.
|
|
|
+ */
|
|
|
+ #no_bounds_check for iz := uint(0); iz < iy; iz += 1 {
|
|
|
+ _W += _WORD(src.digit[tx + iz]) * _WORD(src.digit[ty - iz])
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Double the inner product and add carry.
|
|
|
+ */
|
|
|
+ _W = _W + _W + W1
|
|
|
+
|
|
|
+ /*
|
|
|
+ Even columns have the square term in them.
|
|
|
+ */
|
|
|
+ if ix & 1 == 0 {
|
|
|
+ _W += _WORD(src.digit[ix >> 1]) * _WORD(src.digit[ix >> 1])
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Store it.
|
|
|
+ */
|
|
|
+ W[ix] = DIGIT(_W & _WORD(_MASK))
|
|
|
+
|
|
|
+ /*
|
|
|
+ Make next carry.
|
|
|
+ */
|
|
|
+ W1 = _W >> _DIGIT_BITS
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Setup dest.
|
|
|
+ */
|
|
|
+ old_used := dest.used
|
|
|
+ dest.used = src.used + src.used
|
|
|
+
|
|
|
+ #no_bounds_check for ix = 0; ix < pa; ix += 1 {
|
|
|
+ dest.digit[ix] = W[ix] & _MASK
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Clear unused digits [that existed in the old copy of dest].
|
|
|
+ */
|
|
|
+ internal_zero_unused(dest, old_used)
|
|
|
+
|
|
|
+ return internal_clamp(dest)
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Karatsuba squaring, computes `dest` = `src` * `src` using three half-size squarings.
|
|
|
+
|
|
|
+ See comments of `_private_int_mul_karatsuba` for details.
|
|
|
+ It is essentially the same algorithm but merely tuned to perform recursive squarings.
|
|
|
+*/
|
|
|
+_private_int_sqr_karatsuba :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ x0, x1, t1, t2, x0x0, x1x1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(x0, x1, t1, t2, x0x0, x1x1)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Min # of digits, divided by two.
|
|
|
+ */
|
|
|
+ B := src.used >> 1
|
|
|
+
|
|
|
+ /*
|
|
|
+ Init temps.
|
|
|
+ */
|
|
|
+ internal_grow(x0, B) or_return
|
|
|
+ internal_grow(x1, src.used - B) or_return
|
|
|
+ internal_grow(t1, src.used * 2) or_return
|
|
|
+ internal_grow(t2, src.used * 2) or_return
|
|
|
+ internal_grow(x0x0, B * 2 ) or_return
|
|
|
+ internal_grow(x1x1, (src.used - B) * 2) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now shift the digits.
|
|
|
+ */
|
|
|
+ x0.used = B
|
|
|
+ x1.used = src.used - B
|
|
|
+
|
|
|
+ #force_inline internal_copy_digits(x0, src, x0.used)
|
|
|
+ #force_inline mem.copy_non_overlapping(&x1.digit[0], &src.digit[B], size_of(DIGIT) * x1.used)
|
|
|
+ #force_inline internal_clamp(x0)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now calc the products x0*x0 and x1*x1.
|
|
|
+ */
|
|
|
+ internal_sqr(x0x0, x0) or_return
|
|
|
+ internal_sqr(x1x1, x1) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now calc (x1+x0)^2
|
|
|
+ */
|
|
|
+ internal_add(t1, x0, x1) or_return
|
|
|
+ internal_sqr(t1, t1) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Add x0y0
|
|
|
+ */
|
|
|
+ internal_add(t2, x0x0, x1x1) or_return
|
|
|
+ internal_sub(t1, t1, t2) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Shift by B.
|
|
|
+ */
|
|
|
+ _private_int_shl_leg(t1, B) or_return
|
|
|
+ _private_int_shl_leg(x1x1, B * 2) or_return
|
|
|
+ internal_add(t1, t1, x0x0) or_return
|
|
|
+ internal_add(dest, t1, x1x1) or_return
|
|
|
+
|
|
|
+ return #force_inline internal_clamp(dest)
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Squaring using Toom-Cook 3-way algorithm.
|
|
|
+
|
|
|
+ Setup and interpolation from algorithm SQR_3 in Chung, Jaewook, and M. Anwar Hasan. "Asymmetric squaring formulae."
|
|
|
+ 18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.
|
|
|
+*/
|
|
|
+_private_int_sqr_toom :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ S0, a0, a1, a2 := &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(S0, a0, a1, a2)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Init temps.
|
|
|
+ */
|
|
|
+ internal_zero(S0) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ B
|
|
|
+ */
|
|
|
+ B := src.used / 3
|
|
|
+
|
|
|
+ /*
|
|
|
+ a = a2 * x^2 + a1 * x + a0;
|
|
|
+ */
|
|
|
+ internal_grow(a0, B) or_return
|
|
|
+ internal_grow(a1, B) or_return
|
|
|
+ internal_grow(a2, src.used - (2 * B)) or_return
|
|
|
+
|
|
|
+ a0.used = B
|
|
|
+ a1.used = B
|
|
|
+ a2.used = src.used - 2 * B
|
|
|
+
|
|
|
+ #force_inline mem.copy_non_overlapping(&a0.digit[0], &src.digit[ 0], size_of(DIGIT) * a0.used)
|
|
|
+ #force_inline mem.copy_non_overlapping(&a1.digit[0], &src.digit[ B], size_of(DIGIT) * a1.used)
|
|
|
+ #force_inline mem.copy_non_overlapping(&a2.digit[0], &src.digit[2 * B], size_of(DIGIT) * a2.used)
|
|
|
+
|
|
|
+ internal_clamp(a0)
|
|
|
+ internal_clamp(a1)
|
|
|
+ internal_clamp(a2)
|
|
|
+
|
|
|
+ /** S0 = a0^2; */
|
|
|
+ internal_sqr(S0, a0) or_return
|
|
|
+
|
|
|
+ /** \\S1 = (a2 + a1 + a0)^2 */
|
|
|
+ /** \\S2 = (a2 - a1 + a0)^2 */
|
|
|
+ /** \\S1 = a0 + a2; */
|
|
|
+ /** a0 = a0 + a2; */
|
|
|
+ internal_add(a0, a0, a2) or_return
|
|
|
+ /** \\S2 = S1 - a1; */
|
|
|
+ /** b = a0 - a1; */
|
|
|
+ internal_sub(dest, a0, a1) or_return
|
|
|
+ /** \\S1 = S1 + a1; */
|
|
|
+ /** a0 = a0 + a1; */
|
|
|
+ internal_add(a0, a0, a1) or_return
|
|
|
+ /** \\S1 = S1^2; */
|
|
|
+ /** a0 = a0^2; */
|
|
|
+ internal_sqr(a0, a0) or_return
|
|
|
+ /** \\S2 = S2^2; */
|
|
|
+ /** b = b^2; */
|
|
|
+ internal_sqr(dest, dest) or_return
|
|
|
+ /** \\ S3 = 2 * a1 * a2 */
|
|
|
+ /** \\S3 = a1 * a2; */
|
|
|
+ /** a1 = a1 * a2; */
|
|
|
+ internal_mul(a1, a1, a2) or_return
|
|
|
+ /** \\S3 = S3 << 1; */
|
|
|
+ /** a1 = a1 << 1; */
|
|
|
+ internal_shl(a1, a1, 1) or_return
|
|
|
+ /** \\S4 = a2^2; */
|
|
|
+ /** a2 = a2^2; */
|
|
|
+ internal_sqr(a2, a2) or_return
|
|
|
+ /** \\ tmp = (S1 + S2)/2 */
|
|
|
+ /** \\tmp = S1 + S2; */
|
|
|
+ /** b = a0 + b; */
|
|
|
+ internal_add(dest, a0, dest) or_return
|
|
|
+ /** \\tmp = tmp >> 1; */
|
|
|
+ /** b = b >> 1; */
|
|
|
+ internal_shr(dest, dest, 1) or_return
|
|
|
+ /** \\ S1 = S1 - tmp - S3 */
|
|
|
+ /** \\S1 = S1 - tmp; */
|
|
|
+ /** a0 = a0 - b; */
|
|
|
+ internal_sub(a0, a0, dest) or_return
|
|
|
+ /** \\S1 = S1 - S3; */
|
|
|
+ /** a0 = a0 - a1; */
|
|
|
+ internal_sub(a0, a0, a1) or_return
|
|
|
+ /** \\S2 = tmp - S4 -S0 */
|
|
|
+ /** \\S2 = tmp - S4; */
|
|
|
+ /** b = b - a2; */
|
|
|
+ internal_sub(dest, dest, a2) or_return
|
|
|
+ /** \\S2 = S2 - S0; */
|
|
|
+ /** b = b - S0; */
|
|
|
+ internal_sub(dest, dest, S0) or_return
|
|
|
+ /** \\P = S4*x^4 + S3*x^3 + S2*x^2 + S1*x + S0; */
|
|
|
+ /** P = a2*x^4 + a1*x^3 + b*x^2 + a0*x + S0; */
|
|
|
+ _private_int_shl_leg( a2, 4 * B) or_return
|
|
|
+ _private_int_shl_leg( a1, 3 * B) or_return
|
|
|
+ _private_int_shl_leg(dest, 2 * B) or_return
|
|
|
+ _private_int_shl_leg( a0, 1 * B) or_return
|
|
|
+
|
|
|
+ internal_add(a2, a2, a1) or_return
|
|
|
+ internal_add(dest, dest, a2) or_return
|
|
|
+ internal_add(dest, dest, a0) or_return
|
|
|
+ internal_add(dest, dest, S0) or_return
|
|
|
+ /** a^2 - P */
|
|
|
+
|
|
|
+ return #force_inline internal_clamp(dest)
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Divide by three (based on routine from MPI and the GMP manual).
|
|
|
+*/
|
|
|
+_private_int_div_3 :: proc(quotient, numerator: ^Int, allocator := context.allocator) -> (remainder: DIGIT, err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ /*
|
|
|
+ b = 2^_DIGIT_BITS / 3
|
|
|
+ */
|
|
|
+ b := _WORD(1) << _WORD(_DIGIT_BITS) / _WORD(3)
|
|
|
+
|
|
|
+ q := &Int{}
|
|
|
+ internal_grow(q, numerator.used) or_return
|
|
|
+ q.used = numerator.used
|
|
|
+ q.sign = numerator.sign
|
|
|
+
|
|
|
+ w, t: _WORD
|
|
|
+ #no_bounds_check for ix := numerator.used; ix >= 0; ix -= 1 {
|
|
|
+ w = (w << _WORD(_DIGIT_BITS)) | _WORD(numerator.digit[ix])
|
|
|
+ if w >= 3 {
|
|
|
+ /*
|
|
|
+ Multiply w by [1/3].
|
|
|
+ */
|
|
|
+ t = (w * b) >> _WORD(_DIGIT_BITS)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now subtract 3 * [w/3] from w, to get the remainder.
|
|
|
+ */
|
|
|
+ w -= t+t+t
|
|
|
+
|
|
|
+ /*
|
|
|
+ Fixup the remainder as required since the optimization is not exact.
|
|
|
+ */
|
|
|
+ for w >= 3 {
|
|
|
+ t += 1
|
|
|
+ w -= 3
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ t = 0
|
|
|
+ }
|
|
|
+ q.digit[ix] = DIGIT(t)
|
|
|
+ }
|
|
|
+ remainder = DIGIT(w)
|
|
|
+
|
|
|
+ /*
|
|
|
+ [optional] store the quotient.
|
|
|
+ */
|
|
|
+ if quotient != nil {
|
|
|
+ err = clamp(q)
|
|
|
+ internal_swap(q, quotient)
|
|
|
+ }
|
|
|
+ internal_destroy(q)
|
|
|
+ return remainder, nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Signed Integer Division
|
|
|
+
|
|
|
+ c*b + d == a [i.e. a/b, c=quotient, d=remainder], HAC pp.598 Algorithm 14.20
|
|
|
+
|
|
|
+ Note that the description in HAC is horribly incomplete.
|
|
|
+ For example, it doesn't consider the case where digits are removed from 'x' in
|
|
|
+ the inner loop.
|
|
|
+
|
|
|
+ It also doesn't consider the case that y has fewer than three digits, etc.
|
|
|
+ The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
|
|
|
+*/
|
|
|
+_private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ error_if_immutable(quotient, remainder) or_return
|
|
|
+
|
|
|
+ q, x, y, t1, t2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(q, x, y, t1, t2)
|
|
|
+
|
|
|
+ internal_grow(q, numerator.used + 2) or_return
|
|
|
+ q.used = numerator.used + 2
|
|
|
+
|
|
|
+ internal_init_multi(t1, t2) or_return
|
|
|
+ internal_copy(x, numerator) or_return
|
|
|
+ internal_copy(y, denominator) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Fix the sign.
|
|
|
+ */
|
|
|
+ neg := numerator.sign != denominator.sign
|
|
|
+ x.sign = .Zero_or_Positive
|
|
|
+ y.sign = .Zero_or_Positive
|
|
|
+
|
|
|
+ /*
|
|
|
+ Normalize both x and y, ensure that y >= b/2, [b == 2**MP_DIGIT_BIT]
|
|
|
+ */
|
|
|
+ norm := internal_count_bits(y) % _DIGIT_BITS
|
|
|
+
|
|
|
+ if norm < _DIGIT_BITS - 1 {
|
|
|
+ norm = (_DIGIT_BITS - 1) - norm
|
|
|
+ internal_shl(x, x, norm) or_return
|
|
|
+ internal_shl(y, y, norm) or_return
|
|
|
+ } else {
|
|
|
+ norm = 0
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Note: HAC does 0 based, so if used==5 then it's 0,1,2,3,4, i.e. use 4
|
|
|
+ */
|
|
|
+ n := x.used - 1
|
|
|
+ t := y.used - 1
|
|
|
+
|
|
|
+ /*
|
|
|
+ while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} }
|
|
|
+ y = y*b**{n-t}
|
|
|
+ */
|
|
|
+
|
|
|
+ _private_int_shl_leg(y, n - t) or_return
|
|
|
+
|
|
|
+ gte := internal_gte(x, y)
|
|
|
+ for gte {
|
|
|
+ q.digit[n - t] += 1
|
|
|
+ internal_sub(x, x, y) or_return
|
|
|
+ gte = internal_gte(x, y)
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Reset y by shifting it back down.
|
|
|
+ */
|
|
|
+ _private_int_shr_leg(y, n - t)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Step 3. for i from n down to (t + 1).
|
|
|
+ */
|
|
|
+ #no_bounds_check for i := n; i >= (t + 1); i -= 1 {
|
|
|
+ if i > x.used { continue }
|
|
|
+
|
|
|
+ /*
|
|
|
+ step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt
|
|
|
+ */
|
|
|
+ if x.digit[i] == y.digit[t] {
|
|
|
+ q.digit[(i - t) - 1] = 1 << (_DIGIT_BITS - 1)
|
|
|
+ } else {
|
|
|
+
|
|
|
+ tmp := _WORD(x.digit[i]) << _DIGIT_BITS
|
|
|
+ tmp |= _WORD(x.digit[i - 1])
|
|
|
+ tmp /= _WORD(y.digit[t])
|
|
|
+ if tmp > _WORD(_MASK) {
|
|
|
+ tmp = _WORD(_MASK)
|
|
|
+ }
|
|
|
+ q.digit[(i - t) - 1] = DIGIT(tmp & _WORD(_MASK))
|
|
|
+ }
|
|
|
+
|
|
|
+ /* while (q{i-t-1} * (yt * b + y{t-1})) >
|
|
|
+ xi * b**2 + xi-1 * b + xi-2
|
|
|
+
|
|
|
+ do q{i-t-1} -= 1;
|
|
|
+ */
|
|
|
+
|
|
|
+ iter := 0
|
|
|
+
|
|
|
+ q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] + 1) & _MASK
|
|
|
+ #no_bounds_check for {
|
|
|
+ q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK
|
|
|
+
|
|
|
+ /*
|
|
|
+ Find left hand.
|
|
|
+ */
|
|
|
+ internal_zero(t1)
|
|
|
+ t1.digit[0] = ((t - 1) < 0) ? 0 : y.digit[t - 1]
|
|
|
+ t1.digit[1] = y.digit[t]
|
|
|
+ t1.used = 2
|
|
|
+ internal_mul(t1, t1, q.digit[(i - t) - 1]) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Find right hand.
|
|
|
+ */
|
|
|
+ t2.digit[0] = ((i - 2) < 0) ? 0 : x.digit[i - 2]
|
|
|
+ t2.digit[1] = x.digit[i - 1] /* i >= 1 always holds */
|
|
|
+ t2.digit[2] = x.digit[i]
|
|
|
+ t2.used = 3
|
|
|
+
|
|
|
+ if internal_lte(t1, t2) {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ iter += 1; if iter > 100 {
|
|
|
+ return .Max_Iterations_Reached
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Step 3.3 x = x - q{i-t-1} * y * b**{i-t-1}
|
|
|
+ */
|
|
|
+ int_mul_digit(t1, y, q.digit[(i - t) - 1]) or_return
|
|
|
+ _private_int_shl_leg(t1, (i - t) - 1) or_return
|
|
|
+ internal_sub(x, x, t1) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; }
|
|
|
+ */
|
|
|
+ if x.sign == .Negative {
|
|
|
+ internal_copy(t1, y) or_return
|
|
|
+ _private_int_shl_leg(t1, (i - t) - 1) or_return
|
|
|
+ internal_add(x, x, t1) or_return
|
|
|
+
|
|
|
+ q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now q is the quotient and x is the remainder, [which we have to normalize]
|
|
|
+ Get sign before writing to c.
|
|
|
+ */
|
|
|
+ z, _ := is_zero(x)
|
|
|
+ x.sign = .Zero_or_Positive if z else numerator.sign
|
|
|
+
|
|
|
+ if quotient != nil {
|
|
|
+ internal_clamp(q)
|
|
|
+ internal_swap(q, quotient)
|
|
|
+ quotient.sign = .Negative if neg else .Zero_or_Positive
|
|
|
+ }
|
|
|
+
|
|
|
+ if remainder != nil {
|
|
|
+ internal_shr(x, x, norm) or_return
|
|
|
+ internal_swap(x, remainder)
|
|
|
+ }
|
|
|
+
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Direct implementation of algorithms 1.8 "RecursiveDivRem" and 1.9 "UnbalancedDivision" from:
|
|
|
+
|
|
|
+ Brent, Richard P., and Paul Zimmermann. "Modern computer arithmetic"
|
|
|
+ Vol. 18. Cambridge University Press, 2010
|
|
|
+ Available online at https://arxiv.org/pdf/1004.4710
|
|
|
+
|
|
|
+ pages 19ff. in the above online document.
|
|
|
+*/
|
|
|
+_private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ A1, A2, B1, B0, Q1, Q0, R1, R0, t := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(A1, A2, B1, B0, Q1, Q0, R1, R0, t)
|
|
|
+
|
|
|
+ m := a.used - b.used
|
|
|
+ k := m / 2
|
|
|
+
|
|
|
+ if m < MUL_KARATSUBA_CUTOFF {
|
|
|
+ return _private_int_div_school(quotient, remainder, a, b)
|
|
|
+ }
|
|
|
+
|
|
|
+ internal_init_multi(A1, A2, B1, B0, Q1, Q0, R1, R0, t) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ `B1` = `b` / `beta`^`k`, `B0` = `b` % `beta`^`k`
|
|
|
+ */
|
|
|
+ internal_shrmod(B1, B0, b, k * _DIGIT_BITS) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ (Q1, R1) = RecursiveDivRem(A / beta^(2k), B1)
|
|
|
+ */
|
|
|
+ internal_shrmod(A1, t, a, 2 * k * _DIGIT_BITS) or_return
|
|
|
+ _private_div_recursion(Q1, R1, A1, B1) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ A1 = (R1 * beta^(2k)) + (A % beta^(2k)) - (Q1 * B0 * beta^k)
|
|
|
+ */
|
|
|
+ _private_int_shl_leg(R1, 2 * k) or_return
|
|
|
+ internal_add(A1, R1, t) or_return
|
|
|
+ internal_mul(t, Q1, B0) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ While A1 < 0 do Q1 = Q1 - 1, A1 = A1 + (beta^k * B)
|
|
|
+ */
|
|
|
+ if internal_lt(A1, 0) {
|
|
|
+ internal_shl(t, b, k * _DIGIT_BITS) or_return
|
|
|
+
|
|
|
+ for {
|
|
|
+ internal_decr(Q1) or_return
|
|
|
+ internal_add(A1, A1, t) or_return
|
|
|
+ if internal_gte(A1, 0) { break }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ (Q0, R0) = RecursiveDivRem(A1 / beta^(k), B1)
|
|
|
+ */
|
|
|
+ internal_shrmod(A1, t, A1, k * _DIGIT_BITS) or_return
|
|
|
+ _private_div_recursion(Q0, R0, A1, B1) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ A2 = (R0*beta^k) + (A1 % beta^k) - (Q0*B0)
|
|
|
+ */
|
|
|
+ _private_int_shl_leg(R0, k) or_return
|
|
|
+ internal_add(A2, R0, t) or_return
|
|
|
+ internal_mul(t, Q0, B0) or_return
|
|
|
+ internal_sub(A2, A2, t) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ While A2 < 0 do Q0 = Q0 - 1, A2 = A2 + B.
|
|
|
+ */
|
|
|
+ for internal_is_negative(A2) { // internal_lt(A2, 0) {
|
|
|
+ internal_decr(Q0) or_return
|
|
|
+ internal_add(A2, A2, b) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Return q = (Q1*beta^k) + Q0, r = A2.
|
|
|
+ */
|
|
|
+ _private_int_shl_leg(Q1, k) or_return
|
|
|
+ internal_add(quotient, Q1, Q0) or_return
|
|
|
+
|
|
|
+ return internal_copy(remainder, A2)
|
|
|
+}
|
|
|
+
|
|
|
+_private_int_div_recursive :: proc(quotient, remainder, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ A, B, Q, Q1, R, A_div, A_mod := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(A, B, Q, Q1, R, A_div, A_mod)
|
|
|
+
|
|
|
+ internal_init_multi(A, B, Q, Q1, R, A_div, A_mod) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Most significant bit of a limb.
|
|
|
+ Assumes _DIGIT_MAX < (sizeof(DIGIT) * sizeof(u8)).
|
|
|
+ */
|
|
|
+ msb := (_DIGIT_MAX + DIGIT(1)) >> 1
|
|
|
+ sigma := 0
|
|
|
+ msb_b := b.digit[b.used - 1]
|
|
|
+ for msb_b < msb {
|
|
|
+ sigma += 1
|
|
|
+ msb_b <<= 1
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Use that sigma to normalize B.
|
|
|
+ */
|
|
|
+ internal_shl(B, b, sigma) or_return
|
|
|
+ internal_shl(A, a, sigma) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Fix the sign.
|
|
|
+ */
|
|
|
+ neg := a.sign != b.sign
|
|
|
+ A.sign = .Zero_or_Positive; B.sign = .Zero_or_Positive
|
|
|
+
|
|
|
+ /*
|
|
|
+ If the magnitude of "A" is not more more than twice that of "B" we can work
|
|
|
+ on them directly, otherwise we need to work at "A" in chunks.
|
|
|
+ */
|
|
|
+ n := B.used
|
|
|
+ m := A.used - B.used
|
|
|
+
|
|
|
+ /*
|
|
|
+ Q = 0. We already ensured that when we called `internal_init_multi`.
|
|
|
+ */
|
|
|
+ for m > n {
|
|
|
+ /*
|
|
|
+ (q, r) = RecursiveDivRem(A / (beta^(m-n)), B)
|
|
|
+ */
|
|
|
+ j := (m - n) * _DIGIT_BITS
|
|
|
+ internal_shrmod(A_div, A_mod, A, j) or_return
|
|
|
+ _private_div_recursion(Q1, R, A_div, B) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Q = (Q*beta!(n)) + q
|
|
|
+ */
|
|
|
+ internal_shl(Q, Q, n * _DIGIT_BITS) or_return
|
|
|
+ internal_add(Q, Q, Q1) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ A = (r * beta^(m-n)) + (A % beta^(m-n))
|
|
|
+ */
|
|
|
+ internal_shl(R, R, (m - n) * _DIGIT_BITS) or_return
|
|
|
+ internal_add(A, R, A_mod) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ m = m - n
|
|
|
+ */
|
|
|
+ m -= n
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ (q, r) = RecursiveDivRem(A, B)
|
|
|
+ */
|
|
|
+ _private_div_recursion(Q1, R, A, B) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Q = (Q * beta^m) + q, R = r
|
|
|
+ */
|
|
|
+ internal_shl(Q, Q, m * _DIGIT_BITS) or_return
|
|
|
+ internal_add(Q, Q, Q1) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Get sign before writing to dest.
|
|
|
+ */
|
|
|
+ R.sign = .Zero_or_Positive if internal_is_zero(Q) else a.sign
|
|
|
+
|
|
|
+ if quotient != nil {
|
|
|
+ swap(quotient, Q)
|
|
|
+ quotient.sign = .Negative if neg else .Zero_or_Positive
|
|
|
+ }
|
|
|
+ if remainder != nil {
|
|
|
+ /*
|
|
|
+ De-normalize the remainder.
|
|
|
+ */
|
|
|
+ internal_shrmod(R, nil, R, sigma) or_return
|
|
|
+ swap(remainder, R)
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Slower bit-bang division... also smaller.
|
|
|
+*/
|
|
|
+@(deprecated="Use `_int_div_school`, it's 3.5x faster.")
|
|
|
+_private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) {
|
|
|
+
|
|
|
+ ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+
|
|
|
+ defer internal_destroy(ta, tb, tq, q)
|
|
|
+
|
|
|
+ for {
|
|
|
+ internal_one(tq) or_return
|
|
|
+
|
|
|
+ num_bits, _ := count_bits(numerator)
|
|
|
+ den_bits, _ := count_bits(denominator)
|
|
|
+ n := num_bits - den_bits
|
|
|
+
|
|
|
+ abs(ta, numerator) or_return
|
|
|
+ abs(tb, denominator) or_return
|
|
|
+ shl(tb, tb, n) or_return
|
|
|
+ shl(tq, tq, n) or_return
|
|
|
+
|
|
|
+ for n >= 0 {
|
|
|
+ if internal_gte(ta, tb) {
|
|
|
+ // ta -= tb
|
|
|
+ sub(ta, ta, tb) or_return
|
|
|
+ // q += tq
|
|
|
+ add( q, q, tq) or_return
|
|
|
+ }
|
|
|
+ shr1(tb, tb) or_return
|
|
|
+ shr1(tq, tq) or_return
|
|
|
+
|
|
|
+ n -= 1
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now q == quotient and ta == remainder.
|
|
|
+ */
|
|
|
+ neg := numerator.sign != denominator.sign
|
|
|
+ if quotient != nil {
|
|
|
+ swap(quotient, q)
|
|
|
+ z, _ := is_zero(quotient)
|
|
|
+ quotient.sign = .Negative if neg && !z else .Zero_or_Positive
|
|
|
+ }
|
|
|
+ if remainder != nil {
|
|
|
+ swap(remainder, ta)
|
|
|
+ z, _ := is_zero(numerator)
|
|
|
+ remainder.sign = .Zero_or_Positive if z else numerator.sign
|
|
|
+ }
|
|
|
+
|
|
|
+ break
|
|
|
+ }
|
|
|
+ return err
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+/*
|
|
|
+ Binary split factorial algo due to: http://www.luschny.de/math/factorial/binarysplitfact.html
|
|
|
+*/
|
|
|
+_private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ inner, outer, start, stop, temp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(inner, outer, start, stop, temp)
|
|
|
+
|
|
|
+ internal_one(inner, false) or_return
|
|
|
+ internal_one(outer, false) or_return
|
|
|
+
|
|
|
+ bits_used := ilog2(n)
|
|
|
+
|
|
|
+ for i := bits_used; i >= 0; i -= 1 {
|
|
|
+ start := (n >> (uint(i) + 1)) + 1 | 1
|
|
|
+ stop := (n >> uint(i)) + 1 | 1
|
|
|
+ _private_int_recursive_product(temp, start, stop, 0) or_return
|
|
|
+ internal_mul(inner, inner, temp) or_return
|
|
|
+ internal_mul(outer, outer, inner) or_return
|
|
|
+ }
|
|
|
+ shift := n - intrinsics.count_ones(n)
|
|
|
+
|
|
|
+ return internal_shl(res, outer, int(shift))
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Recursive product used by binary split factorial algorithm.
|
|
|
+*/
|
|
|
+_private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int(0), allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ t1, t2 := &Int{}, &Int{}
|
|
|
+ defer internal_destroy(t1, t2)
|
|
|
+
|
|
|
+ if level > FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS {
|
|
|
+ return .Max_Iterations_Reached
|
|
|
+ }
|
|
|
+
|
|
|
+ num_factors := (stop - start) >> 1
|
|
|
+ if num_factors == 2 {
|
|
|
+ internal_set(t1, start, false) or_return
|
|
|
+ when true {
|
|
|
+ internal_grow(t2, t1.used + 1, false) or_return
|
|
|
+ internal_add(t2, t1, 2) or_return
|
|
|
+ } else {
|
|
|
+ internal_add(t2, t1, 2) or_return
|
|
|
+ }
|
|
|
+ return internal_mul(res, t1, t2)
|
|
|
+ }
|
|
|
+
|
|
|
+ if num_factors > 1 {
|
|
|
+ mid := (start + num_factors) | 1
|
|
|
+ _private_int_recursive_product(t1, start, mid, level + 1) or_return
|
|
|
+ _private_int_recursive_product(t2, mid, stop, level + 1) or_return
|
|
|
+ return internal_mul(res, t1, t2)
|
|
|
+ }
|
|
|
+
|
|
|
+ if num_factors == 1 {
|
|
|
+ return #force_inline internal_set(res, start, true)
|
|
|
+ }
|
|
|
+
|
|
|
+ return #force_inline internal_one(res, true)
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Internal function computing both GCD using the binary method,
|
|
|
+ and, if target isn't `nil`, also LCM.
|
|
|
+
|
|
|
+ Expects the `a` and `b` to have been initialized
|
|
|
+ and one or both of `res_gcd` or `res_lcm` not to be `nil`.
|
|
|
+
|
|
|
+ If both `a` and `b` are zero, return zero.
|
|
|
+ If either `a` or `b`, return the other one.
|
|
|
+
|
|
|
+ The `gcd` and `lcm` wrappers have already done this test,
|
|
|
+ but `gcd_lcm` wouldn't have, so we still need to perform it.
|
|
|
+
|
|
|
+ If neither result is wanted, we have nothing to do.
|
|
|
+*/
|
|
|
+_private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ if res_gcd == nil && res_lcm == nil {
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ We need a temporary because `res_gcd` is allowed to be `nil`.
|
|
|
+ */
|
|
|
+ if a.used == 0 && b.used == 0 {
|
|
|
+ /*
|
|
|
+ GCD(0, 0) and LCM(0, 0) are both 0.
|
|
|
+ */
|
|
|
+ if res_gcd != nil {
|
|
|
+ internal_zero(res_gcd) or_return
|
|
|
+ }
|
|
|
+ if res_lcm != nil {
|
|
|
+ internal_zero(res_lcm) or_return
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+ } else if a.used == 0 {
|
|
|
+ /*
|
|
|
+ We can early out with GCD = B and LCM = 0
|
|
|
+ */
|
|
|
+ if res_gcd != nil {
|
|
|
+ internal_abs(res_gcd, b) or_return
|
|
|
+ }
|
|
|
+ if res_lcm != nil {
|
|
|
+ internal_zero(res_lcm) or_return
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+ } else if b.used == 0 {
|
|
|
+ /*
|
|
|
+ We can early out with GCD = A and LCM = 0
|
|
|
+ */
|
|
|
+ if res_gcd != nil {
|
|
|
+ internal_abs(res_gcd, a) or_return
|
|
|
+ }
|
|
|
+ if res_lcm != nil {
|
|
|
+ internal_zero(res_lcm) or_return
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+
|
|
|
+ temp_gcd_res := &Int{}
|
|
|
+ defer internal_destroy(temp_gcd_res)
|
|
|
+
|
|
|
+ /*
|
|
|
+ If neither `a` or `b` was zero, we need to compute `gcd`.
|
|
|
+ Get copies of `a` and `b` we can modify.
|
|
|
+ */
|
|
|
+ u, v := &Int{}, &Int{}
|
|
|
+ defer internal_destroy(u, v)
|
|
|
+ internal_copy(u, a) or_return
|
|
|
+ internal_copy(v, b) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Must be positive for the remainder of the algorithm.
|
|
|
+ */
|
|
|
+ u.sign = .Zero_or_Positive; v.sign = .Zero_or_Positive
|
|
|
+
|
|
|
+ /*
|
|
|
+ B1. Find the common power of two for `u` and `v`.
|
|
|
+ */
|
|
|
+ u_lsb, _ := internal_count_lsb(u)
|
|
|
+ v_lsb, _ := internal_count_lsb(v)
|
|
|
+ k := min(u_lsb, v_lsb)
|
|
|
+
|
|
|
+ if k > 0 {
|
|
|
+ /*
|
|
|
+ Divide the power of two out.
|
|
|
+ */
|
|
|
+ internal_shr(u, u, k) or_return
|
|
|
+ internal_shr(v, v, k) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Divide any remaining factors of two out.
|
|
|
+ */
|
|
|
+ if u_lsb != k {
|
|
|
+ internal_shr(u, u, u_lsb - k) or_return
|
|
|
+ }
|
|
|
+ if v_lsb != k {
|
|
|
+ internal_shr(v, v, v_lsb - k) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ for v.used != 0 {
|
|
|
+ /*
|
|
|
+ Make sure `v` is the largest.
|
|
|
+ */
|
|
|
+ if internal_gt(u, v) {
|
|
|
+ /*
|
|
|
+ Swap `u` and `v` to make sure `v` is >= `u`.
|
|
|
+ */
|
|
|
+ internal_swap(u, v)
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Subtract smallest from largest.
|
|
|
+ */
|
|
|
+ internal_sub(v, v, u) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Divide out all factors of two.
|
|
|
+ */
|
|
|
+ b, _ := internal_count_lsb(v)
|
|
|
+ internal_shr(v, v, b) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Multiply by 2**k which we divided out at the beginning.
|
|
|
+ */
|
|
|
+ internal_shl(temp_gcd_res, u, k) or_return
|
|
|
+ temp_gcd_res.sign = .Zero_or_Positive
|
|
|
+
|
|
|
+ /*
|
|
|
+ We've computed `gcd`, either the long way, or because one of the inputs was zero.
|
|
|
+ If we don't want `lcm`, we're done.
|
|
|
+ */
|
|
|
+ if res_lcm == nil {
|
|
|
+ internal_swap(temp_gcd_res, res_gcd)
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Computes least common multiple as `|a*b|/gcd(a,b)`
|
|
|
+ Divide the smallest by the GCD.
|
|
|
+ */
|
|
|
+ if internal_lt_abs(a, b) {
|
|
|
+ /*
|
|
|
+ Store quotient in `t2` such that `t2 * b` is the LCM.
|
|
|
+ */
|
|
|
+ internal_div(res_lcm, a, temp_gcd_res) or_return
|
|
|
+ err = internal_mul(res_lcm, res_lcm, b)
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ Store quotient in `t2` such that `t2 * a` is the LCM.
|
|
|
+ */
|
|
|
+ internal_div(res_lcm, b, temp_gcd_res) or_return
|
|
|
+ err = internal_mul(res_lcm, res_lcm, a)
|
|
|
+ }
|
|
|
+
|
|
|
+ if res_gcd != nil {
|
|
|
+ internal_swap(temp_gcd_res, res_gcd)
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Fix the sign to positive and return.
|
|
|
+ */
|
|
|
+ res_lcm.sign = .Zero_or_Positive
|
|
|
+ return err
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Internal implementation of log.
|
|
|
+ Assumes `a` not to be `nil` and to have been initialized.
|
|
|
+*/
|
|
|
+_private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -> (res: int, err: Error) {
|
|
|
+ bracket_low, bracket_high, bracket_mid, t, bi_base := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(bracket_low, bracket_high, bracket_mid, t, bi_base)
|
|
|
+
|
|
|
+ ic := #force_inline internal_cmp(a, base)
|
|
|
+ if ic == -1 || ic == 0 {
|
|
|
+ return 1 if ic == 0 else 0, nil
|
|
|
+ }
|
|
|
+ defer if err != nil {
|
|
|
+ res = -1
|
|
|
+ }
|
|
|
+
|
|
|
+ internal_set(bi_base, base, true, allocator) or_return
|
|
|
+ internal_clear(bracket_mid, false, allocator) or_return
|
|
|
+ internal_clear(t, false, allocator) or_return
|
|
|
+ internal_one(bracket_low, false, allocator) or_return
|
|
|
+ internal_set(bracket_high, base, false, allocator) or_return
|
|
|
+
|
|
|
+ low := 0; high := 1
|
|
|
+
|
|
|
+ /*
|
|
|
+ A kind of Giant-step/baby-step algorithm.
|
|
|
+ Idea shamelessly stolen from https://programmingpraxis.com/2010/05/07/integer-logarithms/2/
|
|
|
+ The effect is asymptotic, hence needs benchmarks to test if the Giant-step should be skipped
|
|
|
+ for small n.
|
|
|
+ */
|
|
|
+
|
|
|
+ for {
|
|
|
+ /*
|
|
|
+ Iterate until `a` is bracketed between low + high.
|
|
|
+ */
|
|
|
+ if #force_inline internal_gte(bracket_high, a) { break }
|
|
|
+
|
|
|
+ low = high
|
|
|
+ #force_inline internal_copy(bracket_low, bracket_high) or_return
|
|
|
+ high <<= 1
|
|
|
+ #force_inline internal_sqr(bracket_high, bracket_high) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ for (high - low) > 1 {
|
|
|
+ mid := (high + low) >> 1
|
|
|
+
|
|
|
+ #force_inline internal_pow(t, bi_base, mid - low) or_return
|
|
|
+
|
|
|
+ #force_inline internal_mul(bracket_mid, bracket_low, t) or_return
|
|
|
+
|
|
|
+ mc := #force_inline internal_cmp(a, bracket_mid)
|
|
|
+ switch mc {
|
|
|
+ case -1:
|
|
|
+ high = mid
|
|
|
+ internal_swap(bracket_mid, bracket_high)
|
|
|
+ case 0:
|
|
|
+ return mid, nil
|
|
|
+ case 1:
|
|
|
+ low = mid
|
|
|
+ internal_swap(bracket_mid, bracket_low)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ fc := #force_inline internal_cmp(bracket_high, a)
|
|
|
+ res = high if fc == 0 else low
|
|
|
+
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Computes xR**-1 == x (mod N) via Montgomery Reduction.
|
|
|
+ This is an optimized implementation of `internal_montgomery_reduce`
|
|
|
+ which uses the comba method to quickly calculate the columns of the reduction.
|
|
|
+ Based on Algorithm 14.32 on pp.601 of HAC.
|
|
|
+*/
|
|
|
+_private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+ W: [_WARRAY]_WORD = ---
|
|
|
+
|
|
|
+ if x.used > _WARRAY { return .Invalid_Argument }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Get old used count.
|
|
|
+ */
|
|
|
+ old_used := x.used
|
|
|
+
|
|
|
+ /*
|
|
|
+ Grow `x` as required.
|
|
|
+ */
|
|
|
+ internal_grow(x, n.used + 1) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ First we have to get the digits of the input into an array of double precision words W[...]
|
|
|
+ Copy the digits of `x` into W[0..`x.used` - 1]
|
|
|
+ */
|
|
|
+ ix: int
|
|
|
+ for ix = 0; ix < x.used; ix += 1 {
|
|
|
+ W[ix] = _WORD(x.digit[ix])
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Zero the high words of W[a->used..m->used*2].
|
|
|
+ */
|
|
|
+ zero_upper := (n.used * 2) + 1
|
|
|
+ if ix < zero_upper {
|
|
|
+ for ix = x.used; ix < zero_upper; ix += 1 {
|
|
|
+ W[ix] = {}
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now we proceed to zero successive digits from the least significant upwards.
|
|
|
+ */
|
|
|
+ for ix = 0; ix < n.used; ix += 1 {
|
|
|
+ /*
|
|
|
+ `mu = ai * m' mod b`
|
|
|
+
|
|
|
+ We avoid a double precision multiplication (which isn't required)
|
|
|
+ by casting the value down to a DIGIT. Note this requires
|
|
|
+ that W[ix-1] have the carry cleared (see after the inner loop)
|
|
|
+ */
|
|
|
+ mu := ((W[ix] & _WORD(_MASK)) * _WORD(rho)) & _WORD(_MASK)
|
|
|
+
|
|
|
+ /*
|
|
|
+ `a = a + mu * m * b**i`
|
|
|
+
|
|
|
+ This is computed in place and on the fly. The multiplication
|
|
|
+ by b**i is handled by offseting which columns the results
|
|
|
+ are added to.
|
|
|
+
|
|
|
+ Note the comba method normally doesn't handle carries in the
|
|
|
+ inner loop In this case we fix the carry from the previous
|
|
|
+ column since the Montgomery reduction requires digits of the
|
|
|
+ result (so far) [see above] to work.
|
|
|
+
|
|
|
+ This is handled by fixing up one carry after the inner loop.
|
|
|
+ The carry fixups are done in order so after these loops the
|
|
|
+ first m->used words of W[] have the carries fixed.
|
|
|
+ */
|
|
|
+ for iy := 0; iy < n.used; iy += 1 {
|
|
|
+ W[ix + iy] += mu * _WORD(n.digit[iy])
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now fix carry for next digit, W[ix+1].
|
|
|
+ */
|
|
|
+ W[ix + 1] += (W[ix] >> _DIGIT_BITS)
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now we have to propagate the carries and shift the words downward
|
|
|
+ [all those least significant digits we zeroed].
|
|
|
+ */
|
|
|
+
|
|
|
+ for ; ix < n.used * 2; ix += 1 {
|
|
|
+ W[ix + 1] += (W[ix] >> _DIGIT_BITS)
|
|
|
+ }
|
|
|
+
|
|
|
+ /* copy out, A = A/b**n
|
|
|
+ *
|
|
|
+ * The result is A/b**n but instead of converting from an
|
|
|
+ * array of mp_word to mp_digit than calling mp_rshd
|
|
|
+ * we just copy them in the right order
|
|
|
+ */
|
|
|
+
|
|
|
+ for ix = 0; ix < (n.used + 1); ix += 1 {
|
|
|
+ x.digit[ix] = DIGIT(W[n.used + ix] & _WORD(_MASK))
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Set the max used.
|
|
|
+ */
|
|
|
+ x.used = n.used + 1
|
|
|
+
|
|
|
+ /*
|
|
|
+ Zero old_used digits, if the input a was larger than m->used+1 we'll have to clear the digits.
|
|
|
+ */
|
|
|
+ internal_zero_unused(x, old_used)
|
|
|
+ internal_clamp(x)
|
|
|
+
|
|
|
+ /*
|
|
|
+ if A >= m then A = A - m
|
|
|
+ */
|
|
|
+ if internal_gte_abs(x, n) {
|
|
|
+ return internal_sub(x, x, n)
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Computes xR**-1 == x (mod N) via Montgomery Reduction.
|
|
|
+ Assumes `x` and `n` not to be nil.
|
|
|
+*/
|
|
|
+_private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+ /*
|
|
|
+ Can the fast reduction [comba] method be used?
|
|
|
+ Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
|
|
|
+ since carries are fixed up in the inner loop.
|
|
|
+ */
|
|
|
+ internal_clear_if_uninitialized(x, n) or_return
|
|
|
+
|
|
|
+ digs := (n.used * 2) + 1
|
|
|
+ if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
|
|
|
+ return _private_montgomery_reduce_comba(x, n, rho)
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Grow the input as required
|
|
|
+ */
|
|
|
+ internal_grow(x, digs) or_return
|
|
|
+ x.used = digs
|
|
|
+
|
|
|
+ for ix := 0; ix < n.used; ix += 1 {
|
|
|
+ /*
|
|
|
+ `mu = ai * rho mod b`
|
|
|
+ The value of rho must be precalculated via `int_montgomery_setup()`,
|
|
|
+ such that it equals -1/n0 mod b this allows the following inner loop
|
|
|
+ to reduce the input one digit at a time.
|
|
|
+ */
|
|
|
+
|
|
|
+ mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK))
|
|
|
+
|
|
|
+ /*
|
|
|
+ a = a + mu * m * b**i
|
|
|
+ Multiply and add in place.
|
|
|
+ */
|
|
|
+ u := DIGIT(0)
|
|
|
+ iy := int(0)
|
|
|
+ for ; iy < n.used; iy += 1 {
|
|
|
+ /*
|
|
|
+ Compute product and sum.
|
|
|
+ */
|
|
|
+ r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]))
|
|
|
+
|
|
|
+ /*
|
|
|
+ Get carry.
|
|
|
+ */
|
|
|
+ u = DIGIT(r >> _DIGIT_BITS)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Fix digit.
|
|
|
+ */
|
|
|
+ x.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ At this point the ix'th digit of x should be zero.
|
|
|
+ Propagate carries upwards as required.
|
|
|
+ */
|
|
|
+ for u != 0 {
|
|
|
+ x.digit[ix + iy] += u
|
|
|
+ u = x.digit[ix + iy] >> _DIGIT_BITS
|
|
|
+ x.digit[ix + iy] &= _MASK
|
|
|
+ iy += 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ At this point the n.used'th least significant digits of x are all zero,
|
|
|
+ which means we can shift x to the right by n.used digits and the
|
|
|
+ residue is unchanged.
|
|
|
+
|
|
|
+ x = x/b**n.used.
|
|
|
+ */
|
|
|
+ internal_clamp(x)
|
|
|
+ _private_int_shr_leg(x, n.used)
|
|
|
+
|
|
|
+ /*
|
|
|
+ if x >= n then x = x - n
|
|
|
+ */
|
|
|
+ if internal_gte_abs(x, n) {
|
|
|
+ return internal_sub(x, x, n)
|
|
|
+ }
|
|
|
+
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Shifts with subtractions when the result is greater than b.
|
|
|
+
|
|
|
+ The method is slightly modified to shift B unconditionally upto just under
|
|
|
+ the leading bit of b. This saves alot of multiple precision shifting.
|
|
|
+
|
|
|
+ Assumes `a` and `b` not to be `nil`.
|
|
|
+*/
|
|
|
+_private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+ /*
|
|
|
+ How many bits of last digit does b use.
|
|
|
+ */
|
|
|
+ internal_clear_if_uninitialized(a, b) or_return
|
|
|
+
|
|
|
+ bits := internal_count_bits(b) % _DIGIT_BITS
|
|
|
+
|
|
|
+ if b.used > 1 {
|
|
|
+ power := ((b.used - 1) * _DIGIT_BITS) + bits - 1
|
|
|
+ internal_int_power_of_two(a, power) or_return
|
|
|
+ } else {
|
|
|
+ internal_one(a) or_return
|
|
|
+ bits = 1
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now compute C = A * B mod b.
|
|
|
+ */
|
|
|
+ for x := bits - 1; x < _DIGIT_BITS; x += 1 {
|
|
|
+ internal_int_shl1(a, a) or_return
|
|
|
+ if internal_gte_abs(a, b) {
|
|
|
+ internal_sub(a, a, b) or_return
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Sets up the Montgomery reduction stuff.
|
|
|
+*/
|
|
|
+_private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
|
|
|
+ /*
|
|
|
+ Fast inversion mod 2**k
|
|
|
+ Based on the fact that:
|
|
|
+
|
|
|
+ XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
|
|
|
+ => 2*X*A - X*X*A*A = 1
|
|
|
+ => 2*(1) - (1) = 1
|
|
|
+ */
|
|
|
+ internal_clear_if_uninitialized(n, allocator) or_return
|
|
|
+
|
|
|
+ b := n.digit[0]
|
|
|
+ if b & 1 == 0 { return 0, .Invalid_Argument }
|
|
|
+
|
|
|
+ x := (((b + 2) & 4) << 1) + b /* here x*a==1 mod 2**4 */
|
|
|
+ x *= 2 - (b * x) /* here x*a==1 mod 2**8 */
|
|
|
+ x *= 2 - (b * x) /* here x*a==1 mod 2**16 */
|
|
|
+
|
|
|
+ when _DIGIT_TYPE_BITS == 64 {
|
|
|
+ x *= 2 - (b * x) /* here x*a==1 mod 2**32 */
|
|
|
+ x *= 2 - (b * x) /* here x*a==1 mod 2**64 */
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ rho = -1/m mod b
|
|
|
+ */
|
|
|
+ rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK))
|
|
|
+ return rho, nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
|
|
|
+ From HAC pp.604 Algorithm 14.42
|
|
|
+
|
|
|
+ Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
|
|
|
+*/
|
|
|
+_private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ q := &Int{}
|
|
|
+ defer internal_destroy(q)
|
|
|
+ um := m.used
|
|
|
+
|
|
|
+ /*
|
|
|
+ q = x
|
|
|
+ */
|
|
|
+ internal_copy(q, x) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ q1 = x / b**(k-1)
|
|
|
+ */
|
|
|
+ _private_int_shr_leg(q, um - 1)
|
|
|
+
|
|
|
+ /*
|
|
|
+ According to HAC this optimization is ok.
|
|
|
+ */
|
|
|
+ if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
|
|
|
+ internal_mul(q, q, mu) or_return
|
|
|
+ } else {
|
|
|
+ _private_int_mul_high(q, q, mu, um) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ q3 = q2 / b**(k+1)
|
|
|
+ */
|
|
|
+ _private_int_shr_leg(q, um + 1)
|
|
|
+
|
|
|
+ /*
|
|
|
+ x = x mod b**(k+1), quick (no division)
|
|
|
+ */
|
|
|
+ internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1)) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ q = q * m mod b**(k+1), quick (no division)
|
|
|
+ */
|
|
|
+ _private_int_mul(q, q, m, um + 1) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ x = x - q
|
|
|
+ */
|
|
|
+ internal_sub(x, x, q) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ If x < 0, add b**(k+1) to it.
|
|
|
+ */
|
|
|
+ if internal_is_negative(x) {
|
|
|
+ internal_set(q, 1) or_return
|
|
|
+ _private_int_shl_leg(q, um + 1) or_return
|
|
|
+ internal_add(x, x, q) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Back off if it's too big.
|
|
|
+ */
|
|
|
+ for internal_gte(x, m) {
|
|
|
+ internal_sub(x, x, m) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
|
|
|
+*/
|
|
|
+_private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ q := &Int{}
|
|
|
+ defer internal_destroy(q)
|
|
|
+
|
|
|
+ internal_zero(q) or_return
|
|
|
+
|
|
|
+ p := internal_count_bits(n)
|
|
|
+
|
|
|
+ for {
|
|
|
+ /*
|
|
|
+ q = a/2**p, a = a mod 2**p
|
|
|
+ */
|
|
|
+ internal_shrmod(q, a, a, p) or_return
|
|
|
+
|
|
|
+ if d != 1 {
|
|
|
+ /*
|
|
|
+ q = q * d
|
|
|
+ */
|
|
|
+ internal_mul(q, q, d) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ a = a + q
|
|
|
+ */
|
|
|
+ internal_add(a, a, q) or_return
|
|
|
+ if internal_lt_abs(a, n) { break }
|
|
|
+ internal_sub(a, a, n) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Reduces `a` modulo `n` where `n` is of the form 2**p - d
|
|
|
+ This differs from reduce_2k since "d" can be larger than a single digit.
|
|
|
+*/
|
|
|
+_private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ q := &Int{}
|
|
|
+ defer internal_destroy(q)
|
|
|
+
|
|
|
+ internal_zero(q) or_return
|
|
|
+
|
|
|
+ p := internal_count_bits(n)
|
|
|
+
|
|
|
+ for {
|
|
|
+ /*
|
|
|
+ q = a/2**p, a = a mod 2**p
|
|
|
+ */
|
|
|
+ internal_shrmod(q, a, a, p) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ q = q * d
|
|
|
+ */
|
|
|
+ internal_mul(q, q, d) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ a = a + q
|
|
|
+ */
|
|
|
+ internal_add(a, a, q) or_return
|
|
|
+ if internal_lt_abs(a, n) { break }
|
|
|
+ internal_sub(a, a, n) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Determines if `internal_int_reduce_2k` can be used.
|
|
|
+ Asssumes `a` not to be `nil` and to have been initialized.
|
|
|
+*/
|
|
|
+_private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
|
|
|
+ assert_if_nil(a)
|
|
|
+
|
|
|
+ if internal_is_zero(a) {
|
|
|
+ return false, nil
|
|
|
+ } else if a.used == 1 {
|
|
|
+ return true, nil
|
|
|
+ } else if a.used > 1 {
|
|
|
+ iy := internal_count_bits(a)
|
|
|
+ iw := 1
|
|
|
+ iz := DIGIT(1)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Test every bit from the second digit up, must be 1.
|
|
|
+ */
|
|
|
+ for ix := _DIGIT_BITS; ix < iy; ix += 1 {
|
|
|
+ if a.digit[iw] & iz == 0 {
|
|
|
+ return false, nil
|
|
|
+ }
|
|
|
+
|
|
|
+ iz <<= 1
|
|
|
+ if iz > _DIGIT_MAX {
|
|
|
+ iw += 1
|
|
|
+ iz = 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return true, nil
|
|
|
+ } else {
|
|
|
+ return true, nil
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Determines if `internal_int_reduce_2k_l` can be used.
|
|
|
+ Asssumes `a` not to be `nil` and to have been initialized.
|
|
|
+*/
|
|
|
+_private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
|
|
|
+ assert_if_nil(a)
|
|
|
+
|
|
|
+ if internal_int_is_zero(a) {
|
|
|
+ return false, nil
|
|
|
+ } else if a.used == 1 {
|
|
|
+ return true, nil
|
|
|
+ } else if a.used > 1 {
|
|
|
+ /*
|
|
|
+ If more than half of the digits are -1 we're sold.
|
|
|
+ */
|
|
|
+ ix := 0
|
|
|
+ iy := 0
|
|
|
+
|
|
|
+ for ; ix < a.used; ix += 1 {
|
|
|
+ if a.digit[ix] == _DIGIT_MAX {
|
|
|
+ iy += 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return iy >= (a.used / 2), nil
|
|
|
+ } else {
|
|
|
+ return false, nil
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Determines the setup value.
|
|
|
+ Assumes `a` is not `nil`.
|
|
|
+*/
|
|
|
+_private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ tmp := &Int{}
|
|
|
+ defer internal_destroy(tmp)
|
|
|
+ internal_zero(tmp) or_return
|
|
|
+
|
|
|
+ internal_int_power_of_two(tmp, internal_count_bits(a)) or_return
|
|
|
+ internal_sub(tmp, tmp, a) or_return
|
|
|
+
|
|
|
+ return tmp.digit[0], nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Determines the setup value.
|
|
|
+ Assumes `mu` and `P` are not `nil`.
|
|
|
+
|
|
|
+ d := (1 << a.bits) - a;
|
|
|
+*/
|
|
|
+_private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ tmp := &Int{}
|
|
|
+ defer internal_destroy(tmp)
|
|
|
+ internal_zero(tmp) or_return
|
|
|
+
|
|
|
+ internal_int_power_of_two(tmp, internal_count_bits(P)) or_return
|
|
|
+ internal_sub(mu, tmp, P) or_return
|
|
|
+
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Pre-calculate the value required for Barrett reduction.
|
|
|
+ For a given modulus "P" it calulates the value required in "mu"
|
|
|
+ Assumes `mu` and `P` are not `nil`.
|
|
|
+*/
|
|
|
+_private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS) or_return
|
|
|
+ return internal_int_div(mu, mu, P)
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Determines the setup value.
|
|
|
+ Assumes `a` to not be `nil` and to have been initialized.
|
|
|
+*/
|
|
|
+_private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
|
|
|
+ /*
|
|
|
+ The casts are required if _DIGIT_BITS is one less than
|
|
|
+ the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
|
|
|
+ */
|
|
|
+ return DIGIT((1 << _DIGIT_BITS) - a.digit[0])
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Determines if a number is a valid DR modulus.
|
|
|
+ Assumes `a` to not be `nil` and to have been initialized.
|
|
|
+*/
|
|
|
+_private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
|
|
|
+ /*
|
|
|
+ Must be at least two digits.
|
|
|
+ */
|
|
|
+ if a.used < 2 { return false }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
|
|
|
+ */
|
|
|
+ for ix := 1; ix < a.used; ix += 1 {
|
|
|
+ if a.digit[ix] != _MASK {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return true
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
|
|
|
+ Based on algorithm from the paper
|
|
|
+
|
|
|
+ "Generating Efficient Primes for Discrete Log Cryptosystems"
|
|
|
+ Chae Hoon Lim, Pil Joong Lee,
|
|
|
+ POSTECH Information Research Laboratories
|
|
|
+
|
|
|
+ The modulus must be of a special format [see manual].
|
|
|
+ Has been modified to use algorithm 7.10 from the LTM book instead
|
|
|
+
|
|
|
+ Input x must be in the range 0 <= x <= (n-1)**2
|
|
|
+ Assumes `x` and `n` to not be `nil` and to have been initialized.
|
|
|
+*/
|
|
|
+_private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
|
|
|
+ /*
|
|
|
+ m = digits in modulus.
|
|
|
+ */
|
|
|
+ m := n.used
|
|
|
+
|
|
|
+ /*
|
|
|
+ Ensure that "x" has at least 2m digits.
|
|
|
+ */
|
|
|
+ internal_grow(x, m + m) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Top of loop, this is where the code resumes if another reduction pass is required.
|
|
|
+ */
|
|
|
+ for {
|
|
|
+ i: int
|
|
|
+ mu := DIGIT(0)
|
|
|
+
|
|
|
+ /*
|
|
|
+ Compute (x mod B**m) + k * [x/B**m] inline and inplace.
|
|
|
+ */
|
|
|
+ for i = 0; i < m; i += 1 {
|
|
|
+ r := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu)
|
|
|
+ x.digit[i] = DIGIT(r & _WORD(_MASK))
|
|
|
+ mu = DIGIT(r >> _WORD(_DIGIT_BITS))
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Set final carry.
|
|
|
+ */
|
|
|
+ x.digit[i] = mu
|
|
|
+
|
|
|
+ /*
|
|
|
+ Zero words above m.
|
|
|
+ */
|
|
|
+ mem.zero_slice(x.digit[m + 1:][:x.used - m])
|
|
|
+
|
|
|
+ /*
|
|
|
+ Clamp, sub and return.
|
|
|
+ */
|
|
|
+ internal_clamp(x) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ If x >= n then subtract and reduce again.
|
|
|
+ Each successive "recursion" makes the input smaller and smaller.
|
|
|
+ */
|
|
|
+ if internal_lt_abs(x, n) { break }
|
|
|
+
|
|
|
+ internal_sub(x, x, n) or_return
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Computes res == G**X mod P.
|
|
|
+ Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
|
|
|
+*/
|
|
|
+_private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ M := [_TAB_SIZE]Int{}
|
|
|
+ winsize: uint
|
|
|
+
|
|
|
+ /*
|
|
|
+ Use a pointer to the reduction algorithm.
|
|
|
+ This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
|
|
|
+ */
|
|
|
+ redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error)
|
|
|
+
|
|
|
+ defer {
|
|
|
+ internal_destroy(&M[1])
|
|
|
+ for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
|
|
|
+ internal_destroy(&M[x])
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Find window size.
|
|
|
+ */
|
|
|
+ x := internal_count_bits(X)
|
|
|
+ switch {
|
|
|
+ case x <= 7:
|
|
|
+ winsize = 2
|
|
|
+ case x <= 36:
|
|
|
+ winsize = 3
|
|
|
+ case x <= 140:
|
|
|
+ winsize = 4
|
|
|
+ case x <= 450:
|
|
|
+ winsize = 5
|
|
|
+ case x <= 1303:
|
|
|
+ winsize = 6
|
|
|
+ case x <= 3529:
|
|
|
+ winsize = 7
|
|
|
+ case:
|
|
|
+ winsize = 8
|
|
|
+ }
|
|
|
+
|
|
|
+ winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
|
|
|
+
|
|
|
+ /*
|
|
|
+ Init M array.
|
|
|
+ Init first cell.
|
|
|
+ */
|
|
|
+ internal_zero(&M[1]) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now init the second half of the array.
|
|
|
+ */
|
|
|
+ for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
|
|
|
+ internal_zero(&M[x]) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Create `mu`, used for Barrett reduction.
|
|
|
+ */
|
|
|
+ mu := &Int{}
|
|
|
+ defer internal_destroy(mu)
|
|
|
+ internal_zero(mu) or_return
|
|
|
+
|
|
|
+ if redmode == 0 {
|
|
|
+ _private_int_reduce_setup(mu, P) or_return
|
|
|
+ redux = _private_int_reduce
|
|
|
+ } else {
|
|
|
+ _private_int_reduce_2k_setup_l(mu, P) or_return
|
|
|
+ redux = _private_int_reduce_2k_l
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Create M table.
|
|
|
+
|
|
|
+ The M table contains powers of the base, e.g. M[x] = G**x mod P.
|
|
|
+ The first half of the table is not computed, though, except for M[0] and M[1].
|
|
|
+ */
|
|
|
+ internal_int_mod(&M[1], G, P) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
|
|
|
+
|
|
|
+ TODO: This can probably be replaced by computing the power and using `pow` to raise to it
|
|
|
+ instead of repeated squaring.
|
|
|
+ */
|
|
|
+ slot := 1 << (winsize - 1)
|
|
|
+ internal_copy(&M[slot], &M[1]) or_return
|
|
|
+
|
|
|
+ for x = 0; x < int(winsize - 1); x += 1 {
|
|
|
+ /*
|
|
|
+ Square it.
|
|
|
+ */
|
|
|
+ internal_sqr(&M[slot], &M[slot]) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Reduce modulo P
|
|
|
+ */
|
|
|
+ redux(&M[slot], P, mu) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
|
|
|
+ for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
|
|
|
+ */
|
|
|
+ for x = slot + 1; x < (1 << winsize); x += 1 {
|
|
|
+ internal_mul(&M[x], &M[x - 1], &M[1]) or_return
|
|
|
+ redux(&M[x], P, mu) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Setup result.
|
|
|
+ */
|
|
|
+ internal_one(res) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Set initial mode and bit cnt.
|
|
|
+ */
|
|
|
+ mode := 0
|
|
|
+ bitcnt := 1
|
|
|
+ buf := DIGIT(0)
|
|
|
+ digidx := X.used - 1
|
|
|
+ bitcpy := uint(0)
|
|
|
+ bitbuf := DIGIT(0)
|
|
|
+
|
|
|
+ for {
|
|
|
+ /*
|
|
|
+ Grab next digit as required.
|
|
|
+ */
|
|
|
+ bitcnt -= 1
|
|
|
+ if bitcnt == 0 {
|
|
|
+ /*
|
|
|
+ If digidx == -1 we are out of digits.
|
|
|
+ */
|
|
|
+ if digidx == -1 { break }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Read next digit and reset the bitcnt.
|
|
|
+ */
|
|
|
+ buf = X.digit[digidx]
|
|
|
+ digidx -= 1
|
|
|
+ bitcnt = _DIGIT_BITS
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Grab the next msb from the exponent.
|
|
|
+ */
|
|
|
+ y := buf >> (_DIGIT_BITS - 1) & 1
|
|
|
+ buf <<= 1
|
|
|
+
|
|
|
+ /*
|
|
|
+ If the bit is zero and mode == 0 then we ignore it.
|
|
|
+ These represent the leading zero bits before the first 1 bit
|
|
|
+ in the exponent. Technically this opt is not required but it
|
|
|
+ does lower the # of trivial squaring/reductions used.
|
|
|
+ */
|
|
|
+ if mode == 0 && y == 0 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ If the bit is zero and mode == 1 then we square.
|
|
|
+ */
|
|
|
+ if mode == 1 && y == 0 {
|
|
|
+ internal_sqr(res, res) or_return
|
|
|
+ redux(res, P, mu) or_return
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Else we add it to the window.
|
|
|
+ */
|
|
|
+ bitcpy += 1
|
|
|
+ bitbuf |= (y << (winsize - bitcpy))
|
|
|
+ mode = 2
|
|
|
+
|
|
|
+ if (bitcpy == winsize) {
|
|
|
+ /*
|
|
|
+ Window is filled so square as required and multiply.
|
|
|
+ Square first.
|
|
|
+ */
|
|
|
+ for x = 0; x < int(winsize); x += 1 {
|
|
|
+ internal_sqr(res, res) or_return
|
|
|
+ redux(res, P, mu) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Then multiply.
|
|
|
+ */
|
|
|
+ internal_mul(res, res, &M[bitbuf]) or_return
|
|
|
+ redux(res, P, mu) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Empty window and reset.
|
|
|
+ */
|
|
|
+ bitcpy = 0
|
|
|
+ bitbuf = 0
|
|
|
+ mode = 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ If bits remain then square/multiply.
|
|
|
+ */
|
|
|
+ if mode == 2 && bitcpy > 0 {
|
|
|
+ /*
|
|
|
+ Square then multiply if the bit is set.
|
|
|
+ */
|
|
|
+ for x = 0; x < int(bitcpy); x += 1 {
|
|
|
+ internal_sqr(res, res) or_return
|
|
|
+ redux(res, P, mu) or_return
|
|
|
+
|
|
|
+ bitbuf <<= 1
|
|
|
+ if ((bitbuf & (1 << winsize)) != 0) {
|
|
|
+ /*
|
|
|
+ Then multiply.
|
|
|
+ */
|
|
|
+ internal_mul(res, res, &M[1]) or_return
|
|
|
+ redux(res, P, mu) or_return
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return err
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
|
|
|
+
|
|
|
+ Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
|
|
|
+ The value of `k` changes based on the size of the exponent.
|
|
|
+
|
|
|
+ Uses Montgomery or Diminished Radix reduction [whichever appropriate]
|
|
|
+
|
|
|
+ Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
|
|
|
+*/
|
|
|
+_private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ M := [_TAB_SIZE]Int{}
|
|
|
+ winsize: uint
|
|
|
+
|
|
|
+ /*
|
|
|
+ Use a pointer to the reduction algorithm.
|
|
|
+ This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
|
|
|
+ */
|
|
|
+ redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error)
|
|
|
+
|
|
|
+ defer {
|
|
|
+ internal_destroy(&M[1])
|
|
|
+ for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
|
|
|
+ internal_destroy(&M[x])
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Find window size.
|
|
|
+ */
|
|
|
+ x := internal_count_bits(X)
|
|
|
+ switch {
|
|
|
+ case x <= 7:
|
|
|
+ winsize = 2
|
|
|
+ case x <= 36:
|
|
|
+ winsize = 3
|
|
|
+ case x <= 140:
|
|
|
+ winsize = 4
|
|
|
+ case x <= 450:
|
|
|
+ winsize = 5
|
|
|
+ case x <= 1303:
|
|
|
+ winsize = 6
|
|
|
+ case x <= 3529:
|
|
|
+ winsize = 7
|
|
|
+ case:
|
|
|
+ winsize = 8
|
|
|
+ }
|
|
|
+
|
|
|
+ winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
|
|
|
+
|
|
|
+ /*
|
|
|
+ Init M array
|
|
|
+ Init first cell.
|
|
|
+ */
|
|
|
+ cap := internal_int_allocated_cap(P)
|
|
|
+ internal_grow(&M[1], cap) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now init the second half of the array.
|
|
|
+ */
|
|
|
+ for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
|
|
|
+ internal_grow(&M[x], cap) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Determine and setup reduction code.
|
|
|
+ */
|
|
|
+ rho: DIGIT
|
|
|
+
|
|
|
+ if redmode == 0 {
|
|
|
+ /*
|
|
|
+ Now setup Montgomery.
|
|
|
+ */
|
|
|
+ rho = _private_int_montgomery_setup(P) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Automatically pick the comba one if available (saves quite a few calls/ifs).
|
|
|
+ */
|
|
|
+ if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
|
|
|
+ redux = _private_montgomery_reduce_comba
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ Use slower baseline Montgomery method.
|
|
|
+ */
|
|
|
+ redux = _private_int_montgomery_reduce
|
|
|
+ }
|
|
|
+ } else if redmode == 1 {
|
|
|
+ /*
|
|
|
+ Setup DR reduction for moduli of the form B**k - b.
|
|
|
+ */
|
|
|
+ rho = _private_int_dr_setup(P)
|
|
|
+ redux = _private_int_dr_reduce
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ Setup DR reduction for moduli of the form 2**k - b.
|
|
|
+ */
|
|
|
+ rho = _private_int_reduce_2k_setup(P) or_return
|
|
|
+ redux = _private_int_reduce_2k
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Setup result.
|
|
|
+ */
|
|
|
+ internal_grow(res, cap) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Create M table
|
|
|
+ The first half of the table is not computed, though, except for M[0] and M[1]
|
|
|
+ */
|
|
|
+
|
|
|
+ if redmode == 0 {
|
|
|
+ /*
|
|
|
+ Now we need R mod m.
|
|
|
+ */
|
|
|
+ _private_int_montgomery_calc_normalization(res, P) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now set M[1] to G * R mod m.
|
|
|
+ */
|
|
|
+ internal_mulmod(&M[1], G, res, P) or_return
|
|
|
+ } else {
|
|
|
+ internal_one(res) or_return
|
|
|
+ internal_mod(&M[1], G, P) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
|
|
|
+ */
|
|
|
+ slot := 1 << (winsize - 1)
|
|
|
+ internal_copy(&M[slot], &M[1]) or_return
|
|
|
+
|
|
|
+ for x = 0; x < int(winsize - 1); x += 1 {
|
|
|
+ internal_sqr(&M[slot], &M[slot]) or_return
|
|
|
+ redux(&M[slot], P, rho) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Create upper table.
|
|
|
+ */
|
|
|
+ for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
|
|
|
+ internal_mul(&M[x], &M[x - 1], &M[1]) or_return
|
|
|
+ redux(&M[x], P, rho) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Set initial mode and bit cnt.
|
|
|
+ */
|
|
|
+ mode := 0
|
|
|
+ bitcnt := 1
|
|
|
+ buf := DIGIT(0)
|
|
|
+ digidx := X.used - 1
|
|
|
+ bitcpy := 0
|
|
|
+ bitbuf := DIGIT(0)
|
|
|
+
|
|
|
+ for {
|
|
|
+ /*
|
|
|
+ Grab next digit as required.
|
|
|
+ */
|
|
|
+ bitcnt -= 1
|
|
|
+ if bitcnt == 0 {
|
|
|
+ /*
|
|
|
+ If digidx == -1 we are out of digits so break.
|
|
|
+ */
|
|
|
+ if digidx == -1 { break }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Read next digit and reset the bitcnt.
|
|
|
+ */
|
|
|
+ buf = X.digit[digidx]
|
|
|
+ digidx -= 1
|
|
|
+ bitcnt = _DIGIT_BITS
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Grab the next msb from the exponent.
|
|
|
+ */
|
|
|
+ y := (buf >> (_DIGIT_BITS - 1)) & 1
|
|
|
+ buf <<= 1
|
|
|
+
|
|
|
+ /*
|
|
|
+ If the bit is zero and mode == 0 then we ignore it.
|
|
|
+ These represent the leading zero bits before the first 1 bit in the exponent.
|
|
|
+ Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
|
|
|
+ */
|
|
|
+ if mode == 0 && y == 0 { continue }
|
|
|
+
|
|
|
+ /*
|
|
|
+ If the bit is zero and mode == 1 then we square.
|
|
|
+ */
|
|
|
+ if mode == 1 && y == 0 {
|
|
|
+ internal_sqr(res, res) or_return
|
|
|
+ redux(res, P, rho) or_return
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Else we add it to the window.
|
|
|
+ */
|
|
|
+ bitcpy += 1
|
|
|
+ bitbuf |= (y << (winsize - uint(bitcpy)))
|
|
|
+ mode = 2
|
|
|
+
|
|
|
+ if bitcpy == int(winsize) {
|
|
|
+ /*
|
|
|
+ Window is filled so square as required and multiply
|
|
|
+ Square first.
|
|
|
+ */
|
|
|
+ for x = 0; x < int(winsize); x += 1 {
|
|
|
+ internal_sqr(res, res) or_return
|
|
|
+ redux(res, P, rho) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Then multiply.
|
|
|
+ */
|
|
|
+ internal_mul(res, res, &M[bitbuf]) or_return
|
|
|
+ redux(res, P, rho) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Empty window and reset.
|
|
|
+ */
|
|
|
+ bitcpy = 0
|
|
|
+ bitbuf = 0
|
|
|
+ mode = 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ If bits remain then square/multiply.
|
|
|
+ */
|
|
|
+ if mode == 2 && bitcpy > 0 {
|
|
|
+ /*
|
|
|
+ Square then multiply if the bit is set.
|
|
|
+ */
|
|
|
+ for x = 0; x < bitcpy; x += 1 {
|
|
|
+ internal_sqr(res, res) or_return
|
|
|
+ redux(res, P, rho) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Get next bit of the window.
|
|
|
+ */
|
|
|
+ bitbuf <<= 1
|
|
|
+ if bitbuf & (1 << winsize) != 0 {
|
|
|
+ /*
|
|
|
+ Then multiply.
|
|
|
+ */
|
|
|
+ internal_mul(res, res, &M[1]) or_return
|
|
|
+ redux(res, P, rho) or_return
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if redmode == 0 {
|
|
|
+ /*
|
|
|
+ Fixup result if Montgomery reduction is used.
|
|
|
+ Recall that any value in a Montgomery system is actually multiplied by R mod n.
|
|
|
+ So we have to reduce one more time to cancel out the factor of R.
|
|
|
+ */
|
|
|
+ redux(res, P, rho) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ hac 14.61, pp608
|
|
|
+*/
|
|
|
+_private_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+ x, y, u, v, A, B, C, D := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(x, y, u, v, A, B, C, D)
|
|
|
+
|
|
|
+ // `b` cannot be negative.
|
|
|
+ if b.sign == .Negative || internal_is_zero(b) {
|
|
|
+ return .Invalid_Argument
|
|
|
+ }
|
|
|
+
|
|
|
+ // init temps.
|
|
|
+ internal_init_multi(x, y, u, v, A, B, C, D) or_return
|
|
|
+
|
|
|
+ // `x` = `a` % `b`, `y` = `b`
|
|
|
+ internal_mod(x, a, b) or_return
|
|
|
+ internal_copy(y, b) or_return
|
|
|
+
|
|
|
+ // 2. [modified] if x,y are both even then return an error!
|
|
|
+ if internal_is_even(x) && internal_is_even(y) {
|
|
|
+ return .Invalid_Argument
|
|
|
+ }
|
|
|
+
|
|
|
+ // 3. u=x, v=y, A=1, B=0, C=0, D=1
|
|
|
+ internal_copy(u, x) or_return
|
|
|
+ internal_copy(v, y) or_return
|
|
|
+ internal_one(A) or_return
|
|
|
+ internal_one(D) or_return
|
|
|
+
|
|
|
+ for {
|
|
|
+ // 4. while `u` is even do:
|
|
|
+ for internal_is_even(u) {
|
|
|
+ // 4.1 `u` = `u` / 2
|
|
|
+ internal_int_shr1(u, u) or_return
|
|
|
+
|
|
|
+ // 4.2 if `A` or `B` is odd then:
|
|
|
+ if internal_is_odd(A) || internal_is_odd(B) {
|
|
|
+ // `A` = (`A`+`y`) / 2, `B` = (`B`-`x`) / 2
|
|
|
+ internal_add(A, A, y) or_return
|
|
|
+ internal_sub(B, B, x) or_return
|
|
|
+ }
|
|
|
+ // `A` = `A` / 2, `B` = `B` / 2
|
|
|
+ internal_int_shr1(A, A) or_return
|
|
|
+ internal_int_shr1(B, B) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ // 5. while `v` is even do:
|
|
|
+ for internal_is_even(v) {
|
|
|
+ // 5.1 `v` = `v` / 2
|
|
|
+ internal_int_shr1(v, v) or_return
|
|
|
+
|
|
|
+ // 5.2 if `C` or `D` is odd then:
|
|
|
+ if internal_is_odd(C) || internal_is_odd(D) {
|
|
|
+ // `C` = (`C`+`y`) / 2, `D` = (`D`-`x`) / 2
|
|
|
+ internal_add(C, C, y) or_return
|
|
|
+ internal_sub(D, D, x) or_return
|
|
|
+ }
|
|
|
+ // `C` = `C` / 2, `D` = `D` / 2
|
|
|
+ internal_int_shr1(C, C) or_return
|
|
|
+ internal_int_shr1(D, D) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ // 6. if `u` >= `v` then:
|
|
|
+ if internal_cmp(u, v) != -1 {
|
|
|
+ // `u` = `u` - `v`, `A` = `A` - `C`, `B` = `B` - `D`
|
|
|
+ internal_sub(u, u, v) or_return
|
|
|
+ internal_sub(A, A, C) or_return
|
|
|
+ internal_sub(B, B, D) or_return
|
|
|
+ } else {
|
|
|
+ // v - v - u, C = C - A, D = D - B
|
|
|
+ internal_sub(v, v, u) or_return
|
|
|
+ internal_sub(C, C, A) or_return
|
|
|
+ internal_sub(D, D, B) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ // If not zero goto step 4
|
|
|
+ if internal_is_zero(u) {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Now `a` = `C`, `b` = `D`, `gcd` == `g`*`v`
|
|
|
+
|
|
|
+ // If `v` != `1` then there is no inverse.
|
|
|
+ if !internal_eq(v, 1) {
|
|
|
+ return .Invalid_Argument
|
|
|
+ }
|
|
|
+
|
|
|
+ // If its too low.
|
|
|
+ for internal_is_negative(C) {
|
|
|
+ internal_add(C, C, b) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ // Too big.
|
|
|
+ for internal_cmp_mag(C, b) > -1 {
|
|
|
+ internal_sub(C, C, b) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ // `C` is now the inverse.
|
|
|
+ swap(dest, C)
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Computes the modular inverse via binary extended Euclidean algorithm, that is `dest` = 1 / `a` mod `b`.
|
|
|
+
|
|
|
+ Based on slow invmod except this is optimized for the case where `b` is odd,
|
|
|
+ as per HAC Note 14.64 on pp. 610.
|
|
|
+*/
|
|
|
+_private_inverse_modulo_odd :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+ x, y, u, v, B, D := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
|
|
|
+ defer internal_destroy(x, y, u, v, B, D)
|
|
|
+
|
|
|
+ sign: Sign
|
|
|
+
|
|
|
+ /*
|
|
|
+ 2. [modified] `b` must be odd.
|
|
|
+ */
|
|
|
+ if internal_is_even(b) { return .Invalid_Argument }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Init all our temps.
|
|
|
+ */
|
|
|
+ internal_init_multi(x, y, u, v, B, D) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ `x` == modulus, `y` == value to invert.
|
|
|
+ */
|
|
|
+ internal_copy(x, b) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ We need `y` = `|a|`.
|
|
|
+ */
|
|
|
+ internal_mod(y, a, b) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ If one of `x`, `y` is zero return an error!
|
|
|
+ */
|
|
|
+ if internal_is_zero(x) || internal_is_zero(y) { return .Invalid_Argument }
|
|
|
+
|
|
|
+ /*
|
|
|
+ 3. `u` = `x`, `v` = `y`, `A` = 1, `B` = 0, `C` = 0, `D` = 1
|
|
|
+ */
|
|
|
+ internal_copy(u, x) or_return
|
|
|
+ internal_copy(v, y) or_return
|
|
|
+
|
|
|
+ internal_one(D) or_return
|
|
|
+
|
|
|
+ for {
|
|
|
+ /*
|
|
|
+ 4. while `u` is even do.
|
|
|
+ */
|
|
|
+ for internal_is_even(u) {
|
|
|
+ /*
|
|
|
+ 4.1 `u` = `u` / 2
|
|
|
+ */
|
|
|
+ internal_int_shr1(u, u) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ 4.2 if `B` is odd then:
|
|
|
+ */
|
|
|
+ if internal_is_odd(B) {
|
|
|
+ /*
|
|
|
+ `B` = (`B` - `x`) / 2
|
|
|
+ */
|
|
|
+ internal_sub(B, B, x) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ `B` = `B` / 2
|
|
|
+ */
|
|
|
+ internal_int_shr1(B, B) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ 5. while `v` is even do:
|
|
|
+ */
|
|
|
+ for internal_is_even(v) {
|
|
|
+ /*
|
|
|
+ 5.1 `v` = `v` / 2
|
|
|
+ */
|
|
|
+ internal_int_shr1(v, v) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ 5.2 if `D` is odd then:
|
|
|
+ */
|
|
|
+ if internal_is_odd(D) {
|
|
|
+ /*
|
|
|
+ `D` = (`D` - `x`) / 2
|
|
|
+ */
|
|
|
+ internal_sub(D, D, x) or_return
|
|
|
+ }
|
|
|
+ /*
|
|
|
+ `D` = `D` / 2
|
|
|
+ */
|
|
|
+ internal_int_shr1(D, D) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ 6. if `u` >= `v` then:
|
|
|
+ */
|
|
|
+ if internal_cmp(u, v) != -1 {
|
|
|
+ /*
|
|
|
+ `u` = `u` - `v`, `B` = `B` - `D`
|
|
|
+ */
|
|
|
+ internal_sub(u, u, v) or_return
|
|
|
+ internal_sub(B, B, D) or_return
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ `v` - `v` - `u`, `D` = `D` - `B`
|
|
|
+ */
|
|
|
+ internal_sub(v, v, u) or_return
|
|
|
+ internal_sub(D, D, B) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ If not zero goto step 4.
|
|
|
+ */
|
|
|
+ if internal_is_zero(u) { break }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Now `a` = C, `b` = D, gcd == g*v
|
|
|
+ */
|
|
|
+
|
|
|
+ /*
|
|
|
+ if `v` != 1 then there is no inverse
|
|
|
+ */
|
|
|
+ if internal_cmp(v, 1) != 0 {
|
|
|
+ return .Invalid_Argument
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ `b` is now the inverse.
|
|
|
+ */
|
|
|
+ sign = a.sign
|
|
|
+ for internal_int_is_negative(D) {
|
|
|
+ internal_add(D, D, b) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Too big.
|
|
|
+ */
|
|
|
+ for internal_gte_abs(D, b) {
|
|
|
+ internal_sub(D, D, b) or_return
|
|
|
+ }
|
|
|
+
|
|
|
+ swap(dest, D)
|
|
|
+ dest.sign = sign
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+/*
|
|
|
+ Returns the log2 of an `Int`.
|
|
|
+ Assumes `a` not to be `nil` and to have been initialized.
|
|
|
+ Also assumes `base` is a power of two.
|
|
|
+*/
|
|
|
+_private_log_power_of_two :: proc(a: ^Int, base: DIGIT) -> (log: int, err: Error) {
|
|
|
+ base := base
|
|
|
+ y: int
|
|
|
+ for y = 0; base & 1 == 0; {
|
|
|
+ y += 1
|
|
|
+ base >>= 1
|
|
|
+ }
|
|
|
+ log = internal_count_bits(a)
|
|
|
+ return (log - 1) / y, err
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Copies DIGITs from `src` to `dest`.
|
|
|
+ Assumes `src` and `dest` to not be `nil` and have been initialized.
|
|
|
+*/
|
|
|
+_private_copy_digits :: proc(dest, src: ^Int, digits: int, offset := int(0)) -> (err: Error) {
|
|
|
+ digits := digits
|
|
|
+ /*
|
|
|
+ If dest == src, do nothing
|
|
|
+ */
|
|
|
+ if dest == src {
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+
|
|
|
+ digits = min(digits, len(src.digit), len(dest.digit))
|
|
|
+ mem.copy_non_overlapping(&dest.digit[0], &src.digit[offset], size_of(DIGIT) * digits)
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+/*
|
|
|
+ Shift left by `digits` * _DIGIT_BITS bits.
|
|
|
+*/
|
|
|
+_private_int_shl_leg :: proc(quotient: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ if digits <= 0 { return nil }
|
|
|
+
|
|
|
+ /*
|
|
|
+ No need to shift a zero.
|
|
|
+ */
|
|
|
+ if #force_inline internal_is_zero(quotient) {
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Resize `quotient` to accomodate extra digits.
|
|
|
+ */
|
|
|
+ #force_inline internal_grow(quotient, quotient.used + digits) or_return
|
|
|
+
|
|
|
+ /*
|
|
|
+ Increment the used by the shift amount then copy upwards.
|
|
|
+ */
|
|
|
+
|
|
|
+ /*
|
|
|
+ Much like `_private_int_shr_leg`, this is implemented using a sliding window,
|
|
|
+ except the window goes the other way around.
|
|
|
+ */
|
|
|
+ #no_bounds_check for x := quotient.used; x > 0; x -= 1 {
|
|
|
+ quotient.digit[x+digits-1] = quotient.digit[x-1]
|
|
|
+ }
|
|
|
+
|
|
|
+ quotient.used += digits
|
|
|
+ mem.zero_slice(quotient.digit[:digits])
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ Shift right by `digits` * _DIGIT_BITS bits.
|
|
|
+*/
|
|
|
+_private_int_shr_leg :: proc(quotient: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
|
|
|
+ context.allocator = allocator
|
|
|
+
|
|
|
+ if digits <= 0 { return nil }
|
|
|
+
|
|
|
+ /*
|
|
|
+ If digits > used simply zero and return.
|
|
|
+ */
|
|
|
+ if digits > quotient.used { return internal_zero(quotient) }
|
|
|
+
|
|
|
+ /*
|
|
|
+ Much like `int_shl_digit`, this is implemented using a sliding window,
|
|
|
+ except the window goes the other way around.
|
|
|
+
|
|
|
+ b-2 | b-1 | b0 | b1 | b2 | ... | bb | ---->
|
|
|
+ /\ | ---->
|
|
|
+ \-------------------/ ---->
|
|
|
+ */
|
|
|
+
|
|
|
+ #no_bounds_check for x := 0; x < (quotient.used - digits); x += 1 {
|
|
|
+ quotient.digit[x] = quotient.digit[x + digits]
|
|
|
+ }
|
|
|
+ quotient.used -= digits
|
|
|
+ internal_zero_unused(quotient)
|
|
|
+ return internal_clamp(quotient)
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ ======================== End of private procedures =======================
|
|
|
+
|
|
|
+ =============================== Private tables ===============================
|
|
|
+
|
|
|
+ Tables used by `internal_*` and `_*`.
|
|
|
+*/
|
|
|
+
|
|
|
+_private_int_rem_128 := [?]DIGIT{
|
|
|
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
+ 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
+ 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
|
|
|
+}
|
|
|
+#assert(128 * size_of(DIGIT) == size_of(_private_int_rem_128))
|
|
|
+
|
|
|
+_private_int_rem_105 := [?]DIGIT{
|
|
|
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
|
|
|
+ 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
|
|
|
+ 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
|
|
|
+ 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
+ 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
|
|
|
+ 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
|
|
|
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
|
|
|
+}
|
|
|
+#assert(105 * size_of(DIGIT) == size_of(_private_int_rem_105))
|
|
|
+
|
|
|
+_PRIME_TAB_SIZE :: 256
|
|
|
+_private_prime_table := [_PRIME_TAB_SIZE]DIGIT{
|
|
|
+ 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
|
|
|
+ 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
|
|
|
+ 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
|
|
|
+ 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
|
|
|
+ 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
|
|
|
+ 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
|
|
|
+ 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
|
|
|
+ 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
|
|
|
+
|
|
|
+ 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
|
|
|
+ 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
|
|
|
+ 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
|
|
|
+ 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
|
|
|
+ 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
|
|
|
+ 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
|
|
|
+ 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
|
|
|
+ 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
|
|
|
+
|
|
|
+ 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
|
|
|
+ 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
|
|
|
+ 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
|
|
|
+ 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
|
|
|
+ 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
|
|
|
+ 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
|
|
|
+ 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
|
|
|
+ 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
|
|
|
+
|
|
|
+ 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
|
|
|
+ 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
|
|
|
+ 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
|
|
|
+ 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
|
|
|
+ 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
|
|
|
+ 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
|
|
|
+ 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
|
|
|
+ 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653,
|
|
|
+}
|
|
|
+#assert(_PRIME_TAB_SIZE * size_of(DIGIT) == size_of(_private_prime_table))
|
|
|
+
|
|
|
+when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
|
|
|
+ _factorial_table := [35]_WORD{
|
|
|
+/* f(00): */ 1,
|
|
|
+/* f(01): */ 1,
|
|
|
+/* f(02): */ 2,
|
|
|
+/* f(03): */ 6,
|
|
|
+/* f(04): */ 24,
|
|
|
+/* f(05): */ 120,
|
|
|
+/* f(06): */ 720,
|
|
|
+/* f(07): */ 5_040,
|
|
|
+/* f(08): */ 40_320,
|
|
|
+/* f(09): */ 362_880,
|
|
|
+/* f(10): */ 3_628_800,
|
|
|
+/* f(11): */ 39_916_800,
|
|
|
+/* f(12): */ 479_001_600,
|
|
|
+/* f(13): */ 6_227_020_800,
|
|
|
+/* f(14): */ 87_178_291_200,
|
|
|
+/* f(15): */ 1_307_674_368_000,
|
|
|
+/* f(16): */ 20_922_789_888_000,
|
|
|
+/* f(17): */ 355_687_428_096_000,
|
|
|
+/* f(18): */ 6_402_373_705_728_000,
|
|
|
+/* f(19): */ 121_645_100_408_832_000,
|
|
|
+/* f(20): */ 2_432_902_008_176_640_000,
|
|
|
+/* f(21): */ 51_090_942_171_709_440_000,
|
|
|
+/* f(22): */ 1_124_000_727_777_607_680_000,
|
|
|
+/* f(23): */ 25_852_016_738_884_976_640_000,
|
|
|
+/* f(24): */ 620_448_401_733_239_439_360_000,
|
|
|
+/* f(25): */ 15_511_210_043_330_985_984_000_000,
|
|
|
+/* f(26): */ 403_291_461_126_605_635_584_000_000,
|
|
|
+/* f(27): */ 10_888_869_450_418_352_160_768_000_000,
|
|
|
+/* f(28): */ 304_888_344_611_713_860_501_504_000_000,
|
|
|
+/* f(29): */ 8_841_761_993_739_701_954_543_616_000_000,
|
|
|
+/* f(30): */ 265_252_859_812_191_058_636_308_480_000_000,
|
|
|
+/* f(31): */ 8_222_838_654_177_922_817_725_562_880_000_000,
|
|
|
+/* f(32): */ 263_130_836_933_693_530_167_218_012_160_000_000,
|
|
|
+/* f(33): */ 8_683_317_618_811_886_495_518_194_401_280_000_000,
|
|
|
+/* f(34): */ 295_232_799_039_604_140_847_618_609_643_520_000_000,
|
|
|
+ }
|
|
|
+} else {
|
|
|
+ _factorial_table := [21]_WORD{
|
|
|
+/* f(00): */ 1,
|
|
|
+/* f(01): */ 1,
|
|
|
+/* f(02): */ 2,
|
|
|
+/* f(03): */ 6,
|
|
|
+/* f(04): */ 24,
|
|
|
+/* f(05): */ 120,
|
|
|
+/* f(06): */ 720,
|
|
|
+/* f(07): */ 5_040,
|
|
|
+/* f(08): */ 40_320,
|
|
|
+/* f(09): */ 362_880,
|
|
|
+/* f(10): */ 3_628_800,
|
|
|
+/* f(11): */ 39_916_800,
|
|
|
+/* f(12): */ 479_001_600,
|
|
|
+/* f(13): */ 6_227_020_800,
|
|
|
+/* f(14): */ 87_178_291_200,
|
|
|
+/* f(15): */ 1_307_674_368_000,
|
|
|
+/* f(16): */ 20_922_789_888_000,
|
|
|
+/* f(17): */ 355_687_428_096_000,
|
|
|
+/* f(18): */ 6_402_373_705_728_000,
|
|
|
+/* f(19): */ 121_645_100_408_832_000,
|
|
|
+/* f(20): */ 2_432_902_008_176_640_000,
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ ========================= End of private tables ========================
|
|
|
*/
|