Browse Source

big: Improve tunables.

Jeroen van Rijn 4 years ago
parent
commit
777e17d80f

+ 2 - 4
core/math/big/basic.odin

@@ -11,8 +11,6 @@ package big
 	This file contains basic arithmetic operations like `add`, `sub`, `mul`, `div`, ...
 */
 
-import "core:mem"
-
 /*
 	===========================
 		User-level routines    
@@ -244,7 +242,7 @@ sqrmod :: proc { int_sqrmod, };
 
 
 int_factorial :: proc(res: ^Int, n: int) -> (err: Error) {
-	if n < 0 || n > _FACTORIAL_MAX_N { return .Invalid_Argument; }
+	if n < 0 || n > FACTORIAL_MAX_N { return .Invalid_Argument; }
 	if res == nil { return .Invalid_Pointer; }
 
 	return #force_inline internal_int_factorial(res, n);
@@ -269,7 +267,7 @@ factorial :: proc { int_factorial, };
 */
 int_choose_digit :: proc(res: ^Int, n, k: int) -> (err: Error) {
 	if res == nil  { return .Invalid_Pointer; }
-	if n < 0 || n > _FACTORIAL_MAX_N { return .Invalid_Argument; }
+	if n < 0 || n > FACTORIAL_MAX_N { return .Invalid_Argument; }
 
 	if k > n { return zero(res); }
 

+ 3 - 3
core/math/big/build.bat

@@ -1,10 +1,10 @@
 @echo off
-:odin run . -vet
+odin run . -vet
 : -o:size
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check
 :odin build . -build-mode:shared -show-timings -o:size -no-bounds-check
 :odin build . -build-mode:shared -show-timings -o:size
-odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check
+:odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check
 :odin build . -build-mode:shared -show-timings -o:speed
 
-python test.py
+:python test.py

+ 48 - 27
core/math/big/common.odin

@@ -13,56 +13,77 @@ import "core:intrinsics"
 
 /*
 	TODO: Make the tunables runtime adjustable where practical.
+
 	This allows to benchmark and/or setting optimized values for a certain CPU without recompiling.
 */
 
 /*
-	Tunables
-*/
-
-MATH_BIG_FORCE_64_BIT :: false;
-MATH_BIG_FORCE_32_BIT :: false;
-when (MATH_BIG_FORCE_32_BIT && MATH_BIG_FORCE_64_BIT) { #panic("Cannot force 32-bit and 64-bit big backend simultaneously."); };
-
+	==========================    TUNABLES     ==========================
 
-_LOW_MEMORY          :: #config(BIGINT_SMALL_MEMORY, false);
-when _LOW_MEMORY {
-	_DEFAULT_DIGIT_COUNT :: 8;
-} else {
-	_DEFAULT_DIGIT_COUNT :: 32;
-}
-
-/*
 	`initialize_constants` returns `#config(MUL_KARATSUBA_CUTOFF, _DEFAULT_MUL_KARATSUBA_CUTOFF)`
 	and we initialize this cutoff that way so that the procedure is used and called,
 	because it handles initializing the constants ONE, ZERO, MINUS_ONE, NAN and INF.
+
+	`initialize_constants` also replaces the other `_DEFAULT_*` cutoffs with custom compile-time values if so `#config`ured.
+
 */
-_MUL_KARATSUBA_CUTOFF := initialize_constants();
-_SQR_KARATSUBA_CUTOFF := #config(SQR_KARATSUBA_CUTOFF, _DEFAULT_SQR_KARATSUBA_CUTOFF);
-_MUL_TOOM_CUTOFF      := #config(MUL_TOOM_CUTOFF,      _DEFAULT_MUL_TOOM_CUTOFF);
-_SQR_TOOM_CUTOFF      := #config(SQR_TOOM_CUTOFF,      _DEFAULT_SQR_TOOM_CUTOFF);
+MUL_KARATSUBA_CUTOFF := initialize_constants();
+SQR_KARATSUBA_CUTOFF := _DEFAULT_SQR_KARATSUBA_CUTOFF;
+MUL_TOOM_CUTOFF      := _DEFAULT_MUL_TOOM_CUTOFF;
+SQR_TOOM_CUTOFF      := _DEFAULT_SQR_TOOM_CUTOFF;
 
 /*
 	These defaults were tuned on an AMD A8-6600K (64-bit) using libTomMath's `make tune`.
+
 	TODO(Jeroen): Port this tuning algorithm and tune them for more modern processors.
+
+	It would also be cool if we collected some data across various processor families.
+	This would let uss set reasonable defaults at runtime as this library initializes
+	itself by using `cpuid` or the ARM equivalent.
 */
-_DEFAULT_MUL_KARATSUBA_CUTOFF ::  80;
-_DEFAULT_SQR_KARATSUBA_CUTOFF :: 120;
-_DEFAULT_MUL_TOOM_CUTOFF      :: 350;
-_DEFAULT_SQR_TOOM_CUTOFF      :: 400;
 
-_MAX_ITERATIONS_ROOT_N        :: 500;
+_DEFAULT_MUL_KARATSUBA_CUTOFF :: #config(MUL_KARATSUBA_CUTOFF,  80);
+_DEFAULT_SQR_KARATSUBA_CUTOFF :: #config(SQR_KARATSUBA_CUTOFF, 120);
+_DEFAULT_MUL_TOOM_CUTOFF      :: #config(MUL_TOOM_CUTOFF,      350);
+_DEFAULT_SQR_TOOM_CUTOFF      :: #config(SQR_TOOM_CUTOFF,      400);
+
+
+MAX_ITERATIONS_ROOT_N := 500;
 
 /*
 	Largest `N` for which we'll compute `N!`
 */
-_FACTORIAL_MAX_N              :: 1_000_000;
+FACTORIAL_MAX_N       := 1_000_000;
 
 /*
 	Cutoff to switch to int_factorial_binary_split, and its max recursion level.
 */
-_FACTORIAL_BINARY_SPLIT_CUTOFF         :: 6100;
-_FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS :: 100;
+FACTORIAL_BINARY_SPLIT_CUTOFF         := 6100;
+FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS := 100;
+
+
+/*
+	We don't allow these to be switched at runtime for two reasons:
+
+	1) 32-bit and 64-bit versions of procedures use different types for their storage,
+		so we'd have to double the number of procedures, and they couldn't interact.
+
+	2) Optimizations thanks to precomputed masks wouldn't work.
+*/
+MATH_BIG_FORCE_64_BIT :: #config(MATH_BIG_FORCE_64_BIT, false);
+MATH_BIG_FORCE_32_BIT :: #config(MATH_BIG_FORCE_32_BIT, false);
+when (MATH_BIG_FORCE_32_BIT && MATH_BIG_FORCE_64_BIT) { #panic("Cannot force 32-bit and 64-bit big backend simultaneously."); };
+
+_LOW_MEMORY           :: #config(BIGINT_SMALL_MEMORY, false);
+when _LOW_MEMORY {
+	_DEFAULT_DIGIT_COUNT :: 8;
+} else {
+	_DEFAULT_DIGIT_COUNT :: 32;
+}
+
+/*
+	=======================    END OF TUNABLES     =======================
+*/
 
 Sign :: enum u8 {
 	Zero_or_Positive = 0,

+ 27 - 15
core/math/big/example.odin

@@ -15,17 +15,23 @@ import "core:mem"
 
 print_configation :: proc() {
 	fmt.printf(
-`Configuration:
-	DIGIT_BITS           %v
-	MIN_DIGIT_COUNT      %v
-	MAX_DIGIT_COUNT      %v
-	DEFAULT_DIGIT_COUNT  %v
-	MAX_COMBA            %v
-	WARRAY               %v
-	MUL_KARATSUBA_CUTOFF %v
-	SQR_KARATSUBA_CUTOFF %v
-	MUL_TOOM_CUTOFF      %v
-	SQR_TOOM_CUTOFF      %v
+`
+Configuration:
+	_DIGIT_BITS                           %v
+	_MIN_DIGIT_COUNT                      %v
+	_MAX_DIGIT_COUNT                      %v
+	_DEFAULT_DIGIT_COUNT                  %v
+	_MAX_COMBA                            %v
+	_WARRAY                               %v
+Runtime tunable:
+	MUL_KARATSUBA_CUTOFF                  %v
+	SQR_KARATSUBA_CUTOFF                  %v
+	MUL_TOOM_CUTOFF                       %v
+	SQR_TOOM_CUTOFF                       %v
+	MAX_ITERATIONS_ROOT_N                 %v
+	FACTORIAL_MAX_N                       %v
+	FACTORIAL_BINARY_SPLIT_CUTOFF         %v
+	FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS %v
 
 `, _DIGIT_BITS,
 _MIN_DIGIT_COUNT,
@@ -33,10 +39,14 @@ _MAX_DIGIT_COUNT,
 _DEFAULT_DIGIT_COUNT,
 _MAX_COMBA,
 _WARRAY,
-_MUL_KARATSUBA_CUTOFF,
-_SQR_KARATSUBA_CUTOFF,
-_MUL_TOOM_CUTOFF,
-_SQR_TOOM_CUTOFF,
+MUL_KARATSUBA_CUTOFF,
+SQR_KARATSUBA_CUTOFF,
+MUL_TOOM_CUTOFF,
+SQR_TOOM_CUTOFF,
+MAX_ITERATIONS_ROOT_N,
+FACTORIAL_MAX_N,
+FACTORIAL_BINARY_SPLIT_CUTOFF,
+FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS,
 );
 
 }
@@ -84,6 +94,8 @@ main :: proc() {
 
 	demo();
 
+	print_configation();
+
 	print_timings();
 
 	if len(ta.allocation_map) > 0 {

+ 3 - 3
core/math/big/exp_log.odin

@@ -360,7 +360,7 @@ int_root_n :: proc(dest, src: ^Int, n: int) -> (err: Error) {
 		}
 		if c, err = cmp(t1, t2); c == 0 { break; }
 		iterations += 1;
-		if iterations == _MAX_ITERATIONS_ROOT_N {
+		if iterations == MAX_ITERATIONS_ROOT_N {
 			return .Max_Iterations_Reached;
 		}
 	}
@@ -383,7 +383,7 @@ int_root_n :: proc(dest, src: ^Int, n: int) -> (err: Error) {
 		}
 
 		iterations += 1;
-		if iterations == _MAX_ITERATIONS_ROOT_N {
+		if iterations == MAX_ITERATIONS_ROOT_N {
 			return .Max_Iterations_Reached;
 		}
 	}
@@ -401,7 +401,7 @@ int_root_n :: proc(dest, src: ^Int, n: int) -> (err: Error) {
 		}
 
 		iterations += 1;
-		if iterations == _MAX_ITERATIONS_ROOT_N {
+		if iterations == MAX_ITERATIONS_ROOT_N {
 			return .Max_Iterations_Reached;
 		}
 	}

+ 1 - 1
core/math/big/helpers.odin

@@ -691,7 +691,7 @@ initialize_constants :: proc() -> (res: int) {
 	set(      INF,  1);       INF.flags = {.Immutable, .Inf};
 	set(      INF, -1); MINUS_INF.flags = {.Immutable, .Inf};
 
-	return #config(MUL_KARATSUBA_CUTOFF, _DEFAULT_MUL_KARATSUBA_CUTOFF);
+	return _DEFAULT_MUL_KARATSUBA_CUTOFF;
 }
 
 destroy_constants :: proc() {

+ 9 - 9
core/math/big/internal.odin

@@ -597,10 +597,10 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc
 		/*
 			Do we need to square?
 		*/
-		if        false && src.used >= _SQR_TOOM_CUTOFF {
+		if        false && src.used >= SQR_TOOM_CUTOFF {
 			/* Use Toom-Cook? */
 			// err = s_mp_sqr_toom(a, c);
-		} else if false && src.used >= _SQR_KARATSUBA_CUTOFF {
+		} else if false && src.used >= SQR_KARATSUBA_CUTOFF {
 			/* Karatsuba? */
 			// err = s_mp_sqr_karatsuba(a, c);
 		} else if false && ((src.used * 2) + 1) < _WARRAY &&
@@ -625,16 +625,16 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc
 		max_used := max(src.used, multiplier.used);
 		digits   := src.used + multiplier.used + 1;
 
-		if        false &&  min_used     >= _MUL_KARATSUBA_CUTOFF &&
-						    max_used / 2 >= _MUL_KARATSUBA_CUTOFF &&
+		if        false &&  min_used     >= MUL_KARATSUBA_CUTOFF &&
+						    max_used / 2 >= MUL_KARATSUBA_CUTOFF &&
 			/*
 				Not much effect was observed below a ratio of 1:2, but again: YMMV.
 			*/
 							max_used     >= 2 * min_used {
 			// err = s_mp_mul_balance(a,b,c);
-		} else if false && min_used >= _MUL_TOOM_CUTOFF {
+		} else if false && min_used >= MUL_TOOM_CUTOFF {
 			// err = s_mp_mul_toom(a, b, c);
-		} else if false && min_used >= _MUL_KARATSUBA_CUTOFF {
+		} else if false && min_used >= MUL_KARATSUBA_CUTOFF {
 			// err = s_mp_mul_karatsuba(a, b, c);
 		} else if digits < _WARRAY && min_used <= _MAX_COMBA {
 			/*
@@ -676,7 +676,7 @@ internal_int_divmod :: proc(quotient, remainder, numerator, denominator: ^Int, a
 		return nil;
 	}
 
-	if false && (denominator.used > 2 * _MUL_KARATSUBA_CUTOFF) && (denominator.used <= (numerator.used/3) * 2) {
+	if false && (denominator.used > 2 * MUL_KARATSUBA_CUTOFF) && (denominator.used <= (numerator.used/3) * 2) {
 		// err = _int_div_recursive(quotient, remainder, numerator, denominator);
 	} else {
 		when true {
@@ -846,7 +846,7 @@ internal_sqrmod :: proc { internal_int_sqrmod, };
 	This way we'll have to reallocate less, possibly not at all.
 */
 internal_int_factorial :: proc(res: ^Int, n: int) -> (err: Error) {
-	if n >= _FACTORIAL_BINARY_SPLIT_CUTOFF {
+	if n >= FACTORIAL_BINARY_SPLIT_CUTOFF {
 		return #force_inline _private_int_factorial_binary_split(res, n);
 	}
 
@@ -1490,7 +1490,7 @@ _private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int
 	t1, t2 := &Int{}, &Int{};
 	defer destroy(t1, t2);
 
-	if level > _FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS { return .Max_Iterations_Reached; }
+	if level > FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS { return .Max_Iterations_Reached; }
 
 	num_factors := (stop - start) >> 1;
 	if num_factors == 2 {