Browse Source

`strconv.parse_f64` - accurately parse floats

gingerBill 2 years ago
parent
commit
b9ec2de4db
3 changed files with 421 additions and 98 deletions
  1. 83 0
      core/strconv/decimal/decimal.odin
  2. 86 0
      core/strconv/generic_float.odin
  3. 252 98
      core/strconv/strconv.odin

+ 83 - 0
core/strconv/decimal/decimal.odin

@@ -9,6 +9,89 @@ Decimal :: struct {
 	neg, trunc:    bool,
 	neg, trunc:    bool,
 }
 }
 
 
+set :: proc(d: ^Decimal, s: string) -> (ok: bool) {
+	d^ = {}
+
+	if len(s) == 0 {
+		return
+	}
+
+	i := 0
+	switch s[i] {
+	case '+': i += 1
+	case '-': i += 1; d.neg = true
+	}
+
+	// digits
+	saw_dot := false
+	saw_digits := false
+	for ; i < len(s); i += 1 {
+		switch {
+		case s[i] == '_':
+			// ignore underscores
+			continue
+		case s[i] == '.':
+			if saw_dot {
+				return
+			}
+			saw_dot = true
+			d.decimal_point = d.count
+			continue
+
+		case '0' <= s[i] && s[i] <= '9':
+			saw_digits = true
+			if s[i] == '0' && d.count == 0 {
+				d.decimal_point -= 1
+				continue
+			}
+			if d.count < len(d.digits) {
+				d.digits[d.count] = s[i]
+				d.count += 1
+			} else if s[i] != '0' {
+				d.trunc = true
+			}
+			continue
+		}
+		break
+	}
+	if !saw_digits {
+		return
+	}
+	if !saw_dot {
+		d.decimal_point = d.count
+	}
+
+	lower :: #force_inline proc "contextless" (ch: byte) -> byte { return ('a' - 'A') | ch }
+
+	if i < len(s) && lower(s[i]) == 'e' {
+		i += 1
+		if i >= len(s) {
+			return
+		}
+		exp_sign := 1
+		switch s[i] {
+		case '+': i += 1
+		case '-': i += 1; exp_sign = -1
+		}
+		if i >= len(s) || s[i] < '0' || s[i] > '9' {
+			return
+		}
+		e := 0
+		for ; i < len(s) && ('0' <= s[i] && s[i] <= '9' || s[i] == '_'); i += 1 {
+			if s[i] == '_' {
+				// ignore underscores
+				continue
+			}
+			if e < 1e4 {
+				e = e*10 + int(s[i]) - '0'
+			}
+		}
+		d.decimal_point += e * exp_sign
+	}
+
+	return i == len(s)
+}
+
 decimal_to_string :: proc(buf: []byte, a: ^Decimal) -> string {
 decimal_to_string :: proc(buf: []byte, a: ^Decimal) -> string {
 	digit_zero :: proc(buf: []byte) -> int {
 	digit_zero :: proc(buf: []byte) -> int {
 		for _, i in buf {
 		for _, i in buf {

+ 86 - 0
core/strconv/generic_float.odin

@@ -284,3 +284,89 @@ round_shortest :: proc(d: ^decimal.Decimal, mant: u64, exp: int, flt: ^Float_Inf
 	}
 	}
 
 
 }
 }
+
+@(private)
+decimal_to_float_bits :: proc(d: ^decimal.Decimal, info: ^Float_Info) -> (b: u64, overflow: bool) {
+	end :: proc "contextless" (d: ^decimal.Decimal, mant: u64, exp: int, info: ^Float_Info) -> (b: u64) {
+		bits := mant & (u64(1)<<info.mantbits - 1)
+		bits |= u64((exp-info.bias) & (1<<info.expbits - 1)) << info.mantbits
+		if d.neg {
+			bits |= 1<< info.mantbits << info.expbits
+		}
+		return bits
+	}
+	set_overflow :: proc "contextless" (mant: ^u64, exp: ^int, info: ^Float_Info) -> bool {
+		mant^ = 0
+		exp^ = 1<<info.expbits - 1 + info.bias
+		return true
+	}
+
+	mant: u64
+	exp: int
+	if d.decimal_point == 0 {
+		mant = 0
+		exp = info.bias
+		b = end(d, mant, exp, info)
+		return
+	}
+
+	if d.decimal_point > 310 {
+		set_overflow(&mant, &exp, info)
+		b = end(d, mant, exp, info)
+		return
+	} else if d.decimal_point < -330 {
+		mant = 0
+		exp = info.bias
+		b = end(d, mant, exp, info)
+		return
+	}
+
+	@static power_table := [?]int{1, 3, 6, 9, 13, 16, 19, 23, 26}
+
+	exp = 0
+	for d.decimal_point > 0 {
+		n := 27 if d.decimal_point >= len(power_table) else power_table[d.decimal_point]
+		decimal.shift(d, n)
+		exp += n
+	}
+	for d.decimal_point < 0 || d.decimal_point == 0 && d.digits[0] < '5' {
+		n := 27 if -d.decimal_point >= len(power_table) else power_table[-d.decimal_point]
+		decimal.shift(d, n)
+		exp -= n
+	}
+
+	// go from [0.5, 1) to [1, 2)
+	exp -= 1
+
+	if exp < info.bias + 1 {
+		n := info.bias + 1 - exp
+		decimal.shift(d, n)
+		exp += n
+	}
+
+	if (exp-info.bias) >= (1<<info.expbits - 1) {
+		set_overflow(&mant, &exp, info)
+		b = end(d, mant, exp, info)
+		return
+	}
+
+	decimal.shift(d, int(1 + info.mantbits))
+	mant = decimal.rounded_integer(d)
+
+	if mant == 2<<info.mantbits {
+		mant >>= 1
+		exp += 1
+		if (exp-info.bias) >= (1<<info.expbits - 1) {
+			set_overflow(&mant, &exp, info)
+			b = end(d, mant, exp, info)
+			return
+		}
+	}
+
+	if mant & (1<<info.mantbits) == 0 {
+		exp = info.bias
+	}
+
+	b = end(d, mant, exp, info)
+	return
+}

+ 252 - 98
core/strconv/strconv.odin

@@ -1,6 +1,7 @@
 package strconv
 package strconv
 
 
 import "core:unicode/utf8"
 import "core:unicode/utf8"
+import "decimal"
 
 
 parse_bool :: proc(s: string, n: ^int = nil) -> (result: bool = false, ok: bool) {
 parse_bool :: proc(s: string, n: ^int = nil) -> (result: bool = false, ok: bool) {
 	switch s {
 	switch s {
@@ -532,6 +533,8 @@ parse_u128_maybe_prefixed :: proc(str: string, n: ^int = nil) -> (value: u128, o
 parse_u128 :: proc{parse_u128_maybe_prefixed, parse_u128_of_base}
 parse_u128 :: proc{parse_u128_maybe_prefixed, parse_u128_of_base}
 
 
 
 
+@(private)
+lower :: #force_inline proc "contextless" (ch: byte) -> byte { return ('a' - 'A') | ch }
 
 
 
 
 
 
@@ -566,133 +569,284 @@ parse_f32 :: proc(s: string, n: ^int = nil) -> (value: f32, ok: bool) {
 // assert(n == 12.34 && ok);
 // assert(n == 12.34 && ok);
 // ```
 // ```
 parse_f64 :: proc(str: string, n: ^int = nil) -> (value: f64, ok: bool) {
 parse_f64 :: proc(str: string, n: ^int = nil) -> (value: f64, ok: bool) {
-	s := str
-	defer if n != nil { n^ = len(str) - len(s) }
-	if s == "" {
-		return
+	common_prefix_len_ignore_case :: proc "contextless" (s, prefix: string) -> int {
+		n := len(prefix)
+		if n > len(s) {
+			n = len(s)
+		}
+		for i in 0..<n {
+			c := s[i]
+			if 'A' <= c && c <= 'Z' {
+				c += 'a' - 'A'
+			}
+			if c != prefix[i] {
+				return i
+			}
+		}
+		return n
 	}
 	}
-
-	i := 0
-
-	sign: f64 = 1
-	seen_sign := true
-	switch s[i] {
-	case '-': i += 1; sign = -1
-	case '+': i += 1
-	case: seen_sign = false
+	check_special :: proc "contextless" (s: string) -> (f: f64, n: int, ok: bool) {
+		s := s
+		if len(s) > 0 {
+			sign := 1
+			nsign := 0
+			switch s[0] {
+			case '+', '-':
+				if s[0] == '-' {
+					sign = -1
+				}
+				nsign = 1
+				s = s[1:]
+				fallthrough
+			case 'i', 'I':
+				n := common_prefix_len_ignore_case(s, "infinity")
+				if 3 < n && n < 8 { // "inf" or "infinity"
+					n = 3
+				}
+				if n == 3 || n == 8 {
+					f = 0h7ff00000_00000000 if sign == 1 else 0hfff00000_00000000
+					n = nsign + 3
+					ok = true
+					return
+				}
+			case 'n', 'N':
+				if common_prefix_len_ignore_case(s, "nan") == 3 {
+					f = 0h7ff80000_00000001
+					n = nsign + 3
+					ok = true
+					return
+				}
+			}
+		}
+		return
 	}
 	}
+	parse_components :: proc "contextless" (s: string) -> (mantissa: u64, exp: int, neg, trunc, hex: bool, i: int, ok: bool) {
+		if len(s) == 0 {
+			return
+		}
+		switch s[i] {
+		case '+': i += 1
+		case '-': i += 1; neg = true
+		}
 
 
-	for ; i < len(s); i += 1 {
-		r := rune(s[i])
-		if r == '_' {
-			continue
+		base := u64(10)
+		MAX_MANT_DIGITS := 19
+		exp_char := byte('e')
+		// support stupid 0x1.ABp100 hex floats even if Odin doesn't
+		if i+2 < len(s) && s[i] == '0' && lower(s[i+1]) == 'x' {
+			base = 16
+			MAX_MANT_DIGITS = 16
+			i += 2
+			exp_char = 'p'
+			hex = true
 		}
 		}
 
 
-		v := _digit_value(r)
-		if v >= 10 {
-			if r == '.' || r == 'e' || r == 'E' { // Skip parsing NaN and Inf if it's probably a regular float
-				break
-			}
-			if len(s) >= 3 + i {
-				buf: [4]u8
-				copy(buf[:], s[i:][:3])
-
-				v2 := transmute(u32)buf
-				v2 &= 0xDFDFDFDF // Knock out lower-case bits
-
-				buf = transmute([4]u8)v2
-
-				when ODIN_ENDIAN == .Little {
-					if v2 == 0x464e49 { // "INF"
-						s = s[3+i:]
-						value = 0h7ff00000_00000000 if sign == 1 else 0hfff00000_00000000
-						return value, len(s) == 0
-					} else if v2 == 0x4e414e { // "NAN"
-						s = s[3+i:]
-						return 0h7ff80000_00000001, len(s) == 0
-					}
+		underscores := false
+		saw_dot, saw_digits := false, false
+		nd := 0
+		nd_mant := 0
+		decimal_point := 0
+		loop: for ; i < len(s); i += 1 {
+			switch c := s[i]; true {
+			case c == '_':
+				underscores = true
+				continue loop
+			case c == '.':
+				if saw_dot {
+					break loop
+				}
+				saw_dot = true
+				decimal_point = nd
+				continue loop
+
+			case '0' <= c && c <= '9':
+				saw_digits = true
+				if c == '0' && nd == 0 {
+					decimal_point -= 1
+					continue loop
+				}
+				nd += 1
+				if nd_mant < MAX_MANT_DIGITS {
+					mantissa *= base
+					mantissa += u64(c - '0')
+					nd_mant += 1
+				} else if c != '0' {
+					trunc = true
+				}
+				continue loop
+			case base == 16 && 'a' <= lower(c) && lower(c) <= 'f':
+				saw_digits = true
+				nd += 1
+				if nd_mant < MAX_MANT_DIGITS {
+					MAX_MANT_DIGITS *= 16
+					MAX_MANT_DIGITS += int(lower(c) - 'a' + 10)
+					nd_mant += 1
 				} else {
 				} else {
-					if v2 == 0x494e4600 { // "\0FNI"
-						s = s[3+i:]
-						value = 0h7ff00000_00000000 if sign == 1 else 0hfff00000_00000000
-						return value, len(s) == 0
-					} else if v2 == 0x4e414e00 { // "\0NAN"
-						s = s[3+i:]
-						return 0h7ff80000_00000001, len(s) == 0
-					}
+					trunc = true
 				}
 				}
+				continue loop
 			}
 			}
-			break
+			break loop
 		}
 		}
-		value *= 10
-		value += f64(v)
-	}
 
 
-	if i < len(s) && s[i] == '.' {
-		pow10: f64 = 10
-		i += 1
+		if !saw_digits {
+			return
+		}
+		if !saw_dot {
+			decimal_point = nd
+		}
+		if base == 16 {
+			decimal_point *= 4
+			nd_mant *= 4
+		}
 
 
-		for ; i < len(s); i += 1 {
-			r := rune(s[i])
-			if r == '_' {
-				continue
+		if i < len(s) && lower(s[i]) == exp_char {
+			i += 1
+			if i >= len(s) { return }
+			exp_sign := 1
+			switch s[i] {
+			case '+': i += 1
+			case '-': i += 1; exp_sign = -1
 			}
 			}
-
-			v := _digit_value(r)
-			if v >= 10 {
-				break
+			if i >= len(s) || s[i] < '0' || s[i] > '9' {
+				return
+			}
+			e := 0
+			for ; i < len(s) && ('0' <= s[i] && s[i] <= '9' || s[i] == '_'); i += 1 {
+				if s[i] == '_' {
+					underscores = true
+					continue
+				}
+				if e < 1e5 {
+					e = e*10 + int(s[i]) - '0'
+				}
 			}
 			}
-			value += f64(v)/pow10
-			pow10 *= 10
+			decimal_point += e * exp_sign
+		} else if base == 16 {
+			return
+		}
+
+		if mantissa != 0 {
+			exp = decimal_point - nd_mant
 		}
 		}
+		// TODO(bill): check underscore correctness
+		ok = true
+		return
 	}
 	}
 
 
-	frac := false
-	scale: f64 = 1
+	parse_hex :: proc(s: string, mantissa: u64, exp: int, neg, trunc: bool) -> (f64, bool) {
+		info := &_f64_info
 
 
-	if i < len(s) && (s[i] == 'e' || s[i] == 'E') {
-		i += 1
+		mantissa, exp := mantissa, exp
 
 
-		if i < len(s) {
-			switch s[i] {
-			case '-': i += 1; frac = true
-			case '+': i += 1
-			}
+		MAX_EXP := 1<<info.expbits + info.bias - 2
+		MIN_EXP := info.bias + 1
+		exp += int(info.mantbits)
 
 
-			exp: u32 = 0
-			for ; i < len(s); i += 1 {
-				r := rune(s[i])
-				if r == '_' {
-					continue
-				}
+		for mantissa != 0 && mantissa >> (info.mantbits+2) == 0 {
+			mantissa <<= 1
+			exp -= 1
+		}
+		if trunc {
+			mantissa |= 1
+		}
 
 
-				d := u32(_digit_value(r))
-				if d >= 10 {
-					break
-				}
-				exp = exp * 10 + d
+		for mantissa >> (info.mantbits+2) == 0 {
+			mantissa = mantissa>>1 | mantissa&1
+			exp += 1
+		}
+
+		// denormalize
+		if mantissa > 1 && exp < MIN_EXP-2 {
+			mantissa = mantissa>>1 | mantissa&1
+			exp += 1
+		}
+
+		round := mantissa & 3
+		mantissa >>= 2
+		round |= mantissa & 1 // round to even
+		exp += 2
+		if round == 3 {
+			mantissa += 1
+			if mantissa == 1 << (1 + info.mantbits) {
+				mantissa >>= 1
+				exp += 1
 			}
 			}
-			if exp > 308 { exp = 308 }
+		}
+		if mantissa>>info.mantbits == 0 {
+			// zero or denormal
+			exp = info.bias
+		}
+
+		ok := true
+		if exp > MAX_EXP {
+			// infinity or invalid
+			mantissa = 1<<info.mantbits
+			exp = MAX_EXP + 1
+			ok = false
+		}
 
 
-			for exp >= 50 { scale *= 1e50; exp -= 50 }
-			for exp >=  8 { scale *=  1e8; exp -=  8 }
-			for exp >   0 { scale *=   10; exp -=  1 }
+		bits := mantissa & (1<<info.mantbits - 1)
+		bits |= u64((exp-info.bias) & (1<<info.expbits - 1)) << info.mantbits
+		if neg {
+			bits |= 1 << info.mantbits << info.expbits
 		}
 		}
+		return transmute(f64)bits, ok
 	}
 	}
 
 
-	// If we only consumed a sign, return false
-	if i == 1 && seen_sign {
-		return 0, false
+
+	nr: int
+	defer if n != nil { n^ = nr }
+
+	if value, nr, ok = check_special(str); ok {
+		return
 	}
 	}
 
 
-	s = s[i:]
-	if frac {
-		value = sign * (value/scale)
-	} else {
-		value = sign * (value*scale)
+	mantissa: u64
+	exp:      int
+	neg, trunc, hex: bool
+	mantissa, exp, neg, trunc, hex, nr = parse_components(str) or_return
+
+	if hex {
+		return parse_hex(str, mantissa, exp, neg, trunc)
 	}
 	}
 
 
-	ok = len(s) == 0
+	trunc_block: if !trunc {
+		@static pow10 := [?]f64{
+			1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,
+			1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
+			1e20, 1e21, 1e22,
+		}
+
+		if mantissa>>_f64_info.mantbits != 0 {
+			return
+		}
+		f := f64(mantissa)
+		if neg {
+			f = -f
+		}
+		switch {
+		case exp == 0:
+			return f, true
+		case exp > 0 && exp <= 15+22:
+			if exp > 22 {
+				f *= pow10[exp-22]
+				exp = 22
+			}
+			if f > 1e15 || f < 1e-15 {
+				break trunc_block
+			}
+			return f * pow10[exp], true
+		case -22 <= exp && exp < 0:
+			return f / pow10[-exp], true
+		}
+	}
+
+	d: decimal.Decimal
+	decimal.set(&d, str[:nr])
+	b, overflow := decimal_to_float_bits(&d, &_f64_info)
+	value = transmute(f64)b
+	ok = !overflow
 	return
 	return
 }
 }