#+vet !cast
package runtime

import "base:intrinsics"

@(private="file")
IS_WASM :: ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32

@(private)
RUNTIME_LINKAGE :: "strong" when (
	ODIN_USE_SEPARATE_MODULES || 
	ODIN_BUILD_MODE == .Dynamic ||
	!ODIN_NO_CRT) else "internal"
RUNTIME_REQUIRE :: false // !ODIN_TILDE

@(private)
__float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16

HAS_HARDWARE_SIMD :: false when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
	false when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
	false when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
	false when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
	true


@(private)
byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byte #no_bounds_check {
	return ([^]byte)(data)[:max(len, 0)]
}

is_power_of_two_int :: #force_inline proc "contextless" (x: int) -> bool {
	if x <= 0 {
		return false
	}
	return (x & (x-1)) == 0
}

align_forward_int :: #force_inline proc "odin" (ptr, align: int) -> int {
	assert(is_power_of_two_int(align))

	p := ptr
	modulo := p & (align-1)
	if modulo != 0 {
		p += align - modulo
	}
	return p
}

is_power_of_two_uint :: #force_inline proc "contextless" (x: uint) -> bool {
	if x <= 0 {
		return false
	}
	return (x & (x-1)) == 0
}

align_forward_uint :: #force_inline proc "odin" (ptr, align: uint) -> uint {
	assert(is_power_of_two_uint(align))

	p := ptr
	modulo := p & (align-1)
	if modulo != 0 {
		p += align - modulo
	}
	return p
}

is_power_of_two_uintptr :: #force_inline proc "contextless" (x: uintptr) -> bool {
	if x <= 0 {
		return false
	}
	return (x & (x-1)) == 0
}

align_forward_uintptr :: #force_inline proc "odin" (ptr, align: uintptr) -> uintptr {
	assert(is_power_of_two_uintptr(align))

	p := ptr
	modulo := p & (align-1)
	if modulo != 0 {
		p += align - modulo
	}
	return p
}

is_power_of_two :: proc {
	is_power_of_two_int,
	is_power_of_two_uint,
	is_power_of_two_uintptr,
}

align_forward :: proc {
	align_forward_int,
	align_forward_uint,
	align_forward_uintptr,
}

mem_zero :: proc "contextless" (data: rawptr, len: int) -> rawptr {
	if data == nil {
		return nil
	}
	if len <= 0 {
		return data
	}
	intrinsics.mem_zero(data, len)
	return data
}

mem_copy :: proc "contextless" (dst, src: rawptr, len: int) -> rawptr {
	if src != nil && dst != src && len > 0 {
		// NOTE(bill): This _must_ be implemented like C's memmove
		intrinsics.mem_copy(dst, src, len)
	}
	return dst
}

mem_copy_non_overlapping :: proc "contextless" (dst, src: rawptr, len: int) -> rawptr {
	if src != nil && dst != src && len > 0 {
		// NOTE(bill): This _must_ be implemented like C's memcpy
		intrinsics.mem_copy_non_overlapping(dst, src, len)
	}
	return dst
}

DEFAULT_ALIGNMENT :: 2*align_of(rawptr)

mem_alloc_bytes :: #force_no_inline proc(size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> ([]byte, Allocator_Error) {
	assert(is_power_of_two_int(alignment), "Alignment must be a power of two", loc)
	if size == 0 || allocator.procedure == nil{
		return nil, nil
	}
	return allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0, loc)
}

mem_alloc :: #force_no_inline proc(size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> ([]byte, Allocator_Error) {
	assert(is_power_of_two_int(alignment), "Alignment must be a power of two", loc)
	if size == 0 || allocator.procedure == nil {
		return nil, nil
	}
	return allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0, loc)
}

mem_alloc_non_zeroed :: #force_no_inline proc(size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> ([]byte, Allocator_Error) {
	assert(is_power_of_two_int(alignment), "Alignment must be a power of two", loc)
	if size == 0 || allocator.procedure == nil {
		return nil, nil
	}
	return allocator.procedure(allocator.data, .Alloc_Non_Zeroed, size, alignment, nil, 0, loc)
}

mem_free :: #force_no_inline proc(ptr: rawptr, allocator := context.allocator, loc := #caller_location) -> Allocator_Error {
	if ptr == nil || allocator.procedure == nil {
		return nil
	}
	_, err := allocator.procedure(allocator.data, .Free, 0, 0, ptr, 0, loc)
	return err
}

mem_free_with_size :: #force_no_inline proc(ptr: rawptr, byte_count: int, allocator := context.allocator, loc := #caller_location) -> Allocator_Error {
	if ptr == nil || allocator.procedure == nil {
		return nil
	}
	_, err := allocator.procedure(allocator.data, .Free, 0, 0, ptr, byte_count, loc)
	return err
}

mem_free_bytes :: #force_no_inline proc(bytes: []byte, allocator := context.allocator, loc := #caller_location) -> Allocator_Error {
	if bytes == nil || allocator.procedure == nil {
		return nil
	}
	_, err := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(bytes), len(bytes), loc)
	return err
}


mem_free_all :: #force_no_inline proc(allocator := context.allocator, loc := #caller_location) -> (err: Allocator_Error) {
	if allocator.procedure != nil {
		_, err = allocator.procedure(allocator.data, .Free_All, 0, 0, nil, 0, loc)
	}
	return
}

_mem_resize :: #force_no_inline proc(ptr: rawptr, old_size, new_size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, should_zero: bool, loc := #caller_location) -> (data: []byte, err: Allocator_Error) {
	assert(is_power_of_two_int(alignment), "Alignment must be a power of two", loc)
	if allocator.procedure == nil {
		return nil, nil
	}
	if new_size == 0 {
		if ptr != nil {
			_, err = allocator.procedure(allocator.data, .Free, 0, 0, ptr, old_size, loc)
			return
		}
		return
	} else if ptr == nil {
		if should_zero {
			return allocator.procedure(allocator.data, .Alloc, new_size, alignment, nil, 0, loc)
		} else {
			return allocator.procedure(allocator.data, .Alloc_Non_Zeroed, new_size, alignment, nil, 0, loc)
		}
	} else if old_size == new_size && uintptr(ptr) % uintptr(alignment) == 0 {
		data = ([^]byte)(ptr)[:old_size]
		return
	}

	if should_zero {
		data, err = allocator.procedure(allocator.data, .Resize, new_size, alignment, ptr, old_size, loc)
	} else {
		data, err = allocator.procedure(allocator.data, .Resize_Non_Zeroed, new_size, alignment, ptr, old_size, loc)
	}
	if err == .Mode_Not_Implemented {
		if should_zero {
			data, err = allocator.procedure(allocator.data, .Alloc, new_size, alignment, nil, 0, loc)
		} else {
			data, err = allocator.procedure(allocator.data, .Alloc_Non_Zeroed, new_size, alignment, nil, 0, loc)
		}
		if err != nil {
			return
		}
		copy(data, ([^]byte)(ptr)[:old_size])
		_, err = allocator.procedure(allocator.data, .Free, 0, 0, ptr, old_size, loc)
	}
	return
}

mem_resize :: proc(ptr: rawptr, old_size, new_size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> (data: []byte, err: Allocator_Error) {
	assert(is_power_of_two_int(alignment), "Alignment must be a power of two", loc)
	return _mem_resize(ptr, old_size, new_size, alignment, allocator, true, loc)
}
non_zero_mem_resize :: proc(ptr: rawptr, old_size, new_size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> (data: []byte, err: Allocator_Error) {
	assert(is_power_of_two_int(alignment), "Alignment must be a power of two", loc)
	return _mem_resize(ptr, old_size, new_size, alignment, allocator, false, loc)
}

conditional_mem_zero :: proc "contextless" (data: rawptr, n_: int) #no_bounds_check {
	// When acquiring memory from the OS for the first time it's likely that the
	// OS already gives the zero page mapped multiple times for the request. The
	// actual allocation does not have physical pages allocated to it until those
	// pages are written to which causes a page-fault. This is often called COW
	// (Copy on Write)
	//
	// You do not want to actually zero out memory in this case because it would
	// cause a bunch of page faults decreasing the speed of allocations and
	// increase the amount of actual resident physical memory used.
	//
	// Instead a better technique is to check if memory is zerored before zeroing
	// it. This turns out to be an important optimization in practice, saving
	// nearly half (or more) the amount of physical memory used by an application.
	// This is why every implementation of calloc in libc does this optimization.
	//
	// It may seem counter-intuitive but most allocations in an application are
	// wasted and never used. When you consider something like a [dynamic]T which
	// always doubles in capacity on resize but you rarely ever actually use the
	// full capacity of a dynamic array it means you have a lot of resident waste
	// if you actually zeroed the remainder of the memory.
	//
	// Keep in mind the OS is already guaranteed to give you zeroed memory by
	// mapping in this zero page multiple times so in the best case there is no
	// need to actually zero anything. As for testing all this memory for a zero
	// value, it costs nothing because the the same zero page is used for the
	// whole allocation and will exist in L1 cache for the entire zero checking
	// process.

	if n_ <= 0 {
		return
	}
	n := uint(n_)

	n_words := n / size_of(uintptr)
	p_words := ([^]uintptr)(data)[:n_words]
	p_bytes := ([^]byte)(data)[size_of(uintptr) * n_words:n]
	for &p_word in p_words {
		if p_word != 0 {
			p_word = 0
		}
	}
	for &p_byte in p_bytes {
		if p_byte != 0 {
			p_byte = 0
		}
	}
}

memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
	switch {
	case n == 0: return true
	case x == y: return true
	}
	a, b := cast([^]byte)x, cast([^]byte)y

	n := uint(n)
	i := uint(0)
	m := uint(0)

	if n >= 8 {
		when HAS_HARDWARE_SIMD {
			// Avoid using 256-bit SIMD on platforms where its emulation is
			// likely to be less than ideal.
			when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
				m = n / 32 * 32
				for /**/; i < m; i += 32 {
					load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
					load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
					ne := intrinsics.simd_lanes_ne(load_a, load_b)
					if intrinsics.simd_reduce_or(ne) != 0 {
						return false
					}
				}
			}
		}

		m = (n-i) / 16 * 16
		for /**/; i < m; i += 16 {
			load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
			load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
			ne := intrinsics.simd_lanes_ne(load_a, load_b)
			if intrinsics.simd_reduce_or(ne) != 0 {
				return false
			}
		}

		m = (n-i) / size_of(uintptr) * size_of(uintptr)
		for /**/; i < m; i += size_of(uintptr) {
			if intrinsics.unaligned_load(cast(^uintptr)&a[i]) != intrinsics.unaligned_load(cast(^uintptr)&b[i]) {
				return false
			}
		}
	}

	for /**/; i < n; i += 1 {
		if a[i] != b[i] {
			return false
		}
	}
	return true
}

memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_check {
	switch {
	case x == y:   return 0
	case x == nil: return -1
	case y == nil: return +1
	}
	a, b := cast([^]byte)x, cast([^]byte)y
	
	n := uint(n)
	i := uint(0)
	m := uint(0)

	when HAS_HARDWARE_SIMD {
		when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
			m = n / 32 * 32
			for /**/; i < m; i += 32 {
				load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
				load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
				comparison := intrinsics.simd_lanes_ne(load_a, load_b)
				if intrinsics.simd_reduce_or(comparison) != 0 {
					sentinel: #simd[32]u8 = u8(0xFF)
					indices := intrinsics.simd_indices(#simd[32]u8)
					index_select := intrinsics.simd_select(comparison, indices, sentinel)
					index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
					return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
				}
			}
		}
	}

	m = (n-i) / 16 * 16
	for /**/; i < m; i += 16 {
		load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
		load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
		comparison := intrinsics.simd_lanes_ne(load_a, load_b)
		if intrinsics.simd_reduce_or(comparison) != 0 {
			sentinel: #simd[16]u8 = u8(0xFF)
			indices := intrinsics.simd_indices(#simd[16]u8)
			index_select := intrinsics.simd_select(comparison, indices, sentinel)
			index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
			return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
		}
	}

	// 64-bit SIMD is faster than using a `uintptr` to detect a difference then
	// re-iterating with the byte-by-byte loop, at least on AMD64.
	m = (n-i) / 8 * 8
	for /**/; i < m; i += 8 {
		load_a := intrinsics.unaligned_load(cast(^#simd[8]u8)&a[i])
		load_b := intrinsics.unaligned_load(cast(^#simd[8]u8)&b[i])
		comparison := intrinsics.simd_lanes_ne(load_a, load_b)
		if intrinsics.simd_reduce_or(comparison) != 0 {
			sentinel: #simd[8]u8 = u8(0xFF)
			indices := intrinsics.simd_indices(#simd[8]u8)
			index_select := intrinsics.simd_select(comparison, indices, sentinel)
			index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
			return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
		}
	}

	for /**/; i < n; i += 1 {
		if a[i] ~ b[i] != 0 {
			return -1 if int(a[i]) - int(b[i]) < 0 else +1
		}
	}
	return 0
}

memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
	n := uint(n)
	i := uint(0)
	m := uint(0)

	// Because we're comparing against zero, we never return -1, as that would
	// indicate the compared value is less than zero.
	//
	// Note that a zero return value here means equality.

	bytes := ([^]u8)(a)

	if n >= 8 {
		when HAS_HARDWARE_SIMD {
			when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
				scanner32: #simd[32]u8
				m = n / 32 * 32
				for /**/; i < m; i += 32 {
					load := intrinsics.unaligned_load(cast(^#simd[32]u8)&bytes[i])
					ne := intrinsics.simd_lanes_ne(scanner32, load)
					if intrinsics.simd_reduce_or(ne) > 0 {
						return 1
					}
				}
			}
		}

		scanner16: #simd[16]u8
		m = (n-i) / 16 * 16
		for /**/; i < m; i += 16 {
			load := intrinsics.unaligned_load(cast(^#simd[16]u8)&bytes[i])
			ne := intrinsics.simd_lanes_ne(scanner16, load)
			if intrinsics.simd_reduce_or(ne) != 0 {
				return 1
			}
		}

		m = (n-i) / size_of(uintptr) * size_of(uintptr)
		for /**/; i < m; i += size_of(uintptr) {
			if intrinsics.unaligned_load(cast(^uintptr)&bytes[i]) != 0 {
				return 1
			}
		}
	}

	for /**/; i < n; i += 1 {
		if bytes[i] != 0 {
			return 1
		}
	}
	return 0
}

memory_prefix_length :: proc "contextless" (x, y: rawptr, n: int) -> (idx: int) #no_bounds_check {
	switch {
	case x == y:   return n
	case x == nil: return 0
	case y == nil: return 0
	}
	a, b := cast([^]byte)x, cast([^]byte)y

	n := uint(n)
	i := uint(0)
	m := uint(0)

	when HAS_HARDWARE_SIMD {
		when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
			m = n / 32 * 32
			for /**/; i < m; i += 32 {
				load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
				load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
				comparison := intrinsics.simd_lanes_ne(load_a, load_b)
				if intrinsics.simd_reduce_or(comparison) != 0 {
					sentinel: #simd[32]u8 = u8(0xFF)
					indices := intrinsics.simd_indices(#simd[32]u8)
					index_select := intrinsics.simd_select(comparison, indices, sentinel)
					index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
					return int(i + index_reduce)
				}
			}
		}
	}

	m = (n-i) / 16 * 16
	for /**/; i < m; i += 16 {
		load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
		load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
		comparison := intrinsics.simd_lanes_ne(load_a, load_b)
		if intrinsics.simd_reduce_or(comparison) != 0 {
			sentinel: #simd[16]u8 = u8(0xFF)
			indices := intrinsics.simd_indices(#simd[16]u8)
			index_select := intrinsics.simd_select(comparison, indices, sentinel)
			index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
			return int(i + index_reduce)
		}
	}

	// 64-bit SIMD is faster than using a `uintptr` to detect a difference then
	// re-iterating with the byte-by-byte loop, at least on AMD64.
	m = (n-i) / 8 * 8
	for /**/; i < m; i += 8 {
		load_a := intrinsics.unaligned_load(cast(^#simd[8]u8)&a[i])
		load_b := intrinsics.unaligned_load(cast(^#simd[8]u8)&b[i])
		comparison := intrinsics.simd_lanes_ne(load_a, load_b)
		if intrinsics.simd_reduce_or(comparison) != 0 {
			sentinel: #simd[8]u8 = u8(0xFF)
			indices := intrinsics.simd_indices(#simd[8]u8)
			index_select := intrinsics.simd_select(comparison, indices, sentinel)
			index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
			return int(i + index_reduce)
		}
	}

	for /**/; i < n; i += 1 {
		if a[i] ~ b[i] != 0 {
			return int(i)
		}
	}
	return int(n)
}

string_eq :: proc "contextless" (lhs, rhs: string) -> bool {
	x := transmute(Raw_String)lhs
	y := transmute(Raw_String)rhs
	if x.len != y.len {
		return false
	}
	return #force_inline memory_equal(x.data, y.data, x.len)
}

string_cmp :: proc "contextless" (a, b: string) -> int {
	x := transmute(Raw_String)a
	y := transmute(Raw_String)b

	ret := memory_compare(x.data, y.data, min(x.len, y.len))
	if ret == 0 && x.len != y.len {
		return -1 if x.len < y.len else +1
	}
	return ret
}


string16_eq :: proc "contextless" (lhs, rhs: string16) -> bool {
	x := transmute(Raw_String16)lhs
	y := transmute(Raw_String16)rhs
	if x.len != y.len {
		return false
	}
	return #force_inline memory_equal(x.data, y.data, x.len*size_of(u16))
}

string16_cmp :: proc "contextless" (a, b: string16) -> int {
	x := transmute(Raw_String16)a
	y := transmute(Raw_String16)b

	ret := memory_compare(x.data, y.data, min(x.len, y.len)*size_of(u16))
	if ret == 0 && x.len != y.len {
		return -1 if x.len < y.len else +1
	}
	return ret
}

string_ne :: #force_inline proc "contextless" (a, b: string) -> bool { return !string_eq(a, b) }
string_lt :: #force_inline proc "contextless" (a, b: string) -> bool { return string_cmp(a, b) < 0 }
string_gt :: #force_inline proc "contextless" (a, b: string) -> bool { return string_cmp(a, b) > 0 }
string_le :: #force_inline proc "contextless" (a, b: string) -> bool { return string_cmp(a, b) <= 0 }
string_ge :: #force_inline proc "contextless" (a, b: string) -> bool { return string_cmp(a, b) >= 0 }

string16_ne :: #force_inline proc "contextless" (a, b: string16) -> bool { return !string16_eq(a, b) }
string16_lt :: #force_inline proc "contextless" (a, b: string16) -> bool { return string16_cmp(a, b) < 0 }
string16_gt :: #force_inline proc "contextless" (a, b: string16) -> bool { return string16_cmp(a, b) > 0 }
string16_le :: #force_inline proc "contextless" (a, b: string16) -> bool { return string16_cmp(a, b) <= 0 }
string16_ge :: #force_inline proc "contextless" (a, b: string16) -> bool { return string16_cmp(a, b) >= 0 }


cstring_len :: proc "contextless" (s: cstring) -> int {
	p0 := uintptr((^byte)(s))
	p := p0
	for p != 0 && (^byte)(p)^ != 0 {
		p += 1
	}
	return int(p - p0)
}

cstring16_len :: proc "contextless" (s: cstring16) -> int {
	p := ([^]u16)(s)
	n := 0
	for p != nil && p[0] != 0 {
		p = p[1:]
		n += 1
	}
	return n
}

cstring_to_string :: proc "contextless" (s: cstring) -> string {
	if s == nil {
		return ""
	}
	ptr := (^byte)(s)
	n := cstring_len(s)
	return transmute(string)Raw_String{ptr, n}
}

cstring16_to_string16 :: proc "contextless" (s: cstring16) -> string16 {
	if s == nil {
		return ""
	}
	ptr := (^u16)(s)
	n := cstring16_len(s)
	return transmute(string16)Raw_String16{ptr, n}
}


cstring_eq :: proc "contextless" (lhs, rhs: cstring) -> bool {
	x := ([^]byte)(lhs)
	y := ([^]byte)(rhs)
	if x == y {
		return true
	}
	if (x == nil) ~ (y == nil) {
		return false
	}
	xn := cstring_len(lhs)
	yn := cstring_len(rhs)
	if xn != yn {
		return false
	}
	return #force_inline memory_equal(x, y, xn)
}

cstring_cmp :: proc "contextless" (lhs, rhs: cstring) -> int {
	x := ([^]byte)(lhs)
	y := ([^]byte)(rhs)
	if x == y {
		return 0
	}
	if (x == nil) ~ (y == nil) {
		return -1 if x == nil else +1
	}
	xn := cstring_len(lhs)
	yn := cstring_len(rhs)
	ret := memory_compare(x, y, min(xn, yn))
	if ret == 0 && xn != yn {
		return -1 if xn < yn else +1
	}
	return ret
}

cstring_ne :: #force_inline proc "contextless" (a, b: cstring) -> bool { return !cstring_eq(a, b) }
cstring_lt :: #force_inline proc "contextless" (a, b: cstring) -> bool { return cstring_cmp(a, b) < 0 }
cstring_gt :: #force_inline proc "contextless" (a, b: cstring) -> bool { return cstring_cmp(a, b) > 0 }
cstring_le :: #force_inline proc "contextless" (a, b: cstring) -> bool { return cstring_cmp(a, b) <= 0 }
cstring_ge :: #force_inline proc "contextless" (a, b: cstring) -> bool { return cstring_cmp(a, b) >= 0 }

cstring16_eq :: proc "contextless" (lhs, rhs: cstring16) -> bool {
	x := ([^]u16)(lhs)
	y := ([^]u16)(rhs)
	if x == y {
		return true
	}
	if (x == nil) ~ (y == nil) {
		return false
	}
	xn := cstring16_len(lhs)
	yn := cstring16_len(rhs)
	if xn != yn {
		return false
	}
	return #force_inline memory_equal(x, y, xn*size_of(u16))
}

cstring16_cmp :: proc "contextless" (lhs, rhs: cstring16) -> int {
	x := ([^]u16)(lhs)
	y := ([^]u16)(rhs)
	if x == y {
		return 0
	}
	if (x == nil) ~ (y == nil) {
		return -1 if x == nil else +1
	}
	xn := cstring16_len(lhs)
	yn := cstring16_len(rhs)
	ret := memory_compare(x, y, min(xn, yn)*size_of(u16))
	if ret == 0 && xn != yn {
		return -1 if xn < yn else +1
	}
	return ret
}

cstring16_ne :: #force_inline proc "contextless" (a, b: cstring16) -> bool { return !cstring16_eq(a, b) }
cstring16_lt :: #force_inline proc "contextless" (a, b: cstring16) -> bool { return cstring16_cmp(a, b) < 0 }
cstring16_gt :: #force_inline proc "contextless" (a, b: cstring16) -> bool { return cstring16_cmp(a, b) > 0 }
cstring16_le :: #force_inline proc "contextless" (a, b: cstring16) -> bool { return cstring16_cmp(a, b) <= 0 }
cstring16_ge :: #force_inline proc "contextless" (a, b: cstring16) -> bool { return cstring16_cmp(a, b) >= 0 }

complex32_eq :: #force_inline proc "contextless"  (a, b: complex32)  -> bool { return real(a) == real(b) && imag(a) == imag(b) }
complex32_ne :: #force_inline proc "contextless"  (a, b: complex32)  -> bool { return real(a) != real(b) || imag(a) != imag(b) }

complex64_eq :: #force_inline proc "contextless"  (a, b: complex64)  -> bool { return real(a) == real(b) && imag(a) == imag(b) }
complex64_ne :: #force_inline proc "contextless"  (a, b: complex64)  -> bool { return real(a) != real(b) || imag(a) != imag(b) }

complex128_eq :: #force_inline proc "contextless" (a, b: complex128) -> bool { return real(a) == real(b) && imag(a) == imag(b) }
complex128_ne :: #force_inline proc "contextless" (a, b: complex128) -> bool { return real(a) != real(b) || imag(a) != imag(b) }


quaternion64_eq :: #force_inline proc "contextless"  (a, b: quaternion64)  -> bool { return real(a) == real(b) && imag(a) == imag(b) && jmag(a) == jmag(b) && kmag(a) == kmag(b) }
quaternion64_ne :: #force_inline proc "contextless"  (a, b: quaternion64)  -> bool { return real(a) != real(b) || imag(a) != imag(b) || jmag(a) != jmag(b) || kmag(a) != kmag(b) }

quaternion128_eq :: #force_inline proc "contextless"  (a, b: quaternion128)  -> bool { return real(a) == real(b) && imag(a) == imag(b) && jmag(a) == jmag(b) && kmag(a) == kmag(b) }
quaternion128_ne :: #force_inline proc "contextless"  (a, b: quaternion128)  -> bool { return real(a) != real(b) || imag(a) != imag(b) || jmag(a) != jmag(b) || kmag(a) != kmag(b) }

quaternion256_eq :: #force_inline proc "contextless" (a, b: quaternion256) -> bool { return real(a) == real(b) && imag(a) == imag(b) && jmag(a) == jmag(b) && kmag(a) == kmag(b) }
quaternion256_ne :: #force_inline proc "contextless" (a, b: quaternion256) -> bool { return real(a) != real(b) || imag(a) != imag(b) || jmag(a) != jmag(b) || kmag(a) != kmag(b) }


string_decode_rune :: proc "contextless" (s: string) -> (rune, int) {
	// NOTE(bill): Duplicated here to remove dependency on package unicode/utf8

	@(static, rodata) accept_sizes := [256]u8{
		0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0f
		0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1f
		0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2f
		0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3f
		0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4f
		0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5f
		0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6f
		0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7f

		0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8f
		0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9f
		0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xa0-0xaf
		0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xb0-0xbf
		0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xc0-0xcf
		0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xd0-0xdf
		0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xe0-0xef
		0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xf0-0xff
	}
	Accept_Range :: struct {lo, hi: u8}

	@(static, rodata) accept_ranges := [5]Accept_Range{
		{0x80, 0xbf},
		{0xa0, 0xbf},
		{0x80, 0x9f},
		{0x90, 0xbf},
		{0x80, 0x8f},
	}

	MASKX :: 0b0011_1111
	MASK2 :: 0b0001_1111
	MASK3 :: 0b0000_1111
	MASK4 :: 0b0000_0111

	LOCB :: 0b1000_0000
	HICB :: 0b1011_1111


	RUNE_ERROR :: '\ufffd'

	n := len(s)
	if n < 1 {
		return RUNE_ERROR, 0
	}
	s0 := s[0]
	x := accept_sizes[s0]
	if x >= 0xF0 {
		mask := rune(x) << 31 >> 31 // NOTE(bill): Create 0x0000 or 0xffff.
		return rune(s[0])&~mask | RUNE_ERROR&mask, 1
	}
	sz := x & 7
	accept := accept_ranges[x>>4]
	if n < int(sz) {
		return RUNE_ERROR, 1
	}
	b1 := s[1]
	if b1 < accept.lo || accept.hi < b1 {
		return RUNE_ERROR, 1
	}
	if sz == 2 {
		return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2
	}
	b2 := s[2]
	if b2 < LOCB || HICB < b2 {
		return RUNE_ERROR, 1
	}
	if sz == 3 {
		return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3
	}
	b3 := s[3]
	if b3 < LOCB || HICB < b3 {
		return RUNE_ERROR, 1
	}
	return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4
}

string_decode_last_rune :: proc "contextless" (s: string) -> (rune, int) {
	RUNE_ERROR :: '\ufffd'
	RUNE_SELF  :: 0x80
	UTF_MAX    :: 4

	r: rune
	size: int
	start, end, limit: int

	end = len(s)
	if end == 0 {
		return RUNE_ERROR, 0
	}
	start = end-1
	r = rune(s[start])
	if r < RUNE_SELF {
		return r, 1
	}

	limit = max(end - UTF_MAX, 0)

	for start-=1; start >= limit; start-=1 {
		if (s[start] & 0xc0) != RUNE_SELF {
			break
		}
	}

	start = max(start, 0)
	r, size = string_decode_rune(s[start:end])
	if start+size != end {
		return RUNE_ERROR, 1
	}
	return r, size
}


string16_decode_rune :: proc "contextless" (s: string16) -> (rune, int) {
	REPLACEMENT_CHAR :: '\ufffd'
	_surr1           :: 0xd800
	_surr2           :: 0xdc00
	_surr3           :: 0xe000
	_surr_self       :: 0x10000

	r := rune(REPLACEMENT_CHAR)

	if len(s) < 1 {
		return r, 0
	}

	w := 1
	switch c := s[0]; {
	case c < _surr1, _surr3 <= c:
		r = rune(c)
	case _surr1 <= c && c < _surr2 && 1 < len(s) &&
		_surr2 <= s[1] && s[1] < _surr3:
		r1, r2 := rune(c), rune(s[1])
		if _surr1 <= r1 && r1 < _surr2 && _surr2 <= r2 && r2 < _surr3 {
			r = (r1-_surr1)<<10 | (r2 - _surr2) + _surr_self
		}
		w += 1
	}
	return r, w
}

string16_decode_last_rune :: proc "contextless" (s: string16) -> (rune, int) {
	REPLACEMENT_CHAR :: '\ufffd'
	_surr1           :: 0xd800
	_surr2           :: 0xdc00
	_surr3           :: 0xe000
	_surr_self       :: 0x10000

	r := rune(REPLACEMENT_CHAR)

	if len(s) < 1 {
		return r, 0
	}

	n := len(s)-1
	c := s[n]
	w := 1
	if _surr2 <= c && c < _surr3 {
		if n >= 1 {
			r1 := rune(s[n-1])
			r2 := rune(c)
			if _surr1 <= r1 && r1 < _surr2 {
				r = (r1-_surr1)<<10 | (r2 - _surr2) + _surr_self
			}
			w = 2
		}
	} else if c < _surr1 || _surr3 <= c {
		r = rune(c)
	}
	return r, w
}


abs_complex32 :: #force_inline proc "contextless" (x: complex32) -> f16 {
	p, q := abs(real(x)), abs(imag(x))
	if p < q {
		p, q = q, p
	}
	if p == 0 {
		return 0
	}
	q = q / p
	return p * f16(intrinsics.sqrt(f32(1 + q*q)))
}
abs_complex64 :: #force_inline proc "contextless" (x: complex64) -> f32 {
	p, q := abs(real(x)), abs(imag(x))
	if p < q {
		p, q = q, p
	}
	if p == 0 {
		return 0
	}
	q = q / p
	return p * intrinsics.sqrt(1 + q*q)
}
abs_complex128 :: #force_inline proc "contextless" (x: complex128) -> f64 {
	p, q := abs(real(x)), abs(imag(x))
	if p < q {
		p, q = q, p
	}
	if p == 0 {
		return 0
	}
	q = q / p
	return p * intrinsics.sqrt(1 + q*q)
}
abs_quaternion64 :: #force_inline proc "contextless" (x: quaternion64) -> f16 {
	r, i, j, k := real(x), imag(x), jmag(x), kmag(x)
	return f16(intrinsics.sqrt(f32(r*r + i*i + j*j + k*k)))
}
abs_quaternion128 :: #force_inline proc "contextless" (x: quaternion128) -> f32 {
	r, i, j, k := real(x), imag(x), jmag(x), kmag(x)
	return intrinsics.sqrt(r*r + i*i + j*j + k*k)
}
abs_quaternion256 :: #force_inline proc "contextless" (x: quaternion256) -> f64 {
	r, i, j, k := real(x), imag(x), jmag(x), kmag(x)
	return intrinsics.sqrt(r*r + i*i + j*j + k*k)
}


quo_complex32 :: proc "contextless" (n, m: complex32) -> complex32 {
	nr, ni := f32(real(n)), f32(imag(n))
	mr, mi := f32(real(m)), f32(imag(m))

	e, f: f32

	if abs(mr) >= abs(mi) {
		ratio := mi / mr
		denom := mr + ratio*mi
		e = (nr + ni*ratio) / denom
		f = (ni - nr*ratio) / denom
	} else {
		ratio := mr / mi
		denom := mi + ratio*mr
		e = (nr*ratio + ni) / denom
		f = (ni*ratio - nr) / denom
	}

	return complex(f16(e), f16(f))
}


quo_complex64 :: proc "contextless" (n, m: complex64) -> complex64 {
	e, f: f32

	if abs(real(m)) >= abs(imag(m)) {
		ratio := imag(m) / real(m)
		denom := real(m) + ratio*imag(m)
		e = (real(n) + imag(n)*ratio) / denom
		f = (imag(n) - real(n)*ratio) / denom
	} else {
		ratio := real(m) / imag(m)
		denom := imag(m) + ratio*real(m)
		e = (real(n)*ratio + imag(n)) / denom
		f = (imag(n)*ratio - real(n)) / denom
	}

	return complex(e, f)
}

quo_complex128 :: proc "contextless" (n, m: complex128) -> complex128 {
	e, f: f64

	if abs(real(m)) >= abs(imag(m)) {
		ratio := imag(m) / real(m)
		denom := real(m) + ratio*imag(m)
		e = (real(n) + imag(n)*ratio) / denom
		f = (imag(n) - real(n)*ratio) / denom
	} else {
		ratio := real(m) / imag(m)
		denom := imag(m) + ratio*real(m)
		e = (real(n)*ratio + imag(n)) / denom
		f = (imag(n)*ratio - real(n)) / denom
	}

	return complex(e, f)
}

mul_quaternion64 :: proc "contextless" (q, r: quaternion64) -> quaternion64 {
	q0, q1, q2, q3 := f32(real(q)), f32(imag(q)), f32(jmag(q)), f32(kmag(q))
	r0, r1, r2, r3 := f32(real(r)), f32(imag(r)), f32(jmag(r)), f32(kmag(r))

	t0 := r0*q0 - r1*q1 - r2*q2 - r3*q3
	t1 := r0*q1 + r1*q0 - r2*q3 + r3*q2
	t2 := r0*q2 + r1*q3 + r2*q0 - r3*q1
	t3 := r0*q3 - r1*q2 + r2*q1 + r3*q0

	return quaternion(w=f16(t0), x=f16(t1), y=f16(t2), z=f16(t3))
}

mul_quaternion128 :: proc "contextless" (q, r: quaternion128) -> quaternion128 {
	q0, q1, q2, q3 := real(q), imag(q), jmag(q), kmag(q)
	r0, r1, r2, r3 := real(r), imag(r), jmag(r), kmag(r)

	t0 := r0*q0 - r1*q1 - r2*q2 - r3*q3
	t1 := r0*q1 + r1*q0 - r2*q3 + r3*q2
	t2 := r0*q2 + r1*q3 + r2*q0 - r3*q1
	t3 := r0*q3 - r1*q2 + r2*q1 + r3*q0

	return quaternion(w=t0, x=t1, y=t2, z=t3)
}

mul_quaternion256 :: proc "contextless" (q, r: quaternion256) -> quaternion256 {
	q0, q1, q2, q3 := real(q), imag(q), jmag(q), kmag(q)
	r0, r1, r2, r3 := real(r), imag(r), jmag(r), kmag(r)

	t0 := r0*q0 - r1*q1 - r2*q2 - r3*q3
	t1 := r0*q1 + r1*q0 - r2*q3 + r3*q2
	t2 := r0*q2 + r1*q3 + r2*q0 - r3*q1
	t3 := r0*q3 - r1*q2 + r2*q1 + r3*q0

	return quaternion(w=t0, x=t1, y=t2, z=t3)
}

quo_quaternion64 :: proc "contextless" (q, r: quaternion64) -> quaternion64 {
	q0, q1, q2, q3 := f32(real(q)), f32(imag(q)), f32(jmag(q)), f32(kmag(q))
	r0, r1, r2, r3 := f32(real(r)), f32(imag(r)), f32(jmag(r)), f32(kmag(r))

	invmag2 := 1.0 / (r0*r0 + r1*r1 + r2*r2 + r3*r3)

	t0 := (r0*q0 + r1*q1 + r2*q2 + r3*q3) * invmag2
	t1 := (r0*q1 - r1*q0 - r2*q3 - r3*q2) * invmag2
	t2 := (r0*q2 - r1*q3 - r2*q0 + r3*q1) * invmag2
	t3 := (r0*q3 + r1*q2 + r2*q1 - r3*q0) * invmag2

	return quaternion(w=f16(t0), x=f16(t1), y=f16(t2), z=f16(t3))
}

quo_quaternion128 :: proc "contextless" (q, r: quaternion128) -> quaternion128 {
	q0, q1, q2, q3 := real(q), imag(q), jmag(q), kmag(q)
	r0, r1, r2, r3 := real(r), imag(r), jmag(r), kmag(r)

	invmag2 := 1.0 / (r0*r0 + r1*r1 + r2*r2 + r3*r3)

	t0 := (r0*q0 + r1*q1 + r2*q2 + r3*q3) * invmag2
	t1 := (r0*q1 - r1*q0 - r2*q3 - r3*q2) * invmag2
	t2 := (r0*q2 - r1*q3 - r2*q0 + r3*q1) * invmag2
	t3 := (r0*q3 + r1*q2 + r2*q1 - r3*q0) * invmag2

	return quaternion(w=t0, x=t1, y=t2, z=t3)
}

quo_quaternion256 :: proc "contextless" (q, r: quaternion256) -> quaternion256 {
	q0, q1, q2, q3 := real(q), imag(q), jmag(q), kmag(q)
	r0, r1, r2, r3 := real(r), imag(r), jmag(r), kmag(r)

	invmag2 := 1.0 / (r0*r0 + r1*r1 + r2*r2 + r3*r3)

	t0 := (r0*q0 + r1*q1 + r2*q2 + r3*q3) * invmag2
	t1 := (r0*q1 - r1*q0 - r2*q3 - r3*q2) * invmag2
	t2 := (r0*q2 - r1*q3 - r2*q0 + r3*q1) * invmag2
	t3 := (r0*q3 + r1*q2 + r2*q1 - r3*q0) * invmag2

	return quaternion(w=t0, x=t1, y=t2, z=t3)
}

@(link_name="__truncsfhf2", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
truncsfhf2 :: proc "c" (value: f32) -> __float16 {
	v: struct #raw_union { i: u32, f: f32 }
	i, s, e, m: i32

	v.f = value
	i = i32(v.i)

	s =  (i >> 16) & 0x00008000
	e = ((i >> 23) & 0x000000ff) - (127 - 15)
	m =   i        & 0x007fffff


	if e <= 0 {
		if e < -10 {
			return transmute(__float16)u16(s)
		}
		m = (m | 0x00800000) >> u32(1 - e)

		if m & 0x00001000 != 0 {
			m += 0x00002000
		}

		return transmute(__float16)u16(s | (m >> 13))
	} else if e == 0xff - (127 - 15) {
		if m == 0 {
			return transmute(__float16)u16(s | 0x7c00) /* NOTE(bill): infinity */
		} else {
			/* NOTE(bill): NAN */
			m >>= 13
			return transmute(__float16)u16(s | 0x7c00 | m | i32(m == 0))
		}
	} else {
		if m & 0x00001000 != 0 {
			m += 0x00002000
			if (m & 0x00800000) != 0 {
				m = 0
				e += 1
			}
		}

		if e > 30 {
			f := i64(1e12)
			for j := 0; j < 10; j += 1 {
				/* NOTE(bill): Cause overflow */
				g := intrinsics.volatile_load(&f)
				g *= g
				intrinsics.volatile_store(&f, g)
			}

			return transmute(__float16)u16(s | 0x7c00)
		}

		return transmute(__float16)u16(s | (e << 10) | (m >> 13))
	}
}

@(link_name="__aeabi_d2h", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
aeabi_d2h :: proc "c" (value: f64) -> __float16 {
	return truncsfhf2(f32(value))
}

@(link_name="__truncdfhf2", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
truncdfhf2 :: proc "c" (value: f64) -> __float16 {
	return truncsfhf2(f32(value))
}

@(link_name="__gnu_h2f_ieee", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
gnu_h2f_ieee :: proc "c" (value_: __float16) -> f32 {
	fp32 :: struct #raw_union { u: u32, f: f32 }

	value := transmute(u16)value_
	v: fp32
	magic, inf_or_nan: fp32
	magic.u = u32((254 - 15) << 23)
	inf_or_nan.u = u32((127 + 16) << 23)

	v.u = u32(value & 0x7fff) << 13
	v.f *= magic.f
	if v.f >= inf_or_nan.f {
		v.u |= 255 << 23
	}
	v.u |= u32(value & 0x8000) << 16
	return v.f
}


@(link_name="__gnu_f2h_ieee", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
gnu_f2h_ieee :: proc "c" (value: f32) -> __float16 {
	return truncsfhf2(value)
}

@(link_name="__extendhfsf2", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
extendhfsf2 :: proc "c" (value: __float16) -> f32 {
	return gnu_h2f_ieee(value)
}


@(link_name="__floattidf", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
floattidf :: proc "c" (a: i128) -> f64 {
	DBL_MANT_DIG :: 53
	if a == 0 {
		return 0.0
	}
	a := a
	N :: size_of(i128) * 8
	s := a >> (N-1)
	a = (a ~ s) - s
	sd: = N - intrinsics.count_leading_zeros(a)  // number of significant digits
	e := i32(sd - 1)        // exponent
	if sd > DBL_MANT_DIG {
		switch sd {
		case DBL_MANT_DIG + 1:
			a <<= 1
		case DBL_MANT_DIG + 2:
			// okay
		case:
			a = i128(u128(a) >> u128(sd - (DBL_MANT_DIG+2))) |
			    i128(u128(a) & (~u128(0) >> u128(N + DBL_MANT_DIG+2 - sd)) != 0)
		}

		a |= i128((a & 4) != 0)
		a += 1
		a >>= 2

		if a & (i128(1) << DBL_MANT_DIG) != 0 {
			a >>= 1
			e += 1
		}
	} else {
		a <<= u128(DBL_MANT_DIG - sd) & 127
	}
	fb: [2]u32
	fb[1] = (u32(s) & 0x80000000) |          // sign
	        (u32(e + 1023) << 20) |          // exponent
	        u32((u64(a) >> 32) & 0x000FFFFF) // mantissa-high
	fb[0] = u32(a)                           // mantissa-low
	return transmute(f64)fb
}


@(link_name="__floattidf_unsigned", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
floattidf_unsigned :: proc "c" (a: u128) -> f64 {
	DBL_MANT_DIG :: 53
	if a == 0 {
		return 0.0
	}
	a := a
	N :: size_of(u128) * 8
	sd: = N - intrinsics.count_leading_zeros(a)  // number of significant digits
	e := i32(sd - 1)        // exponent
	if sd > DBL_MANT_DIG {
		switch sd {
		case DBL_MANT_DIG + 1:
			a <<= 1
		case DBL_MANT_DIG + 2:
			// okay
		case:
			a = u128(u128(a) >> u128(sd - (DBL_MANT_DIG+2))) |
				u128(u128(a) & (~u128(0) >> u128(N + DBL_MANT_DIG+2 - sd)) != 0)
		}

		a |= u128((a & 4) != 0)
		a += 1
		a >>= 2

		if a & (1 << DBL_MANT_DIG) != 0 {
			a >>= 1
			e += 1
		}
	} else {
		a <<= u128(DBL_MANT_DIG - sd)
	}
	fb: [2]u32
	fb[1] = (0) |                            // sign
	        u32((e + 1023) << 20) |          // exponent
	        u32((u64(a) >> 32) & 0x000FFFFF) // mantissa-high
	fb[0] = u32(a)                           // mantissa-low
	return transmute(f64)fb
}


@(link_name="__fixunsdfti", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
fixunsdfti :: #force_no_inline proc "c" (a: f64) -> u128 {
	// TODO(bill): implement `fixunsdfti` correctly
	x := u64(a)
	return u128(x)
}

@(link_name="__fixunsdfdi", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
fixunsdfdi :: #force_no_inline proc "c" (a: f64) -> i128 {
	// TODO(bill): implement `fixunsdfdi` correctly
	x := i64(a)
	return i128(x)
}


@(link_name="__umodti3", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
umodti3 :: proc "c" (a, b: u128) -> u128 {
	r: u128 = ---
	_ = udivmod128(a, b, &r)
	return r
}


@(link_name="__udivmodti4", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
udivmodti4 :: proc "c" (a, b: u128, rem: ^u128) -> u128 {
	return udivmod128(a, b, rem)
}

when !IS_WASM {
	@(link_name="__udivti3", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
	udivti3 :: proc "c" (a, b: u128) -> u128 {
		return udivmodti4(a, b, nil)
	}
}


@(link_name="__modti3", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
modti3 :: proc "c" (a, b: i128) -> i128 {
	s_a := a >> (128 - 1)
	s_b := b >> (128 - 1)
	an := (a ~ s_a) - s_a
	bn := (b ~ s_b) - s_b

	r: u128 = ---
	_ = udivmod128(u128(an), u128(bn), &r)
	return (i128(r) ~ s_a) - s_a
}


@(link_name="__divmodti4", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
divmodti4 :: proc "c" (a, b: i128, rem: ^i128) -> i128 {
	s_a := a >> (128 - 1) // -1 if negative or 0
	s_b := b >> (128 - 1)
	an := (a ~ s_a) - s_a // absolute
	bn := (b ~ s_b) - s_b

	s_b   ~= s_a // quotient sign
	u_s_b := u128(s_b)
	u_s_a := u128(s_a)

	r: u128 = ---
	u := i128((udivmodti4(u128(an), u128(bn), &r) ~ u_s_b) - u_s_b) // negate if negative
	rem^ = i128((r ~ u_s_a) - u_s_a)
	return u
}

@(link_name="__divti3", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
divti3 :: proc "c" (a, b: i128) -> i128 {
	s_a := a >> (128 - 1) // -1 if negative or 0
	s_b := b >> (128 - 1)
	an := (a ~ s_a) - s_a // absolute
	bn := (b ~ s_b) - s_b

	s_a   ~= s_b // quotient sign
	u_s_a := u128(s_a)

	return i128((udivmodti4(u128(an), u128(bn), nil) ~ u_s_a) - u_s_a) // negate if negative
}


@(link_name="__fixdfti", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
fixdfti :: proc "c" (a: u64) -> i128 {
	significandBits :: 52
	typeWidth       :: (size_of(u64)*8)
	exponentBits    :: (typeWidth - significandBits - 1)
	maxExponent     :: ((1 << exponentBits) - 1)
	exponentBias    :: (maxExponent >> 1)

	implicitBit     :: (u64(1) << significandBits)
	significandMask :: (implicitBit - 1)
	signBit         :: (u64(1) << (significandBits + exponentBits))
	absMask         :: (signBit - 1)
	exponentMask    :: (absMask ~ significandMask)

	// Break a into sign, exponent, significand
	aRep := a
	aAbs := aRep & absMask
	sign := i128(-1 if aRep & signBit != 0 else 1)
	exponent := u64((aAbs >> significandBits) - exponentBias)
	significand := u64((aAbs & significandMask) | implicitBit)

	// If exponent is negative, the result is zero.
	if exponent < 0 {
		return 0
	}

	// If the value is too large for the integer type, saturate.
	if exponent >= size_of(i128) * 8 {
		return max(i128) if sign == 1 else min(i128)
	}

	// If 0 <= exponent < significandBits, right shift to get the result.
	// Otherwise, shift left.
	if exponent < significandBits {
		return sign * i128(significand >> (significandBits - exponent))
	} else {
		return sign * (i128(significand) << (exponent - significandBits))
	}

}


__write_bits :: proc "contextless" (dst, src: [^]byte, offset: uintptr, size: uintptr) {
	for i in 0..<size {
		j := offset+i
		the_bit := byte((src[i>>3]) & (1<<(i&7)) != 0)
		dst[j>>3] &~=       1<<(j&7)
		dst[j>>3]  |= the_bit<<(j&7)
	}
}

__read_bits :: proc "contextless" (dst, src: [^]byte, offset: uintptr, size: uintptr) {
	for j in 0..<size {
		i := offset+j
		the_bit := byte((src[i>>3]) & (1<<(i&7)) != 0)
		dst[j>>3] &~=       1<<(j&7)
		dst[j>>3]  |= the_bit<<(j&7)
	}
}

when .Address in ODIN_SANITIZER_FLAGS {
	foreign {
		@(require)
		__asan_unpoison_memory_region :: proc "system" (address: rawptr, size: uint) ---
	}
}