Browse Source

Merge pull request #1807 from odin-lang/simd-dev

Generic #simd type and intrinsics
gingerBill 3 years ago
parent
commit
a1f15c2c69

+ 91 - 8
core/intrinsics/intrinsics.odin

@@ -6,12 +6,14 @@ package intrinsics
 is_package_imported :: proc(package_name: string) -> bool ---
 
 // Types
-simd_vector :: proc($N: int, $T: typeid) -> type/#simd[N]T
 soa_struct :: proc($N: int, $T: typeid) -> type/#soa[N]T
 
 // Volatile
 volatile_load  :: proc(dst: ^$T) -> T ---
-volatile_store :: proc(dst: ^$T, val: T) -> T ---
+volatile_store :: proc(dst: ^$T, val: T) ---
+
+non_temporal_load  :: proc(dst: ^$T) -> T ---
+non_temporal_store :: proc(dst: ^$T, val: T)  ---
 
 // Trapping
 debug_trap :: proc() ---
@@ -23,18 +25,20 @@ alloca             :: proc(size, align: int) -> [^]u8 ---
 cpu_relax          :: proc() ---
 read_cycle_counter :: proc() -> i64 ---
 
-count_ones           :: proc(x: $T) -> T where type_is_integer(T) ---
-count_zeros          :: proc(x: $T) -> T where type_is_integer(T) ---
-count_trailing_zeros :: proc(x: $T) -> T where type_is_integer(T) ---
-count_leading_zeros  :: proc(x: $T) -> T where type_is_integer(T) ---
-reverse_bits         :: proc(x: $T) -> T where type_is_integer(T) ---
+count_ones           :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
+count_zeros          :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
+count_trailing_zeros :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
+count_leading_zeros  :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
+reverse_bits         :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
 byte_swap            :: proc(x: $T) -> T where type_is_integer(T) || type_is_float(T) ---
 
 overflow_add :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok ---
 overflow_sub :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok ---
 overflow_mul :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok ---
 
-sqrt :: proc(x: $T) -> T where type_is_float(T) ---
+sqrt :: proc(x: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) ---
+
+fused_mul_add :: proc(a, b, c: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) ---
 
 mem_copy                 :: proc(dst, src: rawptr, len: int) ---
 mem_copy_non_overlapping :: proc(dst, src: rawptr, len: int) ---
@@ -186,6 +190,81 @@ type_hasher_proc :: proc($T: typeid) -> (hasher: proc "contextless" (data: rawpt
 
 constant_utf16_cstring :: proc($literal: string) -> [^]u16 ---
 
+// SIMD related
+simd_add  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_sub  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_mul  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_div  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_rem  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+
+// Keeps Odin's Behaviour
+// (x << y) if y <= mask else 0
+simd_shl :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
+simd_shr :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
+
+// Similar to C's Behaviour
+// x << (y & mask)
+simd_shl_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
+simd_shr_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
+
+simd_add_sat :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_sub_sat :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+
+simd_and     :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_or      :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_xor     :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_and_not :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+
+simd_neg  :: proc(a: #simd[N]T) -> #simd[N]T ---
+
+simd_abs :: proc(a: #simd[N]T) -> #simd[N]T ---
+
+simd_min   :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_max   :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_clamp :: proc(v, min, max: #simd[N]T) -> #simd[N]T ---
+
+// Return an unsigned integer of the same size as the input type
+// NOT A BOOLEAN
+// element-wise:
+//     false => 0x00...00
+//     true  => 0xff...ff
+simd_lanes_eq :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+simd_lanes_ne :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+simd_lanes_lt :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+simd_lanes_le :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+simd_lanes_gt :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+
+simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
+simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---
+
+simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T ---
+simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T ---
+simd_reduce_min         :: proc(a: #simd[N]T) -> T ---
+simd_reduce_max         :: proc(a: #simd[N]T) -> T ---
+simd_reduce_and         :: proc(a: #simd[N]T) -> T ---
+simd_reduce_or          :: proc(a: #simd[N]T) -> T ---
+simd_reduce_xor         :: proc(a: #simd[N]T) -> T ---
+
+simd_shuffle :: proc(a, b: #simd[N]T, indices: ..int) -> #simd[len(indices)]T ---
+simd_select  :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T ---
+
+// Lane-wise operations
+simd_ceil    :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
+simd_floor   :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
+simd_trunc   :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
+// rounding to the nearest integral value; if two values are equally near, rounds to the even one
+simd_nearest :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
+
+simd_to_bits :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), type_is_unsigned(Integer) ---
+
+// equivalent a swizzle with descending indices, e.g. reserve(a, 3, 2, 1, 0)
+simd_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---
+
+simd_rotate_left  :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
+simd_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
+
+
 // WASM targets only
 wasm_memory_grow :: proc(index, delta: uintptr) -> int ---
 wasm_memory_size :: proc(index: uintptr)        -> int ---
@@ -199,6 +278,10 @@ wasm_memory_size :: proc(index: uintptr)        -> int ---
 wasm_memory_atomic_wait32   :: proc(ptr: ^u32, expected: u32, timeout_ns: i64) -> u32 ---
 wasm_memory_atomic_notify32 :: proc(ptr: ^u32, waiters: u32) -> (waiters_woken_up: u32) ---
 
+// x86 Targets (i386, amd64)
+x86_cpuid  :: proc(ax, cx: u32) -> (eax, ebc, ecx, edx: u32) ---
+x86_xgetbv :: proc(cx: u32) -> (eax, edx: u32) ---
+
 
 // Darwin targets only
 objc_object   :: struct{}

+ 1 - 0
core/mem/raw.odin

@@ -21,6 +21,7 @@ make_any :: proc "contextless" (data: rawptr, id: typeid) -> any {
 }
 
 raw_array_data         :: runtime.raw_array_data
+raw_simd_data          :: runtime.raw_simd_data
 raw_string_data        :: runtime.raw_string_data
 raw_slice_data         :: runtime.raw_slice_data
 raw_dynamic_array_data :: runtime.raw_dynamic_array_data

+ 5 - 1
core/runtime/core_builtin.odin

@@ -604,6 +604,10 @@ raw_array_data :: proc "contextless" (a: $P/^($T/[$N]$E)) -> [^]E {
 	return ([^]E)(a)
 }
 @builtin
+raw_simd_data :: proc "contextless" (a: $P/^($T/#simd[$N]$E)) -> [^]E {
+	return ([^]E)(a)
+}
+@builtin
 raw_slice_data :: proc "contextless" (s: $S/[]$E) -> [^]E {
 	ptr := (transmute(Raw_Slice)s).data
 	return ([^]E)(ptr)
@@ -619,7 +623,7 @@ raw_string_data :: proc "contextless" (s: $S/string) -> [^]u8 {
 }
 
 @builtin
-raw_data :: proc{raw_array_data, raw_slice_data, raw_dynamic_array_data, raw_string_data}
+raw_data :: proc{raw_array_data, raw_slice_data, raw_dynamic_array_data, raw_string_data, raw_simd_data}
 
 
 

+ 188 - 0
core/simd/simd.odin

@@ -0,0 +1,188 @@
+package simd
+
+import "core:builtin"
+import "core:intrinsics"
+
+// 128-bit vector aliases
+u8x16 :: #simd[16]u8
+i8x16 :: #simd[16]i8
+u16x8 :: #simd[8]u16
+i16x8 :: #simd[8]i16
+u32x4 :: #simd[4]u32
+i32x4 :: #simd[4]i32
+u64x2 :: #simd[2]u64
+i64x2 :: #simd[2]i64
+f32x4 :: #simd[4]f32
+f64x2 :: #simd[2]f64
+
+boolx16 :: #simd[16]bool
+b8x16   :: #simd[16]b8
+b16x8   :: #simd[8]b16
+b32x4   :: #simd[4]b32
+b64x2   :: #simd[2]b64
+
+// 256-bit vector aliases
+u8x32  :: #simd[32]u8
+i8x32  :: #simd[32]i8
+u16x16 :: #simd[16]u16
+i16x16 :: #simd[16]i16
+u32x8  :: #simd[8]u32
+i32x8  :: #simd[8]i32
+u64x4  :: #simd[4]u64
+i64x4  :: #simd[4]i64
+f32x8  :: #simd[8]f32
+f64x4  :: #simd[4]f64
+
+boolx32 :: #simd[32]bool
+b8x32   :: #simd[32]b8
+b16x16  :: #simd[16]b16
+b32x8   :: #simd[8]b32
+b64x4   :: #simd[4]b64
+
+// 512-bit vector aliases
+u8x64  :: #simd[64]u8
+i8x64  :: #simd[64]i8
+u16x32 :: #simd[32]u16
+i16x32 :: #simd[32]i16
+u32x16 :: #simd[16]u32
+i32x16 :: #simd[16]i32
+u64x8  :: #simd[8]u64
+i64x8  :: #simd[8]i64
+f32x16 :: #simd[16]f32
+f64x8  :: #simd[8]f64
+
+boolx64 :: #simd[64]bool
+b8x64   :: #simd[64]b8
+b16x32  :: #simd[32]b16
+b32x16  :: #simd[16]b32
+b64x8   :: #simd[8]b64
+
+
+add :: intrinsics.simd_add
+sub :: intrinsics.simd_sub
+mul :: intrinsics.simd_mul
+div :: intrinsics.simd_div
+rem :: intrinsics.simd_rem // integers only
+
+// Keeps Odin's Behaviour
+// (x << y) if y <= mask else 0
+shl :: intrinsics.simd_shl
+shr :: intrinsics.simd_shr
+
+// Similar to C's Behaviour
+// x << (y & mask)
+shl_masked :: intrinsics.simd_shl_masked
+shr_masked :: intrinsics.simd_shr_masked
+
+// Saturation Arithmetic
+add_sat :: intrinsics.simd_add_sat
+sub_sat :: intrinsics.simd_sub_sat
+
+and     :: intrinsics.simd_and
+or      :: intrinsics.simd_or
+xor     :: intrinsics.simd_xor
+and_not :: intrinsics.simd_and_not
+
+neg :: intrinsics.simd_neg
+
+abs   :: intrinsics.simd_abs
+
+min   :: intrinsics.simd_min
+max   :: intrinsics.simd_max
+clamp :: intrinsics.simd_clamp
+
+// Return an unsigned integer of the same size as the input type
+// NOT A BOOLEAN
+// element-wise:
+//     false => 0x00...00
+//     true  => 0xff...ff
+lanes_eq :: intrinsics.simd_lanes_eq
+lanes_ne :: intrinsics.simd_lanes_ne
+lanes_lt :: intrinsics.simd_lanes_lt
+lanes_le :: intrinsics.simd_lanes_le
+lanes_gt :: intrinsics.simd_lanes_gt
+lanes_ge :: intrinsics.simd_lanes_ge
+
+// extract :: proc(a: #simd[N]T, idx: uint) -> T
+extract :: intrinsics.simd_extract
+// replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T
+replace :: intrinsics.simd_replace
+
+reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
+reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
+reduce_min         :: intrinsics.simd_reduce_min
+reduce_max         :: intrinsics.simd_reduce_max
+reduce_and         :: intrinsics.simd_reduce_and
+reduce_or          :: intrinsics.simd_reduce_or
+reduce_xor         :: intrinsics.simd_reduce_xor
+
+// swizzle :: proc(a: #simd[N]T, indices: ..int) -> #simd[len(indices)]T
+swizzle :: builtin.swizzle
+
+// shuffle :: proc(a, b: #simd[N]T, indices: #simd[max 2*N]u32) -> #simd[len(indices)]T
+shuffle :: intrinsics.simd_shuffle
+
+// select :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T
+select :: intrinsics.simd_select
+
+
+sqrt    :: intrinsics.sqrt
+ceil    :: intrinsics.simd_ceil
+floor   :: intrinsics.simd_floor
+trunc   :: intrinsics.simd_trunc
+nearest :: intrinsics.simd_nearest
+
+to_bits :: intrinsics.simd_to_bits
+
+lanes_reverse :: intrinsics.simd_lanes_reverse
+
+lanes_rotate_left  :: intrinsics.simd_lanes_rotate_left
+lanes_rotate_right :: intrinsics.simd_lanes_rotate_right
+
+count_ones           :: intrinsics.count_ones
+count_zeros          :: intrinsics.count_zeros
+count_trailing_zeros :: intrinsics.count_trailing_zeros
+count_leading_zeros  :: intrinsics.count_leading_zeros
+reverse_bits         :: intrinsics.reverse_bits
+
+fused_mul_add :: intrinsics.fused_mul_add
+fma           :: intrinsics.fused_mul_add
+
+to_array_ptr :: #force_inline proc "contextless" (v: ^#simd[$LANES]$E) -> ^[LANES]E {
+	return (^[LANES]E)(v)
+}
+to_array :: #force_inline proc "contextless" (v: #simd[$LANES]$E) -> [LANES]E {
+	return transmute([LANES]E)(v)
+}
+from_array :: #force_inline proc "contextless" (v: $A/[$LANES]$E) -> #simd[LANES]E {
+	return transmute(#simd[LANES]E)v
+}
+
+from_slice :: proc($T: typeid/#simd[$LANES]$E, slice: []E) -> T {
+	assert(len(slice) >= LANES, "slice length must be a least the number of lanes")
+	array: [LANES]E
+	#no_bounds_check for i in 0..<LANES {
+		array[i] = slice[i]
+	}
+	return transmute(T)array
+}
+
+bit_not :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_integer(E) {
+	return xor(v, T(~E(0)))
+}
+
+copysign :: #force_inline proc "contextless" (v, sign: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
+	neg_zero := to_bits(T(-0.0))
+	sign_bit := to_bits(sign) & neg_zero
+	magnitude := to_bits(v) &~ neg_zero
+	return transmute(T)(sign_bit|magnitude)
+}
+
+signum :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
+	is_nan := lanes_ne(v, v)
+	return select(is_nan, v, copysign(T(1), v))
+}
+
+recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
+	return T(1) / v
+}

+ 24 - 0
core/simd/x86/abm.odin

@@ -0,0 +1,24 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+
+@(require_results, enable_target_feature="lzcnt")
+_lzcnt_u32 :: #force_inline proc "c" (x: u32) -> u32 {
+	return intrinsics.count_leading_zeros(x)
+}
+@(require_results, enable_target_feature="popcnt")
+_popcnt32 :: #force_inline proc "c" (x: u32) -> i32 {
+	return i32(intrinsics.count_ones(x))
+}
+
+when ODIN_ARCH == .amd64 {
+	@(require_results, enable_target_feature="lzcnt")
+	_lzcnt_u64 :: #force_inline proc "c" (x: u64) -> u64 {
+		return intrinsics.count_leading_zeros(x)
+	}
+	@(require_results, enable_target_feature="popcnt")
+	_popcnt64 :: #force_inline proc "c" (x: u64) -> i32 {
+		return i32(intrinsics.count_ones(x))
+	}
+}

+ 56 - 0
core/simd/x86/adx.odin

@@ -0,0 +1,56 @@
+//+build i386, amd64
+package simd_x86
+
+@(require_results)
+_addcarry_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 {
+	x, y := llvm_addcarry_u32(c_in, a, b)
+	out^ = y
+	return x
+}
+@(require_results)
+_addcarryx_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 {
+	return llvm_addcarryx_u32(c_in, a, b, out)
+}
+@(require_results)
+_subborrow_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 {
+	x, y := llvm_subborrow_u32(c_in, a, b)
+	out^ = y
+	return x
+}
+
+when ODIN_ARCH == .amd64 {
+	@(require_results)
+	_addcarry_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 {
+		x, y := llvm_addcarry_u64(c_in, a, b)
+		out^ = y
+		return x
+	}
+	@(require_results)
+	_addcarryx_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 {
+		return llvm_addcarryx_u64(c_in, a, b, out)
+	}
+	@(require_results)
+	_subborrow_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 {
+		x, y := llvm_subborrow_u64(c_in, a, b)
+		out^ = y
+		return x
+	}
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+	@(link_name="llvm.x86.addcarry.32")
+	llvm_addcarry_u32  :: proc(a: u8, b: u32, c: u32) -> (u8, u32) ---
+	@(link_name="llvm.x86.addcarryx.u32")
+	llvm_addcarryx_u32 :: proc(a: u8, b: u32, c: u32, d: rawptr) -> u8 ---
+	@(link_name="llvm.x86.subborrow.32")
+	llvm_subborrow_u32 :: proc(a: u8, b: u32, c: u32) -> (u8, u32) ---
+
+	// amd64 only
+	@(link_name="llvm.x86.addcarry.64")
+	llvm_addcarry_u64  :: proc(a: u8, b: u64, c: u64) -> (u8, u64) ---
+	@(link_name="llvm.x86.addcarryx.u64")
+	llvm_addcarryx_u64 :: proc(a: u8, b: u64, c: u64, d: rawptr) -> u8 ---
+	@(link_name="llvm.x86.subborrow.64")
+	llvm_subborrow_u64 :: proc(a: u8, b: u64, c: u64) -> (u8, u64) ---
+}

+ 8 - 0
core/simd/x86/cmpxchg16b.odin

@@ -0,0 +1,8 @@
+//+build amd64
+package simd_x86
+
+import "core:intrinsics"
+
+cmpxchg16b :: #force_inline proc "c" (dst: ^u128, old, new: u128, $success, $failure: intrinsics.Atomic_Memory_Order) -> (val: u128) {
+	return intrinsics.atomic_compare_exchange_strong_explicit(dst, old, new, success, failure)
+}

+ 94 - 0
core/simd/x86/cpu.odin

@@ -0,0 +1,94 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+
+// cpuid :: proc(ax, cx: u32) -> (eax, ebc, ecx, edx: u32) ---
+cpuid :: intrinsics.x86_cpuid
+
+// xgetbv :: proc(cx: u32) -> (eax, edx: u32) ---
+xgetbv :: intrinsics.x86_xgetbv
+
+
+CPU_Feature :: enum u64 {
+	aes,       // AES hardware implementation (AES NI)
+	adx,       // Multi-precision add-carry instruction extensions
+	avx,       // Advanced vector extension
+	avx2,      // Advanced vector extension 2
+	bmi1,      // Bit manipulation instruction set 1
+	bmi2,      // Bit manipulation instruction set 2
+	erms,      // Enhanced REP for MOVSB and STOSB
+	fma,       // Fused-multiply-add instructions
+	os_xsave,  // OS supports XSAVE/XRESTOR for saving/restoring XMM registers.
+	pclmulqdq, // PCLMULQDQ instruction - most often used for AES-GCM
+	popcnt,    // Hamming weight instruction POPCNT.
+	rdrand,    // RDRAND instruction (on-chip random number generator)
+	rdseed,    // RDSEED instruction (on-chip random number generator)
+	sse2,      // Streaming SIMD extension 2 (always available on amd64)
+	sse3,      // Streaming SIMD extension 3
+	ssse3,     // Supplemental streaming SIMD extension 3
+	sse41,     // Streaming SIMD extension 4 and 4.1
+	sse42,     // Streaming SIMD extension 4 and 4.2
+}
+
+CPU_Features :: distinct bit_set[CPU_Feature; u64]
+
+cpu_features: Maybe(CPU_Features)
+
+@(init, private)
+init_cpu_features :: proc "c" () {
+	is_set :: #force_inline proc "c" (hwc: u32, value: u32) -> bool {
+		return hwc&value != 0
+	}
+	try_set :: #force_inline proc "c" (set: ^CPU_Features, feature: CPU_Feature, hwc: u32, value: u32) {
+		if is_set(hwc, value) {
+			set^ += {feature}
+		}
+	}
+
+	max_id, _, _, _ := cpuid(0, 0)
+	if max_id < 1 {
+		return
+	}
+
+	set: CPU_Features
+
+	_, _, ecx1, edx1 := cpuid(1, 0)
+
+	try_set(&set, .sse2,      26, edx1)
+	try_set(&set, .sse3,       0, ecx1)
+	try_set(&set, .pclmulqdq,  1, ecx1)
+	try_set(&set, .ssse3,      9, ecx1)
+	try_set(&set, .fma,       12, ecx1)
+	try_set(&set, .sse41,     19, ecx1)
+	try_set(&set, .sse42,     20, ecx1)
+	try_set(&set, .popcnt,    23, ecx1)
+	try_set(&set, .aes,       25, ecx1)
+	try_set(&set, .os_xsave,  27, ecx1)
+	try_set(&set, .rdrand,    30, ecx1)
+
+	os_supports_avx := false
+	if .os_xsave in set {
+		eax, _ := xgetbv(0)
+		os_supports_avx = is_set(1, eax) && is_set(2, eax)
+	}
+	if os_supports_avx {
+		try_set(&set, .avx, 28, ecx1)
+	}
+
+	if max_id < 7 {
+		return
+	}
+
+	_, ebx7, _, _ := cpuid(7, 0)
+	try_set(&set, .bmi1, 3, ebx7)
+	if os_supports_avx {
+		try_set(&set, .avx2, 5, ebx7)
+	}
+	try_set(&set, .bmi2,    8, ebx7)
+	try_set(&set, .erms,    9, ebx7)
+	try_set(&set, .rdseed, 18, ebx7)
+	try_set(&set, .adx,    19, ebx7)
+
+	cpu_features = set
+}

+ 36 - 0
core/simd/x86/fxsr.odin

@@ -0,0 +1,36 @@
+//+build i386, amd64
+package simd_x86
+
+@(enable_target_feature="fxsr")
+_fxsave :: #force_inline proc "c" (mem_addr: rawptr) {
+	fxsave(mem_addr)
+}
+@(enable_target_feature="fxsr")
+_fxrstor :: #force_inline proc "c" (mem_addr: rawptr) {
+	fxrstor(mem_addr)
+}
+
+when ODIN_ARCH == .amd64 {
+	@(enable_target_feature="fxsr")
+	_fxsave64 :: #force_inline proc "c" (mem_addr: rawptr) {
+		fxsave64(mem_addr)
+	}
+	@(enable_target_feature="fxsr")
+	_fxrstor64 :: #force_inline proc "c" (mem_addr: rawptr) {
+		fxrstor64(mem_addr)
+	}
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+	@(link_name="llvm.x86.fxsave")
+	fxsave    :: proc(p: rawptr) ---
+	@(link_name="llvm.x86.fxrstor")
+	fxrstor   :: proc(p: rawptr) ---
+
+	// amd64 only
+	@(link_name="llvm.x86.fxsave64")
+	fxsave64  :: proc(p: rawptr) ---
+	@(link_name="llvm.x86.fxrstor64")
+	fxrstor64 :: proc(p: rawptr) ---
+}

+ 13 - 0
core/simd/x86/pclmulqdq.odin

@@ -0,0 +1,13 @@
+//+build i386, amd64
+package simd_x86
+
+@(require_results, enable_target_feature="pclmulqdq")
+_mm_clmulepi64_si128 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i {
+	return pclmulqdq(a, b, u8(IMM8))
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+	@(link_name="llvm.x86.pclmulqdq")
+	pclmulqdq :: proc(a, round_key: __m128i, #const imm8: u8) -> __m128i ---
+}

+ 20 - 0
core/simd/x86/rdtsc.odin

@@ -0,0 +1,20 @@
+//+build i386, amd64
+package simd_x86
+
+@(require_results)
+_rdtsc :: #force_inline proc "c" () -> u64 {
+	return rdtsc()
+}
+
+@(require_results)
+__rdtscp :: #force_inline proc "c" (aux: ^u32) -> u64 {
+	return rdtscp(aux)
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+	@(link_name="llvm.x86.rdtsc")
+	rdtsc  :: proc() -> u64 ---
+	@(link_name="llvm.x86.rdtscp")
+	rdtscp :: proc(aux: rawptr) -> u64 ---
+}

+ 49 - 0
core/simd/x86/sha.odin

@@ -0,0 +1,49 @@
+//+build i386, amd64
+package simd_x86
+
+@(require_results, enable_target_feature="sha")
+_mm_sha1msg1_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)sha1msg1(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha1msg2_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)sha1msg2(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha1nexte_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)sha1nexte(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha1rnds4_epu32 :: #force_inline proc "c" (a, b: __m128i, $FUNC: u32) -> __m128i where 0 <= FUNC, FUNC <= 3 {
+	return transmute(__m128i)sha1rnds4(transmute(i32x4)a, transmute(i32x4)b, u8(FUNC & 0xff))
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha256msg1_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)sha256msg1(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha256msg2_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)sha256msg2(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha256rnds2_epu32 :: #force_inline proc "c" (a, b, k: __m128i) -> __m128i {
+	return transmute(__m128i)sha256rnds2(transmute(i32x4)a, transmute(i32x4)b, transmute(i32x4)k)
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+	@(link_name="llvm.x86.sha1msg1")
+	sha1msg1    :: proc(a, b: i32x4) -> i32x4 ---
+	@(link_name="llvm.x86.sha1msg2")
+	sha1msg2    :: proc(a, b: i32x4) -> i32x4 ---
+	@(link_name="llvm.x86.sha1nexte")
+	sha1nexte   :: proc(a, b: i32x4) -> i32x4 ---
+	@(link_name="llvm.x86.sha1rnds4")
+	sha1rnds4   :: proc(a, b: i32x4, #const c: u8) -> i32x4 ---
+	@(link_name="llvm.x86.sha256msg1")
+	sha256msg1  :: proc(a, b: i32x4) -> i32x4 ---
+	@(link_name="llvm.x86.sha256msg2")
+	sha256msg2  :: proc(a, b: i32x4) -> i32x4 ---
+	@(link_name="llvm.x86.sha256rnds2")
+	sha256rnds2 :: proc(a, b, k: i32x4) -> i32x4 ---
+}

+ 618 - 0
core/simd/x86/sse.odin

@@ -0,0 +1,618 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+import "core:simd"
+
+// _MM_SHUFFLE(z, y, x, w) -> (z<<6 | y<<4 | x<<2 | w)
+_MM_SHUFFLE :: intrinsics.simd_x86__MM_SHUFFLE
+
+_MM_HINT_T0  :: 3
+_MM_HINT_T1  :: 2
+_MM_HINT_T2  :: 1
+_MM_HINT_NTA :: 0
+_MM_HINT_ET0 :: 7
+_MM_HINT_ET1 :: 6
+
+
+_MM_EXCEPT_INVALID    :: 0x0001
+_MM_EXCEPT_DENORM     :: 0x0002
+_MM_EXCEPT_DIV_ZERO   :: 0x0004
+_MM_EXCEPT_OVERFLOW   :: 0x0008
+_MM_EXCEPT_UNDERFLOW  :: 0x0010
+_MM_EXCEPT_INEXACT    :: 0x0020
+_MM_EXCEPT_MASK       :: 0x003f
+
+_MM_MASK_INVALID      :: 0x0080
+_MM_MASK_DENORM       :: 0x0100
+_MM_MASK_DIV_ZERO     :: 0x0200
+_MM_MASK_OVERFLOW     :: 0x0400
+_MM_MASK_UNDERFLOW    :: 0x0800
+_MM_MASK_INEXACT      :: 0x1000
+_MM_MASK_MASK         :: 0x1f80
+
+_MM_ROUND_NEAREST     :: 0x0000
+_MM_ROUND_DOWN        :: 0x2000
+_MM_ROUND_UP          :: 0x4000
+_MM_ROUND_TOWARD_ZERO :: 0x6000
+
+_MM_ROUND_MASK        :: 0x6000
+
+_MM_FLUSH_ZERO_MASK   :: 0x8000
+_MM_FLUSH_ZERO_ON     :: 0x8000
+_MM_FLUSH_ZERO_OFF    :: 0x0000
+
+
+@(require_results, enable_target_feature="sse")
+_mm_add_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return addss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_add_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.add(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_sub_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return subss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_sub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.sub(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_mul_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return mulss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_mul_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.mul(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_div_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return divss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_div_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.div(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_sqrt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return sqrtss(a)
+}
+@(require_results, enable_target_feature="sse")
+_mm_sqrt_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+	return sqrtps(a)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_rcp_ss :: #force_inline proc "c" (a: __m128) -> __m128 {
+	return rcpss(a)
+}
+@(require_results, enable_target_feature="sse")
+_mm_rcp_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+	return rcpps(a)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_rsqrt_ss :: #force_inline proc "c" (a: __m128) -> __m128 {
+	return rsqrtss(a)
+}
+@(require_results, enable_target_feature="sse")
+_mm_rsqrt_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+	return rsqrtps(a)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_min_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return minss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_min_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return minps(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_max_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return maxss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_max_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return maxps(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_and_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return transmute(__m128)simd.and(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_andnot_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return transmute(__m128)simd.and_not(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_or_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return transmute(__m128)simd.or(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_xor_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return transmute(__m128)simd.xor(transmute(__m128i)a, transmute(__m128i)b)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_cmpeq_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpss(a, b, 0)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmplt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpss(a, b, 1)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmple_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpss(a, b, 2)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpgt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.shuffle(a, cmpss(b, a, 1), 4, 1, 2, 3)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpge_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.shuffle(a, cmpss(b, a, 2), 4, 1, 2, 3)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpneq_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpss(a, b, 4)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnlt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpss(a, b, 5)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnle_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpss(a, b, 6)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpngt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.shuffle(a, cmpss(b, a, 5), 4, 1, 2, 3)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnge_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.shuffle(a, cmpss(b, a, 6), 4, 1, 2, 3)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpord_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpss(a, b, 7)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpunord_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpss(a, b, 3)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_cmpeq_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(a, b, 0)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmplt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(a, b, 1)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmple_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(a, b, 2)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpgt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(b, a, 1)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpge_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(b, a, 2)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpneq_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(a, b, 4)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnlt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(a, b, 5)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnle_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(a, b, 6)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpngt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(b, a, 5)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnge_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(b, a, 6)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpord_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(b, a, 7)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpunord_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return cmpps(b, a, 3)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_comieq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return comieq_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_comilt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return comilt_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_comile_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return comile_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_comigt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return comigt_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_comige_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return comige_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_comineq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return comineq_ss(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_ucomieq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return ucomieq_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_ucomilt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return ucomilt_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_ucomile_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return ucomile_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_ucomigt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return ucomigt_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_ucomige_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return ucomige_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_ucomineq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+	return ucomineq_ss(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_cvtss_si32 :: #force_inline proc "c" (a: __m128) -> i32 {
+	return cvtss2si(a)
+}
+_mm_cvt_ss2si :: _mm_cvtss_si32
+_mm_cvttss_si32 :: _mm_cvtss_si32
+
+@(require_results, enable_target_feature="sse")
+_mm_cvtss_f32 :: #force_inline proc "c" (a: __m128) -> f32 {
+	return simd.extract(a, 0)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_cvtsi32_ss :: #force_inline proc "c" (a: __m128, b: i32) -> __m128 {
+	return cvtsi2ss(a, b)
+}
+_mm_cvt_si2ss :: _mm_cvtsi32_ss
+
+
+@(require_results, enable_target_feature="sse")
+_mm_set_ss :: #force_inline proc "c" (a: f32) -> __m128 {
+	return __m128{a, 0, 0, 0}
+}
+@(require_results, enable_target_feature="sse")
+_mm_set1_ps :: #force_inline proc "c" (a: f32) -> __m128 {
+	return __m128(a)
+}
+_mm_set_ps1 :: _mm_set1_ps
+
+@(require_results, enable_target_feature="sse")
+_mm_set_ps :: #force_inline proc "c" (a, b, c, d: f32) -> __m128 {
+	return __m128{d, c, b, a}
+}
+@(require_results, enable_target_feature="sse")
+_mm_setr_ps :: #force_inline proc "c" (a, b, c, d: f32) -> __m128 {
+	return __m128{a, b, c, d}
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_setzero_ps :: #force_inline proc "c" () -> __m128 {
+	return __m128{0, 0, 0, 0}
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_shuffle_ps :: #force_inline proc "c" (a, b: __m128, $MASK: u32) -> __m128 {
+	return simd.shuffle(
+		a, b,
+		u32(MASK) & 0b11,
+		(u32(MASK)>>2) & 0b11,
+		((u32(MASK)>>4) & 0b11)+4,
+		((u32(MASK)>>6) & 0b11)+4)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_unpackhi_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.shuffle(a, b, 2, 6, 3, 7)
+}
+@(require_results, enable_target_feature="sse")
+_mm_unpacklo_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.shuffle(a, b, 0, 4, 1, 5)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_movehl_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.shuffle(a, b, 6, 7, 2, 3)
+}
+@(require_results, enable_target_feature="sse")
+_mm_movelh_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.shuffle(a, b, 0, 1, 4, 5)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_movemask_ps :: #force_inline proc "c" (a: __m128) -> u32 {
+	return movmskps(a)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_load_ss :: #force_inline proc "c" (p: ^f32) -> __m128 {
+	return __m128{p^, 0, 0, 0}
+}
+@(require_results, enable_target_feature="sse")
+_mm_load1_ps :: #force_inline proc "c" (p: ^f32) -> __m128 {
+	a := p^
+	return __m128(a)
+}
+_mm_load_ps1 :: _mm_load1_ps
+
+@(require_results, enable_target_feature="sse")
+_mm_load_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 {
+	return (^__m128)(p)^
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_loadu_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 {
+	dst := _mm_undefined_ps()
+	intrinsics.mem_copy_non_overlapping(&dst, p, size_of(__m128))
+	return dst
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_loadr_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 {
+	return simd.lanes_reverse(_mm_load_ps(p))
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_loadu_si64 :: #force_inline proc "c" (mem_addr: rawptr) -> __m128i {
+	a := intrinsics.unaligned_load((^i64)(mem_addr))
+	return __m128i{a, 0}
+}
+
+@(enable_target_feature="sse")
+_mm_store_ss :: #force_inline proc "c" (p: ^f32, a: __m128) {
+	p^ = simd.extract(a, 0)
+}
+
+@(enable_target_feature="sse")
+_mm_store1_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
+	b := simd.swizzle(a, 0, 0, 0, 0)
+	(^__m128)(p)^ = b
+}
+_mm_store_ps1 :: _mm_store1_ps
+
+
+@(enable_target_feature="sse")
+_mm_store_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
+	(^__m128)(p)^ = a
+}
+@(enable_target_feature="sse")
+_mm_storeu_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
+	b := a
+	intrinsics.mem_copy_non_overlapping(p, &b, size_of(__m128))
+}
+@(enable_target_feature="sse")
+_mm_storer_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
+	(^__m128)(p)^ = simd.lanes_reverse(a)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_move_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return simd.shuffle(a, b, 4, 1, 2, 3)
+}
+
+@(enable_target_feature="sse")
+_mm_sfence :: #force_inline proc "c" () {
+	sfence()
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_getcsr :: #force_inline proc "c" () -> (result: u32) {
+	stmxcsr(&result)
+	return result
+}
+
+@(enable_target_feature="sse")
+_mm_setcsr :: #force_inline proc "c" (val: u32) {
+	val := val
+	ldmxcsr(&val)
+}
+
+@(require_results, enable_target_feature="sse")
+_MM_GET_EXCEPTION_MASK :: #force_inline proc "c" () -> u32 {
+	return _mm_getcsr() & _MM_MASK_MASK
+}
+@(require_results, enable_target_feature="sse")
+_MM_GET_EXCEPTION_STATE :: #force_inline proc "c" () -> u32 {
+	return _mm_getcsr() & _MM_EXCEPT_MASK
+}
+@(require_results, enable_target_feature="sse")
+_MM_GET_FLUSH_ZERO_MODE :: #force_inline proc "c" () -> u32 {
+	return _mm_getcsr() & _MM_FLUSH_ZERO_MASK
+}
+@(require_results, enable_target_feature="sse")
+_MM_GET_ROUNDING_MODE :: #force_inline proc "c" () -> u32 {
+	return _mm_getcsr() & _MM_ROUND_MASK
+}
+
+@(enable_target_feature="sse")
+_MM_SET_EXCEPTION_MASK :: #force_inline proc "c" (x: u32) {
+	_mm_setcsr((_mm_getcsr() &~ _MM_MASK_MASK) | x)
+}
+@(enable_target_feature="sse")
+_MM_SET_EXCEPTION_STATE :: #force_inline proc "c" (x: u32) {
+	_mm_setcsr((_mm_getcsr() &~ _MM_EXCEPT_MASK) | x)
+}
+@(enable_target_feature="sse")
+_MM_SET_FLUSH_ZERO_MODE :: #force_inline proc "c" (x: u32) {
+	_mm_setcsr((_mm_getcsr() &~ _MM_FLUSH_ZERO_MASK) | x)
+}
+@(enable_target_feature="sse")
+_MM_SET_ROUNDING_MODE :: #force_inline proc "c" (x: u32) {
+	_mm_setcsr((_mm_getcsr() &~ _MM_ROUND_MASK) | x)
+}
+
+@(enable_target_feature="sse")
+_mm_prefetch :: #force_inline proc "c" (p: rawptr, $STRATEGY: u32) {
+	prefetch(p, (STRATEGY>>2)&1, STRATEGY&3, 1)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_undefined_ps :: #force_inline proc "c" () -> __m128 {
+	return _mm_set1_ps(0)
+}
+
+@(enable_target_feature="sse")
+_MM_TRANSPOSE4_PS :: #force_inline proc "c" (row0, row1, row2, row3: ^__m128) {
+	tmp0 := _mm_unpacklo_ps(row0^, row1^)
+	tmp1 := _mm_unpacklo_ps(row2^, row3^)
+	tmp2 := _mm_unpackhi_ps(row0^, row1^)
+	tmp3 := _mm_unpackhi_ps(row2^, row3^)
+
+	row0^ = _mm_movelh_ps(tmp0, tmp2)
+	row1^ = _mm_movelh_ps(tmp2, tmp0)
+	row2^ = _mm_movelh_ps(tmp1, tmp3)
+	row3^ = _mm_movelh_ps(tmp3, tmp1)
+}
+
+@(enable_target_feature="sse")
+_mm_stream_ps :: #force_inline proc "c" (addr: [^]f32, a: __m128) {
+	intrinsics.non_temporal_store((^__m128)(addr), a)
+}
+
+when ODIN_ARCH == .amd64 {
+	@(require_results, enable_target_feature="sse")
+	_mm_cvtss_si64 :: #force_inline proc "c"(a: __m128) -> i64 {
+		return cvtss2si64(a)
+	}
+	@(require_results, enable_target_feature="sse")
+	_mm_cvttss_si64 :: #force_inline proc "c"(a: __m128) -> i64 {
+		return cvttss2si64(a)
+	}
+	@(require_results, enable_target_feature="sse")
+	_mm_cvtsi64_ss :: #force_inline proc "c"(a: __m128, b: i64) -> __m128 {
+		return cvtsi642ss(a, b)
+	}
+}
+
+
+@(private, default_calling_convention="c")
+foreign _ {
+	@(link_name="llvm.x86.sse.add.ss")
+	addss       :: proc(a, b: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.sub.ss")
+	subss       :: proc(a, b: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.mul.ss")
+	mulss       :: proc(a, b: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.div.ss")
+	divss       :: proc(a, b: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.sqrt.ss")
+	sqrtss      :: proc(a: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.sqrt.ps")
+	sqrtps      :: proc(a: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.rcp.ss")
+	rcpss       :: proc(a: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.rcp.ps")
+	rcpps       :: proc(a: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.rsqrt.ss")
+	rsqrtss     :: proc(a: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.rsqrt.ps")
+	rsqrtps     :: proc(a: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.min.ss")
+	minss       :: proc(a, b: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.min.ps")
+	minps       :: proc(a, b: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.max.ss")
+	maxss       :: proc(a, b: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.max.ps")
+	maxps       :: proc(a, b: __m128) -> __m128 ---
+	@(link_name="llvm.x86.sse.movmsk.ps")
+	movmskps    :: proc(a: __m128) -> u32 ---
+	@(link_name="llvm.x86.sse.cmp.ps")
+	cmpps       :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
+	@(link_name="llvm.x86.sse.comieq.ss")
+	comieq_ss   :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.comilt.ss")
+	comilt_ss   :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.comile.ss")
+	comile_ss   :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.comigt.ss")
+	comigt_ss   :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.comige.ss")
+	comige_ss   :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.comineq.ss")
+	comineq_ss  :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.ucomieq.ss")
+	ucomieq_ss  :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.ucomilt.ss")
+	ucomilt_ss  :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.ucomile.ss")
+	ucomile_ss  :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.ucomigt.ss")
+	ucomigt_ss  :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.ucomige.ss")
+	ucomige_ss  :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.ucomineq.ss")
+	ucomineq_ss :: proc(a, b: __m128) -> b32 ---
+	@(link_name="llvm.x86.sse.cvtss2si")
+	cvtss2si    :: proc(a: __m128) -> i32 ---
+	@(link_name="llvm.x86.sse.cvttss2si")
+	cvttss2si   :: proc(a: __m128) -> i32 ---
+	@(link_name="llvm.x86.sse.cvtsi2ss")
+	cvtsi2ss    :: proc(a: __m128, b: i32) -> __m128 ---
+	@(link_name="llvm.x86.sse.sfence")
+	sfence      :: proc() ---
+	@(link_name="llvm.x86.sse.stmxcsr")
+	stmxcsr     :: proc(p: rawptr) ---
+	@(link_name="llvm.x86.sse.ldmxcsr")
+	ldmxcsr     :: proc(p: rawptr) ---
+	@(link_name="llvm.prefetch")
+	prefetch    :: proc(p: rawptr, #const rw, loc, ty: u32) ---
+	@(link_name="llvm.x86.sse.cmp.ss")
+	cmpss       :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
+
+
+	// amd64 only
+	@(link_name="llvm.x86.sse.cvtss2si64")
+	cvtss2si64  :: proc(a: __m128) -> i64 ---
+	@(link_name="llvm.x86.sse.cvttss2si64")
+	cvttss2si64 :: proc(a: __m128) -> i64 ---
+	@(link_name="llvm.x86.sse.cvtsi642ss")
+	cvtsi642ss  :: proc(a: __m128, b: i64) -> __m128 ---
+}

+ 1191 - 0
core/simd/x86/sse2.odin

@@ -0,0 +1,1191 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+import "core:simd"
+
+@(enable_target_feature="sse2")
+_mm_pause :: #force_inline proc "c" () {
+	pause()
+}
+@(enable_target_feature="sse2")
+_mm_clflush :: #force_inline proc "c" (p: rawptr) {
+	clflush(p)
+}
+@(enable_target_feature="sse2")
+_mm_lfence :: #force_inline proc "c" () {
+	lfence()
+}
+@(enable_target_feature="sse2")
+_mm_mfence :: #force_inline proc "c" () {
+	mfence()
+}
+
+@(require_results, enable_target_feature="sse2")
+_mm_add_epi8 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)simd.add(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_add_epi16 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)simd.add(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_add_epi32 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)simd.add(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_add_epi64 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)simd.add(transmute(i64x2)a, transmute(i64x2)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_adds_epi8 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)simd.add_sat(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_adds_epi16 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)simd.add_sat(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_adds_epu8 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)simd.add_sat(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_adds_epu16 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)simd.add_sat(transmute(u16x8)a, transmute(u16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_avg_epu8 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)pavgb(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_avg_epu16 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)pavgw(transmute(u16x8)a, transmute(u16x8)b)
+}
+
+@(require_results, enable_target_feature="sse2")
+_mm_madd_epi16 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)pmaddwd(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_max_epi16 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)pmaxsw(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_max_epu8 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)pmaxub(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_min_epi16 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)pminsw(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_min_epu8 :: #force_inline proc "c" (a, b: __m128i)  -> __m128i {
+	return transmute(__m128i)pminub(transmute(u8x16)a, transmute(u8x16)b)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_mulhi_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pmulhw(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_mulhi_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pmulhuw(transmute(u16x8)a, transmute(u16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_mullo_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.mul(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_mul_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pmuludq(transmute(u32x4)a, transmute(u32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sad_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)psadbw(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.sub(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.sub(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.sub(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.sub(transmute(i64x2)a, transmute(i64x2)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_subs_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.sub_sat(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_subs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.sub_sat(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_subs_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.sub_sat(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_subs_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.sub_sat(transmute(u16x8)a, transmute(u16x8)b)
+}
+
+
+
+@(private)
+@(require_results, enable_target_feature="sse2")
+_mm_slli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	shift :: IMM8 & 0xff
+
+	return transmute(__m128i)simd.shuffle(
+		transmute(i8x16)a,
+		i8x16(0),
+		0  when shift > 15 else (16 - shift + 0),
+		1  when shift > 15 else (16 - shift + 1),
+		2  when shift > 15 else (16 - shift + 2),
+		3  when shift > 15 else (16 - shift + 3),
+		4  when shift > 15 else (16 - shift + 4),
+		5  when shift > 15 else (16 - shift + 5),
+		6  when shift > 15 else (16 - shift + 6),
+		7  when shift > 15 else (16 - shift + 7),
+		8  when shift > 15 else (16 - shift + 8),
+		9  when shift > 15 else (16 - shift + 9),
+		10 when shift > 15 else (16 - shift + 10),
+		11 when shift > 15 else (16 - shift + 11),
+		12 when shift > 15 else (16 - shift + 12),
+		13 when shift > 15 else (16 - shift + 13),
+		14 when shift > 15 else (16 - shift + 14),
+		15 when shift > 15 else (16 - shift + 15),
+	)
+}
+
+@(private)
+@(require_results, enable_target_feature="sse2")
+_mm_srli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	shift :: IMM8
+	return transmute(__m128i)simd.shuffle(
+		transmute(i8x16)a,
+		i8x16(0),
+		0  + 16 when shift > 15 else (shift + 0),
+		1  + 16 when shift > 15 else (shift + 1),
+		2  + 16 when shift > 15 else (shift + 2),
+		3  + 16 when shift > 15 else (shift + 3),
+		4  + 16 when shift > 15 else (shift + 4),
+		5  + 16 when shift > 15 else (shift + 5),
+		6  + 16 when shift > 15 else (shift + 6),
+		7  + 16 when shift > 15 else (shift + 7),
+		8  + 16 when shift > 15 else (shift + 8),
+		9  + 16 when shift > 15 else (shift + 9),
+		10 + 16 when shift > 15 else (shift + 10),
+		11 + 16 when shift > 15 else (shift + 11),
+		12 + 16 when shift > 15 else (shift + 12),
+		13 + 16 when shift > 15 else (shift + 13),
+		14 + 16 when shift > 15 else (shift + 14),
+		15 + 16 when shift > 15 else (shift + 15),
+	)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_slli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return _mm_slli_si128_impl(a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_bslli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return _mm_slli_si128_impl(a, IMM8)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_bsrli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return _mm_srli_si128_impl(a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_slli_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return transmute(__m128i)pslliw(transmute(i16x8)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sll_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+	return transmute(__m128i)psllw(transmute(i16x8)a, transmute(i16x8)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_slli_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return transmute(__m128i)psllid(transmute(i32x4)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sll_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+	return transmute(__m128i)pslld(transmute(i32x4)a, transmute(i32x4)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_slli_epi64 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return transmute(__m128i)pslliq(transmute(i64x2)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sll_epi64 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+	return transmute(__m128i)psllq(transmute(i64x2)a, transmute(i64x2)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srai_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return transmute(__m128i)psraiw(transmute(i16x8)a. IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sra_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+	return transmute(__m128i)psraw(transmute(i16x8)a, transmute(i16x8)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srai_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return transmute(__m128i)psraid(transmute(i32x4)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sra_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+	return transmute(__m128i)psrad(transmute(i32x4)a, transmute(i32x4)count)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_srli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return _mm_srli_si128_impl(a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srli_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return transmute(__m128i)psrliw(transmute(i16x8)a. IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srl_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+	return transmute(__m128i)psrlw(transmute(i16x8)a, transmute(i16x8)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srli_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return transmute(__m128i)psrlid(transmute(i32x4)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srl_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+	return transmute(__m128i)psrld(transmute(i32x4)a, transmute(i32x4)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srli_epi64 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	return transmute(__m128i)psrliq(transmute(i64x2)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srl_epi64 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+	return transmute(__m128i)psrlq(transmute(i64x2)a, transmute(i64x2)count)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_and_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return simd.and(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_andnot_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return simd.and_not(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_or_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return simd.or(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_xor_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return simd.xor(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpeq_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_eq(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpeq_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_eq(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpeq_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_eq(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpgt_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_gt(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpgt_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_gt(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpgt_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_gt(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmplt_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_lt(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmplt_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_lt(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmplt_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_lt(transmute(i32x4)a, transmute(i32x4)b)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_cvtepi32_pd :: #force_inline proc "c" (a: __m128i) -> __m128d {
+	v := transmute(i32x4)a
+	return cast(__m128d)simd.shuffle(v, v, 0, 1)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsi32_sd :: #force_inline proc "c" (a: __m128d, b: i32) -> __m128d {
+	return simd.replace(a, 0, f64(b))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtepi32_ps :: #force_inline proc "c" (a: __m128i) -> __m128 {
+	return cvtdq2ps(transmute(i32x4)a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtps_epi32 :: #force_inline proc "c" (a: __m128) -> __m128i {
+	return transmute(__m128i)cvtps2dq(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsi32_si128 :: #force_inline proc "c" (a: i32) -> __m128i {
+	return transmute(__m128i)i32x4{a, 0, 0, 0}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsi128_si32 :: #force_inline proc "c" (a: __m128i) -> i32 {
+	return simd.extract(transmute(i32x4)a, 0)
+}
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_set_epi64x :: #force_inline proc "c" (e1, e0: i64) -> __m128i {
+	return transmute(__m128i)i64x2{e0, e1}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_epi32 :: #force_inline proc "c" (e3, e2, e1, e0: i32) -> __m128i {
+	return transmute(__m128i)i32x4{e0, e1, e2, e3}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_epi16 :: #force_inline proc "c" (e7, e6, e5, e4, e3, e2, e1, e0: i16) -> __m128i {
+	return transmute(__m128i)i16x8{e0, e1, e2, e3, e4, e5, e6, e7}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_epi8 :: #force_inline proc "c" (e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0: i8) -> __m128i {
+	return transmute(__m128i)i8x16{e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set1_epi64x :: #force_inline proc "c" (a: i64) -> __m128i {
+	return _mm_set_epi64x(a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set1_epi32 :: #force_inline proc "c" (a: i32) -> __m128i {
+	return _mm_set_epi32(a, a, a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set1_epi16 :: #force_inline proc "c" (a: i16) -> __m128i {
+	return _mm_set_epi16(a, a, a, a, a, a, a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set1_epi8 :: #force_inline proc "c" (a: i8) -> __m128i {
+	return _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setr_epi32 :: #force_inline proc "c" (e3, e2, e1, e0: i32) -> __m128i {
+	return _mm_set_epi32(e0, e1, e2, e3)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setr_epi16 :: #force_inline proc "c" (e7, e6, e5, e4, e3, e2, e1, e0: i16) -> __m128i {
+	return _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setr_epi8 :: #force_inline proc "c" (e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0: i8) -> __m128i {
+	return _mm_set_epi8(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setzero_si128 :: #force_inline proc "c" () -> __m128i {
+	return _mm_set1_epi64x(0)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_loadl_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i {
+	return _mm_set_epi64x(0, intrinsics.unaligned_load((^i64)(mem_addr)))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_load_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i {
+	return mem_addr^
+}
+@(require_results, enable_target_feature="sse2")
+_mm_loadu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i {
+	dst := _mm_undefined_si128()
+	intrinsics.mem_copy_non_overlapping(&dst, mem_addr, size_of(__m128i))
+	return dst
+}
+@(enable_target_feature="sse2")
+_mm_maskmoveu_si128 :: #force_inline proc "c" (a, mask: __m128i, mem_addr: rawptr) {
+	maskmovdqu(transmute(i8x16)a, transmute(i8x16)mask, mem_addr)
+}
+@(enable_target_feature="sse2")
+_mm_store_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
+	mem_addr^ = a
+}
+@(enable_target_feature="sse2")
+_mm_storeu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
+	storeudq(mem_addr, a)
+}
+@(enable_target_feature="sse2")
+_mm_storel_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
+	a := a
+	intrinsics.mem_copy_non_overlapping(mem_addr, &a, 8)
+}
+@(enable_target_feature="sse2")
+_mm_stream_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
+	intrinsics.non_temporal_store(mem_addr, a)
+}
+@(enable_target_feature="sse2")
+_mm_stream_si32 :: #force_inline proc "c" (mem_addr: ^i32, a: i32) {
+	intrinsics.non_temporal_store(mem_addr, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_move_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	zero := _mm_setzero_si128()
+	return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)zero, 0, 2)
+}
+
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_packs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)packsswb(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_packs_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)packssdw(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_packus_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)packuswb(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_extract_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 {
+	return i32(simd.extract(transmute(u16x8)a, IMM8))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_insert_epi16 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i {
+	return i32(simd.replace(transmute(u16x8)a, IMM8, i16(i)))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_movemask_epi8 :: #force_inline proc "c" (a: __m128i) -> i32 {
+	return pmovmskb(transmute(i8x16)a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_shuffle_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	v := transmute(i32x4)a
+	return transmute(__m128i)simd.shuffle(
+		v,
+		v,
+		IMM8 & 0b11,
+		(IMM8 >> 2) & 0b11,
+		(IMM8 >> 4) & 0b11,
+		(IMM8 >> 6) & 0b11,
+	)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_shufflehi_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	v := transmute(i16x8)a
+	return transmute(__m128i)simd.shuffle(
+		v,
+		v,
+		0,
+		1,
+		2,
+		3,
+		(IMM8 & 0b11) + 4,
+		((IMM8 >> 2) & 0b11) + 4,
+		((IMM8 >> 4) & 0b11) + 4,
+		((IMM8 >> 6) & 0b11) + 4,
+	)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_shufflelo_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+	v := transmute(i16x8)a
+	return transmute(__m128i)simd.shuffle(
+		v,
+		v,
+		IMM8 & 0b11,
+		(IMM8 >> 2) & 0b11,
+		(IMM8 >> 4) & 0b11,
+		(IMM8 >> 6) & 0b11,
+		4,
+		5,
+		6,
+		7,
+	)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpackhi_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.shuffle(
+	        transmute(i8x16)a,
+	        transmute(i8x16)b,
+        	8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+	)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpackhi_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.shuffle(transmute(i16x8)a, transmute(i16x8)b, 4, 12, 5, 13, 6, 14, 7, 15)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpackhi_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.shuffle(transmute(i32x4)a, transmute(i32x4)b, 2, 6, 3, 7)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpackhi_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)b, 1, 3)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpacklo_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.shuffle(
+	        transmute(i8x16)a,
+	        transmute(i8x16)b,
+        	0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+	)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpacklo_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.shuffle(transmute(i16x8)a, transmute(i16x8)b, 0, 8, 1, 9, 2, 10, 3, 11)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpacklo_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.shuffle(transmute(i32x4)a, transmute(i32x4)b, 0, 4, 1, 5)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpacklo_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)b, 0, 2)
+}
+
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_add_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.replace(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_add_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.add(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_div_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.replace(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_div_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.div(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_max_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return maxsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_max_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return maxpd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_min_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return minsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_min_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return minpd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_mul_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.replace(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_mul_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.mul(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sqrt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.replace(a, 0, _mm_cvtsd_f64(sqrtsd(b)))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sqrt_pd :: #force_inline proc "c" (a: __m128d) -> __m128d {
+	return simd.sqrt(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.replace(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.sub(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_and_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return transmute(__m128d)_mm_and_si128(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_andnot_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return transmute(__m128d)_mm_andnot_si128(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_or_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return transmute(__m128d)_mm_or_si128(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_xor_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return transmute(__m128d)_mm_xor_si128(transmute(__m128i)a, transmute(__m128i)b)
+}
+
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_cmpeq_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmpsd(a, b, 0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmplt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmpsd(a, b, 1)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmple_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmpsd(a, b, 2)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpgt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.replace(_mm_cmplt_sd(b, a), 1, simd.extract(a, 1))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpge_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.replace(_mm_cmple_sd(b, a), 1, simd.extract(a, 1))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpord_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmpsd(a, b, 7)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpunord_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmpsd(a, b, 3)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpneq_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmpsd(a, b, 4)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnlt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmpsd(a, b, 5)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnle_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmpsd(a, b, 6)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpngt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.replace(_mm_cmpnlt_sd(b, a), 1, simd.extract(a, 1))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnge_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.replace(_mm_cmpnle_sd(b, a), 1, simd.extract(a, 1))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpeq_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmppd(a, b, 0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmplt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmppd(a, b, 1)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmple_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmppd(a, b, 2)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpgt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return _mm_cmplt_pd(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpge_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return _mm_cmple_pd(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpord_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmppd(a, b, 7)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpunord_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmppd(a, b, 3)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpneq_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmppd(a, b, 4)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnlt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmppd(a, b, 5)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnle_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return cmppd(a, b, 6)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpngt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return _mm_cmpnlt_pd(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnge_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return _mm_cmpnle_pd(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comieq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return comieqsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comilt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return comiltsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comile_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return comilesd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comigt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return comigtsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comige_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return comigesd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comineq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return comineqsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomieq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return ucomieqsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomilt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return ucomiltsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomile_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return ucomilesd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomigt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return ucomigtsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomige_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return ucomigesd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomineq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return ucomineqsd(a, b)
+}
+
+
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_cvtpd_ps :: #force_inline proc "c" (a: __m128d) -> __m128 {
+	return cvtpd2ps(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtps_pd :: #force_inline proc "c" (a: __m128) -> __m128d {
+	return cvtps2pd(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtpd_epi32 :: #force_inline proc "c" (a: __m128d) -> __m128i {
+	return transmute(__m128i)cvtpd2dq(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsd_si32 :: #force_inline proc "c" (a: __m128d) -> i32 {
+	return cvtsd2si(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsd_ss :: #force_inline proc "c" (a, b: __m128d) -> __m128 {
+	return cvtsd2ss(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsd_f64 :: #force_inline proc "c" (a: __m128d) -> f64 {
+	return simd.extract(a, 0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtss_sd :: #force_inline proc "c" (a, b: __m128) -> __m128d {
+	return cvtss2sd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvttpd_epi32 :: #force_inline proc "c" (a: __m128d) -> __m128i {
+	return transmute(__m128i)cvttpd2dq(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvttsd_si32 :: #force_inline proc "c" (a: __m128d) -> i32 {
+	return cvttsd2si(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvttps_epi32 :: #force_inline proc "c" (a: __m128) -> __m128i {
+	return transmute(__m128i)cvttps2dq(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_sd :: #force_inline proc "c" (a: f64) -> __m128d {
+	return _mm_set_pd(0.0, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set1_pd :: #force_inline proc "c" (a: f64) -> __m128d {
+	return _mm_set_pd(a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_pd1 :: #force_inline proc "c" (a: f64) -> __m128d {
+	return _mm_set_pd(a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_pd :: #force_inline proc "c" (a: f64, b: f64) -> __m128d {
+	return __m128d{b, a}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setr_pd :: #force_inline proc "c" (a: f64, b: f64) -> __m128d {
+	return _mm_set_pd(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setzero_pd :: #force_inline proc "c" () -> __m128d {
+	return _mm_set_pd(0.0, 0.0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_movemask_pd :: #force_inline proc "c" (a: __m128d) -> i32 {
+	return movmskpd(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_load_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+	return (^__m128d)(mem_addr)^
+}
+@(require_results, enable_target_feature="sse2")
+_mm_load_sd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+	return _mm_setr_pd(mem_addr^, 0.)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_loadh_pd :: #force_inline proc "c" (a: __m128d, mem_addr: ^f64) -> __m128d {
+	return _mm_setr_pd(simd.extract(a, 0), mem_addr^)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_loadl_pd :: #force_inline proc "c" (a: __m128d, mem_addr: ^f64) -> __m128d {
+	return _mm_setr_pd(mem_addr^, simd.extract(a, 1))
+}
+@(enable_target_feature="sse2")
+_mm_stream_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+	intrinsics.non_temporal_store((^__m128d)(mem_addr), a)
+}
+@(enable_target_feature="sse2")
+_mm_store_sd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+	mem_addr^ = simd.extract(a, 0)
+}
+@(enable_target_feature="sse2")
+_mm_store_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+	(^__m128d)(mem_addr)^ = a
+}
+@(enable_target_feature="sse2")
+_mm_storeu_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+	storeupd(mem_addr, a)
+}
+@(enable_target_feature="sse2")
+_mm_store1_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+	(^__m128d)(mem_addr)^ = simd.shuffle(a, a, 0, 0)
+}
+@(enable_target_feature="sse2")
+_mm_store_pd1 :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+	(^__m128d)(mem_addr)^ = simd.shuffle(a, a, 0, 0)
+}
+@(enable_target_feature="sse2")
+_mm_storer_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+	(^__m128d)(mem_addr)^ = simd.shuffle(a, a, 1, 0)
+}
+@(enable_target_feature="sse2")
+_mm_storeh_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+	mem_addr^ = simd.extract(a, 1)
+}
+@(enable_target_feature="sse2")
+_mm_storel_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+	mem_addr^ = simd.extract(a, 0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_load1_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+	d := mem_addr^
+	return _mm_setr_pd(d, d)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_load_pd1 :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+	return _mm_load1_pd(mem_addr)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_loadr_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+	a := _mm_load_pd(mem_addr)
+	return simd.shuffle(a, a, 1, 0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_loadu_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+	dst := _mm_undefined_pd()
+	intrinsics.mem_copy_non_overlapping(&dst, mem_addr, size_of(__m128d))
+	return dst
+}
+@(require_results, enable_target_feature="sse2")
+_mm_shuffle_pd :: #force_inline proc "c" (a, b: __m128d, $MASK: u32) -> __m128d {
+	return simd.shuffle(a, b, MASK&0b1, ((MASK>>1)&0b1) + 2)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_move_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return _mm_setr_pd(simd.extract(b, 0), simd.extract(a, 1))
+}
+
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_castpd_ps :: #force_inline proc "c" (a: __m128d) -> __m128 {
+	return transmute(__m128)a
+}
+@(require_results, enable_target_feature="sse2")
+_mm_castpd_si128 :: #force_inline proc "c" (a: __m128d) -> __m128i {
+	return transmute(__m128i)a
+}
+@(require_results, enable_target_feature="sse2")
+_mm_castps_pd :: #force_inline proc "c" (a: __m128) -> __m128d {
+	return transmute(__m128d)a
+}
+@(require_results, enable_target_feature="sse2")
+_mm_castps_si128 :: #force_inline proc "c" (a: __m128) -> __m128i {
+	return transmute(__m128i)a
+}
+@(require_results, enable_target_feature="sse2")
+_mm_castsi128_pd :: #force_inline proc "c" (a: __m128i) -> __m128d {
+	return transmute(__m128d)a
+}
+@(require_results, enable_target_feature="sse2")
+_mm_castsi128_ps :: #force_inline proc "c" (a: __m128i) -> __m128 {
+	return transmute(__m128)a
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_undefined_pd :: #force_inline proc "c" () -> __m128d {
+	return __m128d{0, 0}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_undefined_si128 :: #force_inline proc "c" () -> __m128i {
+	return __m128i{0, 0}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpackhi_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.shuffle(a, b, 1, 3)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpacklo_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return simd.shuffle(a, b, 0, 2)
+}
+
+
+when ODIN_ARCH == .amd64 {
+	@(require_results, enable_target_feature="sse2")
+	_mm_cvtsd_si64 :: #force_inline proc "c" (a: __m128d) -> i64 {
+		return cvtsd2si64(a)
+	}
+	@(require_results, enable_target_feature="sse2")
+	_mm_cvtsd_si64x :: #force_inline proc "c" (a: __m128d) -> i64 {
+		return _mm_cvtsd_si64(a)
+	}
+	@(require_results, enable_target_feature="sse2")
+	_mm_cvttsd_si64 :: #force_inline proc "c" (a: __m128d) -> i64 {
+		return cvttsd2si64(a)
+	}
+	@(require_results, enable_target_feature="sse2")
+	_mm_cvttsd_si64x :: #force_inline proc "c" (a: __m128d) -> i64 {
+		return _mm_cvttsd_si64(a)
+	}
+	@(enable_target_feature="sse2")
+	_mm_stream_si64 :: #force_inline proc "c" (mem_addr: ^i64, a: i64) {
+		intrinsics.non_temporal_store(mem_addr, a)
+	}
+	@(require_results, enable_target_feature="sse2")
+	_mm_cvtsi64_si128 :: #force_inline proc "c" (a: i64) -> __m128i {
+		return _mm_set_epi64x(0, a)
+	}
+	@(require_results, enable_target_feature="sse2")
+	_mm_cvtsi64x_si128 :: #force_inline proc "c" (a: i64) -> __m128i {
+		return _mm_cvtsi64_si128(a)
+	}
+	@(require_results, enable_target_feature="sse2")
+	_mm_cvtsi128_si64 :: #force_inline proc "c" (a: __m128i) -> i64 {
+		return simd.extract(transmute(i64x2)a, 0)
+	}
+	@(require_results, enable_target_feature="sse2")
+	_mm_cvtsi128_si64x :: #force_inline proc "c" (a: __m128i) -> i64 {
+		return _mm_cvtsi128_si64(a)
+	}
+	@(require_results, enable_target_feature="sse2")
+	_mm_cvtsi64_sd :: #force_inline proc "c" (a: __m128d, b: i64) -> __m128d {
+		return simd.replace(a, 0, f64(b))
+	}
+	@(require_results, enable_target_feature="sse2")
+	_mm_cvtsi64x_sd :: #force_inline proc "c" (a: __m128d, b: i64) -> __m128d {
+		return _mm_cvtsi64_sd(a, b)
+	}
+}
+
+
+@(private, default_calling_convention="c")
+foreign _ {
+	@(link_name="llvm.x86.sse2.pause")
+	pause      :: proc() ---
+	@(link_name="llvm.x86.sse2.clflush")
+	clflush    :: proc(p: rawptr) ---
+	@(link_name="llvm.x86.sse2.lfence")
+	lfence     :: proc() ---
+	@(link_name="llvm.x86.sse2.mfence")
+	mfence     :: proc() ---
+	@(link_name="llvm.x86.sse2.pavg.b")
+	pavgb      :: proc(a, b: u8x16) -> u8x16 ---
+	@(link_name="llvm.x86.sse2.pavg.w")
+	pavgw      :: proc(a, b: u16x8) -> u16x8 ---
+	@(link_name="llvm.x86.sse2.pmadd.wd")
+	pmaddwd    :: proc(a, b: i16x8) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.pmaxs.w")
+	pmaxsw     :: proc(a, b: i16x8) -> i16x8 ---
+	@(link_name="llvm.x86.sse2.pmaxu.b")
+	pmaxub     :: proc(a, b: u8x16) -> u8x16 ---
+	@(link_name="llvm.x86.sse2.pmins.w")
+	pminsw     :: proc(a, b: i16x8) -> i16x8 ---
+	@(link_name="llvm.x86.sse2.pminu.b")
+	pminub     :: proc(a, b: u8x16) -> u8x16 ---
+	@(link_name="llvm.x86.sse2.pmulh.w")
+	pmulhw     :: proc(a, b: i16x8) -> i16x8 ---
+	@(link_name="llvm.x86.sse2.pmulhu.w")
+	pmulhuw    :: proc(a, b: u16x8) -> u16x8 ---
+	@(link_name="llvm.x86.sse2.pmulu.dq")
+	pmuludq    :: proc(a, b: u32x4) -> u64x2 ---
+	@(link_name="llvm.x86.sse2.psad.bw")
+	psadbw     :: proc(a, b: u8x16) -> u64x2 ---
+	@(link_name="llvm.x86.sse2.pslli.w")
+	pslliw     :: proc(a: i16x8, #const imm8: u32) -> i16x8 ---
+	@(link_name="llvm.x86.sse2.psll.w")
+	psllw      :: proc(a: i16x8, count: i16x8) -> i16x8 ---
+	@(link_name="llvm.x86.sse2.pslli.d")
+	psllid     :: proc(a: i32x4, #const imm8: u32) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.psll.d")
+	pslld      :: proc(a: i32x4, count: i32x4) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.pslli.q")
+	pslliq     :: proc(a: i64x2, #const imm8: u32) -> i64x2 ---
+	@(link_name="llvm.x86.sse2.psll.q")
+	psllq      :: proc(a: i64x2, count: i64x2) -> i64x2 ---
+	@(link_name="llvm.x86.sse2.psrai.w")
+	psraiw     :: proc(a: i16x8, #const imm8: u32) -> i16x8 ---
+	@(link_name="llvm.x86.sse2.psra.w")
+	psraw      :: proc(a: i16x8, count: i16x8) -> i16x8 ---
+	@(link_name="llvm.x86.sse2.psrai.d")
+	psraid     :: proc(a: i32x4, #const imm8: u32) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.psra.d")
+	psrad      :: proc(a: i32x4, count: i32x4) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.psrli.w")
+	psrliw     :: proc(a: i16x8, #const imm8: u32) -> i16x8 ---
+	@(link_name="llvm.x86.sse2.psrl.w")
+	psrlw      :: proc(a: i16x8, count: i16x8) -> i16x8 ---
+	@(link_name="llvm.x86.sse2.psrli.d")
+	psrlid     :: proc(a: i32x4, #const imm8: u32) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.psrl.d")
+	psrld      :: proc(a: i32x4, count: i32x4) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.psrli.q")
+	psrliq     :: proc(a: i64x2, #const imm8: u32) -> i64x2 ---
+	@(link_name="llvm.x86.sse2.psrl.q")
+	psrlq      :: proc(a: i64x2, count: i64x2) -> i64x2 ---
+	@(link_name="llvm.x86.sse2.cvtdq2ps")
+	cvtdq2ps   :: proc(a: i32x4) -> __m128 ---
+	@(link_name="llvm.x86.sse2.cvtps2dq")
+	cvtps2dq   :: proc(a: __m128) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.maskmov.dqu")
+	maskmovdqu :: proc(a: i8x16, mask: i8x16, mem_addr: rawptr) ---
+	@(link_name="llvm.x86.sse2.packsswb.128")
+	packsswb   :: proc(a, b: i16x8) -> i8x16 ---
+	@(link_name="llvm.x86.sse2.packssdw.128")
+	packssdw   :: proc(a, b: i32x4) -> i16x8 ---
+	@(link_name="llvm.x86.sse2.packuswb.128")
+	packuswb   :: proc(a, b: i16x8) -> u8x16 ---
+	@(link_name="llvm.x86.sse2.pmovmskb.128")
+	pmovmskb   :: proc(a: i8x16) -> i32 ---
+	@(link_name="llvm.x86.sse2.max.sd")
+	maxsd      :: proc(a, b: __m128d) -> __m128d ---
+	@(link_name="llvm.x86.sse2.max.pd")
+	maxpd      :: proc(a, b: __m128d) -> __m128d ---
+	@(link_name="llvm.x86.sse2.min.sd")
+	minsd      :: proc(a, b: __m128d) -> __m128d ---
+	@(link_name="llvm.x86.sse2.min.pd")
+	minpd      :: proc(a, b: __m128d) -> __m128d ---
+	@(link_name="llvm.x86.sse2.sqrt.sd")
+	sqrtsd     :: proc(a: __m128d) -> __m128d ---
+	@(link_name="llvm.x86.sse2.sqrt.pd")
+	sqrtpd     :: proc(a: __m128d) -> __m128d ---
+	@(link_name="llvm.x86.sse2.cmp.sd")
+	cmpsd      :: proc(a, b: __m128d, imm8: i8) -> __m128d ---
+	@(link_name="llvm.x86.sse2.cmp.pd")
+	cmppd      :: proc(a, b: __m128d, imm8: i8) -> __m128d ---
+	@(link_name="llvm.x86.sse2.comieq.sd")
+	comieqsd   :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.comilt.sd")
+	comiltsd   :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.comile.sd")
+	comilesd   :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.comigt.sd")
+	comigtsd   :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.comige.sd")
+	comigesd   :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.comineq.sd")
+	comineqsd  :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.ucomieq.sd")
+	ucomieqsd  :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.ucomilt.sd")
+	ucomiltsd  :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.ucomile.sd")
+	ucomilesd  :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.ucomigt.sd")
+	ucomigtsd  :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.ucomige.sd")
+	ucomigesd  :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.ucomineq.sd")
+	ucomineqsd :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.movmsk.pd")
+	movmskpd   :: proc(a: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.cvtpd2ps")
+	cvtpd2ps   :: proc(a: __m128d) -> __m128 ---
+	@(link_name="llvm.x86.sse2.cvtps2pd")
+	cvtps2pd   :: proc(a: __m128) -> __m128d ---
+	@(link_name="llvm.x86.sse2.cvtpd2dq")
+	cvtpd2dq   :: proc(a: __m128d) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.cvtsd2si")
+	cvtsd2si   :: proc(a: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.cvtsd2ss")
+	cvtsd2ss   :: proc(a, b: __m128d) -> __m128 ---
+	@(link_name="llvm.x86.sse2.cvtss2sd")
+	cvtss2sd   :: proc(a, b: __m128) -> __m128d ---
+	@(link_name="llvm.x86.sse2.cvttpd2dq")
+	cvttpd2dq  :: proc(a: __m128d) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.cvttsd2si")
+	cvttsd2si  :: proc(a: __m128d) -> i32 ---
+	@(link_name="llvm.x86.sse2.cvttps2dq")
+	cvttps2dq  :: proc(a: __m128) -> i32x4 ---
+	@(link_name="llvm.x86.sse2.storeu.dq")
+	storeudq   :: proc(mem_addr: rawptr, a: __m128i) ---
+	@(link_name="llvm.x86.sse2.storeu.pd")
+	storeupd   :: proc(mem_addr: rawptr, a: __m128d) ---
+
+	// amd64 only
+	@(link_name="llvm.x86.sse2.cvtsd2si64")
+	cvtsd2si64  :: proc(a: __m128d) -> i64 ---
+	@(link_name="llvm.x86.sse2.cvttsd2si64")
+	cvttsd2si64 :: proc(a: __m128d) -> i64 ---
+}

+ 68 - 0
core/simd/x86/sse3.odin

@@ -0,0 +1,68 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+import "core:simd"
+
+@(require_results, enable_target_feature="sse3")
+_mm_addsub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return addsubps(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_addsub_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d {
+	return addsubpd(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_hadd_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d {
+	return haddpd(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_hadd_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return haddps(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_hsub_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d {
+	return hsubpd(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_hsub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return hsubps(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_lddqu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i {
+	return transmute(__m128i)lddqu(mem_addr)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_movedup_pd :: #force_inline proc "c" (a: __m128d) -> __m128d {
+	return simd.shuffle(a, a, 0, 0)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_loaddup_pd :: #force_inline proc "c" (mem_addr: [^]f64) -> __m128d {
+	return _mm_load1_pd(mem_addr)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_movehdup_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+	return simd.shuffle(a, a, 1, 1, 3, 3)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_moveldup_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+	return simd.shuffle(a, a, 0, 0, 2, 2)
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+	@(link_name = "llvm.x86.sse3.addsub.ps")
+	addsubps :: proc(a, b: __m128) -> __m128 ---
+	@(link_name = "llvm.x86.sse3.addsub.pd")
+	addsubpd :: proc(a: __m128d, b: __m128d) -> __m128d ---
+	@(link_name = "llvm.x86.sse3.hadd.pd")
+	haddpd :: proc(a: __m128d, b: __m128d) -> __m128d ---
+	@(link_name = "llvm.x86.sse3.hadd.ps")
+	haddps :: proc(a, b: __m128) -> __m128 ---
+	@(link_name = "llvm.x86.sse3.hsub.pd")
+	hsubpd :: proc(a: __m128d, b: __m128d) -> __m128d ---
+	@(link_name = "llvm.x86.sse3.hsub.ps")
+	hsubps :: proc(a, b: __m128) -> __m128 ---
+	@(link_name = "llvm.x86.sse3.ldu.dq")
+	lddqu :: proc(mem_addr: rawptr) -> i8x16 ---
+}

+ 352 - 0
core/simd/x86/sse41.odin

@@ -0,0 +1,352 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:simd"
+
+// SSE4 rounding constants
+_MM_FROUND_TO_NEAREST_INT :: 0x00
+_MM_FROUND_TO_NEG_INF     :: 0x01
+_MM_FROUND_TO_POS_INF     :: 0x02
+_MM_FROUND_TO_ZERO        :: 0x03
+_MM_FROUND_CUR_DIRECTION  :: 0x04
+_MM_FROUND_RAISE_EXC      :: 0x00
+_MM_FROUND_NO_EXC         :: 0x08
+_MM_FROUND_NINT           :: 0x00
+_MM_FROUND_FLOOR          :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF
+_MM_FROUND_CEIL           :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF
+_MM_FROUND_TRUNC          :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO
+_MM_FROUND_RINT           :: _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION
+_MM_FROUND_NEARBYINT      :: _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION
+
+
+
+@(require_results, enable_target_feature="sse4.1")
+_mm_blendv_epi8 :: #force_inline proc "c" (a, b, mask: __m128i) -> __m128i {
+	return transmute(__m128i)pblendvb(transmute(i8x16)a, transmute(i8x16)b, transmute(i8x16)mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_blend_epi16 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i {
+	return transmute(__m128i)pblendw(transmute(i16x8)a, transmute(i16x8)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_blendv_pd :: #force_inline proc "c" (a, b, mask: __m128d) -> __m128d {
+	return blendvpd(a, b, mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_blendv_ps :: #force_inline proc "c" (a, b, mask: __m128) -> __m128 {
+	return blendvps(a, b, mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_blend_pd :: #force_inline proc "c" (a, b: __m128d, $IMM2: u8) -> __m128d {
+	return blendpd(a, b, IMM2)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_blend_ps :: #force_inline proc "c" (a, b: __m128, $IMM4: u8) -> __m128 {
+	return blendps(a, b, IMM4)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_extract_ps :: #force_inline proc "c" (a: __m128, $IMM8: u32) -> i32 {
+	return transmute(i32)simd.extract(a, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_extract_epi8 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 {
+	return i32(simd.extract(transmute(u8x16)a, IMM8))
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_extract_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 {
+	return simd.extract(transmute(i32x4)a, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_insert_ps :: #force_inline proc "c" (a, b: __m128, $IMM8: u8) -> __m128 {
+	return insertps(a, b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_insert_epi8 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i {
+	return transmute(__m128i)simd.replace(transmute(i8x16)a, IMM8, i8(i))
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_insert_epi32 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i {
+	return transmute(__m128i)simd.replace(transmute(i32x4)a, IMM8, i)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_max_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pmaxsb(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_max_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pmaxuw(transmute(u16x8)a, transmute(u16x8)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_max_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pmaxsd(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_max_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pmaxud(transmute(u32x4)a, transmute(u32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_min_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pminsb(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_min_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pminuw(transmute(u16x8)a, transmute(u16x8)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_min_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pminsd(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_min_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pminud(transmute(u32x4)a, transmute(u32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_packus_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)packusdw(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cmpeq_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_eq(transmute(i64x2)a, transmute(i64x2)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi8_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(i8x16)a
+	y := simd.shuffle(x, x, 0, 1, 2, 3, 4, 5, 6, 7)
+	return transmute(__m128i)i16x8(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi8_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(i8x16)a
+	y := simd.shuffle(x, x, 0, 1, 2, 3)
+	return transmute(__m128i)i32x4(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi8_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(i8x16)a
+	y := simd.shuffle(x, x, 0, 1)
+	return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi16_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(i16x8)a
+	y := simd.shuffle(x, x, 0, 1, 2, 3)
+	return transmute(__m128i)i32x4(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi16_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(i16x8)a
+	y := simd.shuffle(x, x, 0, 1)
+	return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi32_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(i32x4)a
+	y := simd.shuffle(x, x, 0, 1)
+	return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu8_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(u8x16)a
+	y := simd.shuffle(x, x, 0, 1, 2, 3, 4, 5, 6, 7)
+	return transmute(__m128i)i16x8(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu8_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(u8x16)a
+	y := simd.shuffle(x, x, 0, 1, 2, 3)
+	return transmute(__m128i)i32x4(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu8_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(u8x16)a
+	y := simd.shuffle(x, x, 0, 1)
+	return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu16_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(u16x8)a
+	y := simd.shuffle(x, x, 0, 1, 2, 3)
+	return transmute(__m128i)i32x4(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu16_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(u16x8)a
+	y := simd.shuffle(x, x, 0, 1)
+	return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu32_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	x := transmute(u32x4)a
+	y := simd.shuffle(x, x, 0, 1)
+	return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_dp_pd :: #force_inline proc "c" (a, b: __m128d, $IMM8: u8) -> __m128d {
+	return dppd(a, b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_dp_ps :: #force_inline proc "c" (a, b: __m128, $IMM8: u8) -> __m128 {
+	return dpps(a, b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_floor_pd :: #force_inline proc "c" (a: __m128d) -> __m128d {
+	return simd.floor(a)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_floor_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+	return simd.floor(a)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_floor_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return roundsd(a, b, _MM_FROUND_FLOOR)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_floor_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return roundss(a, b, _MM_FROUND_FLOOR)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_ceil_pd :: #force_inline proc "c" (a: __m128d) -> __m128d {
+	return simd.ceil(a)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_ceil_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+	return simd.ceil(a)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_ceil_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+	return roundsd(a, b, _MM_FROUND_CEIL)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_ceil_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+	return roundss(a, b, _MM_FROUND_CEIL)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_round_pd :: #force_inline proc "c" (a: __m128d, $ROUNDING: i32) -> __m128d {
+	return roundpd(a, ROUNDING)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_round_ps :: #force_inline proc "c" (a: __m128, $ROUNDING: i32) -> __m128 {
+	return roundps(a, ROUNDING)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_round_sd :: #force_inline proc "c" (a, b: __m128d, $ROUNDING: i32) -> __m128d {
+	return roundsd(a, b, ROUNDING)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_round_ss :: #force_inline proc "c" (a, b: __m128, $ROUNDING: i32) -> __m128 {
+	return roundss(a, b, ROUNDING)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_minpos_epu16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	return transmute(__m128i)phminposuw(transmute(u16x8)a)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_mul_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pmuldq(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_mullo_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.mul(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_mpsadbw_epu8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i {
+	return transmute(__m128i)mpsadbw(transmute(u8x16)a, transmute(u8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_testz_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
+	return ptestz(transmute(i64x2)a, transmute(i64x2)mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_testc_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
+	return ptestc(transmute(i64x2)a, transmute(i64x2)mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_testnzc_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
+	return ptestnzc(transmute(i64x2)a, transmute(i64x2)mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_test_all_zeros :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
+	return _mm_testz_si128(a, mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_test_all_ones :: #force_inline proc "c" (a: __m128i) -> i32 {
+	return _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_test_mix_ones_zeros :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
+	return _mm_testnzc_si128(a, mask)
+}
+
+
+when ODIN_ARCH == .amd64 {
+	@(require_results, enable_target_feature="sse4.1")
+	_mm_extract_epi64 :: #force_inline proc "c" (a: __m128i, $IMM1: u32) -> i64 {
+		return simd.extract(transmute(i64x2)a, IMM1)
+	}
+
+	@(require_results, enable_target_feature="sse4.1")
+	_mm_insert_epi64 :: #force_inline proc "c" (a: __m128i, i: i64, $IMM1: u32) -> __m128i {
+		return transmute(__m128i)simd.replace(transmute(i64x2)a, IMM1, i)
+	}
+}
+
+
+@(private, default_calling_convention="c")
+foreign _ {
+	@(link_name = "llvm.x86.sse41.pblendvb")
+	pblendvb   :: proc(a, b: i8x16, mask: i8x16) -> i8x16 ---
+	@(link_name = "llvm.x86.sse41.blendvpd")
+	blendvpd   :: proc(a, b, mask: __m128d) -> __m128d ---
+	@(link_name = "llvm.x86.sse41.blendvps")
+	blendvps   :: proc(a, b, mask: __m128) -> __m128 ---
+	@(link_name = "llvm.x86.sse41.blendpd")
+	blendpd    :: proc(a, b: __m128d, #const imm2: u8) -> __m128d ---
+	@(link_name = "llvm.x86.sse41.blendps")
+	blendps    :: proc(a, b: __m128, #const imm4: u8) -> __m128 ---
+	@(link_name = "llvm.x86.sse41.pblendw")
+	pblendw    :: proc(a: i16x8, b: i16x8, #const imm8: u8) -> i16x8 ---
+	@(link_name = "llvm.x86.sse41.insertps")
+	insertps   :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
+	@(link_name = "llvm.x86.sse41.pmaxsb")
+	pmaxsb     :: proc(a, b: i8x16) -> i8x16 ---
+	@(link_name = "llvm.x86.sse41.pmaxuw")
+	pmaxuw     :: proc(a, b: u16x8) -> u16x8 ---
+	@(link_name = "llvm.x86.sse41.pmaxsd")
+	pmaxsd     :: proc(a, b: i32x4) -> i32x4 ---
+	@(link_name = "llvm.x86.sse41.pmaxud")
+	pmaxud     :: proc(a, b: u32x4) -> u32x4 ---
+	@(link_name = "llvm.x86.sse41.pminsb")
+	pminsb     :: proc(a, b: i8x16) -> i8x16 ---
+	@(link_name = "llvm.x86.sse41.pminuw")
+	pminuw     :: proc(a, b: u16x8) -> u16x8 ---
+	@(link_name = "llvm.x86.sse41.pminsd")
+	pminsd     :: proc(a, b: i32x4) -> i32x4 ---
+	@(link_name = "llvm.x86.sse41.pminud")
+	pminud     :: proc(a, b: u32x4) -> u32x4 ---
+	@(link_name = "llvm.x86.sse41.packusdw")
+	packusdw   :: proc(a, b: i32x4) -> u16x8 ---
+	@(link_name = "llvm.x86.sse41.dppd")
+	dppd       :: proc(a, b: __m128d, #const imm8: u8) -> __m128d ---
+	@(link_name = "llvm.x86.sse41.dpps")
+	dpps       :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
+	@(link_name = "llvm.x86.sse41.round.pd")
+	roundpd    :: proc(a: __m128d, rounding: i32) -> __m128d ---
+	@(link_name = "llvm.x86.sse41.round.ps")
+	roundps    :: proc(a: __m128, rounding: i32) -> __m128 ---
+	@(link_name = "llvm.x86.sse41.round.sd")
+	roundsd    :: proc(a, b: __m128d, rounding: i32) -> __m128d ---
+	@(link_name = "llvm.x86.sse41.round.ss")
+	roundss    :: proc(a, b: __m128, rounding: i32) -> __m128 ---
+	@(link_name = "llvm.x86.sse41.phminposuw")
+	phminposuw :: proc(a: u16x8) -> u16x8 ---
+	@(link_name = "llvm.x86.sse41.pmuldq")
+	pmuldq     :: proc(a, b: i32x4) -> i64x2 ---
+	@(link_name = "llvm.x86.sse41.mpsadbw")
+	mpsadbw    :: proc(a, b: u8x16, #const imm8: u8) -> u16x8 ---
+	@(link_name = "llvm.x86.sse41.ptestz")
+	ptestz     :: proc(a, mask: i64x2) -> i32 ---
+	@(link_name = "llvm.x86.sse41.ptestc")
+	ptestc     :: proc(a, mask: i64x2) -> i32 ---
+	@(link_name = "llvm.x86.sse41.ptestnzc")
+	ptestnzc   :: proc(a, mask: i64x2) -> i32 ---
+}

+ 149 - 0
core/simd/x86/sse42.odin

@@ -0,0 +1,149 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:simd"
+
+_SIDD_UBYTE_OPS                :: 0b0000_0000
+_SIDD_UWORD_OPS                :: 0b0000_0001
+_SIDD_SBYTE_OPS                :: 0b0000_0010
+_SIDD_SWORD_OPS                :: 0b0000_0011
+
+_SIDD_CMP_EQUAL_ANY            :: 0b0000_0000
+_SIDD_CMP_RANGES               :: 0b0000_0100
+_SIDD_CMP_EQUAL_EACH           :: 0b0000_1000
+_SIDD_CMP_EQUAL_ORDERED        :: 0b0000_1100
+
+_SIDD_POSITIVE_POLARITY        :: 0b0000_0000
+_SIDD_NEGATIVE_POLARITY        :: 0b0001_0000
+_SIDD_MASKED_POSITIVE_POLARITY :: 0b0010_0000
+_SIDD_MASKED_NEGATIVE_POLARITY :: 0b0011_0000
+
+_SIDD_LEAST_SIGNIFICANT        :: 0b0000_0000
+_SIDD_MOST_SIGNIFICANT         :: 0b0100_0000
+
+_SIDD_BIT_MASK                 :: 0b0000_0000
+_SIDD_UNIT_MASK                :: 0b0100_0000
+
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistrm :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> __m128i {
+	return transmute(__m128i)pcmpistrm128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistri :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+	return pcmpistri128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistrz :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+	return pcmpistriz128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistrc :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+	return pcmpistric128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistrs :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+	return pcmpistris128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistro :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+	return pcmpistrio128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistra :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+	return pcmpistria128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestrm :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> __m128i {
+	return transmute(__m128i)pcmpestrm128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestri :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+	return pcmpestri128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestrz :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+	return pcmpestriz128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestrc :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+	return pcmpestric128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestrs :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+	return pcmpestris128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestro :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+	return pcmpestrio128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestra :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+	return pcmpestria128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_crc32_u8 :: #force_inline proc "c" (crc: u32, v: u8) -> u32 {
+	return crc32_32_8(crc, v)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_crc32_u16 :: #force_inline proc "c" (crc: u32, v: u16) -> u32 {
+	return crc32_32_16(crc, v)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_crc32_u32 :: #force_inline proc "c" (crc: u32, v: u32) -> u32 {
+	return crc32_32_32(crc, v)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpgt_epi64 :: #force_inline proc "c" (a: __m128i, b: __m128i) -> __m128i {
+	return transmute(__m128i)simd.lanes_gt(transmute(i64x2)a, transmute(i64x2)b)
+}
+
+when ODIN_ARCH == .amd64 {
+	@(require_results, enable_target_feature="sse4.2")
+	_mm_crc32_u64 :: #force_inline proc "c" (crc: u64, v: u64) -> u64 {
+		return crc32_64_64(crc, v)
+	}
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+	// SSE 4.2 string and text comparison ops
+	@(link_name="llvm.x86.sse42.pcmpestrm128")
+	pcmpestrm128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> u8x16 ---
+	@(link_name="llvm.x86.sse42.pcmpestri128")
+	pcmpestri128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpestriz128")
+	pcmpestriz128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpestric128")
+	pcmpestric128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpestris128")
+	pcmpestris128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpestrio128")
+	pcmpestrio128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpestria128")
+	pcmpestria128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpistrm128")
+	pcmpistrm128 :: proc(a, b: i8x16, #const imm8: i8) -> i8x16 ---
+	@(link_name="llvm.x86.sse42.pcmpistri128")
+	pcmpistri128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpistriz128")
+	pcmpistriz128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpistric128")
+	pcmpistric128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpistris128")
+	pcmpistris128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpistrio128")
+	pcmpistrio128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+	@(link_name="llvm.x86.sse42.pcmpistria128")
+	pcmpistria128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+	// SSE 4.2 CRC instructions
+	@(link_name="llvm.x86.sse42.crc32.32.8")
+	crc32_32_8 :: proc(crc: u32, v: u8) -> u32 ---
+	@(link_name="llvm.x86.sse42.crc32.32.16")
+	crc32_32_16 :: proc(crc: u32, v: u16) -> u32 ---
+	@(link_name="llvm.x86.sse42.crc32.32.32")
+	crc32_32_32 :: proc(crc: u32, v: u32) -> u32 ---
+
+	// AMD64 Only
+	@(link_name="llvm.x86.sse42.crc32.64.64")
+	crc32_64_64 :: proc(crc: u64, v: u64) -> u64 ---
+}

+ 140 - 0
core/simd/x86/ssse3.odin

@@ -0,0 +1,140 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+import "core:simd"
+_ :: simd
+
+@(require_results, enable_target_feature="ssse3")
+_mm_abs_epi8 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	return transmute(__m128i)pabsb128(transmute(i8x16)a)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_abs_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	return transmute(__m128i)pabsw128(transmute(i16x8)a)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_abs_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+	return transmute(__m128i)pabsd128(transmute(i32x4)a)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_shuffle_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pshufb128(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_alignr_epi8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u32) -> __m128i {
+	shift :: IMM8
+
+	// If palignr is shifting the pair of vectors more than the size of two
+	// lanes, emit zero.
+	if shift > 32 {
+		return _mm_set1_epi8(0)
+	}
+	a, b := a, b
+	if shift > 16 {
+		a, b = _mm_set1_epi8(0), a
+	}
+
+	return transmute(__m128i)simd.shuffle(
+		transmute(i8x16)b,
+		transmute(i8x16)a,
+		0  when shift > 32 else shift - 16 + 0  when shift > 16 else shift + 0,
+		1  when shift > 32 else shift - 16 + 1  when shift > 16 else shift + 1,
+		2  when shift > 32 else shift - 16 + 2  when shift > 16 else shift + 2,
+		3  when shift > 32 else shift - 16 + 3  when shift > 16 else shift + 3,
+		4  when shift > 32 else shift - 16 + 4  when shift > 16 else shift + 4,
+		5  when shift > 32 else shift - 16 + 5  when shift > 16 else shift + 5,
+		6  when shift > 32 else shift - 16 + 6  when shift > 16 else shift + 6,
+		7  when shift > 32 else shift - 16 + 7  when shift > 16 else shift + 7,
+		8  when shift > 32 else shift - 16 + 8  when shift > 16 else shift + 8,
+		9  when shift > 32 else shift - 16 + 9  when shift > 16 else shift + 9,
+		10 when shift > 32 else shift - 16 + 10 when shift > 16 else shift + 10,
+		11 when shift > 32 else shift - 16 + 11 when shift > 16 else shift + 11,
+		12 when shift > 32 else shift - 16 + 12 when shift > 16 else shift + 12,
+		13 when shift > 32 else shift - 16 + 13 when shift > 16 else shift + 13,
+		14 when shift > 32 else shift - 16 + 14 when shift > 16 else shift + 14,
+		15 when shift > 32 else shift - 16 + 15 when shift > 16 else shift + 15,
+	)
+}
+
+
+@(require_results, enable_target_feature="ssse3")
+_mm_hadd_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)phaddw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_hadds_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)phaddsw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_hadd_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)phaddd128(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_hsub_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)phsubw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_hsubs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)phsubsw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_hsub_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)phsubd128(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_maddubs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pmaddubsw128(transmute(u8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_mulhrs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)pmulhrsw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_sign_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)psignb128(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_sign_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)psignw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_sign_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+	return transmute(__m128i)psignd128(transmute(i32x4)a, transmute(i32x4)b)
+}
+
+
+
+@(private, default_calling_convention="c")
+foreign _ {
+	@(link_name = "llvm.x86.ssse3.pabs.b.128")
+	pabsb128     :: proc(a: i8x16) -> u8x16 ---
+	@(link_name = "llvm.x86.ssse3.pabs.w.128")
+	pabsw128     :: proc(a: i16x8) -> u16x8 ---
+	@(link_name = "llvm.x86.ssse3.pabs.d.128")
+	pabsd128     :: proc(a: i32x4) -> u32x4 ---
+	@(link_name = "llvm.x86.ssse3.pshuf.b.128")
+	pshufb128    :: proc(a, b: u8x16) -> u8x16 ---
+	@(link_name = "llvm.x86.ssse3.phadd.w.128")
+	phaddw128    :: proc(a, b: i16x8) -> i16x8 ---
+	@(link_name = "llvm.x86.ssse3.phadd.sw.128")
+	phaddsw128   :: proc(a, b: i16x8) -> i16x8 ---
+	@(link_name = "llvm.x86.ssse3.phadd.d.128")
+	phaddd128    :: proc(a, b: i32x4) -> i32x4 ---
+	@(link_name = "llvm.x86.ssse3.phsub.w.128")
+	phsubw128    :: proc(a, b: i16x8) -> i16x8 ---
+	@(link_name = "llvm.x86.ssse3.phsub.sw.128")
+	phsubsw128   :: proc(a, b: i16x8) -> i16x8 ---
+	@(link_name = "llvm.x86.ssse3.phsub.d.128")
+	phsubd128    :: proc(a, b: i32x4) -> i32x4 ---
+	@(link_name = "llvm.x86.ssse3.pmadd.ub.sw.128")
+	pmaddubsw128 :: proc(a: u8x16, b: i8x16) -> i16x8 ---
+	@(link_name = "llvm.x86.ssse3.pmul.hr.sw.128")
+	pmulhrsw128  :: proc(a, b: i16x8) -> i16x8 ---
+	@(link_name = "llvm.x86.ssse3.psign.b.128")
+	psignb128    :: proc(a, b: i8x16) -> i8x16 ---
+	@(link_name = "llvm.x86.ssse3.psign.w.128")
+	psignw128    :: proc(a, b: i16x8) -> i16x8 ---
+	@(link_name = "llvm.x86.ssse3.psign.d.128")
+	psignd128    :: proc(a, b: i32x4) -> i32x4 ---
+}

+ 57 - 0
core/simd/x86/types.odin

@@ -0,0 +1,57 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:simd"
+
+bf16 :: u16
+
+__m128i :: #simd[2]i64
+__m128  :: #simd[4]f32
+__m128d :: #simd[2]f64
+
+__m256i :: #simd[4]i64
+__m256  :: #simd[8]f32
+__m256d :: #simd[4]f64
+
+__m512i :: #simd[8]i64
+__m512  :: #simd[16]f32
+__m512d :: #simd[8]f64
+
+__m128bh :: #simd[8]bf16
+__m256bh :: #simd[16]bf16
+__m512bh :: #simd[32]bf16
+
+
+/// The `__mmask64` type used in AVX-512 intrinsics, a 64-bit integer
+__mmask64 :: u64
+
+/// The `__mmask32` type used in AVX-512 intrinsics, a 32-bit integer
+__mmask32 :: u32
+
+/// The `__mmask16` type used in AVX-512 intrinsics, a 16-bit integer
+__mmask16 :: u16
+
+/// The `__mmask8` type used in AVX-512 intrinsics, a 8-bit integer
+__mmask8 :: u8
+
+/// The `_MM_CMPINT_ENUM` type used to specify comparison operations in AVX-512 intrinsics.
+_MM_CMPINT_ENUM :: i32
+
+/// The `MM_MANTISSA_NORM_ENUM` type used to specify mantissa normalized operations in AVX-512 intrinsics.
+_MM_MANTISSA_NORM_ENUM :: i32
+
+/// The `MM_MANTISSA_SIGN_ENUM` type used to specify mantissa signed operations in AVX-512 intrinsics.
+_MM_MANTISSA_SIGN_ENUM :: i32
+
+_MM_PERM_ENUM :: i32
+
+@(private) u8x16 :: simd.u8x16
+@(private) i8x16 :: simd.i8x16
+@(private) u16x8 :: simd.u16x8
+@(private) i16x8 :: simd.i16x8
+@(private) u32x4 :: simd.u32x4
+@(private) i32x4 :: simd.i32x4
+@(private) u64x2 :: simd.u64x2
+@(private) i64x2 :: simd.i64x2
+@(private) f32x4 :: simd.f32x4
+@(private) f64x2 :: simd.f64x2

+ 0 - 33
core/sys/cpu/cpu.odin

@@ -1,33 +0,0 @@
-package sys_cpu
-
-Cache_Line_Pad :: struct {_: [_cache_line_size]byte};
-
-initialized: bool;
-
-x86: struct {
-	_: Cache_Line_Pad,
-	has_aes:       bool, // AES hardware implementation (AES NI)
-	has_adx:       bool, // Multi-precision add-carry instruction extensions
-	has_avx:       bool, // Advanced vector extension
-	has_avx2:      bool, // Advanced vector extension 2
-	has_bmi1:      bool, // Bit manipulation instruction set 1
-	has_bmi2:      bool, // Bit manipulation instruction set 2
-	has_erms:      bool, // Enhanced REP for MOVSB and STOSB
-	has_fma:       bool, // Fused-multiply-add instructions
-	has_os_xsave:  bool, // OS supports XSAVE/XRESTOR for saving/restoring XMM registers.
-	has_pclmulqdq: bool, // PCLMULQDQ instruction - most often used for AES-GCM
-	has_popcnt:    bool, // Hamming weight instruction POPCNT.
-	has_rdrand:    bool, // RDRAND instruction (on-chip random number generator)
-	has_rdseed:    bool, // RDSEED instruction (on-chip random number generator)
-	has_sse2:      bool, // Streaming SIMD extension 2 (always available on amd64)
-	has_sse3:      bool, // Streaming SIMD extension 3
-	has_ssse3:     bool, // Supplemental streaming SIMD extension 3
-	has_sse41:     bool, // Streaming SIMD extension 4 and 4.1
-	has_sse42:     bool, // Streaming SIMD extension 4 and 4.2
-	_: Cache_Line_Pad,
-};
-
-
-init :: proc() {
-	_init();
-}

+ 0 - 67
core/sys/cpu/cpu_x86.odin

@@ -1,67 +0,0 @@
-//+build i386, amd64
-package sys_cpu
-
-_cache_line_size :: 64;
-
-cpuid :: proc(ax, cx: u32) -> (eax, ebc, ecx, edx: u32) {
-	return expand_to_tuple(asm(u32, u32) -> struct{eax, ebc, ecx, edx: u32} {
-		"cpuid",
-		"={ax},={bx},={cx},={dx},{ax},{cx}",
-	}(ax, cx));
-}
-
-xgetbv :: proc() -> (eax, edx: u32) {
-	return expand_to_tuple(asm(u32) -> struct{eax, edx: u32} {
-		"xgetbv",
-		"={ax},={dx},{cx}",
-	}(0));
-}
-
-_init :: proc() {
-	is_set :: proc(hwc: u32, value: u32) -> bool {
-		return hwc&value != 0;
-	}
-
-	initialized = true;
-
-	max_id, _, _, _ := cpuid(0, 0);
-
-	if max_id < 1 {
-		return;
-	}
-
-	_, _, ecx1, edx1 := cpuid(1, 0);
-
-	x86.has_sse2 = is_set(26, edx1);
-
-	x86.has_sse3      = is_set(0, ecx1);
-	x86.has_pclmulqdq = is_set(1, ecx1);
-	x86.has_ssse3     = is_set(9, ecx1);
-	x86.has_fma       = is_set(12, ecx1);
-	x86.has_sse41     = is_set(19, ecx1);
-	x86.has_sse42     = is_set(20, ecx1);
-	x86.has_popcnt    = is_set(23, ecx1);
-	x86.has_aes       = is_set(25, ecx1);
-	x86.has_os_xsave  = is_set(27, ecx1);
-	x86.has_rdrand    = is_set(30, ecx1);
-
-	os_supports_avx := false;
-	if x86.has_os_xsave {
-		eax, _ := xgetbv();
-		os_supports_avx = is_set(1, eax) && is_set(2, eax);
-	}
-
-	x86.has_avx = is_set(28, ecx1) && os_supports_avx;
-
-	if max_id < 7 {
-		return;
-	}
-
-	_, ebx7, _, _ := cpuid(7, 0);
-	x86.has_bmi1   = is_set(3, ebx7);
-	x86.has_avx2   = is_set(5, ebx7) && os_supports_avx;
-	x86.has_bmi2   = is_set(8, ebx7);
-	x86.has_erms   = is_set(9, ebx7);
-	x86.has_rdseed = is_set(18, ebx7);
-	x86.has_adx    = is_set(19, ebx7);
-}

+ 2 - 0
examples/all/all_main.odin

@@ -96,6 +96,7 @@ import filepath       "core:path/filepath"
 
 import reflect        "core:reflect"
 import runtime        "core:runtime"
+import simd           "core:simd"
 import slice          "core:slice"
 import sort           "core:sort"
 import strconv        "core:strconv"
@@ -192,6 +193,7 @@ _ :: slashpath
 _ :: filepath
 _ :: reflect
 _ :: runtime
+_ :: simd
 _ :: slice
 _ :: sort
 _ :: strconv

+ 116 - 2
src/build_settings.cpp

@@ -256,7 +256,6 @@ struct BuildContext {
 	String extra_linker_flags;
 	String extra_assembler_flags;
 	String microarch;
-	String target_features;
 	BuildModeKind build_mode;
 	bool   generate_docs;
 	i32    optimization_level;
@@ -320,6 +319,10 @@ struct BuildContext {
 
 	PtrMap<char const *, ExactValue> defined_values;
 
+	BlockingMutex target_features_mutex;
+	StringSet target_features_set;
+	String target_features_string;
+
 };
 
 gb_global BuildContext build_context = {0};
@@ -629,6 +632,15 @@ bool is_arch_wasm(void) {
 	return false;
 }
 
+bool is_arch_x86(void) {
+	switch (build_context.metrics.arch) {
+	case TargetArch_i386:
+	case TargetArch_amd64:
+		return true;
+	}
+	return false;
+}
+
 bool allow_check_foreign_filepath(void) {
 	switch (build_context.metrics.arch) {
 	case TargetArch_wasm32:
@@ -1188,6 +1200,100 @@ void init_build_context(TargetMetrics *cross_target) {
 #include "microsoft_craziness.h"
 #endif
 
+
+Array<String> split_by_comma(String const &list) {
+	isize n = 1;
+	for (isize i = 0; i < list.len; i++) {
+		if (list.text[i] == ',') {
+			n++;
+		}
+	}
+	auto res = array_make<String>(heap_allocator(), n);
+
+	String s = list;
+	for (isize i = 0; i < n; i++) {
+		isize m = string_index_byte(s, ',');
+		if (m < 0) {
+			res[i] = s;
+			break;
+		}
+		res[i] = substring(s, 0, m);
+		s = substring(s, m+1, s.len);
+	}
+	return res;
+}
+
+bool check_target_feature_is_valid(TokenPos pos, String const &feature) {
+	// TODO(bill): check_target_feature_is_valid
+	return true;
+}
+
+bool check_target_feature_is_enabled(TokenPos pos, String const &target_feature_list) {
+	BuildContext *bc = &build_context;
+	mutex_lock(&bc->target_features_mutex);
+	defer (mutex_unlock(&bc->target_features_mutex));
+
+	auto items = split_by_comma(target_feature_list);
+	array_free(&items);
+	for_array(i, items) {
+		String const &item = items.data[i];
+		if (!check_target_feature_is_valid(pos, item)) {
+			error(pos, "Target feature '%.*s' is not valid", LIT(item));
+			return false;
+		}
+		if (!string_set_exists(&bc->target_features_set, item)) {
+			error(pos, "Target feature '%.*s' is not enabled", LIT(item));
+			return false;
+		}
+	}
+
+	return true;
+}
+
+void enable_target_feature(TokenPos pos, String const &target_feature_list) {
+	BuildContext *bc = &build_context;
+	mutex_lock(&bc->target_features_mutex);
+	defer (mutex_unlock(&bc->target_features_mutex));
+
+	auto items = split_by_comma(target_feature_list);
+	array_free(&items);
+	for_array(i, items) {
+		String const &item = items.data[i];
+		if (!check_target_feature_is_valid(pos, item)) {
+			error(pos, "Target feature '%.*s' is not valid", LIT(item));
+		}
+	}
+}
+
+
+char const *target_features_set_to_cstring(gbAllocator allocator, bool with_quotes) {
+	isize len = 0;
+	for_array(i, build_context.target_features_set.entries) {
+		if (i != 0) {
+			len += 1;
+		}
+		String feature = build_context.target_features_set.entries[i].value;
+		len += feature.len;
+		if (with_quotes) len += 2;
+	}
+	char *features = gb_alloc_array(allocator, char, len+1);
+	len = 0;
+	for_array(i, build_context.target_features_set.entries) {
+		if (i != 0) {
+			features[len++] = ',';
+		}
+
+		if (with_quotes) features[len++] = '"';
+		String feature = build_context.target_features_set.entries[i].value;
+		gb_memmove(features, feature.text, feature.len);
+		len += feature.len;
+		if (with_quotes) features[len++] = '"';
+	}
+	features[len++] = 0;
+
+	return features;
+}
+
 // NOTE(Jeroen): Set/create the output and other paths and report an error as appropriate.
 // We've previously called `parse_build_flags`, so `out_filepath` should be set.
 bool init_build_paths(String init_filename) {
@@ -1197,6 +1303,9 @@ bool init_build_paths(String init_filename) {
 	// NOTE(Jeroen): We're pre-allocating BuildPathCOUNT slots so that certain paths are always at the same enumerated index.
 	array_init(&bc->build_paths, permanent_allocator(), BuildPathCOUNT);
 
+	string_set_init(&bc->target_features_set, heap_allocator(), 1024);
+	mutex_init(&bc->target_features_mutex);
+
 	// [BuildPathMainPackage] Turn given init path into a `Path`, which includes normalizing it into a full path.
 	bc->build_paths[BuildPath_Main_Package] = path_from_string(ha, init_filename);
 
@@ -1377,5 +1486,10 @@ bool init_build_paths(String init_filename) {
 		return false;
 	}
 
+	if (bc->target_features_string.len != 0) {
+		enable_target_feature({}, bc->target_features_string);
+	}
+
 	return true;
-}
+}
+

+ 822 - 55
src/check_builtin.cpp

@@ -246,7 +246,7 @@ bool is_constant_string(CheckerContext *c, String const &builtin_name, Ast *expr
 }
 
 bool check_builtin_objc_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 id, Type *type_hint) {
-	String builtin_name = builtin_procs[id].name;
+	String const &builtin_name = builtin_procs[id].name;
 
 	if (build_context.metrics.os != TargetOs_darwin) {
 		// allow on doc generation (e.g. Metal stuff)
@@ -409,6 +409,667 @@ bool check_atomic_memory_order_argument(CheckerContext *c, Ast *expr, String con
 
 }
 
+
+bool check_builtin_simd_operation(CheckerContext *c, Operand *operand, Ast *call, i32 id, Type *type_hint) {
+	ast_node(ce, CallExpr, call);
+
+	String const &builtin_name = builtin_procs[id].name;
+	switch (id) {
+	// Any numeric
+	case BuiltinProc_simd_add:
+	case BuiltinProc_simd_sub:
+	case BuiltinProc_simd_mul:
+	case BuiltinProc_simd_div:
+	case BuiltinProc_simd_min:
+	case BuiltinProc_simd_max:
+		{
+			Operand x = {};
+			Operand y = {};
+			check_expr(c, &x, ce->args[0]);                        if (x.mode == Addressing_Invalid) return false;
+			check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &y, x.type);                       if (y.mode == Addressing_Invalid) return false;
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!is_type_simd_vector(y.type)) {
+				error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!are_types_identical(x.type, y.type)) {
+				gbString xs = type_to_string(x.type);
+				gbString ys = type_to_string(y.type);
+				error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys);
+				gb_string_free(ys);
+				gb_string_free(xs);
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+			if (!is_type_integer(elem) && !is_type_float(elem)) {
+				gbString xs = type_to_string(x.type);
+				error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs);
+				gb_string_free(xs);
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = x.type;
+			return true;
+		}
+
+	// Integer only
+	case BuiltinProc_simd_add_sat:
+	case BuiltinProc_simd_sub_sat:
+	case BuiltinProc_simd_rem:
+	case BuiltinProc_simd_and:
+	case BuiltinProc_simd_or:
+	case BuiltinProc_simd_xor:
+	case BuiltinProc_simd_and_not:
+		{
+			Operand x = {};
+			Operand y = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+			check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!is_type_simd_vector(y.type)) {
+				error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!are_types_identical(x.type, y.type)) {
+				gbString xs = type_to_string(x.type);
+				gbString ys = type_to_string(y.type);
+				error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys);
+				gb_string_free(ys);
+				gb_string_free(xs);
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+
+			switch (id) {
+			case BuiltinProc_simd_add_sat:
+			case BuiltinProc_simd_sub_sat:
+			case BuiltinProc_simd_rem:
+				if (!is_type_integer(elem)) {
+					gbString xs = type_to_string(x.type);
+					error(x.expr, "'%.*s' expected a #simd type with an integer element, got '%s'", LIT(builtin_name), xs);
+					gb_string_free(xs);
+					return false;
+				}
+				break;
+			default:
+				if (!is_type_integer(elem) && !is_type_boolean(elem)) {
+					gbString xs = type_to_string(x.type);
+					error(x.expr, "'%.*s' expected a #simd type with an integer or boolean element, got '%s'", LIT(builtin_name), xs);
+					gb_string_free(xs);
+					return false;
+				}
+				break;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = x.type;
+			return true;
+		}
+
+	case BuiltinProc_simd_shl:        // Odin-like
+	case BuiltinProc_simd_shr:        // Odin-like
+	case BuiltinProc_simd_shl_masked: // C-like
+	case BuiltinProc_simd_shr_masked: // C-like
+		{
+			Operand x = {};
+			Operand y = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+			check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!is_type_simd_vector(y.type)) {
+				error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			GB_ASSERT(x.type->kind == Type_SimdVector);
+			GB_ASSERT(y.type->kind == Type_SimdVector);
+			Type *xt = x.type;
+			Type *yt = y.type;
+
+			if (xt->SimdVector.count != yt->SimdVector.count) {
+				error(x.expr, "'%.*s' mismatched simd vector lengths, got '%lld' vs '%lld'",
+				      LIT(builtin_name),
+				      cast(long long)xt->SimdVector.count,
+				      cast(long long)yt->SimdVector.count);
+				return false;
+			}
+			if (!is_type_integer(base_array_type(x.type))) {
+				gbString xs = type_to_string(x.type);
+				error(x.expr, "'%.*s' expected a #simd type with an integer element, got '%s'", LIT(builtin_name), xs);
+				gb_string_free(xs);
+				return false;
+			}
+			if (!is_type_unsigned(base_array_type(y.type))) {
+				gbString ys = type_to_string(y.type);
+				error(y.expr, "'%.*s' expected a #simd type with an unsigned integer element as the shifting operand, got '%s'", LIT(builtin_name), ys);
+				gb_string_free(ys);
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = x.type;
+			return true;
+		}
+
+	// Unary
+	case BuiltinProc_simd_neg:
+	case BuiltinProc_simd_abs:
+		{
+			Operand x = {};
+			check_expr(c, &x, ce->args[0]);
+			if (x.mode == Addressing_Invalid) {
+				return false;
+			}
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+			if (!is_type_integer(elem) && !is_type_float(elem)) {
+				gbString xs = type_to_string(x.type);
+				error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs);
+				gb_string_free(xs);
+				return false;
+			}
+			operand->mode = Addressing_Value;
+			operand->type = x.type;
+			return true;
+		}
+
+	// Return integer masks
+	case BuiltinProc_simd_lanes_eq:
+	case BuiltinProc_simd_lanes_ne:
+	case BuiltinProc_simd_lanes_lt:
+	case BuiltinProc_simd_lanes_le:
+	case BuiltinProc_simd_lanes_gt:
+	case BuiltinProc_simd_lanes_ge:
+		{
+			// op(#simd[N]T, #simd[N]T) -> #simd[N]V
+			// where `V` is an integer, `size_of(T) == size_of(V)`
+			// `V` will all 0s if false and all 1s if true (e.g. 0x00 and 0xff for false and true, respectively)
+
+			Operand x = {};
+			Operand y = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+			check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+			switch (id) {
+			case BuiltinProc_simd_lanes_eq:
+			case BuiltinProc_simd_lanes_ne:
+				if (!is_type_integer(elem) && !is_type_float(elem) && !is_type_boolean(elem)) {
+					gbString xs = type_to_string(x.type);
+					error(x.expr, "'%.*s' expected a #simd type with an integer, floating point, or boolean element, got '%s'", LIT(builtin_name), xs);
+					gb_string_free(xs);
+					return false;
+				}
+				break;
+			default:
+				if (!is_type_integer(elem) && !is_type_float(elem)) {
+					gbString xs = type_to_string(x.type);
+					error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs);
+					gb_string_free(xs);
+					return false;
+				}
+				break;
+			}
+
+
+			Type *vt = base_type(x.type);
+			GB_ASSERT(vt->kind == Type_SimdVector);
+			i64 count = vt->SimdVector.count;
+
+			i64 sz = type_size_of(elem);
+			Type *new_elem = nullptr;
+
+			switch (sz) {
+			case 1: new_elem = t_u8;  break;
+			case 2: new_elem = t_u16; break;
+			case 4: new_elem = t_u32; break;
+			case 8: new_elem = t_u64; break;
+			case 16:
+				error(x.expr, "'%.*s' not supported 128-bit integer backed simd vector types", LIT(builtin_name));
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = alloc_type_simd_vector(count, new_elem);
+			return true;
+		}
+
+	case BuiltinProc_simd_extract:
+		{
+			Operand x = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+			i64 max_count = x.type->SimdVector.count;
+			i64 value = -1;
+			if (!check_index_value(c, x.type, false, ce->args[1], max_count, &value)) {
+				return false;
+			}
+			if (max_count < 0) {
+				error(ce->args[1], "'%.*s' expected a constant integer index, got '%lld'", LIT(builtin_name), cast(long long)value);
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = elem;
+			return true;
+		}
+		break;
+	case BuiltinProc_simd_replace:
+		{
+			Operand x = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+			i64 max_count = x.type->SimdVector.count;
+			i64 value = -1;
+			if (!check_index_value(c, x.type, false, ce->args[1], max_count, &value)) {
+				return false;
+			}
+			if (max_count < 0) {
+				error(ce->args[1], "'%.*s' expected a constant integer index, got '%lld'", LIT(builtin_name), cast(long long)value);
+				return false;
+			}
+
+			Operand y = {};
+			check_expr_with_type_hint(c, &y, ce->args[2], elem); if (y.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &y, elem); if (y.mode == Addressing_Invalid) return false;
+			if (!are_types_identical(y.type, elem)) {
+				gbString et = type_to_string(elem);
+				gbString yt = type_to_string(y.type);
+				error(y.expr, "'%.*s' expected a type of '%s' to insert, got '%s'", LIT(builtin_name), et, yt);
+				gb_string_free(yt);
+				gb_string_free(et);
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = x.type;
+			return true;
+		}
+		break;
+
+	case BuiltinProc_simd_reduce_add_ordered:
+	case BuiltinProc_simd_reduce_mul_ordered:
+	case BuiltinProc_simd_reduce_min:
+	case BuiltinProc_simd_reduce_max:
+		{
+			Operand x = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+			if (!is_type_integer(elem) && !is_type_float(elem)) {
+				gbString xs = type_to_string(x.type);
+				error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs);
+				gb_string_free(xs);
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = base_array_type(x.type);
+			return true;
+		}
+
+	case BuiltinProc_simd_reduce_and:
+	case BuiltinProc_simd_reduce_or:
+	case BuiltinProc_simd_reduce_xor:
+		{
+			Operand x = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+			if (!is_type_integer(elem) && !is_type_boolean(elem)) {
+				gbString xs = type_to_string(x.type);
+				error(x.expr, "'%.*s' expected a #simd type with an integer or boolean element, got '%s'", LIT(builtin_name), xs);
+				gb_string_free(xs);
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = base_array_type(x.type);
+			return true;
+		}
+
+
+	case BuiltinProc_simd_shuffle:
+		{
+			Operand x = {};
+			Operand y = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+			check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!is_type_simd_vector(y.type)) {
+				error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!are_types_identical(x.type, y.type)) {
+				gbString xs = type_to_string(x.type);
+				gbString ys = type_to_string(y.type);
+				error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys);
+				gb_string_free(ys);
+				gb_string_free(xs);
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+
+			i64 max_count = x.type->SimdVector.count + y.type->SimdVector.count;
+
+			i64 arg_count = 0;
+			for_array(i, ce->args) {
+				if (i < 2) {
+					continue;
+				}
+				Ast *arg = ce->args[i];
+				Operand op = {};
+				check_expr(c, &op, arg);
+				if (op.mode == Addressing_Invalid) {
+					return false;
+				}
+				Type *arg_type = base_type(op.type);
+				if (!is_type_integer(arg_type) || op.mode != Addressing_Constant) {
+					error(op.expr, "Indices to '%.*s' must be constant integers", LIT(builtin_name));
+					return false;
+				}
+
+				if (big_int_is_neg(&op.value.value_integer)) {
+					error(op.expr, "Negative '%.*s' index", LIT(builtin_name));
+					return false;
+				}
+
+				BigInt mc = {};
+				big_int_from_i64(&mc, max_count);
+				if (big_int_cmp(&mc, &op.value.value_integer) <= 0) {
+					error(op.expr, "'%.*s' index exceeds length", LIT(builtin_name));
+					return false;
+				}
+
+				arg_count++;
+			}
+
+			if (arg_count > max_count) {
+				error(call, "Too many '%.*s' indices, %td > %td", LIT(builtin_name), arg_count, max_count);
+				return false;
+			}
+
+
+			if (!is_power_of_two(arg_count)) {
+				error(call, "'%.*s' must have a power of two index arguments, got %lld", LIT(builtin_name), cast(long long)arg_count);
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = alloc_type_simd_vector(arg_count, elem);
+			return true;
+		}
+
+	case BuiltinProc_simd_select:
+		{
+			Operand cond = {};
+			check_expr(c, &cond, ce->args[0]); if (cond.mode == Addressing_Invalid) return false;
+
+			if (!is_type_simd_vector(cond.type)) {
+				error(cond.expr, "'%.*s' expected a simd vector boolean type", LIT(builtin_name));
+				return false;
+			}
+			Type *cond_elem = base_array_type(cond.type);
+			if (!is_type_boolean(cond_elem) && !is_type_integer(cond_elem)) {
+				gbString cond_str = type_to_string(cond.type);
+				error(cond.expr, "'%.*s' expected a simd vector boolean or integer type, got '%s'", LIT(builtin_name), cond_str);
+				gb_string_free(cond_str);
+				return false;
+			}
+
+			Operand x = {};
+			Operand y = {};
+			check_expr(c, &x, ce->args[1]); if (x.mode == Addressing_Invalid) return false;
+			check_expr_with_type_hint(c, &y, ce->args[2], x.type); if (y.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!is_type_simd_vector(y.type)) {
+				error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!are_types_identical(x.type, y.type)) {
+				gbString xs = type_to_string(x.type);
+				gbString ys = type_to_string(y.type);
+				error(x.expr, "'%.*s' expected 2 results of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys);
+				gb_string_free(ys);
+				gb_string_free(xs);
+				return false;
+			}
+
+			if (cond.type->SimdVector.count != x.type->SimdVector.count) {
+				error(x.expr, "'%.*s' expected condition vector to match the length of the result lengths, got '%lld' vs '%lld'",
+				      LIT(builtin_name),
+				      cast(long long)cond.type->SimdVector.count,
+				      cast(long long)x.type->SimdVector.count);
+				return false;
+			}
+
+
+			operand->mode = Addressing_Value;
+			operand->type = x.type;
+			return true;
+		}
+
+	case BuiltinProc_simd_ceil:
+	case BuiltinProc_simd_floor:
+	case BuiltinProc_simd_trunc:
+	case BuiltinProc_simd_nearest:
+		{
+			Operand x = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector boolean type", LIT(builtin_name));
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+			if (!is_type_float(elem)) {
+				gbString x_str = type_to_string(x.type);
+				error(x.expr, "'%.*s' expected a simd vector floating point type, got '%s'", LIT(builtin_name), x_str);
+				gb_string_free(x_str);
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = x.type;
+			return true;
+		}
+
+	case BuiltinProc_simd_lanes_reverse:
+		{
+			Operand x = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			operand->type = x.type;
+			operand->mode = Addressing_Value;
+			return true;
+		}
+
+	case BuiltinProc_simd_lanes_rotate_left:
+	case BuiltinProc_simd_lanes_rotate_right:
+		{
+			Operand x = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			Operand offset = {};
+			check_expr(c, &offset, ce->args[1]); if (offset.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &offset, t_i64);
+			if (!is_type_integer(offset.type) || offset.mode != Addressing_Constant) {
+				error(offset.expr, "'%.*s' expected a constant integer offset");
+				return false;
+			}
+			check_assignment(c, &offset, t_i64, builtin_name);
+
+			operand->type = x.type;
+			operand->mode = Addressing_Value;
+			return true;
+		}
+
+	case BuiltinProc_simd_clamp:
+		{
+			Operand x = {};
+			Operand y = {};
+			Operand z = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+			check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+			check_expr_with_type_hint(c, &z, ce->args[2], x.type); if (z.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &z, x.type);
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!is_type_simd_vector(y.type)) {
+				error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!is_type_simd_vector(z.type)) {
+				error(z.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			if (!are_types_identical(x.type, y.type)) {
+				gbString xs = type_to_string(x.type);
+				gbString ys = type_to_string(y.type);
+				error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys);
+				gb_string_free(ys);
+				gb_string_free(xs);
+				return false;
+			}
+			if (!are_types_identical(x.type, z.type)) {
+				gbString xs = type_to_string(x.type);
+				gbString zs = type_to_string(z.type);
+				error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, zs);
+				gb_string_free(zs);
+				gb_string_free(xs);
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+			if (!is_type_integer(elem) && !is_type_float(elem)) {
+				gbString xs = type_to_string(x.type);
+				error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs);
+				gb_string_free(xs);
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = x.type;
+			return true;
+		}
+
+	case BuiltinProc_simd_to_bits:
+		{
+			Operand x = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+			if (!is_type_simd_vector(x.type)) {
+				error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+				return false;
+			}
+			Type *elem = base_array_type(x.type);
+			i64 count = get_array_type_count(x.type);
+			i64 sz = type_size_of(elem);
+			Type *bit_elem = nullptr;
+			switch (sz) {
+			case 1: bit_elem = t_u8;  break;
+			case 2: bit_elem = t_u16; break;
+			case 4: bit_elem = t_u32; break;
+			case 8: bit_elem = t_u64; break;
+			}
+			GB_ASSERT(bit_elem != nullptr);
+
+			operand->type = alloc_type_simd_vector(count, bit_elem);
+			operand->mode = Addressing_Value;
+			return true;
+		}
+
+	case BuiltinProc_simd_x86__MM_SHUFFLE:
+		{
+			Operand x[4] = {};
+			for (unsigned i = 0; i < 4; i++) {
+				check_expr(c, x+i, ce->args[i]); if (x[i].mode == Addressing_Invalid) return false;
+			}
+
+			u32 offsets[4] = {6, 4, 2, 0};
+			u32 result = 0;
+			for (unsigned i = 0; i < 4; i++) {
+				if (!is_type_integer(x[i].type) || x[i].mode != Addressing_Constant) {
+					gbString xs = type_to_string(x[i].type);
+					error(x[i].expr, "'%.*s' expected a constant integer", LIT(builtin_name), xs);
+					gb_string_free(xs);
+					return false;
+				}
+				i64 val = exact_value_to_i64(x[i].value);
+				if (val < 0 || val > 3) {
+					error(x[i].expr, "'%.*s' expected a constant integer in the range 0..<4, got %lld", LIT(builtin_name), cast(long long)val);
+					return false;
+				}
+				result |= cast(u32)(val) << offsets[i];
+			}
+
+			operand->type = t_untyped_integer;
+			operand->mode = Addressing_Constant;
+			operand->value = exact_value_i64(result);
+			return true;
+		}
+	default:
+		GB_PANIC("Unhandled simd intrinsic: %.*s", LIT(builtin_name));
+	}
+
+	return false;
+}
+
+
 bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 id, Type *type_hint) {
 	ast_node(ce, CallExpr, call);
 	if (ce->inlining != ProcInlining_none) {
@@ -479,7 +1140,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 		break;
 	}
 
-	String builtin_name = builtin_procs[id].name;
+	String const &builtin_name = builtin_procs[id].name;
 
 
 	if (ce->args.count > 0) {
@@ -491,6 +1152,17 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 		}
 	}
 
+	if (BuiltinProc__simd_begin < id && id < BuiltinProc__simd_end) {
+		bool ok = check_builtin_simd_operation(c, operand, call, id, type_hint);
+		if (!ok) {
+			operand->type = t_invalid;
+		}
+		operand->mode = Addressing_Value;
+		operand->value = {};
+		operand->expr = call;
+		return ok;
+	}
+
 	switch (id) {
 	default:
 		GB_PANIC("Implement built-in procedure: %.*s", LIT(builtin_name));
@@ -1031,6 +1703,11 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 			           bt->Struct.soa_kind == StructSoa_Dynamic) {
 				mode = Addressing_Value;
 			}
+		} else if (is_type_simd_vector(op_type)) {
+			Type *bt = base_type(op_type);
+			mode  = Addressing_Constant;
+			value = exact_value_i64(bt->SimdVector.count);
+			type  = t_untyped_integer;
 		}
 		if (operand->mode == Addressing_Type && mode != Addressing_Constant) {
 			mode = Addressing_Invalid;
@@ -1445,6 +2122,11 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 			operand->mode = Addressing_Value;
 		}
 
+		if (is_type_simd_vector(type) && !is_power_of_two(arg_count)) {
+			error(call, "'swizzle' with a #simd vector must have a power of two arguments, got %lld", cast(long long)arg_count);
+			return false;
+		}
+
 		operand->type = determine_swizzle_array_type(original_type, type_hint, arg_count);
 		break;
 	}
@@ -2279,7 +2961,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 					if (i == j) continue;
 					Operand *b = ops[j];
 					convert_to_typed(c, a, b->type);
-					if (a->mode == Addressing_Invalid) { return false; }
+					if (a->mode == Addressing_Invalid) return false;
 				}
 			}
 
@@ -2685,46 +3367,6 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 		break;
 	}
 	
-	case BuiltinProc_simd_vector: {
-		Operand x = {};
-		Operand y = {};
-		x = *operand;
-		if (!is_type_integer(x.type) || x.mode != Addressing_Constant) {
-			error(call, "Expected a constant integer for 'intrinsics.simd_vector'");
-			operand->mode = Addressing_Type;
-			operand->type = t_invalid;
-			return false;
-		}
-		if (big_int_is_neg(&x.value.value_integer)) {
-			error(call, "Negative vector element length");
-			operand->mode = Addressing_Type;
-			operand->type = t_invalid;
-			return false;
-		}
-		i64 count = big_int_to_i64(&x.value.value_integer);
-
-		check_expr_or_type(c, &y, ce->args[1]);
-		if (y.mode != Addressing_Type) {
-			error(call, "Expected a type 'intrinsics.simd_vector'");
-			operand->mode = Addressing_Type;
-			operand->type = t_invalid;
-			return false;
-		}
-		Type *elem = y.type;
-		if (!is_type_valid_vector_elem(elem)) {
-			gbString str = type_to_string(elem);
-			error(call, "Invalid element type for 'intrinsics.simd_vector', expected an integer or float with no specific endianness, got '%s'", str);
-			gb_string_free(str);
-			operand->mode = Addressing_Type;
-			operand->type = t_invalid;
-			return false;
-		}
-
-		operand->mode = Addressing_Type;
-		operand->type = alloc_type_simd_vector(count, elem);
-		break;
-	}
-	
 	case BuiltinProc_is_package_imported: {
 		bool value = false;
 
@@ -2944,7 +3586,14 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 				return false;
 			}
 
-			if (!is_type_integer_like(x.type)) {
+			if (is_type_simd_vector(x.type)) {
+				Type *elem = base_array_type(x.type);
+				if (!is_type_integer_like(elem)) {
+					gbString xts = type_to_string(x.type);
+					error(x.expr, "#simd values passed to '%.*s' must have an element of an integer-like type (integer, boolean, enum, bit_set), got %s", LIT(builtin_name), xts);
+					gb_string_free(xts);
+				}
+			} else if (!is_type_integer_like(x.type)) {
 				gbString xts = type_to_string(x.type);
 				error(x.expr, "Values passed to '%.*s' must be an integer-like type (integer, boolean, enum, bit_set), got %s", LIT(builtin_name), xts);
 				gb_string_free(xts);
@@ -3002,7 +3651,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 			if (y.mode == Addressing_Invalid) {
 				return false;
 			}
-			convert_to_typed(c, &y, x.type);
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
 			convert_to_typed(c, &x, y.type);
 			if (is_type_untyped(x.type)) {
 				gbString xts = type_to_string(x.type);
@@ -3039,14 +3688,23 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 			if (x.mode == Addressing_Invalid) {
 				return false;
 			}
-			if (!is_type_float(x.type)) {
+
+			Type *elem = core_array_type(x.type);
+			if (!is_type_float(x.type) && !(is_type_simd_vector(x.type) && is_type_float(elem))) {
 				gbString xts = type_to_string(x.type);
-				error(x.expr, "Expected a floating point value for '%.*s', got %s", LIT(builtin_name), xts);
+				error(x.expr, "Expected a floating point or #simd vector value for '%.*s', got %s", LIT(builtin_name), xts);
 				gb_string_free(xts);
 				return false;
+			} else if (is_type_different_to_arch_endianness(elem)) {
+				GB_ASSERT(elem->kind == Type_Basic);
+				if (elem->Basic.flags & (BasicFlag_EndianLittle|BasicFlag_EndianBig)) {
+					gbString xts = type_to_string(x.type);
+					error(x.expr, "Expected a float which does not specify the explicit endianness for '%.*s', got %s", LIT(builtin_name), xts);
+					gb_string_free(xts);
+					return false;
+				}
 			}
-
-			if (x.mode == Addressing_Constant) {
+			if (is_type_float(x.type) && x.mode == Addressing_Constant) {
 				f64 v = exact_value_to_f64(x.value);
 
 				operand->mode = Addressing_Constant;
@@ -3059,6 +3717,59 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 		}
 		break;
 
+	case BuiltinProc_fused_mul_add:
+		{
+			Operand x = {};
+			Operand y = {};
+			Operand z = {};
+			check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+			check_expr(c, &y, ce->args[1]); if (y.mode == Addressing_Invalid) return false;
+			check_expr(c, &z, ce->args[2]); if (z.mode == Addressing_Invalid) return false;
+
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &x, y.type); if (x.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &z, x.type); if (z.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &x, z.type); if (x.mode == Addressing_Invalid) return false;
+			if (is_type_untyped(x.type)) {
+				gbString xts = type_to_string(x.type);
+				error(x.expr, "Expected a typed floating point value or #simd vector for '%.*s', got %s", LIT(builtin_name), xts);
+				gb_string_free(xts);
+				return false;
+			}
+
+			Type *elem = core_array_type(x.type);
+			if (!is_type_float(x.type) && !(is_type_simd_vector(x.type) && is_type_float(elem))) {
+				gbString xts = type_to_string(x.type);
+				error(x.expr, "Expected a floating point or #simd vector value for '%.*s', got %s", LIT(builtin_name), xts);
+				gb_string_free(xts);
+				return false;
+			}
+			if (is_type_different_to_arch_endianness(elem)) {
+				GB_ASSERT(elem->kind == Type_Basic);
+				if (elem->Basic.flags & (BasicFlag_EndianLittle|BasicFlag_EndianBig)) {
+					gbString xts = type_to_string(x.type);
+					error(x.expr, "Expected a float which does not specify the explicit endianness for '%.*s', got %s", LIT(builtin_name), xts);
+					gb_string_free(xts);
+					return false;
+				}
+			}
+
+			if (!are_types_identical(x.type, y.type) || !are_types_identical(y.type, z.type)) {
+				gbString xts = type_to_string(x.type);
+				gbString yts = type_to_string(y.type);
+				gbString zts = type_to_string(z.type);
+				error(x.expr, "Mismatched types for '%.*s', got %s vs %s vs %s", LIT(builtin_name), xts, yts, zts);
+				gb_string_free(zts);
+				gb_string_free(yts);
+				gb_string_free(xts);
+				return false;
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = default_type(x.type);
+		}
+		break;
+
 	case BuiltinProc_mem_copy:
 	case BuiltinProc_mem_copy_non_overlapping:
 		{
@@ -3309,9 +4020,8 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 		break;
 
 	case BuiltinProc_volatile_store:
-		/*fallthrough*/
 	case BuiltinProc_unaligned_store:
-		/*fallthrough*/
+	case BuiltinProc_non_temporal_store:
 	case BuiltinProc_atomic_store:
 		{
 			Type *elem = nullptr;
@@ -3358,9 +4068,8 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 
 
 	case BuiltinProc_volatile_load:
-		/*fallthrough*/
 	case BuiltinProc_unaligned_load:
-		/*fallthrough*/
+	case BuiltinProc_non_temporal_load:
 	case BuiltinProc_atomic_load:
 		{
 			Type *elem = nullptr;
@@ -3618,7 +4327,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 			if (x.mode == Addressing_Invalid) {
 				return false;
 			}
-			convert_to_typed(c, &y, x.type);
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
 			if (x.mode == Addressing_Invalid) {
 				return false;
 			}
@@ -3675,7 +4384,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 			if (y.mode == Addressing_Invalid) {
 				return false;
 			}
-			convert_to_typed(c, &y, x.type);
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
 			convert_to_typed(c, &x, y.type);
 			if (!are_types_identical(x.type, y.type)) {
 				gbString xts = type_to_string(x.type);
@@ -4566,6 +5275,64 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 		}
 		break;
 
+	case BuiltinProc_x86_cpuid:
+		{
+			if (!is_arch_x86()) {
+				error(call, "'%.*s' is only allowed on x86 targets (i386, amd64)", LIT(builtin_name));
+				return false;
+			}
+
+			Operand ax = {};
+			Operand cx = {};
+
+			check_expr_with_type_hint(c, &ax, ce->args[0], t_u32); if (ax.mode == Addressing_Invalid) return false;
+			check_expr_with_type_hint(c, &cx, ce->args[1], t_u32); if (cx.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &ax, t_u32); if (ax.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &cx, t_u32); if (cx.mode == Addressing_Invalid) return false;
+			if (!are_types_identical(ax.type, t_u32)) {
+				gbString str = type_to_string(ax.type);
+				error(ax.expr, "'%.*s' expected a u32, got %s", LIT(builtin_name), str);
+				gb_string_free(str);
+				return false;
+			}
+			if (!are_types_identical(cx.type, t_u32)) {
+				gbString str = type_to_string(cx.type);
+				error(cx.expr, "'%.*s' expected a u32, got %s", LIT(builtin_name), str);
+				gb_string_free(str);
+				return false;
+			}
+			Type *types[4] = {t_u32, t_u32, t_u32, t_u32}; // eax ebc ecx edx
+			operand->type = alloc_type_tuple_from_field_types(types, gb_count_of(types), false, false);
+			operand->mode = Addressing_Value;
+			operand->value = {};
+			return true;
+		}
+		break;
+	case BuiltinProc_x86_xgetbv:
+		{
+			if (!is_arch_x86()) {
+				error(call, "'%.*s' is only allowed on x86 targets (i386, amd64)", LIT(builtin_name));
+				return false;
+			}
+
+			Operand cx = {};
+			check_expr_with_type_hint(c, &cx, ce->args[0], t_u32); if (cx.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &cx, t_u32); if (cx.mode == Addressing_Invalid) return false;
+			if (!are_types_identical(cx.type, t_u32)) {
+				gbString str = type_to_string(cx.type);
+				error(cx.expr, "'%.*s' expected a u32, got %s", LIT(builtin_name), str);
+				gb_string_free(str);
+				return false;
+			}
+
+			Type *types[2] = {t_u32, t_u32};
+			operand->type = alloc_type_tuple_from_field_types(types, gb_count_of(types), false, false);
+			operand->mode = Addressing_Value;
+			operand->value = {};
+			return true;
+		}
+		break;
+
 	}
 
 	return true;

+ 35 - 15
src/check_decl.cpp

@@ -313,13 +313,19 @@ void check_type_decl(CheckerContext *ctx, Entity *e, Ast *init_expr, Type *def)
 	}
 	named->Named.base = base;
 
-	if (is_distinct && is_type_typeid(e->type)) {
-		error(init_expr, "'distinct' cannot be applied to 'typeid'");
-		is_distinct = false;
-	}
-	if (is_distinct && is_type_any(e->type)) {
-		error(init_expr, "'distinct' cannot be applied to 'any'");
-		is_distinct = false;
+	if (is_distinct) {
+		if (is_type_typeid(e->type)) {
+			error(init_expr, "'distinct' cannot be applied to 'typeid'");
+			is_distinct = false;
+		} else if (is_type_any(e->type)) {
+			error(init_expr, "'distinct' cannot be applied to 'any'");
+			is_distinct = false;
+		} else if (is_type_simd_vector(e->type)) {
+			gbString str = type_to_string(e->type);
+			error(init_expr, "'distinct' cannot be applied to '%s'", str);
+			gb_string_free(str);
+			is_distinct = false;
+		}
 	}
 	if (!is_distinct) {
 		e->type = bt;
@@ -893,6 +899,18 @@ void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) {
 		}
 	}
 
+	if (ac.require_target_feature.len != 0 && ac.enable_target_feature.len != 0) {
+		error(e->token, "Attributes @(require_target_feature=...) and @(enable_target_feature=...) cannot be used together");
+	} else if (ac.require_target_feature.len != 0) {
+		if (check_target_feature_is_enabled(e->token.pos, ac.require_target_feature)) {
+			e->Procedure.target_feature = ac.require_target_feature;
+		} else {
+			e->Procedure.target_feature_disabled = true;
+		}
+	} else if (ac.enable_target_feature.len != 0) {
+		enable_target_feature(e->token.pos, ac.enable_target_feature);
+		e->Procedure.target_feature = ac.enable_target_feature;
+	}
 
 	switch (e->Procedure.optimization_mode) {
 	case ProcedureOptimizationMode_None:
@@ -996,10 +1014,12 @@ void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) {
 		}
 	}
 
-	if (pt->result_count == 0 && ac.require_results) {
-		error(pl->type, "'require_results' is not needed on a procedure with no results");
-	} else {
-		pt->require_results = ac.require_results;
+	if (ac.require_results) {
+		if (pt->result_count == 0) {
+			error(pl->type, "'require_results' is not needed on a procedure with no results");
+		} else {
+			pt->require_results = true;
+		}
 	}
 
 	if (ac.link_name.len > 0) {
@@ -1309,20 +1329,20 @@ void check_proc_group_decl(CheckerContext *ctx, Entity *&pg_entity, DeclInfo *d)
 
 			if (!both_have_where_clauses) switch (kind) {
 			case ProcOverload_Identical:
-				error(p->token, "Overloaded procedure '%.*s' as the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
+				error(p->token, "Overloaded procedure '%.*s' has the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
 				is_invalid = true;
 				break;
 			// case ProcOverload_CallingConvention:
-				// error(p->token, "Overloaded procedure '%.*s' as the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
+				// error(p->token, "Overloaded procedure '%.*s' has the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
 				// is_invalid = true;
 				// break;
 			case ProcOverload_ParamVariadic:
-				error(p->token, "Overloaded procedure '%.*s' as the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
+				error(p->token, "Overloaded procedure '%.*s' has the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
 				is_invalid = true;
 				break;
 			case ProcOverload_ResultCount:
 			case ProcOverload_ResultTypes:
-				error(p->token, "Overloaded procedure '%.*s' as the same parameters but different results in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
+				error(p->token, "Overloaded procedure '%.*s' has the same parameters but different results in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
 				is_invalid = true;
 				break;
 			case ProcOverload_Polymorphic:

+ 139 - 102
src/check_expr.cpp

@@ -442,6 +442,14 @@ bool find_or_generate_polymorphic_procedure(CheckerContext *old_c, Entity *base_
 	final_proc_type->Proc.is_poly_specialized = true;
 	final_proc_type->Proc.is_polymorphic = true;
 
+	final_proc_type->Proc.variadic            = src->Proc.variadic;
+	final_proc_type->Proc.require_results     = src->Proc.require_results;
+	final_proc_type->Proc.c_vararg            = src->Proc.c_vararg;
+	final_proc_type->Proc.has_named_results   = src->Proc.has_named_results;
+	final_proc_type->Proc.diverging           = src->Proc.diverging;
+	final_proc_type->Proc.return_by_pointer   = src->Proc.return_by_pointer;
+	final_proc_type->Proc.optional_ok         = src->Proc.optional_ok;
+
 
 	for (isize i = 0; i < operands.count; i++) {
 		Operand o = operands[i];
@@ -777,6 +785,14 @@ i64 check_distance_between_types(CheckerContext *c, Operand *operand, Type *type
 			return distance + 6;
 		}
 	}
+
+	if (is_type_simd_vector(dst)) {
+		Type *dst_elem = base_array_type(dst);
+		i64 distance = check_distance_between_types(c, operand, dst_elem);
+		if (distance >= 0) {
+			return distance + 6;
+		}
+	}
 	
 	if (is_type_matrix(dst)) {
 		Type *dst_elem = base_array_type(dst);
@@ -786,6 +802,7 @@ i64 check_distance_between_types(CheckerContext *c, Operand *operand, Type *type
 		}
 	}
 
+
 	if (is_type_any(dst)) {
 		if (!is_type_polymorphic(src)) {
 			if (operand->mode == Addressing_Context && operand->type == t_context) {
@@ -1328,6 +1345,19 @@ bool is_polymorphic_type_assignable(CheckerContext *c, Type *poly, Type *source,
 			}
 		} 
 		return false;
+
+	case Type_SimdVector:
+		if (source->kind == Type_SimdVector) {
+			if (poly->SimdVector.generic_count != nullptr) {
+				if (!polymorphic_assign_index(&poly->SimdVector.generic_count, &poly->SimdVector.count, source->SimdVector.count)) {
+					return false;
+				}
+			}
+			if (poly->SimdVector.count == source->SimdVector.count) {
+				return is_polymorphic_type_assignable(c, poly->SimdVector.elem, source->SimdVector.elem, true, modify_type);
+			}
+		}
+		return false;
 	}
 	return false;
 }
@@ -1567,9 +1597,11 @@ bool check_unary_op(CheckerContext *c, Operand *o, Token op) {
 
 bool check_binary_op(CheckerContext *c, Operand *o, Token op) {
 	Type *main_type = o->type;
+
 	// TODO(bill): Handle errors correctly
 	Type *type = base_type(core_array_type(main_type));
 	Type *ct = core_type(type);
+
 	switch (op.kind) {
 	case Token_Sub:
 	case Token_SubEq:
@@ -1638,14 +1670,6 @@ bool check_binary_op(CheckerContext *c, Operand *o, Token op) {
 			error(op, "Operator '%.*s' is only allowed with integers", LIT(op.string));
 			return false;
 		}
-		if (is_type_simd_vector(o->type)) {
-			switch (op.kind) {
-			case Token_ModMod:
-			case Token_ModModEq:
-				error(op, "Operator '%.*s' is only allowed with integers", LIT(op.string));
-				return false;
-			}
-		}
 		break;
 
 	case Token_AndNot:
@@ -1654,14 +1678,6 @@ bool check_binary_op(CheckerContext *c, Operand *o, Token op) {
 			error(op, "Operator '%.*s' is only allowed with integers and bit sets", LIT(op.string));
 			return false;
 		}
-		if (is_type_simd_vector(o->type)) {
-			switch (op.kind) {
-			case Token_AndNot:
-			case Token_AndNotEq:
-				error(op, "Operator '%.*s' is only allowed with integers", LIT(op.string));
-				return false;
-			}
-		}
 		break;
 
 	case Token_CmpAnd:
@@ -2487,6 +2503,8 @@ void check_shift(CheckerContext *c, Operand *x, Operand *y, Ast *node, Type *typ
 		gb_string_free(err_str);
 	}
 
+	// TODO(bill): Should we support shifts for fixed arrays and #simd vectors?
+
 	if (!is_type_integer(x->type)) {
 		gbString err_str = expr_to_string(y->expr);
 		error(node, "Shift operand '%s' must be an integer", err_str);
@@ -2697,6 +2715,26 @@ bool check_is_castable_to(CheckerContext *c, Operand *operand, Type *y) {
 		return true;
 	}
 
+	if (is_type_simd_vector(src) && is_type_simd_vector(dst)) {
+		if (src->SimdVector.count != dst->SimdVector.count) {
+			return false;
+		}
+		Type *elem_src = base_array_type(src);
+		Type *elem_dst = base_array_type(dst);
+		Operand x = {};
+		x.type = elem_src;
+		x.mode = Addressing_Value;
+		return check_is_castable_to(c, &x, elem_dst);
+	}
+
+	if (is_type_simd_vector(dst)) {
+		Type *elem = base_array_type(dst);
+		if (check_is_castable_to(c, operand, elem)) {
+			return true;
+		}
+	}
+
+
 	return false;
 }
 
@@ -4116,7 +4154,11 @@ ExactValue get_constant_field(CheckerContext *c, Operand const *operand, Selecti
 
 Type *determine_swizzle_array_type(Type *original_type, Type *type_hint, isize new_count) {
 	Type *array_type = base_type(type_deref(original_type));
-	GB_ASSERT(array_type->kind == Type_Array);
+	GB_ASSERT(array_type->kind == Type_Array || array_type->kind == Type_SimdVector);
+	if (array_type->kind == Type_SimdVector) {
+		Type *elem_type = array_type->SimdVector.elem;
+		return alloc_type_simd_vector(new_count, elem_type);
+	}
 	Type *elem_type = array_type->Array.elem;
 
 	Type *swizzle_array_type = nullptr;
@@ -7738,111 +7780,106 @@ ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast *node, Type *
 		}
 
 		if (cl->elems.count > 0 && cl->elems[0]->kind == Ast_FieldValue) {
-			if (is_type_simd_vector(t)) {
-				error(cl->elems[0], "'field = value' is not allowed for SIMD vector literals");
-			} else {
-				RangeCache rc = range_cache_make(heap_allocator());
-				defer (range_cache_destroy(&rc));
+			RangeCache rc = range_cache_make(heap_allocator());
+			defer (range_cache_destroy(&rc));
 
-				for_array(i, cl->elems) {
-					Ast *elem = cl->elems[i];
-					if (elem->kind != Ast_FieldValue) {
-						error(elem, "Mixture of 'field = value' and value elements in a literal is not allowed");
-						continue;
-					}
-					ast_node(fv, FieldValue, elem);
+			for_array(i, cl->elems) {
+				Ast *elem = cl->elems[i];
+				if (elem->kind != Ast_FieldValue) {
+					error(elem, "Mixture of 'field = value' and value elements in a literal is not allowed");
+					continue;
+				}
+				ast_node(fv, FieldValue, elem);
 
-					if (is_ast_range(fv->field)) {
-						Token op = fv->field->BinaryExpr.op;
+				if (is_ast_range(fv->field)) {
+					Token op = fv->field->BinaryExpr.op;
 
-						Operand x = {};
-						Operand y = {};
-						bool ok = check_range(c, fv->field, &x, &y, nullptr);
-						if (!ok) {
-							continue;
-						}
-						if (x.mode != Addressing_Constant || !is_type_integer(core_type(x.type))) {
-							error(x.expr, "Expected a constant integer as an array field");
-							continue;
-						}
+					Operand x = {};
+					Operand y = {};
+					bool ok = check_range(c, fv->field, &x, &y, nullptr);
+					if (!ok) {
+						continue;
+					}
+					if (x.mode != Addressing_Constant || !is_type_integer(core_type(x.type))) {
+						error(x.expr, "Expected a constant integer as an array field");
+						continue;
+					}
 
-						if (y.mode != Addressing_Constant || !is_type_integer(core_type(y.type))) {
-							error(y.expr, "Expected a constant integer as an array field");
-							continue;
-						}
+					if (y.mode != Addressing_Constant || !is_type_integer(core_type(y.type))) {
+						error(y.expr, "Expected a constant integer as an array field");
+						continue;
+					}
 
-						i64 lo = exact_value_to_i64(x.value);
-						i64 hi = exact_value_to_i64(y.value);
-						i64 max_index = hi;
-						if (op.kind == Token_RangeHalf) { // ..< (exclusive)
-							hi -= 1;
-						} else { // .. (inclusive)
-							max_index += 1;
-						}
+					i64 lo = exact_value_to_i64(x.value);
+					i64 hi = exact_value_to_i64(y.value);
+					i64 max_index = hi;
+					if (op.kind == Token_RangeHalf) { // ..< (exclusive)
+						hi -= 1;
+					} else { // .. (inclusive)
+						max_index += 1;
+					}
 
-						bool new_range = range_cache_add_range(&rc, lo, hi);
-						if (!new_range) {
-							error(elem, "Overlapping field range index %lld %.*s %lld for %.*s", lo, LIT(op.string), hi, LIT(context_name));
-							continue;
-						}
+					bool new_range = range_cache_add_range(&rc, lo, hi);
+					if (!new_range) {
+						error(elem, "Overlapping field range index %lld %.*s %lld for %.*s", lo, LIT(op.string), hi, LIT(context_name));
+						continue;
+					}
 
 
-						if (max_type_count >= 0 && (lo < 0 || lo >= max_type_count)) {
-							error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", lo, max_type_count, LIT(context_name));
-							continue;
-						}
-						if (max_type_count >= 0 && (hi < 0 || hi >= max_type_count)) {
-							error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", hi, max_type_count, LIT(context_name));
-							continue;
-						}
+					if (max_type_count >= 0 && (lo < 0 || lo >= max_type_count)) {
+						error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", lo, max_type_count, LIT(context_name));
+						continue;
+					}
+					if (max_type_count >= 0 && (hi < 0 || hi >= max_type_count)) {
+						error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", hi, max_type_count, LIT(context_name));
+						continue;
+					}
 
-						if (max < hi) {
-							max = max_index;
-						}
+					if (max < hi) {
+						max = max_index;
+					}
 
-						Operand operand = {};
-						check_expr_with_type_hint(c, &operand, fv->value, elem_type);
-						check_assignment(c, &operand, elem_type, context_name);
+					Operand operand = {};
+					check_expr_with_type_hint(c, &operand, fv->value, elem_type);
+					check_assignment(c, &operand, elem_type, context_name);
 
-						is_constant = is_constant && operand.mode == Addressing_Constant;
-					} else {
-						Operand op_index = {};
-						check_expr(c, &op_index, fv->field);
+					is_constant = is_constant && operand.mode == Addressing_Constant;
+				} else {
+					Operand op_index = {};
+					check_expr(c, &op_index, fv->field);
 
-						if (op_index.mode != Addressing_Constant || !is_type_integer(core_type(op_index.type))) {
-							error(elem, "Expected a constant integer as an array field");
-							continue;
-						}
-						// add_type_and_value(c->info, op_index.expr, op_index.mode, op_index.type, op_index.value);
+					if (op_index.mode != Addressing_Constant || !is_type_integer(core_type(op_index.type))) {
+						error(elem, "Expected a constant integer as an array field");
+						continue;
+					}
+					// add_type_and_value(c->info, op_index.expr, op_index.mode, op_index.type, op_index.value);
 
-						i64 index = exact_value_to_i64(op_index.value);
+					i64 index = exact_value_to_i64(op_index.value);
 
-						if (max_type_count >= 0 && (index < 0 || index >= max_type_count)) {
-							error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", index, max_type_count, LIT(context_name));
-							continue;
-						}
+					if (max_type_count >= 0 && (index < 0 || index >= max_type_count)) {
+						error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", index, max_type_count, LIT(context_name));
+						continue;
+					}
 
-						bool new_index = range_cache_add_index(&rc, index);
-						if (!new_index) {
-							error(elem, "Duplicate field index %lld for %.*s", index, LIT(context_name));
-							continue;
-						}
+					bool new_index = range_cache_add_index(&rc, index);
+					if (!new_index) {
+						error(elem, "Duplicate field index %lld for %.*s", index, LIT(context_name));
+						continue;
+					}
 
-						if (max < index+1) {
-							max = index+1;
-						}
+					if (max < index+1) {
+						max = index+1;
+					}
 
-						Operand operand = {};
-						check_expr_with_type_hint(c, &operand, fv->value, elem_type);
-						check_assignment(c, &operand, elem_type, context_name);
+					Operand operand = {};
+					check_expr_with_type_hint(c, &operand, fv->value, elem_type);
+					check_assignment(c, &operand, elem_type, context_name);
 
-						is_constant = is_constant && operand.mode == Addressing_Constant;
-					}
+					is_constant = is_constant && operand.mode == Addressing_Constant;
 				}
-
-				cl->max_count = max;
 			}
 
+			cl->max_count = max;
 		} else {
 			isize index = 0;
 			for (; index < cl->elems.count; index++) {
@@ -7887,7 +7924,7 @@ ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast *node, Type *
 
 		if (t->kind == Type_SimdVector) {
 			if (!is_constant) {
-				error(node, "Expected all constant elements for a simd vector");
+				// error(node, "Expected all constant elements for a simd vector");
 			}
 		}
 

+ 41 - 15
src/check_stmt.cpp

@@ -1381,6 +1381,18 @@ bool all_operands_valid(Array<Operand> const &operands) {
 	return true;
 }
 
+bool check_stmt_internal_builtin_proc_id(Ast *expr, BuiltinProcId *id_) {
+	BuiltinProcId id = BuiltinProc_Invalid;
+	Entity *e = entity_of_node(expr);
+	if (e != nullptr && e->kind == Entity_Builtin) {
+		if (e->Builtin.id && e->Builtin.id != BuiltinProc_DIRECTIVE) {
+			id = cast(BuiltinProcId)e->Builtin.id;
+		}
+	}
+	if (id_) *id_ = id;
+	return id != BuiltinProc_Invalid;
+}
+
 void check_stmt_internal(CheckerContext *ctx, Ast *node, u32 flags) {
 	u32 mod_flags = flags & (~Stmt_FallthroughAllowed);
 	switch (node->kind) {
@@ -1405,29 +1417,43 @@ void check_stmt_internal(CheckerContext *ctx, Ast *node, u32 flags) {
 			if (kind == Expr_Stmt) {
 				return;
 			}
-			Ast *expr = strip_or_return_expr(operand.expr);
 
+			Ast *expr = strip_or_return_expr(operand.expr);
 			if (expr->kind == Ast_CallExpr) {
+				BuiltinProcId builtin_id = BuiltinProc_Invalid;
+				bool do_require = false;
+
 				AstCallExpr *ce = &expr->CallExpr;
-				Type *t = type_of_expr(ce->proc);
-				if (is_type_proc(t)) {
-					if (t->Proc.require_results) {
-						gbString expr_str = expr_to_string(ce->proc);
-						error(node, "'%s' requires that its results must be handled", expr_str);
-						gb_string_free(expr_str);
-					}
+				Type *t = base_type(type_of_expr(ce->proc));
+				if (t->kind == Type_Proc) {
+					do_require = t->Proc.require_results;
+				} else if (check_stmt_internal_builtin_proc_id(ce->proc, &builtin_id)) {
+					auto const &bp = builtin_procs[builtin_id];
+					do_require = bp.kind == Expr_Expr && !bp.ignore_results;
+				}
+				if (do_require) {
+					gbString expr_str = expr_to_string(ce->proc);
+					error(node, "'%s' requires that its results must be handled", expr_str);
+					gb_string_free(expr_str);
 				}
 				return;
 			} else if (expr->kind == Ast_SelectorCallExpr) {
+				BuiltinProcId builtin_id = BuiltinProc_Invalid;
+				bool do_require = false;
+
 				AstSelectorCallExpr *se = &expr->SelectorCallExpr;
 				ast_node(ce, CallExpr, se->call);
-				Type *t = type_of_expr(ce->proc);
-				if (is_type_proc(t)) {
-					if (t->Proc.require_results) {
-						gbString expr_str = expr_to_string(ce->proc);
-						error(node, "'%s' requires that its results must be handled", expr_str);
-						gb_string_free(expr_str);
-					}
+				Type *t = base_type(type_of_expr(ce->proc));
+				if (t->kind == Type_Proc) {
+					do_require = t->Proc.require_results;
+				} else if (check_stmt_internal_builtin_proc_id(ce->proc, &builtin_id)) {
+					auto const &bp = builtin_procs[builtin_id];
+					do_require = bp.kind == Expr_Expr && !bp.ignore_results;
+				}
+				if (do_require) {
+					gbString expr_str = expr_to_string(ce->proc);
+					error(node, "'%s' requires that its results must be handled", expr_str);
+					gb_string_free(expr_str);
 				}
 				return;
 			}

+ 16 - 4
src/check_type.cpp

@@ -1234,7 +1234,7 @@ bool check_type_specialization_to(CheckerContext *ctx, Type *specialization, Typ
 }
 
 
-Type *determine_type_from_polymorphic(CheckerContext *ctx, Type *poly_type, Operand operand) {
+Type *determine_type_from_polymorphic(CheckerContext *ctx, Type *poly_type, Operand const &operand) {
 	bool modify_type = !ctx->no_polymorphic_errors;
 	bool show_error = modify_type && !ctx->hide_polymorphic_errors;
 	if (!is_operand_value(operand)) {
@@ -2795,15 +2795,27 @@ bool check_type_internal(CheckerContext *ctx, Ast *e, Type **type, Type *named_t
 				if (name == "soa") {
 					*type = make_soa_struct_fixed(ctx, e, at->elem, elem, count, generic_type);
 				} else if (name == "simd") {
-					if (!is_type_valid_vector_elem(elem)) {
+					if (!is_type_valid_vector_elem(elem) && !is_type_polymorphic(elem)) {
 						gbString str = type_to_string(elem);
-						error(at->elem, "Invalid element type for 'intrinsics.simd_vector', expected an integer or float with no specific endianness, got '%s'", str);
+						error(at->elem, "Invalid element type for #simd, expected an integer, float, or boolean with no specific endianness, got '%s'", str);
 						gb_string_free(str);
 						*type = alloc_type_array(elem, count, generic_type);
 						goto array_end;
 					}
 
-					*type = alloc_type_simd_vector(count, elem);
+					if (generic_type != nullptr) {
+						// Ignore
+					} else if (count < 1 || !is_power_of_two(count)) {
+						error(at->count, "Invalid length for #simd, expected a power of two length, got '%lld'", cast(long long)count);
+						*type = alloc_type_array(elem, count, generic_type);
+						goto array_end;
+					}
+
+					*type = alloc_type_simd_vector(count, elem, generic_type);
+
+					if (count > SIMD_ELEMENT_COUNT_MAX) {
+						error(at->count, "#simd support a maximum element count of %d, got %lld", SIMD_ELEMENT_COUNT_MAX, cast(long long)count);
+					}
 				} else {
 					error(at->tag, "Invalid tag applied to array, got #%.*s", LIT(name));
 					*type = alloc_type_array(elem, count, generic_type);

+ 16 - 0
src/checker.cpp

@@ -3207,6 +3207,22 @@ DECL_ATTRIBUTE_PROC(proc_decl_attribute) {
 			}
 		}
 		return true;
+	} else if (name == "require_target_feature") {
+		ExactValue ev = check_decl_attribute_value(c, value);
+		if (ev.kind == ExactValue_String) {
+			ac->require_target_feature = ev.value_string;
+		} else {
+			error(elem, "Expected a string value for '%.*s'", LIT(name));
+		}
+		return true;
+	} else if (name == "enable_target_feature") {
+		ExactValue ev = check_decl_attribute_value(c, value);
+		if (ev.kind == ExactValue_String) {
+			ac->enable_target_feature = ev.value_string;
+		} else {
+			error(elem, "Expected a string value for '%.*s'", LIT(name));
+		}
+		return true;
 	}
 	return false;
 }

+ 4 - 0
src/checker.hpp

@@ -60,6 +60,7 @@ struct BuiltinProc {
 	ExprKind kind;
 	BuiltinProcPkg pkg;
 	bool diverging;
+	bool ignore_results; // ignores require results handling
 };
 
 
@@ -124,6 +125,9 @@ struct AttributeContext {
 	String  objc_name;
 	bool    objc_is_class_method;
 	Type *  objc_type;
+
+	String require_target_feature; // required by the target micro-architecture
+	String enable_target_feature;  // will be enabled for the procedure only
 };
 
 AttributeContext make_attribute_context(String link_prefix) {

+ 163 - 27
src/checker_builtin_procs.hpp

@@ -45,7 +45,6 @@ enum BuiltinProcId {
 	// "Intrinsics"
 	BuiltinProc_is_package_imported,
 	
-	BuiltinProc_simd_vector,
 	BuiltinProc_soa_struct,
 
 	BuiltinProc_alloca,
@@ -66,6 +65,7 @@ enum BuiltinProcId {
 	BuiltinProc_overflow_mul,
 
 	BuiltinProc_sqrt,
+	BuiltinProc_fused_mul_add,
 
 	BuiltinProc_mem_copy,
 	BuiltinProc_mem_copy_non_overlapping,
@@ -80,6 +80,8 @@ enum BuiltinProcId {
 	
 	BuiltinProc_unaligned_store,
 	BuiltinProc_unaligned_load,
+	BuiltinProc_non_temporal_store,
+	BuiltinProc_non_temporal_load,
 	
 	BuiltinProc_prefetch_read_instruction,
 	BuiltinProc_prefetch_read_data,
@@ -118,10 +120,76 @@ enum BuiltinProcId {
 	BuiltinProc_fixed_point_div_sat,
 
 	BuiltinProc_expect,
+
+BuiltinProc__simd_begin,
+	BuiltinProc_simd_add,
+	BuiltinProc_simd_sub,
+	BuiltinProc_simd_mul,
+	BuiltinProc_simd_div,
+	BuiltinProc_simd_rem,
+	BuiltinProc_simd_shl,        // Odin logic
+	BuiltinProc_simd_shr,        // Odin logic
+	BuiltinProc_simd_shl_masked, // C logic
+	BuiltinProc_simd_shr_masked, // C logic
+
+	BuiltinProc_simd_add_sat, // saturation arithmetic
+	BuiltinProc_simd_sub_sat, // saturation arithmetic
+
+	BuiltinProc_simd_and,
+	BuiltinProc_simd_or,
+	BuiltinProc_simd_xor,
+	BuiltinProc_simd_and_not,
+
+	BuiltinProc_simd_neg,
+	BuiltinProc_simd_abs,
+
+	BuiltinProc_simd_min,
+	BuiltinProc_simd_max,
+	BuiltinProc_simd_clamp,
+
+	BuiltinProc_simd_lanes_eq,
+	BuiltinProc_simd_lanes_ne,
+	BuiltinProc_simd_lanes_lt,
+	BuiltinProc_simd_lanes_le,
+	BuiltinProc_simd_lanes_gt,
+	BuiltinProc_simd_lanes_ge,
+
+	BuiltinProc_simd_extract,
+	BuiltinProc_simd_replace,
+
+	BuiltinProc_simd_reduce_add_ordered,
+	BuiltinProc_simd_reduce_mul_ordered,
+	BuiltinProc_simd_reduce_min,
+	BuiltinProc_simd_reduce_max,
+	BuiltinProc_simd_reduce_and,
+	BuiltinProc_simd_reduce_or,
+	BuiltinProc_simd_reduce_xor,
+
+	BuiltinProc_simd_shuffle,
+	BuiltinProc_simd_select,
+
+	BuiltinProc_simd_ceil,
+	BuiltinProc_simd_floor,
+	BuiltinProc_simd_trunc,
+	BuiltinProc_simd_nearest,
+
+	BuiltinProc_simd_to_bits,
+
+	BuiltinProc_simd_lanes_reverse,
+	BuiltinProc_simd_lanes_rotate_left,
+	BuiltinProc_simd_lanes_rotate_right,
+
+
+	// Platform specific SIMD intrinsics
+	BuiltinProc_simd_x86__MM_SHUFFLE,
+BuiltinProc__simd_end,
 	
 	// Platform specific intrinsics
 	BuiltinProc_syscall,
 
+	BuiltinProc_x86_cpuid,
+	BuiltinProc_x86_xgetbv,
+
 	// Constant type tests
 
 BuiltinProc__type_begin,
@@ -268,7 +336,6 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 	// "Intrinsics"
 	{STR_LIT("is_package_imported"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 		
-	{STR_LIT("simd_vector"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, // Type
 	{STR_LIT("soa_struct"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, // Type
 
 	{STR_LIT("alloca"),    2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
@@ -290,6 +357,7 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 	{STR_LIT("overflow_mul"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 
 	{STR_LIT("sqrt"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("fused_mul_add"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 
 	{STR_LIT("mem_copy"),                 3, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
 	{STR_LIT("mem_copy_non_overlapping"), 3, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
@@ -304,6 +372,8 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 	
 	{STR_LIT("unaligned_store"),  2, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
 	{STR_LIT("unaligned_load"),   1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("non_temporal_store"),  2, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
+	{STR_LIT("non_temporal_load"),   1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	
 	{STR_LIT("prefetch_read_instruction"),  2, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
 	{STR_LIT("prefetch_read_data"),         2, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
@@ -315,26 +385,26 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 	{STR_LIT("atomic_signal_fence"),                     1, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
 	{STR_LIT("atomic_store"),                            2, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
 	{STR_LIT("atomic_store_explicit"),                   3, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_load"),                             1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_load_explicit"),                    2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_add"),                              2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_add_explicit"),                     3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_sub"),                              2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_sub_explicit"),                     3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_and"),                              2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_and_explicit"),                     3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_nand"),                             2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_nand_explicit"),                    3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_or"),                               2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_or_explicit"),                      3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_xor"),                              2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_xor_explicit"),                     3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_exchange"),                         2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_exchange_explicit"),                3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_compare_exchange_strong"),          3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_compare_exchange_strong_explicit"), 5, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_compare_exchange_weak"),            3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("atomic_compare_exchange_weak_explicit"),   5, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("atomic_load"),                             1, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_load_explicit"),                    2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_add"),                              2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_add_explicit"),                     3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_sub"),                              2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_sub_explicit"),                     3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_and"),                              2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_and_explicit"),                     3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_nand"),                             2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_nand_explicit"),                    3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_or"),                               2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_or_explicit"),                      3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_xor"),                              2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_xor_explicit"),                     3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_exchange"),                         2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_exchange_explicit"),                3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_compare_exchange_strong"),          3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_compare_exchange_strong_explicit"), 5, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_compare_exchange_weak"),            3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("atomic_compare_exchange_weak_explicit"),   5, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
 
 	{STR_LIT("fixed_point_mul"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("fixed_point_div"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
@@ -342,8 +412,74 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 	{STR_LIT("fixed_point_div_sat"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 
 	{STR_LIT("expect"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	
-	{STR_LIT("syscall"), 1, true, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_add"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_sub"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_mul"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_div"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_rem"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_shl"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_shr"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_shl_masked"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_shr_masked"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_add_sat"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_sub_sat"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_and"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_or"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_xor"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_and_not"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_neg"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_abs"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_min"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_max"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_clamp"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_lanes_eq"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_lanes_ne"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_lanes_lt"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_lanes_le"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_lanes_gt"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_lanes_ge"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_extract"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_replace"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_reduce_add_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_mul_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_min"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_max"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_and"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_or"),          1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_xor"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_shuffle"), 2, true,  Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_select"),  3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_ceil") , 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_floor"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_trunc"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_nearest"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_to_bits"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_lanes_reverse"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_lanes_rotate_left"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_lanes_rotate_right"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT("simd_x86__MM_SHUFFLE"), 4, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+	{STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
+
+
+	{STR_LIT("syscall"), 1, true, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("x86_cpuid"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("x86_xgetbv"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 
 
 	{STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
@@ -429,12 +565,12 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 
 	{STR_LIT("__entry_point"), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
 
-	{STR_LIT("objc_send"),   3, true,  Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("objc_send"),   3, true,  Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
 
 	{STR_LIT("objc_find_selector"),     1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("objc_find_class"),        1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("objc_register_selector"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-	{STR_LIT("objc_register_class"),    1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("objc_register_selector"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+	{STR_LIT("objc_register_class"),    1, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
 
 	{STR_LIT("constant_utf16_cstring"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 

+ 7 - 0
src/common.cpp

@@ -47,6 +47,13 @@ void debugf(char const *fmt, ...);
 #include "range_cache.cpp"
 
 
+bool is_power_of_two(i64 x) {
+	if (x <= 0) {
+		return false;
+	}
+	return !(x & (x-1));
+}
+
 int isize_cmp(isize x, isize y) {
 	if (x < y) {
 		return -1;

+ 5 - 3
src/entity.cpp

@@ -233,10 +233,12 @@ struct Entity {
 			String  link_name;
 			String  link_prefix;
 			DeferredProcedure deferred_procedure;
-			bool    is_foreign;
-			bool    is_export;
-			bool    generated_from_polymorphic;
 			ProcedureOptimizationMode optimization_mode;
+			bool    is_foreign                 : 1;
+			bool    is_export                  : 1;
+			bool    generated_from_polymorphic : 1;
+			bool    target_feature_disabled    : 1;
+			String  target_feature;
 		} Procedure;
 		struct {
 			Array<Entity *> entities;

+ 2 - 2
src/llvm_backend.cpp

@@ -1332,8 +1332,8 @@ void lb_generate_code(lbGenerator *gen) {
 		}
 	}
 
-	if (build_context.target_features.len != 0) {
-		llvm_features = alloc_cstring(permanent_allocator(), build_context.target_features);
+	if (build_context.target_features_set.entries.count != 0) {
+		llvm_features = target_features_set_to_cstring(permanent_allocator(), false);
 	}
 
 	// GB_ASSERT_MSG(LLVMTargetHasAsmBackend(target));

+ 91 - 20
src/llvm_backend_const.cpp

@@ -495,9 +495,9 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc
 		res.value = data;
 		return res;
 	} else if (is_type_array(type) &&
-	    value.kind != ExactValue_Invalid &&
-	    value.kind != ExactValue_String &&
-	    value.kind != ExactValue_Compound) {
+		value.kind != ExactValue_Invalid &&
+		value.kind != ExactValue_String &&
+		value.kind != ExactValue_Compound) {
 
 		i64 count  = type->Array.count;
 		Type *elem = type->Array.elem;
@@ -513,8 +513,8 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc
 		res.value = llvm_const_array(lb_type(m, elem), elems, cast(unsigned)count);
 		return res;
 	} else if (is_type_matrix(type) &&
-	    value.kind != ExactValue_Invalid &&
-	    value.kind != ExactValue_Compound) {
+		value.kind != ExactValue_Invalid &&
+		value.kind != ExactValue_Compound) {
 		i64 row = type->Matrix.row_count;
 		i64 column = type->Matrix.column_count;
 		GB_ASSERT(row == column);
@@ -537,6 +537,22 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc
 		
 		res.value = LLVMConstArray(lb_type(m, elem), elems, cast(unsigned)total_elem_count);
 		return res;
+	} else if (is_type_simd_vector(type) &&
+		value.kind != ExactValue_Invalid &&
+		value.kind != ExactValue_Compound) {
+		i64 count = type->SimdVector.count;
+		Type *elem = type->SimdVector.elem;
+
+		lbValue single_elem = lb_const_value(m, elem, value, allow_local);
+		single_elem.value = llvm_const_cast(single_elem.value, lb_type(m, elem));
+
+		LLVMValueRef *elems = gb_alloc_array(permanent_allocator(), LLVMValueRef, count);
+		for (i64 i = 0; i < count; i++) {
+			elems[i] = single_elem.value;
+		}
+
+		res.value = LLVMConstVector(elems, cast(unsigned)count);
+		return res;
 	}
 
 	switch (value.kind) {
@@ -819,26 +835,81 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc
 				return lb_const_nil(m, original_type);
 			}
 			GB_ASSERT(elem_type_can_be_constant(elem_type));
-
 			isize total_elem_count = cast(isize)type->SimdVector.count;
 			LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, total_elem_count);
 
-			for (isize i = 0; i < elem_count; i++) {
-				TypeAndValue tav = cl->elems[i]->tav;
-				GB_ASSERT(tav.mode != Addressing_Invalid);
-				values[i] = lb_const_value(m, elem_type, tav.value, allow_local).value;
-			}
-			LLVMTypeRef et = lb_type(m, elem_type);
+			if (cl->elems[0]->kind == Ast_FieldValue) {
+				// TODO(bill): This is O(N*M) and will be quite slow; it should probably be sorted before hand
+				isize value_index = 0;
+				for (i64 i = 0; i < total_elem_count; i++) {
+					bool found = false;
 
-			for (isize i = elem_count; i < type->SimdVector.count; i++) {
-				values[i] = LLVMConstNull(et);
-			}
-			for (isize i = 0; i < total_elem_count; i++) {
-				values[i] = llvm_const_cast(values[i], et);
-			}
+					for (isize j = 0; j < elem_count; j++) {
+						Ast *elem = cl->elems[j];
+						ast_node(fv, FieldValue, elem);
+						if (is_ast_range(fv->field)) {
+							ast_node(ie, BinaryExpr, fv->field);
+							TypeAndValue lo_tav = ie->left->tav;
+							TypeAndValue hi_tav = ie->right->tav;
+							GB_ASSERT(lo_tav.mode == Addressing_Constant);
+							GB_ASSERT(hi_tav.mode == Addressing_Constant);
 
-			res.value = LLVMConstVector(values, cast(unsigned)total_elem_count);
-			return res;
+							TokenKind op = ie->op.kind;
+							i64 lo = exact_value_to_i64(lo_tav.value);
+							i64 hi = exact_value_to_i64(hi_tav.value);
+							if (op != Token_RangeHalf) {
+								hi += 1;
+							}
+							if (lo == i) {
+								TypeAndValue tav = fv->value->tav;
+								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+								for (i64 k = lo; k < hi; k++) {
+									values[value_index++] = val;
+								}
+
+								found = true;
+								i += (hi-lo-1);
+								break;
+							}
+						} else {
+							TypeAndValue index_tav = fv->field->tav;
+							GB_ASSERT(index_tav.mode == Addressing_Constant);
+							i64 index = exact_value_to_i64(index_tav.value);
+							if (index == i) {
+								TypeAndValue tav = fv->value->tav;
+								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+								values[value_index++] = val;
+								found = true;
+								break;
+							}
+						}
+					}
+
+					if (!found) {
+						values[value_index++] = LLVMConstNull(lb_type(m, elem_type));
+					}
+				}
+
+				res.value = LLVMConstVector(values, cast(unsigned)total_elem_count);
+				return res;
+			} else {
+				for (isize i = 0; i < elem_count; i++) {
+					TypeAndValue tav = cl->elems[i]->tav;
+					GB_ASSERT(tav.mode != Addressing_Invalid);
+					values[i] = lb_const_value(m, elem_type, tav.value, allow_local).value;
+				}
+				LLVMTypeRef et = lb_type(m, elem_type);
+
+				for (isize i = elem_count; i < total_elem_count; i++) {
+					values[i] = LLVMConstNull(et);
+				}
+				for (isize i = 0; i < total_elem_count; i++) {
+					values[i] = llvm_const_cast(values[i], et);
+				}
+
+				res.value = LLVMConstVector(values, cast(unsigned)total_elem_count);
+				return res;
+			}
 		} else if (is_type_struct(type)) {
 			ast_node(cl, CompoundLit, value.value_compound);
 

+ 207 - 1
src/llvm_backend_expr.cpp

@@ -258,7 +258,13 @@ lbValue lb_emit_unary_arith(lbProcedure *p, TokenKind op, lbValue x, Type *type)
 			LLVMBuildStore(p->builder, v2, LLVMBuildStructGEP(p->builder, addr.addr.value, 2, ""));
 			LLVMBuildStore(p->builder, v3, LLVMBuildStructGEP(p->builder, addr.addr.value, 3, ""));
 			return lb_addr_load(p, addr);
-
+		} else if (is_type_simd_vector(x.type)) {
+			Type *elem = base_array_type(x.type);
+			if (is_type_float(elem)) {
+				res.value = LLVMBuildFNeg(p->builder, x.value, "");
+			} else {
+				res.value = LLVMBuildNeg(p->builder, x.value, "");
+			}
 		} else {
 			GB_PANIC("Unhandled type %s", type_to_string(x.type));
 		}
@@ -1820,6 +1826,59 @@ lbValue lb_emit_conv(lbProcedure *p, lbValue value, Type *t) {
 		return res;
 	}
 
+	if (is_type_simd_vector(dst)) {
+		Type *et = base_array_type(dst);
+		if (is_type_simd_vector(src)) {
+			Type *src_elem = core_array_type(src);
+			Type *dst_elem = core_array_type(dst);
+
+			GB_ASSERT(src->SimdVector.count == dst->SimdVector.count);
+
+			lbValue res = {};
+			res.type = t;
+			if (are_types_identical(src_elem, dst_elem)) {
+				res.value = value.value;
+			} else if (is_type_float(src_elem) && is_type_integer(dst_elem)) {
+				if (is_type_unsigned(dst_elem)) {
+					res.value = LLVMBuildFPToUI(p->builder, value.value, lb_type(m, t), "");
+				} else {
+					res.value = LLVMBuildFPToSI(p->builder, value.value, lb_type(m, t), "");
+				}
+			} else if (is_type_integer(src_elem) && is_type_float(dst_elem)) {
+				if (is_type_unsigned(src_elem)) {
+					res.value = LLVMBuildUIToFP(p->builder, value.value, lb_type(m, t), "");
+				} else {
+					res.value = LLVMBuildSIToFP(p->builder, value.value, lb_type(m, t), "");
+				}
+			} else if ((is_type_integer(src_elem) || is_type_boolean(src_elem)) && is_type_integer(dst_elem)) {
+				res.value = LLVMBuildIntCast2(p->builder, value.value, lb_type(m, t), !is_type_unsigned(src_elem), "");
+			} else if (is_type_float(src_elem) && is_type_float(dst_elem)) {
+				res.value = LLVMBuildFPCast(p->builder, value.value, lb_type(m, t), "");
+			} else if (is_type_integer(src_elem) && is_type_boolean(dst_elem)) {
+				LLVMValueRef i1vector = LLVMBuildICmp(p->builder, LLVMIntNE, value.value, LLVMConstNull(LLVMTypeOf(value.value)), "");
+				res.value = LLVMBuildIntCast2(p->builder, i1vector, lb_type(m, t), !is_type_unsigned(src_elem), "");
+			} else {
+				GB_PANIC("Unhandled simd vector conversion: %s -> %s", type_to_string(src), type_to_string(dst));
+			}
+			return res;
+		} else {
+			i64 count = get_array_type_count(dst);
+			LLVMTypeRef vt = lb_type(m, t);
+			LLVMTypeRef llvm_u32 = lb_type(m, t_u32);
+			LLVMValueRef elem = lb_emit_conv(p, value, et).value;
+			LLVMValueRef vector = LLVMConstNull(vt);
+			for (i64 i = 0; i < count; i++) {
+				LLVMValueRef idx = LLVMConstInt(llvm_u32, i, false);
+				vector = LLVMBuildInsertElement(p->builder, vector, elem, idx, "");
+			}
+			lbValue res = {};
+			res.type = t;
+			res.value = vector;
+			return res;
+		}
+	}
+
+
 	// Pointer <-> uintptr
 	if (is_type_pointer(src) && is_type_uintptr(dst)) {
 		lbValue res = {};
@@ -2506,6 +2565,57 @@ lbValue lb_emit_comp(lbProcedure *p, TokenKind op_kind, lbValue left, lbValue ri
 		case Token_NotEq: pred = LLVMIntNE;  break;
 		}
 		res.value = LLVMBuildICmp(p->builder, pred, left.value, right.value, "");
+	} else if (is_type_simd_vector(a)) {
+		LLVMValueRef mask = nullptr;
+		Type *elem = base_array_type(a);
+		if (is_type_float(elem)) {
+			LLVMRealPredicate pred = {};
+			switch (op_kind) {
+			case Token_CmpEq: pred = LLVMRealOEQ; break;
+			case Token_NotEq: pred = LLVMRealONE; break;
+			}
+			mask = LLVMBuildFCmp(p->builder, pred, left.value, right.value, "");
+		} else {
+			LLVMIntPredicate pred = {};
+			switch (op_kind) {
+			case Token_CmpEq: pred = LLVMIntEQ; break;
+			case Token_NotEq: pred = LLVMIntNE; break;
+			}
+			mask = LLVMBuildICmp(p->builder, pred, left.value, right.value, "");
+		}
+		GB_ASSERT_MSG(mask != nullptr, "Unhandled comparison kind %s (%s) %.*s %s (%s)", type_to_string(left.type), type_to_string(base_type(left.type)), LIT(token_strings[op_kind]), type_to_string(right.type), type_to_string(base_type(right.type)));
+
+		/* NOTE(bill, 2022-05-28):
+			Thanks to Per Vognsen, sign extending <N x i1> to
+			a vector of the same width as the input vector, bit casting to an integer,
+			and then comparing against zero is the better option
+			See: https://lists.llvm.org/pipermail/llvm-dev/2012-September/053046.html
+
+			// Example assuming 128-bit vector
+
+			%1 = <4 x float> ...
+			%2 = <4 x float> ...
+			%3 = fcmp oeq <4 x float> %1, %2
+			%4 = sext <4 x i1> %3 to <4 x i32>
+			%5 = bitcast <4 x i32> %4 to i128
+			%6 = icmp ne i128 %5, 0
+			br i1 %6, label %true1, label %false2
+
+			This will result in 1 cmpps + 1 ptest + 1 br
+			(even without SSE4.1, contrary to what the mail list states, because of pmovmskb)
+
+		*/
+
+		unsigned count = cast(unsigned)get_array_type_count(a);
+		unsigned elem_sz = cast(unsigned)(type_size_of(elem)*8);
+		LLVMTypeRef mask_type = LLVMVectorType(LLVMIntTypeInContext(p->module->ctx, elem_sz), count);
+		mask = LLVMBuildSExtOrBitCast(p->builder, mask, mask_type, "");
+
+		LLVMTypeRef mask_int_type = LLVMIntTypeInContext(p->module->ctx, cast(unsigned)(8*type_size_of(a)));
+		LLVMValueRef mask_int = LLVMBuildBitCast(p->builder, mask, mask_int_type, "");
+		res.value = LLVMBuildICmp(p->builder, LLVMIntNE, mask_int, LLVMConstNull(LLVMTypeOf(mask_int)), "");
+		return res;
+
 	} else {
 		GB_PANIC("Unhandled comparison kind %s (%s) %.*s %s (%s)", type_to_string(left.type), type_to_string(base_type(left.type)), LIT(token_strings[op_kind]), type_to_string(right.type), type_to_string(base_type(right.type)));
 	}
@@ -4609,6 +4719,102 @@ lbAddr lb_build_addr(lbProcedure *p, Ast *expr) {
 			break;
 		}
 		
+		case Type_SimdVector: {
+			if (cl->elems.count > 0) {
+				lbValue vector_value = lb_const_value(p->module, type, exact_value_compound(expr));
+				defer (lb_addr_store(p, v, vector_value));
+
+				auto temp_data = array_make<lbCompoundLitElemTempData>(temporary_allocator(), 0, cl->elems.count);
+
+				// NOTE(bill): Separate value, store into their own chunks
+				for_array(i, cl->elems) {
+					Ast *elem = cl->elems[i];
+					if (elem->kind == Ast_FieldValue) {
+						ast_node(fv, FieldValue, elem);
+						if (lb_is_elem_const(fv->value, et)) {
+							continue;
+						}
+						if (is_ast_range(fv->field)) {
+							ast_node(ie, BinaryExpr, fv->field);
+							TypeAndValue lo_tav = ie->left->tav;
+							TypeAndValue hi_tav = ie->right->tav;
+							GB_ASSERT(lo_tav.mode == Addressing_Constant);
+							GB_ASSERT(hi_tav.mode == Addressing_Constant);
+
+							TokenKind op = ie->op.kind;
+							i64 lo = exact_value_to_i64(lo_tav.value);
+							i64 hi = exact_value_to_i64(hi_tav.value);
+							if (op != Token_RangeHalf) {
+								hi += 1;
+							}
+
+							lbValue value = lb_build_expr(p, fv->value);
+
+							for (i64 k = lo; k < hi; k++) {
+								lbCompoundLitElemTempData data = {};
+								data.value = value;
+								data.elem_index = cast(i32)k;
+								array_add(&temp_data, data);
+							}
+
+						} else {
+							auto tav = fv->field->tav;
+							GB_ASSERT(tav.mode == Addressing_Constant);
+							i64 index = exact_value_to_i64(tav.value);
+
+							lbValue value = lb_build_expr(p, fv->value);
+							lbCompoundLitElemTempData data = {};
+							data.value = lb_emit_conv(p, value, et);
+							data.expr = fv->value;
+							data.elem_index = cast(i32)index;
+							array_add(&temp_data, data);
+						}
+
+					} else {
+						if (lb_is_elem_const(elem, et)) {
+							continue;
+						}
+						lbCompoundLitElemTempData data = {};
+						data.expr = elem;
+						data.elem_index = cast(i32)i;
+						array_add(&temp_data, data);
+					}
+				}
+
+
+				for_array(i, temp_data) {
+					lbValue field_expr = temp_data[i].value;
+					Ast *expr = temp_data[i].expr;
+
+					auto prev_hint = lb_set_copy_elision_hint(p, lb_addr(temp_data[i].gep), expr);
+
+					if (field_expr.value == nullptr) {
+						field_expr = lb_build_expr(p, expr);
+					}
+					Type *t = field_expr.type;
+					GB_ASSERT(t->kind != Type_Tuple);
+					lbValue ev = lb_emit_conv(p, field_expr, et);
+
+					if (!p->copy_elision_hint.used) {
+						temp_data[i].value = ev;
+					}
+
+					lb_reset_copy_elision_hint(p, prev_hint);
+				}
+
+
+				// TODO(bill): reduce the need for individual `insertelement` if a `shufflevector`
+				// might be a better option
+
+				for_array(i, temp_data) {
+					if (temp_data[i].value.value != nullptr) {
+						LLVMValueRef index = lb_const_int(p->module, t_u32, temp_data[i].elem_index).value;
+						vector_value.value = LLVMBuildInsertElement(p->builder, vector_value.value, temp_data[i].value.value, index, "");
+					}
+				}
+			}
+			break;
+		}
 		}
 
 		return v;

+ 552 - 0
src/llvm_backend_proc.cpp

@@ -169,6 +169,19 @@ lbProcedure *lb_create_procedure(lbModule *m, Entity *entity, bool ignore_body)
 		}
 	}
 
+	if (!entity->Procedure.target_feature_disabled &&
+	    entity->Procedure.target_feature.len != 0) {
+	    	auto features = split_by_comma(entity->Procedure.target_feature);
+		for_array(i, features) {
+			String feature = features[i];
+			LLVMAttributeRef ref = LLVMCreateStringAttribute(
+				m->ctx,
+				cast(char const *)feature.text, cast(unsigned)feature.len,
+				"", 0);
+			LLVMAddAttributeAtIndex(p->value, LLVMAttributeIndex_FunctionIndex, ref);
+		}
+	}
+
 	if (entity->flags & EntityFlag_Cold) {
 		lb_add_attribute_to_proc(m, p->value, "cold");
 	}
@@ -981,10 +994,466 @@ lbValue lb_emit_call(lbProcedure *p, lbValue value, Array<lbValue> const &args,
 	return result;
 }
 
+LLVMValueRef llvm_splat_float(i64 count, LLVMTypeRef type, f64 value) {
+	LLVMValueRef v = LLVMConstReal(type, value);
+	LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count);
+	for (i64 i = 0; i < count; i++) {
+		values[i] = v;
+	}
+	return LLVMConstVector(values, cast(unsigned)count);
+}
+LLVMValueRef llvm_splat_int(i64 count, LLVMTypeRef type, i64 value, bool is_signed=false) {
+	LLVMValueRef v = LLVMConstInt(type, value, is_signed);
+	LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count);
+	for (i64 i = 0; i < count; i++) {
+		values[i] = v;
+	}
+	return LLVMConstVector(values, cast(unsigned)count);
+}
+
+
+lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, BuiltinProcId builtin_id) {
+	ast_node(ce, CallExpr, expr);
+
+	lbModule *m = p->module;
+
+	lbValue res = {};
+	res.type = tv.type;
+
+	lbValue arg0 = {}; if (ce->args.count > 0) arg0 = lb_build_expr(p, ce->args[0]);
+	lbValue arg1 = {}; if (ce->args.count > 1) arg1 = lb_build_expr(p, ce->args[1]);
+	lbValue arg2 = {}; if (ce->args.count > 2) arg2 = lb_build_expr(p, ce->args[2]);
+
+	Type *elem = base_array_type(arg0.type);
+
+	bool is_float = is_type_float(elem);
+	bool is_signed = !is_type_unsigned(elem);
+
+	LLVMOpcode op_code = cast(LLVMOpcode)0;
+
+	switch (builtin_id) {
+	case BuiltinProc_simd_add:
+	case BuiltinProc_simd_sub:
+	case BuiltinProc_simd_mul:
+	case BuiltinProc_simd_div:
+	case BuiltinProc_simd_rem:
+		if (is_float) {
+			switch (builtin_id) {
+			case BuiltinProc_simd_add: op_code = LLVMFAdd; break;
+			case BuiltinProc_simd_sub: op_code = LLVMFSub; break;
+			case BuiltinProc_simd_mul: op_code = LLVMFMul; break;
+			case BuiltinProc_simd_div: op_code = LLVMFDiv; break;
+			}
+		} else {
+			switch (builtin_id) {
+			case BuiltinProc_simd_add: op_code = LLVMAdd; break;
+			case BuiltinProc_simd_sub: op_code = LLVMSub; break;
+			case BuiltinProc_simd_mul: op_code = LLVMMul; break;
+			case BuiltinProc_simd_div:
+				if (is_signed) {
+					op_code = LLVMSDiv;
+				} else {
+					op_code = LLVMUDiv;
+				}
+				break;
+			case BuiltinProc_simd_rem:
+				if (is_signed) {
+					op_code = LLVMSRem;
+				} else {
+					op_code = LLVMURem;
+				}
+				break;
+			}
+		}
+		if (op_code) {
+			res.value = LLVMBuildBinOp(p->builder, op_code, arg0.value, arg1.value, "");
+			return res;
+		}
+		break;
+	case BuiltinProc_simd_shl: // Odin logic
+	case BuiltinProc_simd_shr: // Odin logic
+	case BuiltinProc_simd_shl_masked: // C logic
+	case BuiltinProc_simd_shr_masked: // C logic
+		{
+			i64 sz = type_size_of(elem);
+			GB_ASSERT(arg0.type->kind == Type_SimdVector);
+
+			i64 count = arg0.type->SimdVector.count;
+			Type *elem1 = base_array_type(arg1.type);
+
+			bool is_masked = false;
+			switch (builtin_id) {
+			case BuiltinProc_simd_shl:        op_code = LLVMShl;                         is_masked = false; break;
+			case BuiltinProc_simd_shr:        op_code = is_signed ? LLVMAShr : LLVMLShr; is_masked = false; break;
+			case BuiltinProc_simd_shl_masked: op_code = LLVMShl;                         is_masked = true;  break;
+			case BuiltinProc_simd_shr_masked: op_code = is_signed ? LLVMAShr : LLVMLShr; is_masked = true;  break;
+			}
+			if (op_code) {
+				LLVMValueRef bits = llvm_splat_int(count, lb_type(m, elem1), sz*8 - 1);
+				if (is_masked) {
+					// C logic
+					LLVMValueRef shift = LLVMBuildAnd(p->builder, arg1.value, bits, "");
+					res.value = LLVMBuildBinOp(p->builder, op_code, arg0.value, shift, "");
+				} else {
+					// Odin logic
+					LLVMValueRef zero = lb_const_nil(m, arg1.type).value;
+					LLVMValueRef mask = LLVMBuildICmp(p->builder, LLVMIntULE, arg1.value, bits, "");
+					LLVMValueRef shift = LLVMBuildBinOp(p->builder, op_code, arg0.value, arg1.value, "");
+					res.value = LLVMBuildSelect(p->builder, mask, shift, zero, "");
+				}
+				return res;
+			}
+		}
+		break;
+	case BuiltinProc_simd_and:
+	case BuiltinProc_simd_or:
+	case BuiltinProc_simd_xor:
+	case BuiltinProc_simd_and_not:
+		switch (builtin_id) {
+		case BuiltinProc_simd_and: op_code = LLVMAnd; break;
+		case BuiltinProc_simd_or:  op_code = LLVMOr;  break;
+		case BuiltinProc_simd_xor: op_code = LLVMXor; break;
+		case BuiltinProc_simd_and_not:
+			op_code = LLVMAnd;
+			arg1.value = LLVMBuildNot(p->builder, arg1.value, "");
+			break;
+		}
+		if (op_code) {
+			res.value = LLVMBuildBinOp(p->builder, op_code, arg0.value, arg1.value, "");
+			return res;
+		}
+		break;
+	case BuiltinProc_simd_neg:
+		if (is_float) {
+			res.value = LLVMBuildFNeg(p->builder, arg0.value, "");
+		} else {
+			res.value = LLVMBuildNeg(p->builder, arg0.value, "");
+		}
+		return res;
+	case BuiltinProc_simd_abs:
+		if (is_float) {
+			LLVMValueRef pos = arg0.value;
+			LLVMValueRef neg = LLVMBuildFNeg(p->builder, pos, "");
+			LLVMValueRef cond = LLVMBuildFCmp(p->builder, LLVMRealOGT, pos, neg, "");
+			res.value = LLVMBuildSelect(p->builder, cond, pos, neg, "");
+		} else {
+			LLVMValueRef pos = arg0.value;
+			LLVMValueRef neg = LLVMBuildNeg(p->builder, pos, "");
+			LLVMValueRef cond = LLVMBuildICmp(p->builder, is_signed ? LLVMIntSGT : LLVMIntUGT, pos, neg, "");
+			res.value = LLVMBuildSelect(p->builder, cond, pos, neg, "");
+		}
+		return res;
+	case BuiltinProc_simd_min:
+		if (is_float) {
+			LLVMValueRef cond = LLVMBuildFCmp(p->builder, LLVMRealOLT, arg0.value, arg1.value, "");
+			res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, "");
+		} else {
+			LLVMValueRef cond = LLVMBuildICmp(p->builder, is_signed ? LLVMIntSLT : LLVMIntULT, arg0.value, arg1.value, "");
+			res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, "");
+		}
+		return res;
+	case BuiltinProc_simd_max:
+		if (is_float) {
+			LLVMValueRef cond = LLVMBuildFCmp(p->builder, LLVMRealOGT, arg0.value, arg1.value, "");
+			res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, "");
+		} else {
+			LLVMValueRef cond = LLVMBuildICmp(p->builder, is_signed ? LLVMIntSGT : LLVMIntUGT, arg0.value, arg1.value, "");
+			res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, "");
+		}
+		return res;
+	case BuiltinProc_simd_lanes_eq:
+	case BuiltinProc_simd_lanes_ne:
+	case BuiltinProc_simd_lanes_lt:
+	case BuiltinProc_simd_lanes_le:
+	case BuiltinProc_simd_lanes_gt:
+	case BuiltinProc_simd_lanes_ge:
+		if (is_float) {
+			LLVMRealPredicate pred = cast(LLVMRealPredicate)0;
+			switch (builtin_id) {
+			case BuiltinProc_simd_lanes_eq: pred = LLVMRealOEQ; break;
+			case BuiltinProc_simd_lanes_ne: pred = LLVMRealONE; break;
+			case BuiltinProc_simd_lanes_lt: pred = LLVMRealOLT; break;
+			case BuiltinProc_simd_lanes_le: pred = LLVMRealOLE; break;
+			case BuiltinProc_simd_lanes_gt: pred = LLVMRealOGT; break;
+			case BuiltinProc_simd_lanes_ge: pred = LLVMRealOGE; break;
+			}
+			if (pred) {
+				res.value = LLVMBuildFCmp(p->builder, pred, arg0.value, arg1.value, "");
+				res.value = LLVMBuildSExtOrBitCast(p->builder, res.value, lb_type(m, tv.type), "");
+				return res;
+			}
+		} else {
+			LLVMIntPredicate pred = cast(LLVMIntPredicate)0;
+			switch (builtin_id) {
+			case BuiltinProc_simd_lanes_eq: pred = LLVMIntEQ; break;
+			case BuiltinProc_simd_lanes_ne: pred = LLVMIntNE; break;
+			case BuiltinProc_simd_lanes_lt: pred = is_signed ? LLVMIntSLT :LLVMIntULT; break;
+			case BuiltinProc_simd_lanes_le: pred = is_signed ? LLVMIntSLE :LLVMIntULE; break;
+			case BuiltinProc_simd_lanes_gt: pred = is_signed ? LLVMIntSGT :LLVMIntUGT; break;
+			case BuiltinProc_simd_lanes_ge: pred = is_signed ? LLVMIntSGE :LLVMIntUGE; break;
+			}
+			if (pred) {
+				res.value = LLVMBuildICmp(p->builder, pred, arg0.value, arg1.value, "");
+				res.value = LLVMBuildSExtOrBitCast(p->builder, res.value, lb_type(m, tv.type), "");
+				return res;
+			}
+		}
+		break;
+
+	case BuiltinProc_simd_extract:
+		res.value = LLVMBuildExtractElement(p->builder, arg0.value, arg1.value, "");
+		return res;
+	case BuiltinProc_simd_replace:
+		res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, "");
+		return res;
+
+	case BuiltinProc_simd_reduce_add_ordered:
+	case BuiltinProc_simd_reduce_mul_ordered:
+		{
+			LLVMTypeRef llvm_elem = lb_type(m, elem);
+			LLVMValueRef args[2] = {};
+			isize args_count = 0;
+
+			char const *name = nullptr;
+			switch (builtin_id) {
+			case BuiltinProc_simd_reduce_add_ordered:
+				if (is_float) {
+					name = "llvm.vector.reduce.fadd";
+					args[args_count++] = LLVMConstReal(llvm_elem, 0.0);
+				} else {
+					name = "llvm.vector.reduce.add";
+				}
+				break;
+			case BuiltinProc_simd_reduce_mul_ordered:
+				if (is_float) {
+					name = "llvm.vector.reduce.fmul";
+					args[args_count++] = LLVMConstReal(llvm_elem, 1.0);
+				} else {
+					name = "llvm.vector.reduce.mul";
+				}
+				break;
+			}
+			args[args_count++] = arg0.value;
+
+
+			LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)};
+			unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name));
+			GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0]));
+			LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types));
+
+			res.value = LLVMBuildCall(p->builder, ip, args, cast(unsigned)args_count, "");
+			return res;
+		}
+	case BuiltinProc_simd_reduce_min:
+	case BuiltinProc_simd_reduce_max:
+	case BuiltinProc_simd_reduce_and:
+	case BuiltinProc_simd_reduce_or:
+	case BuiltinProc_simd_reduce_xor:
+		{
+			char const *name = nullptr;
+			switch (builtin_id) {
+			case BuiltinProc_simd_reduce_min:
+				if (is_float) {
+					name = "llvm.vector.reduce.fmin";
+				} else if (is_signed) {
+					name = "llvm.vector.reduce.smin";
+				} else {
+					name = "llvm.vector.reduce.umin";
+				}
+				break;
+			case BuiltinProc_simd_reduce_max:
+				if (is_float) {
+					name = "llvm.vector.reduce.fmax";
+				} else if (is_signed) {
+					name = "llvm.vector.reduce.smax";
+				} else {
+					name = "llvm.vector.reduce.umax";
+				}
+				break;
+			case BuiltinProc_simd_reduce_and: name = "llvm.vector.reduce.and"; break;
+			case BuiltinProc_simd_reduce_or:  name = "llvm.vector.reduce.or";  break;
+			case BuiltinProc_simd_reduce_xor: name = "llvm.vector.reduce.xor"; break;
+			}
+			LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)};
+			unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name));
+			GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0]));
+			LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types));
+
+			LLVMValueRef args[1] = {};
+			args[0] = arg0.value;
+
+			res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), "");
+			return res;
+		}
+
+	case BuiltinProc_simd_shuffle:
+		{
+			Type *vt = arg0.type;
+			GB_ASSERT(vt->kind == Type_SimdVector);
+
+			i64 indices_count = ce->args.count-2;
+			i64 max_count = vt->SimdVector.count*2;
+			GB_ASSERT(indices_count <= max_count);
+
+			LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, indices_count);
+			for (isize i = 0; i < indices_count; i++) {
+				lbValue idx = lb_build_expr(p, ce->args[i+2]);
+				GB_ASSERT(LLVMIsConstant(idx.value));
+				values[i] = idx.value;
+			}
+			LLVMValueRef indices = LLVMConstVector(values, cast(unsigned)indices_count);
+
+			res.value = LLVMBuildShuffleVector(p->builder, arg0.value, arg1.value, indices, "");
+			return res;
+		}
+
+	case BuiltinProc_simd_select:
+		{
+			LLVMValueRef cond = arg0.value;
+			LLVMValueRef x = lb_build_expr(p, ce->args[1]).value;
+			LLVMValueRef y = lb_build_expr(p, ce->args[2]).value;
+
+			cond = LLVMBuildICmp(p->builder, LLVMIntNE, cond, LLVMConstNull(LLVMTypeOf(cond)), "");
+			res.value = LLVMBuildSelect(p->builder, cond, x, y, "");
+			return res;
+		}
+
+	case BuiltinProc_simd_ceil:
+	case BuiltinProc_simd_floor:
+	case BuiltinProc_simd_trunc:
+	case BuiltinProc_simd_nearest:
+		{
+			char const *name = nullptr;
+			switch (builtin_id) {
+			case BuiltinProc_simd_ceil:    name = "llvm.ceil"; break;
+			case BuiltinProc_simd_floor:   name = "llvm.floor"; break;
+			case BuiltinProc_simd_trunc:   name = "llvm.trunc"; break;
+			case BuiltinProc_simd_nearest: name = "llvm.nearbyint"; break;
+			}
+
+			LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)};
+			unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name));
+			GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0]));
+			LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types));
+
+			LLVMValueRef args[1] = {};
+			args[0] = arg0.value;
+
+			res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), "");
+			return res;
+		}
+
+	case BuiltinProc_simd_lanes_reverse:
+		{
+			i64 count = get_array_type_count(arg0.type);
+			LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count);
+			LLVMTypeRef llvm_u32 = lb_type(m, t_u32);
+			for (i64 i = 0; i < count; i++) {
+				values[i] = LLVMConstInt(llvm_u32, count-1-i, false);
+			}
+			LLVMValueRef mask = LLVMConstVector(values, cast(unsigned)count);
+
+			LLVMValueRef v = arg0.value;
+			res.value = LLVMBuildShuffleVector(p->builder, v, v, mask, "");
+			return res;
+		}
+
+	case BuiltinProc_simd_lanes_rotate_left:
+	case BuiltinProc_simd_lanes_rotate_right:
+		{
+
+			i64 count = get_array_type_count(arg0.type);
+			GB_ASSERT(is_power_of_two(count));
+			BigInt bi_count = {};
+			big_int_from_i64(&bi_count, count);
+
+			TypeAndValue const &tv = ce->args[1]->tav;
+			ExactValue val = exact_value_to_integer(tv.value);
+			GB_ASSERT(val.kind == ExactValue_Integer);
+			BigInt *bi = &val.value_integer;
+			if (builtin_id == BuiltinProc_simd_lanes_rotate_right) {
+				big_int_neg(bi, bi);
+			}
+			big_int_rem(bi, bi, &bi_count);
+			big_int_dealloc(&bi_count);
+
+			i64 left = big_int_to_i64(bi);
+
+			LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count);
+			LLVMTypeRef llvm_u32 = lb_type(m, t_u32);
+			for (i64 i = 0; i < count; i++) {
+				u64 idx = cast(u64)(i+left) & cast(u64)(count-1);
+				values[i] = LLVMConstInt(llvm_u32, idx, false);
+			}
+			LLVMValueRef mask = LLVMConstVector(values, cast(unsigned)count);
+
+			LLVMValueRef v = arg0.value;
+			res.value = LLVMBuildShuffleVector(p->builder, v, v, mask, "");
+			return res;
+		}
+
+
+	case BuiltinProc_simd_add_sat:
+	case BuiltinProc_simd_sub_sat:
+		{
+			char const *name = nullptr;
+			switch (builtin_id) {
+			case BuiltinProc_simd_add_sat: name = is_signed ? "llvm.sadd.sat" : "llvm.uadd.sat"; break;
+			case BuiltinProc_simd_sub_sat: name = is_signed ? "llvm.ssub.sat" : "llvm.usub.sat"; break;
+			}
+
+			LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)};
+			unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name));
+			GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0]));
+			LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types));
+
+			LLVMValueRef args[2] = {};
+			args[0] = arg0.value;
+			args[1] = arg1.value;
+
+			res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), "");
+			return res;
+		}
+
+	case BuiltinProc_simd_clamp:
+		{
+			LLVMValueRef v = arg0.value;
+			LLVMValueRef min = arg1.value;
+			LLVMValueRef max = arg2.value;
+
+			if (is_float) {
+				v = LLVMBuildSelect(p->builder, LLVMBuildFCmp(p->builder, LLVMRealOLT, v, min, ""), min, v, "");
+				res.value = LLVMBuildSelect(p->builder, LLVMBuildFCmp(p->builder, LLVMRealOGT, v, max, ""), max, v, "");
+			} else if (is_signed) {
+				v = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntSLT, v, min, ""), min, v, "");
+				res.value = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntSGT, v, max, ""), max, v, "");
+			} else {
+				v = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntULT, v, min, ""), min, v, "");
+				res.value = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntUGT, v, max, ""), max, v, "");
+			}
+			return res;
+		}
+
+	case BuiltinProc_simd_to_bits:
+		{
+			res.value = LLVMBuildBitCast(p->builder, arg0.value, lb_type(m, tv.type), "");
+			return res;
+		}
+
+	}
+	GB_PANIC("Unhandled simd intrinsic: '%.*s'", LIT(builtin_procs[builtin_id].name));
+
+	return {};
+}
+
 
 lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, BuiltinProcId id) {
 	ast_node(ce, CallExpr, expr);
 
+	if (BuiltinProc__simd_begin < id && id < BuiltinProc__simd_end) {
+		return lb_build_builtin_simd_proc(p, expr, tv, id);
+	}
+
 	switch (id) {
 	case BuiltinProc_DIRECTIVE: {
 		ast_node(bd, BasicDirective, ce->proc);
@@ -1532,6 +2001,31 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv,
 			return res;
 		}
 
+	case BuiltinProc_fused_mul_add:
+		{
+			Type *type = tv.type;
+			lbValue x = lb_emit_conv(p, lb_build_expr(p, ce->args[0]), type);
+			lbValue y = lb_emit_conv(p, lb_build_expr(p, ce->args[1]), type);
+			lbValue z = lb_emit_conv(p, lb_build_expr(p, ce->args[2]), type);
+
+
+			char const *name = "llvm.fma";
+			LLVMTypeRef types[1] = {lb_type(p->module, type)};
+			unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name));
+			GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0]));
+			LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types));
+
+			LLVMValueRef args[3] = {};
+			args[0] = x.value;
+			args[1] = y.value;
+			args[2] = z.value;
+
+			lbValue res = {};
+			res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), "");
+			res.type = type;
+			return res;
+		}
+
 	case BuiltinProc_mem_copy:
 		{
 			lbValue dst = lb_build_expr(p, ce->args[0]);
@@ -1614,6 +2108,7 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv,
 		return {};
 
 	case BuiltinProc_volatile_store:
+	case BuiltinProc_non_temporal_store:
 	case BuiltinProc_atomic_store:
 	case BuiltinProc_atomic_store_explicit: {
 		lbValue dst = lb_build_expr(p, ce->args[0]);
@@ -1622,6 +2117,13 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv,
 
 		LLVMValueRef instr = LLVMBuildStore(p->builder, val.value, dst.value);
 		switch (id) {
+		case BuiltinProc_non_temporal_store:
+			{
+				unsigned kind_id = LLVMGetMDKindIDInContext(p->module->ctx, "nontemporal", 11);
+				LLVMMetadataRef node = LLVMValueAsMetadata(LLVMConstInt(lb_type(p->module, t_u32), 1, false));
+				LLVMSetMetadata(instr, kind_id, LLVMMetadataAsValue(p->module->ctx, node));
+			}
+			break;
 		case BuiltinProc_volatile_store:        LLVMSetVolatile(instr, true);                                        break;
 		case BuiltinProc_atomic_store:          LLVMSetOrdering(instr, LLVMAtomicOrderingSequentiallyConsistent);    break;
 		case BuiltinProc_atomic_store_explicit: LLVMSetOrdering(instr, llvm_atomic_ordering_from_odin(ce->args[2])); break;
@@ -1633,12 +2135,21 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv,
 	}
 
 	case BuiltinProc_volatile_load:
+	case BuiltinProc_non_temporal_load:
 	case BuiltinProc_atomic_load:
 	case BuiltinProc_atomic_load_explicit: {
 		lbValue dst = lb_build_expr(p, ce->args[0]);
 
 		LLVMValueRef instr = LLVMBuildLoad(p->builder, dst.value, "");
 		switch (id) {
+		case BuiltinProc_non_temporal_load:
+			{
+				unsigned kind_id = LLVMGetMDKindIDInContext(p->module->ctx, "nontemporal", 11);
+				LLVMMetadataRef node = LLVMValueAsMetadata(LLVMConstInt(lb_type(p->module, t_u32), 1, false));
+				LLVMSetMetadata(instr, kind_id, LLVMMetadataAsValue(p->module->ctx, node));
+			}
+			break;
+			break;
 		case BuiltinProc_volatile_load:        LLVMSetVolatile(instr, true);                                        break;
 		case BuiltinProc_atomic_load:          LLVMSetOrdering(instr, LLVMAtomicOrderingSequentiallyConsistent);    break;
 		case BuiltinProc_atomic_load_explicit: LLVMSetOrdering(instr, llvm_atomic_ordering_from_odin(ce->args[1])); break;
@@ -2232,6 +2743,47 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv,
 			return res;
 		}
 
+
+	case BuiltinProc_x86_cpuid:
+		{
+			Type *param_types[2] = {t_u32, t_u32};
+			Type *type = alloc_type_proc_from_types(param_types, gb_count_of(param_types), tv.type, false, ProcCC_None);
+			LLVMTypeRef func_type = LLVMGetElementType(lb_type(p->module, type));
+			LLVMValueRef the_asm = llvm_get_inline_asm(
+				func_type,
+				str_lit("cpuid"),
+				str_lit("={ax},={bx},={cx},={dx},{ax},{cx}"),
+				true
+			);
+			GB_ASSERT(the_asm != nullptr);
+
+			LLVMValueRef args[2] = {};
+			args[0] = lb_emit_conv(p, lb_build_expr(p, ce->args[0]), t_u32).value;
+			args[1] = lb_emit_conv(p, lb_build_expr(p, ce->args[1]), t_u32).value;
+			lbValue res = {};
+			res.type = tv.type;
+			res.value = LLVMBuildCall2(p->builder, func_type, the_asm, args, gb_count_of(args), "");
+			return res;
+		}
+	case BuiltinProc_x86_xgetbv:
+		{
+			Type *type = alloc_type_proc_from_types(&t_u32, 1, tv.type, false, ProcCC_None);
+			LLVMTypeRef func_type = LLVMGetElementType(lb_type(p->module, type));
+			LLVMValueRef the_asm = llvm_get_inline_asm(
+				func_type,
+				str_lit("xgetbv"),
+				str_lit("={ax},={dx},{cx}"),
+				true
+			);
+			GB_ASSERT(the_asm != nullptr);
+
+			LLVMValueRef args[1] = {};
+			args[0] = lb_emit_conv(p, lb_build_expr(p, ce->args[0]), t_u32).value;
+			lbValue res = {};
+			res.type = tv.type;
+			res.value = LLVMBuildCall2(p->builder, func_type, the_asm, args, gb_count_of(args), "");
+			return res;
+		}
 	}
 
 	GB_PANIC("Unhandled built-in procedure %.*s", LIT(builtin_procs[id].name));

+ 9 - 2
src/llvm_backend_utility.cpp

@@ -201,6 +201,11 @@ lbValue lb_emit_transmute(lbProcedure *p, lbValue value, Type *t) {
 		return res;
 	}
 
+	if (is_type_simd_vector(src) && is_type_simd_vector(dst)) {
+		res.value = LLVMBuildBitCast(p->builder, value.value, lb_type(p->module, t), "");
+		return res;
+	}
+
 	if (lb_is_type_aggregate(src) || lb_is_type_aggregate(dst)) {
 		lbValue s = lb_address_from_load_or_generate_local(p, value);
 		lbValue d = lb_emit_transmute(p, s, alloc_type_pointer(t));
@@ -480,8 +485,10 @@ lbValue lb_emit_count_ones(lbProcedure *p, lbValue x, Type *type) {
 }
 
 lbValue lb_emit_count_zeros(lbProcedure *p, lbValue x, Type *type) {
-	i64 sz = 8*type_size_of(type);
-	lbValue size = lb_const_int(p->module, type, cast(u64)sz);
+	Type *elem = base_array_type(type);
+	i64 sz = 8*type_size_of(elem);
+	lbValue size = lb_const_int(p->module, elem, cast(u64)sz);
+	size = lb_emit_conv(p, size, type);
 	lbValue count = lb_emit_count_ones(p, x, type);
 	return lb_emit_arith(p, Token_Sub, size, count, type);
 }

+ 2 - 2
src/main.cpp

@@ -1376,8 +1376,8 @@ bool parse_build_flags(Array<String> args) {
 						}
 						case BuildFlag_TargetFeatures: {
 							GB_ASSERT(value.kind == ExactValue_String);
-							build_context.target_features = value.value_string;
-							string_to_lower(&build_context.target_features);
+							build_context.target_features_string = value.value_string;
+							string_to_lower(&build_context.target_features_string);
 							break;
 						}
 						case BuildFlag_RelocMode: {

+ 13 - 1
src/parser.cpp

@@ -360,6 +360,7 @@ Ast *clone_ast(Ast *node) {
 	case Ast_ArrayType:
 		n->ArrayType.count = clone_ast(n->ArrayType.count);
 		n->ArrayType.elem  = clone_ast(n->ArrayType.elem);
+		n->ArrayType.tag   = clone_ast(n->ArrayType.tag);
 		break;
 	case Ast_DynamicArrayType:
 		n->DynamicArrayType.elem = clone_ast(n->DynamicArrayType.elem);
@@ -2127,7 +2128,18 @@ Ast *parse_operand(AstFile *f, bool lhs) {
 		Token name = expect_token(f, Token_Ident);
 		if (name.string == "type") {
 			return ast_helper_type(f, token, parse_type(f));
-		} else if (name.string == "soa" || name.string == "simd") {
+		} else if ( name.string == "simd") {
+			Ast *tag = ast_basic_directive(f, token, name);
+			Ast *original_type = parse_type(f);
+			Ast *type = unparen_expr(original_type);
+			switch (type->kind) {
+			case Ast_ArrayType: type->ArrayType.tag = tag; break;
+			default:
+				syntax_error(type, "Expected a fixed array type after #%.*s, got %.*s", LIT(name.string), LIT(ast_strings[type->kind]));
+				break;
+			}
+			return original_type;
+		} else if (name.string == "soa") {
 			Ast *tag = ast_basic_directive(f, token, name);
 			Ast *original_type = parse_type(f);
 			Ast *type = unparen_expr(original_type);

+ 0 - 1
src/parser.hpp

@@ -411,7 +411,6 @@ AST_KIND(_ExprBegin,  "",  bool) \
 		Token        ellipsis; \
 		ProcInlining inlining; \
 		bool         optional_ok_one; \
-		i32          builtin_id; \
 		void *sce_temp_data; \
 	}) \
 	AST_KIND(FieldValue,      "field value",              struct { Token eq; Ast *field, *value; }) \

+ 9 - 0
src/string.cpp

@@ -157,6 +157,15 @@ int string_compare(String const &x, String const &y) {
 	return 0;
 }
 
+isize string_index_byte(String const &s, u8 x) {
+	for (isize i = 0; i < s.len; i++) {
+		if (s.text[i] == x) {
+			return i;
+		}
+	}
+	return -1;
+}
+
 GB_COMPARE_PROC(string_cmp_proc) {
 	String x = *(String *)a;
 	String y = *(String *)b;

+ 21 - 3
src/types.cpp

@@ -261,6 +261,7 @@ struct TypeProc {
 	TYPE_KIND(SimdVector, struct {                            \
 		i64   count;                                      \
 		Type *elem;                                       \
+		Type *generic_count;                              \
 	})                                                        \
 	TYPE_KIND(RelativePointer, struct {                       \
 		Type *pointer_type;                               \
@@ -362,6 +363,9 @@ enum : int {
 	MATRIX_ELEMENT_COUNT_MIN = 1,
 	MATRIX_ELEMENT_COUNT_MAX = 16,
 	MATRIX_ELEMENT_MAX_SIZE = MATRIX_ELEMENT_COUNT_MAX * (2 * 8), // complex128
+
+	SIMD_ELEMENT_COUNT_MIN = 1,
+	SIMD_ELEMENT_COUNT_MAX = 64,
 };
 
 
@@ -1085,10 +1089,11 @@ Type *alloc_type_bit_set() {
 
 
 
-Type *alloc_type_simd_vector(i64 count, Type *elem) {
+Type *alloc_type_simd_vector(i64 count, Type *elem, Type *generic_count=nullptr) {
 	Type *t = alloc_type(Type_SimdVector);
 	t->SimdVector.count = count;
 	t->SimdVector.elem = elem;
+	t->SimdVector.generic_count = generic_count;
 	return t;
 }
 
@@ -1593,6 +1598,8 @@ i64 get_array_type_count(Type *t) {
 		return bt->Array.count;
 	} else if (bt->kind == Type_EnumeratedArray) {
 		return bt->EnumeratedArray.count;
+	} else if (bt->kind == Type_SimdVector) {
+		return bt->SimdVector.count;
 	}
 	GB_ASSERT(is_type_array_like(t));
 	return -1;
@@ -1932,11 +1939,14 @@ bool is_type_valid_vector_elem(Type *t) {
 			return false;
 		}
 		if (is_type_integer(t)) {
-			return true;
+			return !is_type_integer_128bit(t);
 		}
 		if (is_type_float(t)) {
 			return true;
 		}
+		if (is_type_boolean(t)) {
+			return true;
+		}
 	}
 	return false;
 }
@@ -2078,6 +2088,11 @@ bool is_type_polymorphic(Type *t, bool or_specialized=false) {
 			return true;
 		}
 		return is_type_polymorphic(t->Array.elem, or_specialized);
+	case Type_SimdVector:
+		if (t->SimdVector.generic_count != nullptr) {
+			return true;
+		}
+		return is_type_polymorphic(t->SimdVector.elem, or_specialized);
 	case Type_DynamicArray:
 		return is_type_polymorphic(t->DynamicArray.elem, or_specialized);
 	case Type_Slice:
@@ -2291,6 +2306,9 @@ bool is_type_comparable(Type *t) {
 			}
 		}
 		return true;
+
+	case Type_SimdVector:
+		return true;
 	}
 	return false;
 }
@@ -3446,7 +3464,7 @@ i64 type_align_of_internal(Type *t, TypePath *path) {
 
 	case Type_SimdVector: {
 		// IMPORTANT TODO(bill): Figure out the alignment of vector types
-		return gb_clamp(next_pow2(type_size_of_internal(t, path)), 1, build_context.max_align);
+		return gb_clamp(next_pow2(type_size_of_internal(t, path)), 1, build_context.max_align*2);
 	}
 	
 	case Type_Matrix: