4 months ago · 7e34d707bb
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -1759,7 +1759,7 @@ Returns:
 
															 replace :: intrinsics.simd_replace
														
 
															 /*
														
 
															-Reduce a vector to a scalar by adding up all the lanes.
														
 
															+Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
														
 
															 This procedure returns a scalar that is the ordered sum of all lanes. The
														
 
															 ordered sum may be important for accounting for precision errors in
														
@@ -2510,3 +2510,457 @@ Example:
 
															 recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
														
 
															 	return T(1) / v
														
 
															 }
														
 
															+
														
 
															+/*
														
 
															+Creates a vector where each lane contains the index of that lane.
														
 
															+
														
 
															+Inputs:
														
 
															+- `V`: The type of the vector to create.
														
 
															+
														
 
															+Result:
														
 
															+- A vector of the given type, where each lane contains the index of that lane.
														
 
															+
														
 
															+**Operation**:
														
 
															+
														
 
															+>	for i in 0 ..< N {
														
 
															+		res[i] = i
														
 
															+	}
														
 
															+*/
														
 
															+indices :: #force_inline proc "contextless" ($V: typeid/#simd[$N]$E) -> V where intrinsics.type_is_numeric(E) {
														
 
															+	when N == 1 {
														
 
															+		return {0}
														
 
															+	} else when N == 2 {
														
 
															+		return {0, 1}
														
 
															+	} else when N == 4 {
														
 
															+		return {0, 1, 2, 3}
														
 
															+	} else when N == 8 {
														
 
															+		return {0, 1, 2, 3, 4, 5, 6, 7}
														
 
															+	} else when N == 16 {
														
 
															+		return {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
														
 
															+	} else when N == 32 {
														
 
															+		return {
														
 
															+			0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
														
 
															+			16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
														
 
															+		}
														
 
															+	} else when N == 64 {
														
 
															+		return {
														
 
															+			0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
														
 
															+			16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
														
 
															+			32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
														
 
															+			48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
														
 
															+		}
														
 
															+	} else {
														
 
															+		#panic("Unsupported vector size!")
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
														
 
															+
														
 
															+This procedure returns a scalar that is the sum of all lanes, calculated by
														
 
															+adding each even-numbered element with the following odd-numbered element. This
														
 
															+is repeated until only a single element remains. This order is supported by
														
 
															+hardware instructions for some types/architectures (e.g. i16/i32/f32/f64 on x86
														
 
															+SSE, i8/i16/i32/f32 on ARM NEON).
														
 
															+
														
 
															+The order of the sum may be important for accounting for precision errors in
														
 
															+floating-point computation, as floating-point addition is not associative, that
														
 
															+is `(a+b)+c` may not be equal to `a+(b+c)`.
														
 
															+
														
 
															+Inputs:
														
 
															+- `v`: The vector to reduce.
														
 
															+
														
 
															+Result:
														
 
															+- Sum of all lanes, as a scalar.
														
 
															+
														
 
															+**Operation**:
														
 
															+
														
 
															+	for n > 1 {
														
 
															+		n = n / 2
														
 
															+		for i in 0 ..< n {
														
 
															+			a[i] = a[2*i+0] + a[2*i+1]
														
 
															+		}
														
 
															+	}
														
 
															+	res := a[0]
														
 
															+
														
 
															+Graphical representation of the operation for N=4:
														
 
															+
														
 
															+	   +-----------------------+
														
 
															+	v: | v0  | v1  | v2  | v3  |
														
 
															+	   +-----------------------+
														
 
															+	      |     |     |     |
														
 
															+	      `>[+]<'     `>[+]<'
														
 
															+	         |           |
														
 
															+	         `--->[+]<--'
														
 
															+	               |
														
 
															+	               v
														
 
															+	            +-----+
														
 
															+	    result: | y0  |
														
 
															+	            +-----+
														
 
															+*/
														
 
															+reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
														
 
															+	where intrinsics.type_is_numeric(E) {
														
 
															+	when N == 64 { v64 := v }
														
 
															+	when N == 32 { v32 := v }
														
 
															+	when N == 16 { v16 := v }
														
 
															+	when N == 8  { v8 := v }
														
 
															+	when N == 4  { v4 := v }
														
 
															+	when N == 2  { v2 := v }
														
 
															+
														
 
															+	when N >= 64 {
														
 
															+		x32 := swizzle(v64,
														
 
															+			0,  2,  4,  6,  8,  10, 12, 14,
														
 
															+			16, 18, 20, 22, 24, 26, 28, 30,
														
 
															+			32, 34, 36, 38, 40, 42, 44, 46,
														
 
															+			48, 50, 52, 54, 56, 58, 60, 62)
														
 
															+		y32 := swizzle(v64,
														
 
															+			1,  3,  5,  7,  9,  11, 13, 15,
														
 
															+			17, 19, 21, 23, 25, 27, 29, 31,
														
 
															+			33, 35, 37, 39, 41, 43, 45, 47,
														
 
															+			49, 51, 53, 55, 57, 59, 61, 63)
														
 
															+		v32 := x32 * y32
														
 
															+	}
														
 
															+
														
 
															+	when N >= 32 {
														
 
															+		x16 := swizzle(v32,
														
 
															+			0,  2,  4,  6,  8,  10, 12, 14,
														
 
															+			16, 18, 20, 22, 24, 26, 28, 30)
														
 
															+		y16 := swizzle(v32,
														
 
															+			1,  3,  5,  7,  9,  11, 13, 15,
														
 
															+			17, 19, 21, 23, 25, 27, 29, 31)
														
 
															+		v16 := x16 * y16
														
 
															+	}
														
 
															+
														
 
															+	when N >= 16 {
														
 
															+		x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
														
 
															+		y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
														
 
															+		v8 := x8 * y8
														
 
															+	}
														
 
															+
														
 
															+	when N >= 8 {
														
 
															+		x4 := swizzle(v8, 0, 2, 4, 6)
														
 
															+		y4 := swizzle(v8, 1, 3, 5, 7)
														
 
															+		v4 := x4 * y4
														
 
															+	}
														
 
															+
														
 
															+	when N >= 4 {
														
 
															+		x2 := swizzle(v4, 0, 2)
														
 
															+		y2 := swizzle(v4, 1, 3)
														
 
															+		v2 := x2 * y2
														
 
															+	}
														
 
															+
														
 
															+	when N >= 2 {
														
 
															+		return extract(v2, 0) * extract(v2, 1)
														
 
															+	} else {
														
 
															+		return extract(v, 0)
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+Reduce a vector to a scalar by adding up all the lanes in a binary fashion.
														
 
															+
														
 
															+This procedure returns a scalar that is the sum of all lanes, calculated by
														
 
															+splitting the vector in two parts and adding the two halves together
														
 
															+element-wise. This is repeated until only a single element remains. This order
														
 
															+will typically be faster to compute than the ordered sum for floats, as it can
														
 
															+be better parallelized.
														
 
															+
														
 
															+The order of the sum may be important for accounting for precision errors in
														
 
															+floating-point computation, as floating-point addition is not associative, that
														
 
															+is `(a+b)+c` may not be equal to `a+(b+c)`.
														
 
															+
														
 
															+Inputs:
														
 
															+- `v`: The vector to reduce.
														
 
															+
														
 
															+Result:
														
 
															+- Sum of all lanes, as a scalar.
														
 
															+
														
 
															+**Operation**:
														
 
															+
														
 
															+	for n > 1 {
														
 
															+		n = n / 2
														
 
															+		for i in 0 ..< n {
														
 
															+			a[i] += a[i+n]
														
 
															+		}
														
 
															+	}
														
 
															+	res := a[0]
														
 
															+
														
 
															+Graphical representation of the operation for N=4:
														
 
															+
														
 
															+	     +-----------------------+
														
 
															+	     | v0  | v1  | v2  | v3  |
														
 
															+	     +-----------------------+
														
 
															+	        |     |     |     |
														
 
															+	       [+]<-- | ---'      |
														
 
															+	        |    [+]<--------'
														
 
															+	        |     |
														
 
															+	        `>[+]<'
														
 
															+	           |
														
 
															+	           v
														
 
															+	        +-----+
														
 
															+	result: | y0  |
														
 
															+	        +-----+
														
 
															+*/
														
 
															+reduce_add_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
														
 
															+	where intrinsics.type_is_numeric(E) {
														
 
															+	when N == 64 { v64 := v }
														
 
															+	when N == 32 { v32 := v }
														
 
															+	when N == 16 { v16 := v }
														
 
															+	when N == 8  { v8 := v }
														
 
															+	when N == 4  { v4 := v }
														
 
															+	when N == 2  { v2 := v }
														
 
															+
														
 
															+	when N >= 64 {
														
 
															+		x32 := swizzle(v64,
														
 
															+			0,  1,  2,  3,  4,  5,  6,  7,
														
 
															+			8,  9,  10, 11, 12, 13, 14, 15,
														
 
															+			16, 17, 18, 19, 20, 21, 22, 23,
														
 
															+			24, 25, 26, 27, 28, 29, 30, 31)
														
 
															+		y32 := swizzle(v64,
														
 
															+			32, 33, 34, 35, 36, 37, 38, 39,
														
 
															+			40, 41, 42, 43, 44, 45, 46, 47,
														
 
															+			48, 49, 50, 51, 52, 53, 54, 55,
														
 
															+			56, 57, 58, 59, 60, 61, 62, 63)
														
 
															+		v32 := x32 + y32
														
 
															+	}
														
 
															+
														
 
															+	when N >= 32 {
														
 
															+		x16 := swizzle(v32,
														
 
															+			0,  1,  2,  3,  4,  5,  6,  7,
														
 
															+			8,  9,  10, 11, 12, 13, 14, 15)
														
 
															+		y16 := swizzle(v32,
														
 
															+			16, 17, 18, 19, 20, 21, 22, 23,
														
 
															+			24, 25, 26, 27, 28, 29, 30, 31)
														
 
															+		v16 := x16 + y16
														
 
															+	}
														
 
															+
														
 
															+	when N >= 16 {
														
 
															+		x8 := swizzle(v16, 0, 1, 2,  3,  4,  5,  6,  7)
														
 
															+		y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
														
 
															+		v8 := x8 + y8
														
 
															+	}
														
 
															+
														
 
															+	when N >= 8 {
														
 
															+		x4 := swizzle(v8, 0, 1, 2, 3)
														
 
															+		y4 := swizzle(v8, 4, 5, 6, 7)
														
 
															+		v4 := x4 + y4
														
 
															+	}
														
 
															+
														
 
															+	when N >= 4 {
														
 
															+		x2 := swizzle(v4, 0, 1)
														
 
															+		y2 := swizzle(v4, 2, 3)
														
 
															+		v2 := x2 + y2
														
 
															+	}
														
 
															+
														
 
															+	when N >= 2 {
														
 
															+		return extract(v2, 0) + extract(v2, 1)
														
 
															+	} else {
														
 
															+		return extract(v, 0)
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
														
 
															+
														
 
															+This procedure returns a scalar that is the product of all lanes, calculated by
														
 
															+multiplying each even-numbered element with the following odd-numbered element.
														
 
															+This is repeated until only a single element remains. This order may be faster
														
 
															+to compute than the ordered product for floats, as it can be better
														
 
															+parallelized.
														
 
															+
														
 
															+The order of the product may be important for accounting for precision errors
														
 
															+in floating-point computation, as floating-point multiplication is not
														
 
															+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
														
 
															+
														
 
															+Inputs:
														
 
															+- `v`: The vector to reduce.
														
 
															+
														
 
															+Result:
														
 
															+- Product of all lanes, as a scalar.
														
 
															+
														
 
															+**Operation**:
														
 
															+
														
 
															+	for n > 1 {
														
 
															+		n = n / 2
														
 
															+		for i in 0 ..< n {
														
 
															+			a[i] = a[2*i+0] * a[2*i+1]
														
 
															+		}
														
 
															+	}
														
 
															+	res := a[0]
														
 
															+
														
 
															+Graphical representation of the operation for N=4:
														
 
															+
														
 
															+	   +-----------------------+
														
 
															+	v: | v0  | v1  | v2  | v3  |
														
 
															+	   +-----------------------+
														
 
															+	      |     |     |     |
														
 
															+	      `>[x]<'     `>[x]<'
														
 
															+	         |           |
														
 
															+	         `--->[x]<--'
														
 
															+	               |
														
 
															+	               v
														
 
															+	            +-----+
														
 
															+	    result: | y0  |
														
 
															+	            +-----+
														
 
															+*/
														
 
															+reduce_mul_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
														
 
															+	where intrinsics.type_is_numeric(E) {
														
 
															+	when N == 64 { v64 := v }
														
 
															+	when N == 32 { v32 := v }
														
 
															+	when N == 16 { v16 := v }
														
 
															+	when N == 8  { v8 := v }
														
 
															+	when N == 4  { v4 := v }
														
 
															+	when N == 2  { v2 := v }
														
 
															+
														
 
															+	when N >= 64 {
														
 
															+		x32 := swizzle(v64,
														
 
															+			0,  2,  4,  6,  8,  10, 12, 14,
														
 
															+			16, 18, 20, 22, 24, 26, 28, 30,
														
 
															+			32, 34, 36, 38, 40, 42, 44, 46,
														
 
															+			48, 50, 52, 54, 56, 58, 60, 62)
														
 
															+		y32 := swizzle(v64,
														
 
															+			1,  3,  5,  7,  9,  11, 13, 15,
														
 
															+			17, 19, 21, 23, 25, 27, 29, 31,
														
 
															+			33, 35, 37, 39, 41, 43, 45, 47,
														
 
															+			49, 51, 53, 55, 57, 59, 61, 63)
														
 
															+		v32 := x32 * y32
														
 
															+	}
														
 
															+
														
 
															+	when N >= 32 {
														
 
															+		x16 := swizzle(v32,
														
 
															+			0,  2,  4,  6,  8,  10, 12, 14,
														
 
															+			16, 18, 20, 22, 24, 26, 28, 30)
														
 
															+		y16 := swizzle(v32,
														
 
															+			1,  3,  5,  7,  9,  11, 13, 15,
														
 
															+			17, 19, 21, 23, 25, 27, 29, 31)
														
 
															+		v16 := x16 * y16
														
 
															+	}
														
 
															+
														
 
															+	when N >= 16 {
														
 
															+		x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
														
 
															+		y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
														
 
															+		v8 := x8 * y8
														
 
															+	}
														
 
															+
														
 
															+	when N >= 8 {
														
 
															+		x4 := swizzle(v8, 0, 2, 4, 6)
														
 
															+		y4 := swizzle(v8, 1, 3, 5, 7)
														
 
															+		v4 := x4 * y4
														
 
															+	}
														
 
															+
														
 
															+	when N >= 4 {
														
 
															+		x2 := swizzle(v4, 0, 2)
														
 
															+		y2 := swizzle(v4, 1, 3)
														
 
															+		v2 := x2 * y2
														
 
															+	}
														
 
															+
														
 
															+	when N >= 2 {
														
 
															+		return extract(v2, 0) * extract(v2, 1)
														
 
															+	} else {
														
 
															+		return extract(v, 0)
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+Reduce a vector to a scalar by multiplying up all the lanes in a binary fashion.
														
 
															+
														
 
															+This procedure returns a scalar that is the product of all lanes, calculated by
														
 
															+splitting the vector in two parts and multiplying the two halves together
														
 
															+element-wise until only a single element remains. This is repeated until only a
														
 
															+single element remains. This order may be faster to compute than the ordered
														
 
															+product for floats, as it can be better parallelized.
														
 
															+
														
 
															+The order of the product may be important for accounting for precision errors
														
 
															+in floating-point computation, as floating-point multiplication is not
														
 
															+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
														
 
															+
														
 
															+Inputs:
														
 
															+- `v`: The vector to reduce.
														
 
															+
														
 
															+Result:
														
 
															+- Product of all lanes, as a scalar.
														
 
															+
														
 
															+**Operation**:
														
 
															+
														
 
															+	for n > 1 {
														
 
															+		n = n / 2
														
 
															+		for i in 0 ..< n {
														
 
															+			a[i] *= a[i+n]
														
 
															+		}
														
 
															+	}
														
 
															+	res := a[0]
														
 
															+
														
 
															+Graphical representation of the operation for N=4:
														
 
															+
														
 
															+	     +-----------------------+
														
 
															+	     | v0  | v1  | v2  | v3  |
														
 
															+	     +-----------------------+
														
 
															+	        |     |     |     |
														
 
															+	       [x]<-- | ---'      |
														
 
															+	        |    [x]<--------'
														
 
															+	        |     |
														
 
															+	        `>[x]<'
														
 
															+	           |
														
 
															+	           v
														
 
															+	        +-----+
														
 
															+	result: | y0  |
														
 
															+	        +-----+
														
 
															+*/
														
 
															+reduce_mul_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
														
 
															+	where intrinsics.type_is_numeric(E) {
														
 
															+	when N == 64 { v64 := v }
														
 
															+	when N == 32 { v32 := v }
														
 
															+	when N == 16 { v16 := v }
														
 
															+	when N == 8  { v8 := v }
														
 
															+	when N == 4  { v4 := v }
														
 
															+	when N == 2  { v2 := v }
														
 
															+
														
 
															+	when N >= 64 {
														
 
															+		x32 := swizzle(v64,
														
 
															+			0,  1,  2,  3,  4,  5,  6,  7,
														
 
															+			8,  9,  10, 11, 12, 13, 14, 15,
														
 
															+			16, 17, 18, 19, 20, 21, 22, 23,
														
 
															+			24, 25, 26, 27, 28, 29, 30, 31)
														
 
															+		y32 := swizzle(v64,
														
 
															+			32, 33, 34, 35, 36, 37, 38, 39,
														
 
															+			40, 41, 42, 43, 44, 45, 46, 47,
														
 
															+			48, 49, 50, 51, 52, 53, 54, 55,
														
 
															+			56, 57, 58, 59, 60, 61, 62, 63)
														
 
															+		v32 := x32 * y32
														
 
															+	}
														
 
															+
														
 
															+	when N >= 32 {
														
 
															+		x16 := swizzle(v32,
														
 
															+			0,  1,  2,  3,  4,  5,  6,  7,
														
 
															+			8,  9,  10, 11, 12, 13, 14, 15)
														
 
															+		y16 := swizzle(v32,
														
 
															+			16, 17, 18, 19, 20, 21, 22, 23,
														
 
															+			24, 25, 26, 27, 28, 29, 30, 31)
														
 
															+		v16 := x16 * y16
														
 
															+	}
														
 
															+
														
 
															+	when N >= 16 {
														
 
															+		x8 := swizzle(v16, 0, 1, 2,  3,  4,  5,  6,  7)
														
 
															+		y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
														
 
															+		v8 := x8 * y8
														
 
															+	}
														
 
															+
														
 
															+	when N >= 8 {
														
 
															+		x4 := swizzle(v8, 0, 1, 2, 3)
														
 
															+		y4 := swizzle(v8, 4, 5, 6, 7)
														
 
															+		v4 := x4 * y4
														
 
															+	}
														
 
															+
														
 
															+	when N >= 4 {
														
 
															+		x2 := swizzle(v4, 0, 1)
														
 
															+		y2 := swizzle(v4, 2, 3)
														
 
															+		v2 := x2 * y2
														
 
															+	}
														
 
															+
														
 
															+	when N >= 2 {
														
 
															+		return extract(v2, 0) * extract(v2, 1)
														
 
															+	} else {
														
 
															+		return extract(v, 0)
														
 
															+	}
														
 
															+}
														
 
															+