|
@@ -1759,7 +1759,7 @@ Returns:
|
|
|
replace :: intrinsics.simd_replace
|
|
|
|
|
|
/*
|
|
|
-Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
|
|
|
+Reduce a vector to a scalar by adding up all the lanes.
|
|
|
|
|
|
This procedure returns a scalar that is the ordered sum of all lanes. The
|
|
|
ordered sum may be important for accounting for precision errors in
|
|
@@ -2511,460 +2511,16 @@ recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where int
|
|
|
return T(1) / v
|
|
|
}
|
|
|
|
|
|
+
|
|
|
/*
|
|
|
Create a vector where each lane contains the index of that lane.
|
|
|
-
|
|
|
Inputs:
|
|
|
- `V`: The type of the vector to create.
|
|
|
-
|
|
|
Result:
|
|
|
- A vector of the given type, where each lane contains the index of that lane.
|
|
|
-
|
|
|
**Operation**:
|
|
|
-
|
|
|
for i in 0 ..< N {
|
|
|
res[i] = i
|
|
|
}
|
|
|
*/
|
|
|
-indices :: #force_inline proc "contextless" ($V: typeid/#simd[$N]$E) -> V where intrinsics.type_is_numeric(E) {
|
|
|
- when N == 1 {
|
|
|
- return {0}
|
|
|
- } else when N == 2 {
|
|
|
- return {0, 1}
|
|
|
- } else when N == 4 {
|
|
|
- return {0, 1, 2, 3}
|
|
|
- } else when N == 8 {
|
|
|
- return {0, 1, 2, 3, 4, 5, 6, 7}
|
|
|
- } else when N == 16 {
|
|
|
- return {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
|
|
|
- } else when N == 32 {
|
|
|
- return {
|
|
|
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
- 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
|
|
- }
|
|
|
- } else when N == 64 {
|
|
|
- return {
|
|
|
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
- 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
|
|
- 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
|
|
- 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
|
|
- }
|
|
|
- } else {
|
|
|
- #panic("Unsupported vector size!")
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
-Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
|
|
|
-
|
|
|
-This procedure returns a scalar that is the sum of all lanes, calculated by
|
|
|
-adding each even-indexed element with the following odd-indexed element to
|
|
|
-produce N/2 values. This is repeated until only a single element remains. This
|
|
|
-order is supported by hardware instructions for some types/architectures (e.g.
|
|
|
-i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
|
|
|
-
|
|
|
-The order of the sum may be important for accounting for precision errors in
|
|
|
-floating-point computation, as floating-point addition is not associative, that
|
|
|
-is `(a+b)+c` may not be equal to `a+(b+c)`.
|
|
|
-
|
|
|
-Inputs:
|
|
|
-- `v`: The vector to reduce.
|
|
|
-
|
|
|
-Result:
|
|
|
-- Sum of all lanes, as a scalar.
|
|
|
-
|
|
|
-**Operation**:
|
|
|
-
|
|
|
- for n > 1 {
|
|
|
- n = n / 2
|
|
|
- for i in 0 ..< n {
|
|
|
- a[i] = a[2*i+0] + a[2*i+1]
|
|
|
- }
|
|
|
- }
|
|
|
- res := a[0]
|
|
|
-
|
|
|
-Graphical representation of the operation for N=4:
|
|
|
-
|
|
|
- +-----------------------+
|
|
|
- v: | v0 | v1 | v2 | v3 |
|
|
|
- +-----------------------+
|
|
|
- | | | |
|
|
|
- `>[+]<' `>[+]<'
|
|
|
- | |
|
|
|
- `--->[+]<--'
|
|
|
- |
|
|
|
- v
|
|
|
- +-----+
|
|
|
- result: | y0 |
|
|
|
- +-----+
|
|
|
-*/
|
|
|
-reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
|
|
|
- where intrinsics.type_is_numeric(E) {
|
|
|
- when N == 64 { v64 := v }
|
|
|
- when N == 32 { v32 := v }
|
|
|
- when N == 16 { v16 := v }
|
|
|
- when N == 8 { v8 := v }
|
|
|
- when N == 4 { v4 := v }
|
|
|
- when N == 2 { v2 := v }
|
|
|
-
|
|
|
- when N >= 64 {
|
|
|
- x32 := swizzle(v64,
|
|
|
- 0, 2, 4, 6, 8, 10, 12, 14,
|
|
|
- 16, 18, 20, 22, 24, 26, 28, 30,
|
|
|
- 32, 34, 36, 38, 40, 42, 44, 46,
|
|
|
- 48, 50, 52, 54, 56, 58, 60, 62)
|
|
|
- y32 := swizzle(v64,
|
|
|
- 1, 3, 5, 7, 9, 11, 13, 15,
|
|
|
- 17, 19, 21, 23, 25, 27, 29, 31,
|
|
|
- 33, 35, 37, 39, 41, 43, 45, 47,
|
|
|
- 49, 51, 53, 55, 57, 59, 61, 63)
|
|
|
- v32 := x32 + y32
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 32 {
|
|
|
- x16 := swizzle(v32,
|
|
|
- 0, 2, 4, 6, 8, 10, 12, 14,
|
|
|
- 16, 18, 20, 22, 24, 26, 28, 30)
|
|
|
- y16 := swizzle(v32,
|
|
|
- 1, 3, 5, 7, 9, 11, 13, 15,
|
|
|
- 17, 19, 21, 23, 25, 27, 29, 31)
|
|
|
- v16 := x16 + y16
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 16 {
|
|
|
- x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
|
|
|
- y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
|
|
|
- v8 := x8 + y8
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 8 {
|
|
|
- x4 := swizzle(v8, 0, 2, 4, 6)
|
|
|
- y4 := swizzle(v8, 1, 3, 5, 7)
|
|
|
- v4 := x4 + y4
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 4 {
|
|
|
- x2 := swizzle(v4, 0, 2)
|
|
|
- y2 := swizzle(v4, 1, 3)
|
|
|
- v2 := x2 + y2
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 2 {
|
|
|
- return extract(v2, 0) + extract(v2, 1)
|
|
|
- } else {
|
|
|
- return extract(v, 0)
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
-Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
|
|
|
-
|
|
|
-This procedure returns a scalar that is the sum of all lanes, calculated by
|
|
|
-bisecting the vector into two parts, where the first contains lanes [0, N/2)
|
|
|
-and the second contains lanes [N/2, N), and adding the two halves element-wise
|
|
|
-to produce N/2 values. This is repeated until only a single element remains.
|
|
|
-This order may be faster to compute than the ordered sum for floats, as it can
|
|
|
-often be better parallelized.
|
|
|
-
|
|
|
-The order of the sum may be important for accounting for precision errors in
|
|
|
-floating-point computation, as floating-point addition is not associative, that
|
|
|
-is `(a+b)+c` may not be equal to `a+(b+c)`.
|
|
|
-
|
|
|
-Inputs:
|
|
|
-- `v`: The vector to reduce.
|
|
|
-
|
|
|
-Result:
|
|
|
-- Sum of all lanes, as a scalar.
|
|
|
-
|
|
|
-**Operation**:
|
|
|
-
|
|
|
- for n > 1 {
|
|
|
- n = n / 2
|
|
|
- for i in 0 ..< n {
|
|
|
- a[i] += a[i+n]
|
|
|
- }
|
|
|
- }
|
|
|
- res := a[0]
|
|
|
-
|
|
|
-Graphical representation of the operation for N=4:
|
|
|
-
|
|
|
- +-----------------------+
|
|
|
- | v0 | v1 | v2 | v3 |
|
|
|
- +-----------------------+
|
|
|
- | | | |
|
|
|
- [+]<-- | ---' |
|
|
|
- | [+]<--------'
|
|
|
- | |
|
|
|
- `>[+]<'
|
|
|
- |
|
|
|
- v
|
|
|
- +-----+
|
|
|
- result: | y0 |
|
|
|
- +-----+
|
|
|
-*/
|
|
|
-reduce_add_bisect :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
|
|
|
- where intrinsics.type_is_numeric(E) {
|
|
|
- when N == 64 { v64 := v }
|
|
|
- when N == 32 { v32 := v }
|
|
|
- when N == 16 { v16 := v }
|
|
|
- when N == 8 { v8 := v }
|
|
|
- when N == 4 { v4 := v }
|
|
|
- when N == 2 { v2 := v }
|
|
|
-
|
|
|
- when N >= 64 {
|
|
|
- x32 := swizzle(v64,
|
|
|
- 0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
- 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
- 16, 17, 18, 19, 20, 21, 22, 23,
|
|
|
- 24, 25, 26, 27, 28, 29, 30, 31)
|
|
|
- y32 := swizzle(v64,
|
|
|
- 32, 33, 34, 35, 36, 37, 38, 39,
|
|
|
- 40, 41, 42, 43, 44, 45, 46, 47,
|
|
|
- 48, 49, 50, 51, 52, 53, 54, 55,
|
|
|
- 56, 57, 58, 59, 60, 61, 62, 63)
|
|
|
- v32 := x32 + y32
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 32 {
|
|
|
- x16 := swizzle(v32,
|
|
|
- 0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
- 8, 9, 10, 11, 12, 13, 14, 15)
|
|
|
- y16 := swizzle(v32,
|
|
|
- 16, 17, 18, 19, 20, 21, 22, 23,
|
|
|
- 24, 25, 26, 27, 28, 29, 30, 31)
|
|
|
- v16 := x16 + y16
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 16 {
|
|
|
- x8 := swizzle(v16, 0, 1, 2, 3, 4, 5, 6, 7)
|
|
|
- y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
|
|
|
- v8 := x8 + y8
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 8 {
|
|
|
- x4 := swizzle(v8, 0, 1, 2, 3)
|
|
|
- y4 := swizzle(v8, 4, 5, 6, 7)
|
|
|
- v4 := x4 + y4
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 4 {
|
|
|
- x2 := swizzle(v4, 0, 1)
|
|
|
- y2 := swizzle(v4, 2, 3)
|
|
|
- v2 := x2 + y2
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 2 {
|
|
|
- return extract(v2, 0) + extract(v2, 1)
|
|
|
- } else {
|
|
|
- return extract(v, 0)
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
-Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
|
|
|
-
|
|
|
-This procedure returns a scalar that is the product of all lanes, calculated by
|
|
|
-bisecting the vector into two parts, where the first contains lanes [0, N/2)
|
|
|
-and the second contains lanes [N/2, N), and multiplying the two halves together
|
|
|
-multiplying each even-indexed element with the following odd-indexed element to
|
|
|
-produce N/2 values. This is repeated until only a single element remains. This
|
|
|
-order may be faster to compute than the ordered product for floats, as it can
|
|
|
-often be better parallelized.
|
|
|
-
|
|
|
-The order of the product may be important for accounting for precision errors
|
|
|
-in floating-point computation, as floating-point multiplication is not
|
|
|
-associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
|
|
|
-
|
|
|
-Inputs:
|
|
|
-- `v`: The vector to reduce.
|
|
|
-
|
|
|
-Result:
|
|
|
-- Product of all lanes, as a scalar.
|
|
|
-
|
|
|
-**Operation**:
|
|
|
-
|
|
|
- for n > 1 {
|
|
|
- n = n / 2
|
|
|
- for i in 0 ..< n {
|
|
|
- a[i] = a[2*i+0] * a[2*i+1]
|
|
|
- }
|
|
|
- }
|
|
|
- res := a[0]
|
|
|
-
|
|
|
-Graphical representation of the operation for N=4:
|
|
|
-
|
|
|
- +-----------------------+
|
|
|
- v: | v0 | v1 | v2 | v3 |
|
|
|
- +-----------------------+
|
|
|
- | | | |
|
|
|
- `>[x]<' `>[x]<'
|
|
|
- | |
|
|
|
- `--->[x]<--'
|
|
|
- |
|
|
|
- v
|
|
|
- +-----+
|
|
|
- result: | y0 |
|
|
|
- +-----+
|
|
|
-*/
|
|
|
-reduce_mul_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
|
|
|
- where intrinsics.type_is_numeric(E) {
|
|
|
- when N == 64 { v64 := v }
|
|
|
- when N == 32 { v32 := v }
|
|
|
- when N == 16 { v16 := v }
|
|
|
- when N == 8 { v8 := v }
|
|
|
- when N == 4 { v4 := v }
|
|
|
- when N == 2 { v2 := v }
|
|
|
-
|
|
|
- when N >= 64 {
|
|
|
- x32 := swizzle(v64,
|
|
|
- 0, 2, 4, 6, 8, 10, 12, 14,
|
|
|
- 16, 18, 20, 22, 24, 26, 28, 30,
|
|
|
- 32, 34, 36, 38, 40, 42, 44, 46,
|
|
|
- 48, 50, 52, 54, 56, 58, 60, 62)
|
|
|
- y32 := swizzle(v64,
|
|
|
- 1, 3, 5, 7, 9, 11, 13, 15,
|
|
|
- 17, 19, 21, 23, 25, 27, 29, 31,
|
|
|
- 33, 35, 37, 39, 41, 43, 45, 47,
|
|
|
- 49, 51, 53, 55, 57, 59, 61, 63)
|
|
|
- v32 := x32 * y32
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 32 {
|
|
|
- x16 := swizzle(v32,
|
|
|
- 0, 2, 4, 6, 8, 10, 12, 14,
|
|
|
- 16, 18, 20, 22, 24, 26, 28, 30)
|
|
|
- y16 := swizzle(v32,
|
|
|
- 1, 3, 5, 7, 9, 11, 13, 15,
|
|
|
- 17, 19, 21, 23, 25, 27, 29, 31)
|
|
|
- v16 := x16 * y16
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 16 {
|
|
|
- x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
|
|
|
- y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
|
|
|
- v8 := x8 * y8
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 8 {
|
|
|
- x4 := swizzle(v8, 0, 2, 4, 6)
|
|
|
- y4 := swizzle(v8, 1, 3, 5, 7)
|
|
|
- v4 := x4 * y4
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 4 {
|
|
|
- x2 := swizzle(v4, 0, 2)
|
|
|
- y2 := swizzle(v4, 1, 3)
|
|
|
- v2 := x2 * y2
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 2 {
|
|
|
- return extract(v2, 0) * extract(v2, 1)
|
|
|
- } else {
|
|
|
- return extract(v, 0)
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
-Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
|
|
|
-
|
|
|
-This procedure returns a scalar that is the product of all lanes, calculated by
|
|
|
-bisecting the vector into two parts, where the first contains indices [0, N/2)
|
|
|
-and the second contains indices [N/2, N), and multiplying the two halves
|
|
|
-together element-wise to produce N/2 values. This is repeated until only a
|
|
|
-single element remains. This order may be faster to compute than the ordered
|
|
|
-product for floats, as it can often be better parallelized.
|
|
|
-
|
|
|
-The order of the product may be important for accounting for precision errors
|
|
|
-in floating-point computation, as floating-point multiplication is not
|
|
|
-associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
|
|
|
-
|
|
|
-Inputs:
|
|
|
-- `v`: The vector to reduce.
|
|
|
-
|
|
|
-Result:
|
|
|
-- Product of all lanes, as a scalar.
|
|
|
-
|
|
|
-**Operation**:
|
|
|
-
|
|
|
- for n > 1 {
|
|
|
- n = n / 2
|
|
|
- for i in 0 ..< n {
|
|
|
- a[i] *= a[i+n]
|
|
|
- }
|
|
|
- }
|
|
|
- res := a[0]
|
|
|
-
|
|
|
-Graphical representation of the operation for N=4:
|
|
|
-
|
|
|
- +-----------------------+
|
|
|
- | v0 | v1 | v2 | v3 |
|
|
|
- +-----------------------+
|
|
|
- | | | |
|
|
|
- [x]<-- | ---' |
|
|
|
- | [x]<--------'
|
|
|
- | |
|
|
|
- `>[x]<'
|
|
|
- |
|
|
|
- v
|
|
|
- +-----+
|
|
|
- result: | y0 |
|
|
|
- +-----+
|
|
|
-*/
|
|
|
-reduce_mul_bisect :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
|
|
|
- where intrinsics.type_is_numeric(E) {
|
|
|
- when N == 64 { v64 := v }
|
|
|
- when N == 32 { v32 := v }
|
|
|
- when N == 16 { v16 := v }
|
|
|
- when N == 8 { v8 := v }
|
|
|
- when N == 4 { v4 := v }
|
|
|
- when N == 2 { v2 := v }
|
|
|
-
|
|
|
- when N >= 64 {
|
|
|
- x32 := swizzle(v64,
|
|
|
- 0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
- 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
- 16, 17, 18, 19, 20, 21, 22, 23,
|
|
|
- 24, 25, 26, 27, 28, 29, 30, 31)
|
|
|
- y32 := swizzle(v64,
|
|
|
- 32, 33, 34, 35, 36, 37, 38, 39,
|
|
|
- 40, 41, 42, 43, 44, 45, 46, 47,
|
|
|
- 48, 49, 50, 51, 52, 53, 54, 55,
|
|
|
- 56, 57, 58, 59, 60, 61, 62, 63)
|
|
|
- v32 := x32 * y32
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 32 {
|
|
|
- x16 := swizzle(v32,
|
|
|
- 0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
- 8, 9, 10, 11, 12, 13, 14, 15)
|
|
|
- y16 := swizzle(v32,
|
|
|
- 16, 17, 18, 19, 20, 21, 22, 23,
|
|
|
- 24, 25, 26, 27, 28, 29, 30, 31)
|
|
|
- v16 := x16 * y16
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 16 {
|
|
|
- x8 := swizzle(v16, 0, 1, 2, 3, 4, 5, 6, 7)
|
|
|
- y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
|
|
|
- v8 := x8 * y8
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 8 {
|
|
|
- x4 := swizzle(v8, 0, 1, 2, 3)
|
|
|
- y4 := swizzle(v8, 4, 5, 6, 7)
|
|
|
- v4 := x4 * y4
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 4 {
|
|
|
- x2 := swizzle(v4, 0, 1)
|
|
|
- y2 := swizzle(v4, 2, 3)
|
|
|
- v2 := x2 * y2
|
|
|
- }
|
|
|
-
|
|
|
- when N >= 2 {
|
|
|
- return extract(v2, 0) * extract(v2, 1)
|
|
|
- } else {
|
|
|
- return extract(v, 0)
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
+indices :: intrinsics.simd_indices
|