|
@@ -1759,7 +1759,7 @@ Returns:
|
|
replace :: intrinsics.simd_replace
|
|
replace :: intrinsics.simd_replace
|
|
|
|
|
|
/*
|
|
/*
|
|
-Reduce a vector to a scalar by adding up all the lanes.
|
|
|
|
|
|
+Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
|
|
|
|
|
|
This procedure returns a scalar that is the ordered sum of all lanes. The
|
|
This procedure returns a scalar that is the ordered sum of all lanes. The
|
|
ordered sum may be important for accounting for precision errors in
|
|
ordered sum may be important for accounting for precision errors in
|
|
@@ -2510,3 +2510,457 @@ Example:
|
|
recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
|
|
recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
|
|
return T(1) / v
|
|
return T(1) / v
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+Creates a vector where each lane contains the index of that lane.
|
|
|
|
+
|
|
|
|
+Inputs:
|
|
|
|
+- `V`: The type of the vector to create.
|
|
|
|
+
|
|
|
|
+Result:
|
|
|
|
+- A vector of the given type, where each lane contains the index of that lane.
|
|
|
|
+
|
|
|
|
+**Operation**:
|
|
|
|
+
|
|
|
|
+> for i in 0 ..< N {
|
|
|
|
+ res[i] = i
|
|
|
|
+ }
|
|
|
|
+*/
|
|
|
|
+indices :: #force_inline proc "contextless" ($V: typeid/#simd[$N]$E) -> V where intrinsics.type_is_numeric(E) {
|
|
|
|
+ when N == 1 {
|
|
|
|
+ return {0}
|
|
|
|
+ } else when N == 2 {
|
|
|
|
+ return {0, 1}
|
|
|
|
+ } else when N == 4 {
|
|
|
|
+ return {0, 1, 2, 3}
|
|
|
|
+ } else when N == 8 {
|
|
|
|
+ return {0, 1, 2, 3, 4, 5, 6, 7}
|
|
|
|
+ } else when N == 16 {
|
|
|
|
+ return {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
|
|
|
|
+ } else when N == 32 {
|
|
|
|
+ return {
|
|
|
|
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
|
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
|
|
|
+ }
|
|
|
|
+ } else when N == 64 {
|
|
|
|
+ return {
|
|
|
|
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
|
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
|
|
|
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
|
|
|
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ #panic("Unsupported vector size!")
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
|
|
|
|
+
|
|
|
|
+This procedure returns a scalar that is the sum of all lanes, calculated by
|
|
|
|
+adding each even-numbered element with the following odd-numbered element. This
|
|
|
|
+is repeated until only a single element remains. This order is supported by
|
|
|
|
+hardware instructions for some types/architectures (e.g. i16/i32/f32/f64 on x86
|
|
|
|
+SSE, i8/i16/i32/f32 on ARM NEON).
|
|
|
|
+
|
|
|
|
+The order of the sum may be important for accounting for precision errors in
|
|
|
|
+floating-point computation, as floating-point addition is not associative, that
|
|
|
|
+is `(a+b)+c` may not be equal to `a+(b+c)`.
|
|
|
|
+
|
|
|
|
+Inputs:
|
|
|
|
+- `v`: The vector to reduce.
|
|
|
|
+
|
|
|
|
+Result:
|
|
|
|
+- Sum of all lanes, as a scalar.
|
|
|
|
+
|
|
|
|
+**Operation**:
|
|
|
|
+
|
|
|
|
+ for n > 1 {
|
|
|
|
+ n = n / 2
|
|
|
|
+ for i in 0 ..< n {
|
|
|
|
+ a[i] = a[2*i+0] + a[2*i+1]
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ res := a[0]
|
|
|
|
+
|
|
|
|
+Graphical representation of the operation for N=4:
|
|
|
|
+
|
|
|
|
+ +-----------------------+
|
|
|
|
+ v: | v0 | v1 | v2 | v3 |
|
|
|
|
+ +-----------------------+
|
|
|
|
+ | | | |
|
|
|
|
+ `>[+]<' `>[+]<'
|
|
|
|
+ | |
|
|
|
|
+ `--->[+]<--'
|
|
|
|
+ |
|
|
|
|
+ v
|
|
|
|
+ +-----+
|
|
|
|
+ result: | y0 |
|
|
|
|
+ +-----+
|
|
|
|
+*/
|
|
|
|
+reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
|
|
|
|
+ where intrinsics.type_is_numeric(E) {
|
|
|
|
+ when N == 64 { v64 := v }
|
|
|
|
+ when N == 32 { v32 := v }
|
|
|
|
+ when N == 16 { v16 := v }
|
|
|
|
+ when N == 8 { v8 := v }
|
|
|
|
+ when N == 4 { v4 := v }
|
|
|
|
+ when N == 2 { v2 := v }
|
|
|
|
+
|
|
|
|
+ when N >= 64 {
|
|
|
|
+ x32 := swizzle(v64,
|
|
|
|
+ 0, 2, 4, 6, 8, 10, 12, 14,
|
|
|
|
+ 16, 18, 20, 22, 24, 26, 28, 30,
|
|
|
|
+ 32, 34, 36, 38, 40, 42, 44, 46,
|
|
|
|
+ 48, 50, 52, 54, 56, 58, 60, 62)
|
|
|
|
+ y32 := swizzle(v64,
|
|
|
|
+ 1, 3, 5, 7, 9, 11, 13, 15,
|
|
|
|
+ 17, 19, 21, 23, 25, 27, 29, 31,
|
|
|
|
+ 33, 35, 37, 39, 41, 43, 45, 47,
|
|
|
|
+ 49, 51, 53, 55, 57, 59, 61, 63)
|
|
|
|
+ v32 := x32 * y32
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 32 {
|
|
|
|
+ x16 := swizzle(v32,
|
|
|
|
+ 0, 2, 4, 6, 8, 10, 12, 14,
|
|
|
|
+ 16, 18, 20, 22, 24, 26, 28, 30)
|
|
|
|
+ y16 := swizzle(v32,
|
|
|
|
+ 1, 3, 5, 7, 9, 11, 13, 15,
|
|
|
|
+ 17, 19, 21, 23, 25, 27, 29, 31)
|
|
|
|
+ v16 := x16 * y16
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 16 {
|
|
|
|
+ x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
|
|
|
|
+ y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
|
|
|
|
+ v8 := x8 * y8
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 8 {
|
|
|
|
+ x4 := swizzle(v8, 0, 2, 4, 6)
|
|
|
|
+ y4 := swizzle(v8, 1, 3, 5, 7)
|
|
|
|
+ v4 := x4 * y4
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 4 {
|
|
|
|
+ x2 := swizzle(v4, 0, 2)
|
|
|
|
+ y2 := swizzle(v4, 1, 3)
|
|
|
|
+ v2 := x2 * y2
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 2 {
|
|
|
|
+ return extract(v2, 0) * extract(v2, 1)
|
|
|
|
+ } else {
|
|
|
|
+ return extract(v, 0)
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+Reduce a vector to a scalar by adding up all the lanes in a binary fashion.
|
|
|
|
+
|
|
|
|
+This procedure returns a scalar that is the sum of all lanes, calculated by
|
|
|
|
+splitting the vector in two parts and adding the two halves together
|
|
|
|
+element-wise. This is repeated until only a single element remains. This order
|
|
|
|
+will typically be faster to compute than the ordered sum for floats, as it can
|
|
|
|
+be better parallelized.
|
|
|
|
+
|
|
|
|
+The order of the sum may be important for accounting for precision errors in
|
|
|
|
+floating-point computation, as floating-point addition is not associative, that
|
|
|
|
+is `(a+b)+c` may not be equal to `a+(b+c)`.
|
|
|
|
+
|
|
|
|
+Inputs:
|
|
|
|
+- `v`: The vector to reduce.
|
|
|
|
+
|
|
|
|
+Result:
|
|
|
|
+- Sum of all lanes, as a scalar.
|
|
|
|
+
|
|
|
|
+**Operation**:
|
|
|
|
+
|
|
|
|
+ for n > 1 {
|
|
|
|
+ n = n / 2
|
|
|
|
+ for i in 0 ..< n {
|
|
|
|
+ a[i] += a[i+n]
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ res := a[0]
|
|
|
|
+
|
|
|
|
+Graphical representation of the operation for N=4:
|
|
|
|
+
|
|
|
|
+ +-----------------------+
|
|
|
|
+ | v0 | v1 | v2 | v3 |
|
|
|
|
+ +-----------------------+
|
|
|
|
+ | | | |
|
|
|
|
+ [+]<-- | ---' |
|
|
|
|
+ | [+]<--------'
|
|
|
|
+ | |
|
|
|
|
+ `>[+]<'
|
|
|
|
+ |
|
|
|
|
+ v
|
|
|
|
+ +-----+
|
|
|
|
+ result: | y0 |
|
|
|
|
+ +-----+
|
|
|
|
+*/
|
|
|
|
+reduce_add_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
|
|
|
|
+ where intrinsics.type_is_numeric(E) {
|
|
|
|
+ when N == 64 { v64 := v }
|
|
|
|
+ when N == 32 { v32 := v }
|
|
|
|
+ when N == 16 { v16 := v }
|
|
|
|
+ when N == 8 { v8 := v }
|
|
|
|
+ when N == 4 { v4 := v }
|
|
|
|
+ when N == 2 { v2 := v }
|
|
|
|
+
|
|
|
|
+ when N >= 64 {
|
|
|
|
+ x32 := swizzle(v64,
|
|
|
|
+ 0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
|
+ 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
|
+ 16, 17, 18, 19, 20, 21, 22, 23,
|
|
|
|
+ 24, 25, 26, 27, 28, 29, 30, 31)
|
|
|
|
+ y32 := swizzle(v64,
|
|
|
|
+ 32, 33, 34, 35, 36, 37, 38, 39,
|
|
|
|
+ 40, 41, 42, 43, 44, 45, 46, 47,
|
|
|
|
+ 48, 49, 50, 51, 52, 53, 54, 55,
|
|
|
|
+ 56, 57, 58, 59, 60, 61, 62, 63)
|
|
|
|
+ v32 := x32 + y32
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 32 {
|
|
|
|
+ x16 := swizzle(v32,
|
|
|
|
+ 0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
|
+ 8, 9, 10, 11, 12, 13, 14, 15)
|
|
|
|
+ y16 := swizzle(v32,
|
|
|
|
+ 16, 17, 18, 19, 20, 21, 22, 23,
|
|
|
|
+ 24, 25, 26, 27, 28, 29, 30, 31)
|
|
|
|
+ v16 := x16 + y16
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 16 {
|
|
|
|
+ x8 := swizzle(v16, 0, 1, 2, 3, 4, 5, 6, 7)
|
|
|
|
+ y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
|
|
|
|
+ v8 := x8 + y8
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 8 {
|
|
|
|
+ x4 := swizzle(v8, 0, 1, 2, 3)
|
|
|
|
+ y4 := swizzle(v8, 4, 5, 6, 7)
|
|
|
|
+ v4 := x4 + y4
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 4 {
|
|
|
|
+ x2 := swizzle(v4, 0, 1)
|
|
|
|
+ y2 := swizzle(v4, 2, 3)
|
|
|
|
+ v2 := x2 + y2
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 2 {
|
|
|
|
+ return extract(v2, 0) + extract(v2, 1)
|
|
|
|
+ } else {
|
|
|
|
+ return extract(v, 0)
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
|
|
|
|
+
|
|
|
|
+This procedure returns a scalar that is the product of all lanes, calculated by
|
|
|
|
+multiplying each even-numbered element with the following odd-numbered element.
|
|
|
|
+This is repeated until only a single element remains. This order may be faster
|
|
|
|
+to compute than the ordered product for floats, as it can be better
|
|
|
|
+parallelized.
|
|
|
|
+
|
|
|
|
+The order of the product may be important for accounting for precision errors
|
|
|
|
+in floating-point computation, as floating-point multiplication is not
|
|
|
|
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
|
|
|
|
+
|
|
|
|
+Inputs:
|
|
|
|
+- `v`: The vector to reduce.
|
|
|
|
+
|
|
|
|
+Result:
|
|
|
|
+- Product of all lanes, as a scalar.
|
|
|
|
+
|
|
|
|
+**Operation**:
|
|
|
|
+
|
|
|
|
+ for n > 1 {
|
|
|
|
+ n = n / 2
|
|
|
|
+ for i in 0 ..< n {
|
|
|
|
+ a[i] = a[2*i+0] * a[2*i+1]
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ res := a[0]
|
|
|
|
+
|
|
|
|
+Graphical representation of the operation for N=4:
|
|
|
|
+
|
|
|
|
+ +-----------------------+
|
|
|
|
+ v: | v0 | v1 | v2 | v3 |
|
|
|
|
+ +-----------------------+
|
|
|
|
+ | | | |
|
|
|
|
+ `>[x]<' `>[x]<'
|
|
|
|
+ | |
|
|
|
|
+ `--->[x]<--'
|
|
|
|
+ |
|
|
|
|
+ v
|
|
|
|
+ +-----+
|
|
|
|
+ result: | y0 |
|
|
|
|
+ +-----+
|
|
|
|
+*/
|
|
|
|
+reduce_mul_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
|
|
|
|
+ where intrinsics.type_is_numeric(E) {
|
|
|
|
+ when N == 64 { v64 := v }
|
|
|
|
+ when N == 32 { v32 := v }
|
|
|
|
+ when N == 16 { v16 := v }
|
|
|
|
+ when N == 8 { v8 := v }
|
|
|
|
+ when N == 4 { v4 := v }
|
|
|
|
+ when N == 2 { v2 := v }
|
|
|
|
+
|
|
|
|
+ when N >= 64 {
|
|
|
|
+ x32 := swizzle(v64,
|
|
|
|
+ 0, 2, 4, 6, 8, 10, 12, 14,
|
|
|
|
+ 16, 18, 20, 22, 24, 26, 28, 30,
|
|
|
|
+ 32, 34, 36, 38, 40, 42, 44, 46,
|
|
|
|
+ 48, 50, 52, 54, 56, 58, 60, 62)
|
|
|
|
+ y32 := swizzle(v64,
|
|
|
|
+ 1, 3, 5, 7, 9, 11, 13, 15,
|
|
|
|
+ 17, 19, 21, 23, 25, 27, 29, 31,
|
|
|
|
+ 33, 35, 37, 39, 41, 43, 45, 47,
|
|
|
|
+ 49, 51, 53, 55, 57, 59, 61, 63)
|
|
|
|
+ v32 := x32 * y32
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 32 {
|
|
|
|
+ x16 := swizzle(v32,
|
|
|
|
+ 0, 2, 4, 6, 8, 10, 12, 14,
|
|
|
|
+ 16, 18, 20, 22, 24, 26, 28, 30)
|
|
|
|
+ y16 := swizzle(v32,
|
|
|
|
+ 1, 3, 5, 7, 9, 11, 13, 15,
|
|
|
|
+ 17, 19, 21, 23, 25, 27, 29, 31)
|
|
|
|
+ v16 := x16 * y16
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 16 {
|
|
|
|
+ x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
|
|
|
|
+ y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
|
|
|
|
+ v8 := x8 * y8
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 8 {
|
|
|
|
+ x4 := swizzle(v8, 0, 2, 4, 6)
|
|
|
|
+ y4 := swizzle(v8, 1, 3, 5, 7)
|
|
|
|
+ v4 := x4 * y4
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 4 {
|
|
|
|
+ x2 := swizzle(v4, 0, 2)
|
|
|
|
+ y2 := swizzle(v4, 1, 3)
|
|
|
|
+ v2 := x2 * y2
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 2 {
|
|
|
|
+ return extract(v2, 0) * extract(v2, 1)
|
|
|
|
+ } else {
|
|
|
|
+ return extract(v, 0)
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+Reduce a vector to a scalar by multiplying up all the lanes in a binary fashion.
|
|
|
|
+
|
|
|
|
+This procedure returns a scalar that is the product of all lanes, calculated by
|
|
|
|
+splitting the vector in two parts and multiplying the two halves together
|
|
|
|
+element-wise until only a single element remains. This is repeated until only a
|
|
|
|
+single element remains. This order may be faster to compute than the ordered
|
|
|
|
+product for floats, as it can be better parallelized.
|
|
|
|
+
|
|
|
|
+The order of the product may be important for accounting for precision errors
|
|
|
|
+in floating-point computation, as floating-point multiplication is not
|
|
|
|
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
|
|
|
|
+
|
|
|
|
+Inputs:
|
|
|
|
+- `v`: The vector to reduce.
|
|
|
|
+
|
|
|
|
+Result:
|
|
|
|
+- Product of all lanes, as a scalar.
|
|
|
|
+
|
|
|
|
+**Operation**:
|
|
|
|
+
|
|
|
|
+ for n > 1 {
|
|
|
|
+ n = n / 2
|
|
|
|
+ for i in 0 ..< n {
|
|
|
|
+ a[i] *= a[i+n]
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ res := a[0]
|
|
|
|
+
|
|
|
|
+Graphical representation of the operation for N=4:
|
|
|
|
+
|
|
|
|
+ +-----------------------+
|
|
|
|
+ | v0 | v1 | v2 | v3 |
|
|
|
|
+ +-----------------------+
|
|
|
|
+ | | | |
|
|
|
|
+ [x]<-- | ---' |
|
|
|
|
+ | [x]<--------'
|
|
|
|
+ | |
|
|
|
|
+ `>[x]<'
|
|
|
|
+ |
|
|
|
|
+ v
|
|
|
|
+ +-----+
|
|
|
|
+ result: | y0 |
|
|
|
|
+ +-----+
|
|
|
|
+*/
|
|
|
|
+reduce_mul_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
|
|
|
|
+ where intrinsics.type_is_numeric(E) {
|
|
|
|
+ when N == 64 { v64 := v }
|
|
|
|
+ when N == 32 { v32 := v }
|
|
|
|
+ when N == 16 { v16 := v }
|
|
|
|
+ when N == 8 { v8 := v }
|
|
|
|
+ when N == 4 { v4 := v }
|
|
|
|
+ when N == 2 { v2 := v }
|
|
|
|
+
|
|
|
|
+ when N >= 64 {
|
|
|
|
+ x32 := swizzle(v64,
|
|
|
|
+ 0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
|
+ 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
|
+ 16, 17, 18, 19, 20, 21, 22, 23,
|
|
|
|
+ 24, 25, 26, 27, 28, 29, 30, 31)
|
|
|
|
+ y32 := swizzle(v64,
|
|
|
|
+ 32, 33, 34, 35, 36, 37, 38, 39,
|
|
|
|
+ 40, 41, 42, 43, 44, 45, 46, 47,
|
|
|
|
+ 48, 49, 50, 51, 52, 53, 54, 55,
|
|
|
|
+ 56, 57, 58, 59, 60, 61, 62, 63)
|
|
|
|
+ v32 := x32 * y32
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 32 {
|
|
|
|
+ x16 := swizzle(v32,
|
|
|
|
+ 0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
|
+ 8, 9, 10, 11, 12, 13, 14, 15)
|
|
|
|
+ y16 := swizzle(v32,
|
|
|
|
+ 16, 17, 18, 19, 20, 21, 22, 23,
|
|
|
|
+ 24, 25, 26, 27, 28, 29, 30, 31)
|
|
|
|
+ v16 := x16 * y16
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 16 {
|
|
|
|
+ x8 := swizzle(v16, 0, 1, 2, 3, 4, 5, 6, 7)
|
|
|
|
+ y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
|
|
|
|
+ v8 := x8 * y8
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 8 {
|
|
|
|
+ x4 := swizzle(v8, 0, 1, 2, 3)
|
|
|
|
+ y4 := swizzle(v8, 4, 5, 6, 7)
|
|
|
|
+ v4 := x4 * y4
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 4 {
|
|
|
|
+ x2 := swizzle(v4, 0, 1)
|
|
|
|
+ y2 := swizzle(v4, 2, 3)
|
|
|
|
+ v2 := x2 * y2
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ when N >= 2 {
|
|
|
|
+ return extract(v2, 0) * extract(v2, 1)
|
|
|
|
+ } else {
|
|
|
|
+ return extract(v, 0)
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|