7 months ago · b7afbd6d57
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -24,9 +24,9 @@ import "base:intrinsics"
 
				 /*
			
 
				 Check if SIMD is emulated on a target platform.
			
 
				 
			
 
				-This value is `true`, if the compile-time target has the hardware support for
			
 
				+This value is `false`, when the compile-time target has the hardware support for
			
 
				 at 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
			
 
				-for 128-bit SIMD, this value is `false`, and all SIMD operations will be
			
 
				+for 128-bit SIMD, this value is `true`, and all SIMD operations will likely be
			
 
				 emulated.
			
 
				 */
			
 
				 IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
			
@@ -481,7 +481,7 @@ Result:
 
				 
			
 
				 Example:
			
 
				 
			
 
				-This example assumes that the `a` vector is of a signed type.
			
 
				+	// An example for a 4-lane 8-bit signed integer vector `a`.
			
 
				 
			
 
				 	   +-------+-------+-------+-------+
			
 
				 	a: |  0x11 |  0x55 |  0x03 |  0xff |
			
@@ -503,7 +503,7 @@ This procedure returns a vector, such that each lane holds the result of a
 
				 shift-left (aka shift-up) operation, of lane from the vector `a` by the shift
			
 
				 amount from the corresponding lane of the vector `b`.
			
 
				 
			
 
				-The shift amount is rounded to (masked) to the bit-width of the lane.
			
 
				+The shift amount is wrapped (masked) to the bit-width of the lane.
			
 
				 
			
 
				 Inputs:
			
 
				 - `a`: An integer vector of values to shift.
			
@@ -522,6 +522,8 @@ Result:
 
				 
			
 
				 Example:
			
 
				 
			
 
				+	// An example for a 4-lane vector `a` of 8-bit signed integers.
			
 
				+
			
 
				 	   +-------+-------+-------+-------+
			
 
				 	a: |  0x11 |  0x55 |  0x03 |  0xff |
			
 
				 	   +-------+-------+-------+-------+
			
@@ -542,7 +544,7 @@ This procedure returns a vector, such that each lane holds the result of a
 
				 shift-right (aka shift-down) operation, of lane from the vector `a` by the shift
			
 
				 amount from the corresponding lane of the vector `b`.
			
 
				 
			
 
				-The shift amount is rounded to (masked) to the bit-width of the lane.
			
 
				+The shift amount is wrapped (masked) to the bit-width of the lane.
			
 
				 
			
 
				 If the first vector is a vector of signed integers, the arithmetic shift
			
 
				 operation is performed. Otherwise, if the first vector is a vector of unsigned
			
@@ -565,7 +567,7 @@ Result:
 
				 
			
 
				 Example:
			
 
				 
			
 
				-This example assumes that the `a` vector is of a signed type.
			
 
				+	// An example for a 4-lane vector `a` of 8-bit signed integers.
			
 
				 
			
 
				 	   +-------+-------+-------+-------+
			
 
				 	a: |  0x11 |  0x55 |  0x03 |  0xff |
			
@@ -583,9 +585,10 @@ shr_masked :: intrinsics.simd_shr_masked
 
				 /*
			
 
				 Saturated addition of SIMD vectors.
			
 
				 
			
 
				-The *saturated sum* is a sum, that upon overflow or underflow, instead of
			
 
				-round-tripping, keeps the value clamped between the minimum and the maximum
			
 
				-values of the lane type.
			
 
				+The *saturated sum* is a just like a normal sum, except the treatment of the
			
 
				+result upon overflow or underflow is different. In saturated operations, the
			
 
				+result is not wrapped to the bit-width of the lane, and instead is kept clamped
			
 
				+between the minimum and the maximum values of the lane type.
			
 
				 
			
 
				 This procedure returns a vector where each lane is the saturated sum of the
			
 
				 corresponding lanes of vectors `a` and `b`.
			
@@ -613,7 +616,7 @@ Returns:
 
				 
			
 
				 Example:
			
 
				 
			
 
				-Assuming unsigned bytes as the type of the element in a lane:
			
 
				+	// An example for a 4-lane vector `a` of 8-bit signed integers.
			
 
				 
			
 
				 	   +-----+-----+-----+-----+
			
 
				 	a: |  0  | 255 |  2  |  3  |
			
@@ -631,9 +634,10 @@ saturating_add :: intrinsics.simd_saturating_add
 
				 /*
			
 
				 Saturated subtraction of 2 lanes of vectors.
			
 
				 
			
 
				-The *saturated difference* is a difference, that upon overflow or underflow,
			
 
				-instead of round-tripping, keeps the value clamped between the minimum and the
			
 
				-maximum values of the lane type.
			
 
				+The *saturated difference* is a just like a normal difference, except the treatment of the
			
 
				+result upon overflow or underflow is different. In saturated operations, the
			
 
				+result is not wrapped to the bit-width of the lane, and instead is kept clamped
			
 
				+between the minimum and the maximum values of the lane type.
			
 
				 
			
 
				 This procedure returns a vector where each lane is the saturated difference of
			
 
				 the corresponding lanes of vectors `a` and `b`.
			
@@ -661,7 +665,7 @@ Returns:
 
				 
			
 
				 Example:
			
 
				 
			
 
				-Assuming unsigned bytes as the type of the element in a lane:
			
 
				+	// An example for a 4-lane vector `a` of 8-bit signed integers.
			
 
				 
			
 
				 	   +-----+-----+-----+-----+
			
 
				 	a: |  0  | 255 |  2  |  3  |
			
@@ -1274,7 +1278,7 @@ for the `ptr` and `mask` parameters.
 
				 Inputs:
			
 
				 - `ptr`: A vector of memory locations. Each pointer points to a single value,
			
 
				 	of a SIMD vector's lane type, that will be loaded into the vector. Pointer
			
 
				-	in this vector can be `nil` or any other invalid value, if the corresponding
			
 
				+	in this vector can be `nil` or any other invalid value if the corresponding
			
 
				 	value in the `mask` parameter is zero.
			
 
				 - `val`: A vector of values that will be used at corresponding positions
			
 
				 	of the result vector, if the corresponding memory location has been
			
@@ -1304,16 +1308,16 @@ from the value vector `val`.
 
				 
			
 
				 Example:
			
 
				 
			
 
				-Example below loads 2 lanes of values from 2 lanes of float vectors, `v1` and
			
 
				-`v2`. From each of these vectors we're loading the second value, into the first
			
 
				-and the third position of the result vector.
			
 
				+	// Example below loads 2 lanes of values from 2 lanes of float vectors, `v1` and
			
 
				+	// `v2`. From each of these vectors we're loading the second value, into the first
			
 
				+	// and the third position of the result vector.
			
 
				 
			
 
				-Therefore the `ptrs` argument is initialized such that the first and the third
			
 
				-value are the addresses of the values that we want to load into the result
			
 
				-vector, and we'll fill in `nil` for the rest of them. To prevent CPU from
			
 
				-dereferencing those `nil` addresses we provide the mask that only allows us
			
 
				-to load valid positions of the `ptrs` array, and the array of defaults which
			
 
				-will have `127` in each position as the default value.
			
 
				+	// Therefore the `ptrs` argument is initialized such that the first and the third
			
 
				+	// value are the addresses of the values that we want to load into the result
			
 
				+	// vector, and we'll fill in `nil` for the rest of them. To prevent CPU from
			
 
				+	// dereferencing those `nil` addresses we provide the mask that only allows us
			
 
				+	// to load valid positions of the `ptrs` array, and the array of defaults which
			
 
				+	// will have `127` in each position as the default value.
			
 
				 
			
 
				 	v1 := [4] f32 {1, 2, 3, 4};
			
 
				 	v2 := [4] f32 {9, 10,11,12};
			
@@ -1323,9 +1327,13 @@ will have `127` in each position as the default value.
 
				 	res := simd.gather(ptrs, defaults, mask)
			
 
				 	fmt.println(res)
			
 
				 
			
 
				-The code would print `<2, 127, 10, 127>`. First and the third positions came
			
 
				-from the `ptrs` array, and the other 2 lanes of from the default vector.
			
 
				-Graphic below shows how the values of the result are decided based on the mask:
			
 
				+Output:
			
 
				+
			
 
				+	<2, 127, 10, 127>
			
 
				+
			
 
				+The first and the third positions came from the `ptrs` array, and the other
			
 
				+2 lanes of from the default vector. The graphic below shows how the values of
			
 
				+the result are decided based on the mask:
			
 
				 
			
 
				 	      +-------------------------------+ 
			
 
				 	mask: |   1   |   0   |   1   |   0   | 
			
@@ -1375,9 +1383,9 @@ Inputs:
 
				 
			
 
				 Example:
			
 
				 
			
 
				-Example below writes value `127` to the second element of two different
			
 
				-vectors. The addresses of store destinations are written to the first and the
			
 
				-third argument of the `ptr` vector, and the `mask` is set accordingly.
			
 
				+	// Example below writes value `127` to the second element of two different
			
 
				+	// vectors. The addresses of store destinations are written to the first and the
			
 
				+	// third argument of the `ptr` vector, and the `mask` is set accordingly.
			
 
				 
			
 
				 	v1 := [4] f32 {1, 2, 3, 4};
			
 
				 	v2 := [4] f32 {5, 6, 7, 8};
			
@@ -1388,12 +1396,12 @@ third argument of the `ptr` vector, and the `mask` is set accordingly.
 
				 	fmt.println(v1)
			
 
				 	fmt.println(v2)
			
 
				 
			
 
				-This code prints the values of the two vectors, after modification by `scatter`:
			
 
				+Output:
			
 
				 
			
 
				 	[1, 127, 3, 4]
			
 
				 	[5, 127, 7, 8]
			
 
				 
			
 
				-Graphic below shows how the data gets written into memory.
			
 
				+The graphic below shows how the data gets written into memory.
			
 
				 
			
 
				 	
			
 
				 	      +-------------------+
			
@@ -1446,9 +1454,9 @@ memory, and the other lanes are loaded from the `val` vector.
 
				 
			
 
				 Example:
			
 
				 
			
 
				-The following code loads two values from the `src` vector, the first and the
			
 
				-third value (selected by the mask). The masked-off values are given the value
			
 
				-of 127 (`0x7f`).
			
 
				+	// The following code loads two values from the `src` vector, the first and the
			
 
				+	// third value (selected by the mask). The masked-off values are given the value
			
 
				+	// of 127 (`0x7f`).
			
 
				 
			
 
				 	src := [4] f32 {1, 2, 3, 4};
			
 
				 	mask := #simd [4]bool { true, false, true, false }
			
@@ -1456,11 +1464,11 @@ of 127 (`0x7f`).
 
				 	res := simd.masked_load(&src, vals, mask)
			
 
				 	fmt.println(res)
			
 
				 
			
 
				-The above code prints the following:
			
 
				+Output:
			
 
				 
			
 
				 	<1, 127, 3, 127>
			
 
				 
			
 
				-Graphic below demonstrates the flow of lanes.
			
 
				+The graphic below demonstrates the flow of lanes.
			
 
				 
			
 
				 	      +-------------------------------+ 
			
 
				 	mask: |   1   |   0   |   1   |   0   | 
			
@@ -1506,8 +1514,8 @@ Inputs:
 
				 
			
 
				 Example:
			
 
				 
			
 
				-Example below stores the value 127 into the first and the third slot of the
			
 
				-vector `v`.
			
 
				+	// Example below stores the value 127 into the first and the third slot of the
			
 
				+	// vector `v`.
			
 
				 
			
 
				 	v := [4] f32 {1, 2, 3, 4};
			
 
				 	mask := #simd [4]bool { true, false, true, false }
			
@@ -1515,11 +1523,11 @@ vector `v`.
 
				 	simd.masked_store(&v, vals, mask)
			
 
				 	fmt.println(v)
			
 
				 
			
 
				-After the masked store the printed result is:
			
 
				+Output:
			
 
				 
			
 
				 	[127, 2, 127, 4]
			
 
				 
			
 
				-Graphic below shows the flow of lanes:
			
 
				+The graphic below shows the flow of lanes:
			
 
				 
			
 
				 	      +-------------------+
			
 
				 	mask: | 1  | 0  | 1  | 0  |
			
@@ -1575,13 +1583,13 @@ Returns:
 
				 
			
 
				 Example:
			
 
				 
			
 
				-The example below loads two values from memory of the vector `v`. Two values in
			
 
				-the mask are set to `true`, meaning only two memory items will be loaded into
			
 
				-the result vector. The mask is set to `true` in the first and the third
			
 
				-position, which specifies that the first memory item will be read into the
			
 
				-first lane of the result vector, and the second memory item will be read into
			
 
				-the third lane of the result vector. All the other lanes of the result vector
			
 
				-will be initialized to the default value `127`.
			
 
				+	// The example below loads two values from memory of the vector `v`. Two values in
			
 
				+	// the mask are set to `true`, meaning only two memory items will be loaded into
			
 
				+	// the result vector. The mask is set to `true` in the first and the third
			
 
				+	// position, which specifies that the first memory item will be read into the
			
 
				+	// first lane of the result vector, and the second memory item will be read into
			
 
				+	// the third lane of the result vector. All the other lanes of the result vector
			
 
				+	// will be initialized to the default value `127`.
			
 
				 
			
 
				 	v := [2] f64 {1, 2};
			
 
				 	mask := #simd [4]bool { true, false, true, false }
			
@@ -1589,7 +1597,7 @@ will be initialized to the default value `127`.
 
				 	res := simd.masked_expand_load(&v, vals, mask)
			
 
				 	fmt.println(res)
			
 
				 
			
 
				-The above code prints the following:
			
 
				+Output:
			
 
				 
			
 
				 	<1, 127, 2, 127>
			
 
				 
			
@@ -1640,9 +1648,9 @@ Inputs:
 
				 
			
 
				 Example:
			
 
				 
			
 
				-The code below fills the vector `v` with two values from a 4-element SIMD
			
 
				-vector, the first and the third value. The items in the mask are set to `true`
			
 
				-in those lanes.
			
 
				+	// The code below fills the vector `v` with two values from a 4-element SIMD
			
 
				+	// vector, the first and the third value. The items in the mask are set to `true`
			
 
				+	// in those lanes.
			
 
				 
			
 
				 	v := [2] f64 { };
			
 
				 	mask := #simd [4]bool { true, false, true, false }
			
@@ -1650,7 +1658,7 @@ in those lanes.
 
				 	simd.masked_compress_store(&v, vals, mask)
			
 
				 	fmt.println(v)
			
 
				 
			
 
				-The code above prints the following:
			
 
				+Output:
			
 
				 
			
 
				 	[1, 3]
			
 
				 
			
@@ -1928,16 +1936,16 @@ Result:
 
				 	}
			
 
				 	return res
			
 
				 
			
 
				-**Example**
			
 
				+Example:
			
 
				 
			
 
				-The example below shows how the indices are used to determine which lanes of the
			
 
				-input vector get written into the result vector.
			
 
				+	// The example below shows how the indices are used to determine which lanes of the
			
 
				+	// input vector get written into the result vector.
			
 
				 	
			
 
				 	x := #simd [4]f32 { 1.5, 2.5, 3.5, 4.5 }
			
 
				 	res := simd.swizzle(x, 0, 3, 1, 1)
			
 
				 	fmt.println("res")
			
 
				 
			
 
				-The above code will print the following to the console:
			
 
				+Output:
			
 
				 
			
 
				 	[ 1.5, 3.5, 2.5, 2.5 ]
			
 
				 
			
@@ -1998,10 +2006,10 @@ Result:
 
				 	}
			
 
				 	return res
			
 
				 
			
 
				-**Example**
			
 
				+Example:
			
 
				 
			
 
				-The example below shows how the indices are used to determine lanes of the
			
 
				-input vector that are shuffled into the result vector.
			
 
				+	// The example below shows how the indices are used to determine lanes of the
			
 
				+	// input vector that are shuffled into the result vector.
			
 
				 	
			
 
				 	a := #simd [4]f32 { 1, 2, 3, 4 }
			
 
				 	b := #simd [4]f32 { 5, 6, 7, 8 }
			
@@ -2009,7 +2017,7 @@ input vector that are shuffled into the result vector.
 
				 	res := simd.swizzle(x, 0, 4, 2, 5)
			
 
				 	fmt.println("res")
			
 
				 
			
 
				-The above code will print the following to the console:
			
 
				+Output:
			
 
				 
			
 
				 	[ 1, 5, 3, 6 ]
			
 
				 
			
@@ -2065,7 +2073,7 @@ Result:
 
				 	}
			
 
				 	return res
			
 
				 
			
 
				-**Example**:
			
 
				+Example:
			
 
				 
			
 
				 The following example selects values from the two input vectors, `a` and `b`
			
 
				 into a single vector.
			
@@ -2348,12 +2356,15 @@ Returns:
 
				 
			
 
				 Example:
			
 
				 
			
 
				+	// The example for a 4-lane float vectors. Note that to reduce the clutter,
			
 
				+	// the result is shown as rounded to 2 decimal places.
			
 
				+
			
 
				 	   +------+------+------+------+
			
 
				-	a: |   0  |   1  |   3  |   5  |
			
 
				+	a: |  +0  |   1  |   3  |   5  |
			
 
				 	   +------+------+------+------+
			
 
				 	res:
			
 
				 	   +------+------+------+------+
			
 
				-	   |   0  |   1  | 0.33 |  0.2 |
			
 
				+	   |  +∞ |   1  | 0.33 |  0.2 |
			
 
				 	   +------+------+------+------+
			
 
				 */
			
 
				 recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {