Ver código fonte

Implemented non-immediate uniform offset bit shifts for SSE2.

David Piuva 10 meses atrás
pai
commit
6f9822216d

+ 0 - 4
Source/DFPSR/History.txt

@@ -46,10 +46,6 @@ Changes from version 0.2.0 to version 0.3.0 (Performance, safety and template im
 			Replace any << or >> operator that takes a constant offset with the new functions to prevent slowing down.
 				Replace a << 3 with bitShiftLeftImmediate<3>(a).
 				Replace a >> 5 with bitShiftRightImmediate<5>(a).
-			To get dynamic offset, cast the bit offset into a SIMD vector of unsigned integers with the same number of lanes.
-				Replace a << b with a << U32x4(b), a << U16x8(b), a << U8x16(b), a << U32x8(b), a << U16x16(b), a << U8x32(b), a << U32xX(b), a << U16xX(b) or a << U8xX(b).
-				Replace a >> b with a >> U32x4(b), a >> U16x8(b), a >> U8x16(b), a >> U32x8(b), a >> U16x16(b), a >> U8x32(b), a >> U32xX(b), a >> U16xX(b) or a >> U8xX(b).
-				The more lanes you use, the slower it becomes when not available in SIMD hardware, so try to use at least 32-bit integers for faster fallback implementations.
 		* clamp, clampLower and clampUpper are global methods instead of member methods, to work the same for scalar operations in template functions.
 			Replace myVector.clamp(min, max) with clamp(VectorType(min), myVector, VectorType(max)).
 			Replace myVector.clampLower(min) with clampLower(VectorType(min), myVector).

+ 0 - 59
Source/DFPSR/api/textureAPI.h

@@ -25,65 +25,6 @@
 // Everything stored directly in the image types is immutable to allow value types to behave like reference types using the data that they point to.
 // Image types can not be dynamically casted, because the inheritance is entirely static without any virtual functions.
 
-// TODO: Create a fast way to generate masks from an exponential scale floating mip level taken from sampling distances.
-// float samplingDistance (input expressed as some kind of distance in the uv coordinates between two adjacent pixels)
-// uint32_t tileXYMask (tiling should be applied to X and Y using the same mask after limiting to 16 bit integers)
-// uint32_t maxLevelMask
-// So how do we get the weights without shifting bits by the actual bit offset?
-//   Maybe add one to the mask to get a single bit and then multiply.
-/*
-	TODO: Try to handle negative texture coordinates and let positive UV be an optimization flag to enable when known to be valid.
-	      Convert to int32_t with less range and convert to unsigned correctly in modulo of 24 bits.
-
-	// Use leading zeroes to create a mask, which can be turned into a power of two by adding one.
-	// 0001000000000000 -> 0001111111111111
-	// 0001011001000100 -> 0001111111111111
-	// 0001111111111111 -> 0001111111111111
-	// 0000010000000000 -> 0000011111111111
-	// 0000010110010001 -> 0000011111111111
-	// 0000011111111111 -> 0000011111111111
-	// 0000000000100000 -> 0000000000111111
-	// 0000000000101100 -> 0000000000111111
-	// 0000000000111111 -> 0000000000111111
-	uint16_t maskFromLeadingZeroes(uint16_t value) {
-		// Turning 10 into 11
-		uint16_t result = value | (value >> 1);
-		// Turning 1100 into 1111
-		result = result | (result >> 2);
-		// Turning 11110000 into 11111111
-		result = result | (result >> 4);
-		// Turning 1111111100000000 into 1111111111111111
-		result = result | (result >> 8);
-	}
-
-	Generate masks for sampling a specific texture at a specific mip level.
-	  They can then be reused for multiple samples.
-	Pre-condition:
-	  0.0f < samplingDistance
-	  Use min, max, absm et cetera to create a positive sampling distance.
-	void createMasks(float samplingDistance) {
-		uint32_t density = truncateToU32(reciprocal(samplingDistance));
-		// Intel SSE2 does not have dynamic offset bit shifts, because it can only shift by constant bit offsets or dynamic byte offsets.
-		// SSE2, AVX2 and NEON have low 16-bit unsigned multiplication.
-		//   _mm_mullo_epi16, _mm256_mullo_epi16 and vmulq_u16
-		//   Using lower bits might however not be enough and might take more time than simply shifting with scalar operations.
-		//   Then we might as well use SIMD comparisons and make bit masks the way to implement it on all platforms.
-		//     Because returning 1 can be used to return a mask as a fallback.
-		//     And one can also create many overloads for direct selection without the mask in between for future optimization.
-		//   Let textures created from images have 4 mip levels by default, and allow increasing the maximum depth with an optional argument.
-		//     Then make three comparisons to select a mip level.
-		uint16_t mask = maskFromLeadingZeroes(density);
-		// scale is a power of two 16-bit integer used to multiply uv coordinates.
-		//   But SSE2 also does not have 32-bit integer multiplication, so stay in 16 bits or use bit shifts!
-		//   Split into whole pixels and weights before the multiplication somehow.
-		uint16_t scale = mask + 1;
-		// Cast directly to uint16_t with saturation.
-		tileXMask = texture.minimumWidth * scale;
-		tileYMask = texture.minimumHeight * scale;
-		startOffset = texture.startOffsetMask * scale * scale;
-	}
-*/
-
 #ifndef DFPSR_API_TEXTURE
 #define DFPSR_API_TEXTURE
 

+ 240 - 104
Source/DFPSR/base/simd.h

@@ -2284,6 +2284,152 @@
 			IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, right, U32x4, uint32_t, *)
 		#endif
 	}
+
+	// Bitwise and
+	inline U16x8 operator&(const U16x8& left, const U16x8& right) {
+		#if defined(USE_SSE2)
+			return U16x8(_mm_and_si128(left.v, right.v));
+		#elif defined(USE_NEON)
+			return U16x8(vandq_u16(left.v, right.v));
+		#else
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x8, uint16_t, &)
+		#endif
+	}
+	// Bitwise or
+	inline U16x8 operator|(const U16x8& left, const U16x8& right) {
+		#if defined(USE_SSE2)
+			return U16x8(_mm_or_si128(left.v, right.v));
+		#elif defined(USE_NEON)
+			return U16x8(vorrq_u16(left.v, right.v));
+		#else
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x8, uint16_t, |)
+		#endif
+	}
+	// Bitwise xor
+	inline U16x8 operator^(const U16x8& left, const U16x8& right) {
+		#if defined(USE_SSE2)
+			return U16x8(_mm_xor_si128(left.v, right.v));
+		#elif defined(USE_NEON)
+			return U16x8(veorq_u16(left.v, right.v));
+		#else
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x8, uint16_t, ^)
+		#endif
+	}
+	// Bitwise negation
+	inline U16x8 operator~(const U16x8& value) {
+		#if defined(USE_NEON)
+			return U16x8(vmvnq_u16(value.v));
+		#elif defined(USE_BASIC_SIMD)
+			// Fall back on xor against all ones.
+			return value ^ U16x8(~uint16_t(0));
+		#else
+			// TODO: Generate automatically using a macro.
+			return U16x8(~value.scalars[0], ~value.scalars[1], ~value.scalars[2], ~value.scalars[3]);
+		#endif
+	}
+	inline U16x8 operator<<(const U16x8& left, const U16x8 &bitOffsets) {
+		#ifdef SAFE_POINTER_CHECKS
+			if(!allLanesLesser(bitOffsets, U16x8(16u))) {
+				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
+			}
+		#endif
+		#if defined(USE_SSE2)
+			IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, <<)
+		#elif defined(USE_NEON)
+			return U16x8(vshlq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
+		#else
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, <<)
+		#endif
+	}
+	inline U16x8 operator>>(const U16x8& left, const U16x8 &bitOffsets) {
+		#ifdef SAFE_POINTER_CHECKS
+			if(!allLanesLesser(bitOffsets, U16x8(16u))) {
+				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
+			}
+		#endif
+		#if defined(USE_SSE2)
+			IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, >>)
+		#elif defined(USE_NEON)
+			//return U16x8(vshrq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
+			return U16x8(vshlq_u16(left.v, vnegq_s16(vreinterpretq_s16_u16(bitOffsets.v))));
+		#else
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, >>)
+		#endif
+	}
+	inline U16x8 operator<<(const U16x8& left, const uint32_t &bitOffset) {
+		#if defined(USE_SSE2)
+			#ifdef SAFE_POINTER_CHECKS
+				if(bitOffset >= 16u) {
+					throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..16!\n");
+				}
+			#endif
+			// Write the content to aligned stack memory.
+			ALIGN16 __m128i values;
+			left.writeAlignedUnsafe((uint16_t*)&values);
+			// Cast a pointer to the data into two 64-bit elements.
+			uint64_t *largeLanes = (uint64_t*)&values;
+			// Shift the 128 bits as two 64-bit values.
+			largeLanes[0] = largeLanes[0] << bitOffset;
+			largeLanes[1] = largeLanes[1] << bitOffset;
+			// Create a mask.
+			U16x8 mask = U16x8(uint16_t(~0u) << bitOffset);
+			// Return the shifted 64-bit elements masked to remove spill across lanes.
+			return U16x8::readAlignedUnsafe((uint16_t*)&values) & mask;
+		#else
+			return left << U16x8(bitOffset);
+		#endif
+	}
+	inline U16x8 operator>>(const U16x8& left, const uint32_t &bitOffset) {
+		#if defined(USE_SSE2)
+			#ifdef SAFE_POINTER_CHECKS
+				if(bitOffset >= 16u) {
+					throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..16!\n");
+				}
+			#endif
+			// Write the content to aligned stack memory.
+			ALIGN16 __m128i values;
+			left.writeAlignedUnsafe((uint16_t*)&values);
+			// Cast a pointer to the data into two 64-bit elements.
+			uint64_t *largeLanes = (uint64_t*)&values;
+			// Shift the 128 bits as two 64-bit values.
+			largeLanes[0] = largeLanes[0] >> bitOffset;
+			largeLanes[1] = largeLanes[1] >> bitOffset;
+			// Create a mask.
+			U16x8 mask = U16x8(uint16_t(~0u) >> bitOffset);
+			// Return the shifted 64-bit elements masked to remove spill across lanes.
+			return U16x8::readAlignedUnsafe((uint16_t*)&values) & mask;
+		#else
+			return left >> U16x8(bitOffset);
+		#endif
+	}
+	// bitOffset must be an immediate constant, so a template argument is used.
+	template <uint32_t bitOffset>
+	inline U16x8 bitShiftLeftImmediate(const U16x8& left) {
+		static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
+		#if defined(USE_SSE2)
+			return U16x8(_mm_slli_epi16(left.v, bitOffset));
+		#elif defined(USE_NEON)
+			return U16x8(vshlq_u32(left.v, vdupq_n_s16(bitOffset)));
+		#else
+			U16x8 bitOffsets = U16x8(bitOffset);
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, <<)
+		#endif
+	}
+	// bitOffset must be an immediate constant.
+	template <uint32_t bitOffset>
+	inline U16x8 bitShiftRightImmediate(const U16x8& left) {
+		static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
+		#if defined(USE_SSE2)
+			return U16x8(_mm_srli_epi16(left.v, bitOffset));
+		#elif defined(USE_NEON)
+			//return U16x8(vshrq_u16(left.v, vdupq_n_s16(bitOffset)));
+			return U16x8(vshlq_u16(left.v, vdupq_n_s16(-(int32_t)bitOffset)));
+		#else
+			U16x8 bitOffsets = U16x8(bitOffset);
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, >>)
+		#endif
+	}
+
 	// Bitwise and
 	inline U32x4 operator&(const U32x4& left, const U32x4& right) {
 		#if defined(USE_BASIC_SIMD)
@@ -2297,7 +2443,7 @@
 		#if defined(USE_BASIC_SIMD)
 			return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
 		#else
-			IMPL_SCALAR_REFERENCE_INFIX(left, right, U32x4, uint32_t, 4, |)
+			IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, right, U32x4, uint32_t, |)
 		#endif
 	}
 	// Bitwise xor
@@ -2305,7 +2451,7 @@
 		#if defined(USE_BASIC_SIMD)
 			return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
 		#else
-			IMPL_SCALAR_REFERENCE_INFIX(left, right, U32x4, uint32_t, 4, ^)
+			IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, right, U32x4, uint32_t, ^)
 		#endif
 	}
 	// Bitwise negation
@@ -2350,6 +2496,52 @@
 			IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, bitOffsets, U32x4, uint32_t, >>)
 		#endif
 	}
+	inline U32x4 operator<<(const U32x4& left, const uint32_t &bitOffset) {
+		#if defined(USE_SSE2)
+			#ifdef SAFE_POINTER_CHECKS
+				if(bitOffset >= 32u) {
+					throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..31!\n");
+				}
+			#endif
+			// Write the content to aligned stack memory.
+			ALIGN16 __m128i values;
+			left.writeAlignedUnsafe((uint32_t*)&values);
+			// Cast a pointer to the data into two 64-bit elements.
+			uint64_t *largeLanes = (uint64_t*)&values;
+			// Shift the 128 bits as two 64-bit values.
+			largeLanes[0] = largeLanes[0] << bitOffset;
+			largeLanes[1] = largeLanes[1] << bitOffset;
+			// Create a mask.
+			U32x4 mask = U32x4(uint32_t(~0u) << bitOffset);
+			// Return the shifted 64-bit elements masked to remove spill across lanes.
+			return U32x4::readAlignedUnsafe((uint32_t*)&values) & mask;
+		#else
+			return left << U32x4(bitOffset);
+		#endif
+	}
+	inline U32x4 operator>>(const U32x4& left, const uint32_t &bitOffset) {
+		#if defined(USE_SSE2)
+			#ifdef SAFE_POINTER_CHECKS
+				if(bitOffset >= 32u) {
+					throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..31!\n");
+				}
+			#endif
+			// Write the content to aligned stack memory.
+			ALIGN16 __m128i values;
+			left.writeAlignedUnsafe((uint32_t*)&values);
+			// Cast a pointer to the data into two 64-bit elements.
+			uint64_t *largeLanes = (uint64_t*)&values;
+			// Shift the 128 bits as two 64-bit values.
+			largeLanes[0] = largeLanes[0] >> bitOffset;
+			largeLanes[1] = largeLanes[1] >> bitOffset;
+			// Create a mask.
+			U32x4 mask = U32x4(uint32_t(~0u) >> bitOffset);
+			// Return the shifted 64-bit elements masked to remove spill across lanes.
+			return U32x4::readAlignedUnsafe((uint32_t*)&values) & mask;
+		#else
+			return left >> U32x4(bitOffset);
+		#endif
+	}
 	// bitOffset must be an immediate constant, so a template argument is used.
 	template <uint32_t bitOffset>
 	inline U32x4 bitShiftLeftImmediate(const U32x4& left) {
@@ -2379,63 +2571,6 @@
 		#endif
 	}
 
-	inline U16x8 operator<<(const U16x8& left, const U16x8 &bitOffsets) {
-		#ifdef SAFE_POINTER_CHECKS
-			if(!allLanesLesser(bitOffsets, U16x8(16u))) {
-				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
-			}
-		#endif
-		#if defined(USE_SSE2)
-			IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, <<)
-		#elif defined(USE_NEON)
-			return U16x8(vshlq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
-		#else
-			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, <<)
-		#endif
-	}
-	inline U16x8 operator>>(const U16x8& left, const U16x8 &bitOffsets) {
-		#ifdef SAFE_POINTER_CHECKS
-			if(!allLanesLesser(bitOffsets, U16x8(16u))) {
-				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
-			}
-		#endif
-		#if defined(USE_SSE2)
-			IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, >>)
-		#elif defined(USE_NEON)
-			//return U16x8(vshrq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
-			return U16x8(vshlq_u16(left.v, vnegq_s16(vreinterpretq_s16_u16(bitOffsets.v))));
-		#else
-			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, >>)
-		#endif
-	}
-	// bitOffset must be an immediate constant, so a template argument is used.
-	template <uint32_t bitOffset>
-	inline U16x8 bitShiftLeftImmediate(const U16x8& left) {
-		static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
-		#if defined(USE_SSE2)
-			return U16x8(_mm_slli_epi16(left.v, bitOffset));
-		#elif defined(USE_NEON)
-			return U16x8(vshlq_u32(left.v, vdupq_n_s16(bitOffset)));
-		#else
-			U16x8 bitOffsets = U16x8(bitOffset);
-			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, <<)
-		#endif
-	}
-	// bitOffset must be an immediate constant.
-	template <uint32_t bitOffset>
-	inline U16x8 bitShiftRightImmediate(const U16x8& left) {
-		static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
-		#if defined(USE_SSE2)
-			return U16x8(_mm_srli_epi16(left.v, bitOffset));
-		#elif defined(USE_NEON)
-			//return U16x8(vshrq_u16(left.v, vdupq_n_s16(bitOffset)));
-			return U16x8(vshlq_u16(left.v, vdupq_n_s16(-(int32_t)bitOffset)));
-		#else
-			U16x8 bitOffsets = U16x8(bitOffset);
-			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, >>)
-		#endif
-	}
-
 	inline U16x8 operator+(const U16x8& left, const U16x8& right) {
 		#if defined(USE_BASIC_SIMD)
 			return U16x8(ADD_U16_SIMD(left.v, right.v));
@@ -3004,102 +3139,103 @@
 		#endif
 	}
 
-	// ARM NEON does not support 256-bit vectors and Intel's AVX2 does not support variable shifting.
-	inline U32x8 operator<<(const U32x8& left, const U32x8 &bitOffsets) {
-		assert((uintptr_t(&left) & 31u) == 0);
+	// TODO: Implement bit shifts with non-immediate uniform offsets.
+
+	inline U16x16 operator<<(const U16x16& left, const U16x16 &bitOffsets) {
 		#ifdef SAFE_POINTER_CHECKS
-			if(!allLanesLesser(bitOffsets, U32x8(32u))) {
-				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..31!\n");
+			if(!allLanesLesser(bitOffsets, U16x16(16u))) {
+				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
 			}
 		#endif
 		#if defined(USE_AVX2)
-			IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, <<)
+			IMPL_SCALAR_FALLBACK_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
 		#else
-			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, <<)
+			IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
 		#endif
 	}
-	inline U32x8 operator>>(const U32x8& left, const U32x8 &bitOffsets) {
-		assert((uintptr_t(&left) & 31u) == 0);
+	inline U16x16 operator>>(const U16x16& left, const U16x16 &bitOffsets) {
 		#ifdef SAFE_POINTER_CHECKS
-			if(!allLanesLesser(bitOffsets, U32x8(32u))) {
-				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..31!\n");
+			if(!allLanesLesser(bitOffsets, U16x16(16u))) {
+				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
 			}
 		#endif
 		#if defined(USE_AVX2)
-			IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, >>)
+			IMPL_SCALAR_FALLBACK_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, >>)
 		#else
-			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, >>)
+			IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, >>)
 		#endif
 	}
 	// bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
 	template <uint32_t bitOffset>
-	inline U32x8 bitShiftLeftImmediate(const U32x8& left) {
-		assert((uintptr_t(&left) & 31u) == 0);
-		static_assert(bitOffset < 32u, "Immediate left shift of 32-bit values may not shift more than 31 bits!");
+	inline U16x16 bitShiftLeftImmediate(const U16x16& left) {
+		static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
 		#if defined(USE_AVX2)
-			return U32x8(_mm256_slli_epi32(left.v, bitOffset));
+			return U16x16(_mm256_slli_epi16(left.v, bitOffset));
 		#else
-			U32x8 bitOffsets = U32x8(bitOffset);
-			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, <<)
+			U16x16 bitOffsets = U16x16(bitOffset);
+			IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
 		#endif
 	}
 	// bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
 	template <uint32_t bitOffset>
-	inline U32x8 bitShiftRightImmediate(const U32x8& left) {
-		assert((uintptr_t(&left) & 31u) == 0);
-		static_assert(bitOffset < 32u, "Immediate right shift of 32-bit values may not shift more than 31 bits!");
+	inline U16x16 bitShiftRightImmediate(const U16x16& left) {
+		static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
 		#if defined(USE_AVX2)
-			return U32x8(_mm256_srli_epi32(left.v, bitOffset));
+			return U16x16(_mm256_srli_epi16(left.v, bitOffset));
 		#else
-			U32x8 bitOffsets = U32x8(bitOffset);
-			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, >>)
+			U16x16 bitOffsets = U16x16(bitOffset);
+			IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
 		#endif
 	}
 
-	inline U16x16 operator<<(const U16x16& left, const U16x16 &bitOffsets) {
+	inline U32x8 operator<<(const U32x8& left, const U32x8 &bitOffsets) {
+		assert((uintptr_t(&left) & 31u) == 0);
 		#ifdef SAFE_POINTER_CHECKS
-			if(!allLanesLesser(bitOffsets, U16x16(16u))) {
-				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
+			if(!allLanesLesser(bitOffsets, U32x8(32u))) {
+				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..31!\n");
 			}
 		#endif
 		#if defined(USE_AVX2)
-			IMPL_SCALAR_FALLBACK_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
+			IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, <<)
 		#else
-			IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, <<)
 		#endif
 	}
-	inline U16x16 operator>>(const U16x16& left, const U16x16 &bitOffsets) {
+	inline U32x8 operator>>(const U32x8& left, const U32x8 &bitOffsets) {
+		assert((uintptr_t(&left) & 31u) == 0);
 		#ifdef SAFE_POINTER_CHECKS
-			if(!allLanesLesser(bitOffsets, U16x16(16u))) {
-				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
+			if(!allLanesLesser(bitOffsets, U32x8(32u))) {
+				throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..31!\n");
 			}
 		#endif
 		#if defined(USE_AVX2)
-			IMPL_SCALAR_FALLBACK_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, >>)
+			IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, >>)
 		#else
-			IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, >>)
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, >>)
 		#endif
 	}
 	// bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
 	template <uint32_t bitOffset>
-	inline U16x16 bitShiftLeftImmediate(const U16x16& left) {
-		static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
+	inline U32x8 bitShiftLeftImmediate(const U32x8& left) {
+		assert((uintptr_t(&left) & 31u) == 0);
+		static_assert(bitOffset < 32u, "Immediate left shift of 32-bit values may not shift more than 31 bits!");
 		#if defined(USE_AVX2)
-			return U16x16(_mm256_slli_epi16(left.v, bitOffset));
+			return U32x8(_mm256_slli_epi32(left.v, bitOffset));
 		#else
-			U16x16 bitOffsets = U16x16(bitOffset);
-			IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
+			U32x8 bitOffsets = U32x8(bitOffset);
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, <<)
 		#endif
 	}
 	// bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
 	template <uint32_t bitOffset>
-	inline U16x16 bitShiftRightImmediate(const U16x16& left) {
-		static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
+	inline U32x8 bitShiftRightImmediate(const U32x8& left) {
+		assert((uintptr_t(&left) & 31u) == 0);
+		static_assert(bitOffset < 32u, "Immediate right shift of 32-bit values may not shift more than 31 bits!");
 		#if defined(USE_AVX2)
-			return U16x16(_mm256_srli_epi16(left.v, bitOffset));
+			return U32x8(_mm256_srli_epi32(left.v, bitOffset));
 		#else
-			U16x16 bitOffsets = U16x16(bitOffset);
-			IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
+			U32x8 bitOffsets = U32x8(bitOffset);
+			IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, >>)
 		#endif
 	}
 

+ 307 - 9
Source/test/tests/SimdTest.cpp

@@ -401,20 +401,318 @@ START_TEST(Simd)
 	  ^ U32x8(0b00101101001110100011010010100001, 0b10101110100101000011101001010011, 0b00101011100101001011000010100100, 0b11010011101001000110010110110111, 0b00111100101000101010001101001010, 0b00101110100110000111110011010101, 0b11001010010101010010110010101000, 0b11110000111100001111000011110000),
 	    U32x8(0b11100001111110010110000000010010, 0b10000101101110101001100100101010, 0b11100001100011110001010110001000, 0b10000100111000010100111000100001, 0b10010010001110001000100110010011, 0b00000000000001010110011001100101, 0b00100000011110110000011100100111, 0b11011010000000011001010101111000));
 
-	// Bit shift with dynamic offset.
-	uint32_t offset = 1;
-	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) << U32x4(offset), U32x4(2, 4, 6, 8));
+	// Bit shift with dynamic uniform offset.
+	uint32_t offset = 0;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010));
+	offset = 1;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b1000110110010110, 0b1010101101001100, 0b1001000101100110, 0b1101001011001010, 0b1011001100101010, 0b0110011000011100, 0b0100101010010110, 0b0101101100100100));
+	offset = 2;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b0001101100101100, 0b0101011010011000, 0b0010001011001100, 0b1010010110010100, 0b0110011001010100, 0b1100110000111000, 0b1001010100101100, 0b1011011001001000));
+	offset = 3;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b0011011001011000, 0b1010110100110000, 0b0100010110011000, 0b0100101100101000, 0b1100110010101000, 0b1001100001110000, 0b0010101001011000, 0b0110110010010000));
+	offset = 4;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b0110110010110000, 0b0101101001100000, 0b1000101100110000, 0b1001011001010000, 0b1001100101010000, 0b0011000011100000, 0b0101010010110000, 0b1101100100100000));
+	offset = 5;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b1101100101100000, 0b1011010011000000, 0b0001011001100000, 0b0010110010100000, 0b0011001010100000, 0b0110000111000000, 0b1010100101100000, 0b1011001001000000));
+	offset = 6;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b1011001011000000, 0b0110100110000000, 0b0010110011000000, 0b0101100101000000, 0b0110010101000000, 0b1100001110000000, 0b0101001011000000, 0b0110010010000000));
+	offset = 7;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b0110010110000000, 0b1101001100000000, 0b0101100110000000, 0b1011001010000000, 0b1100101010000000, 0b1000011100000000, 0b1010010110000000, 0b1100100100000000));
+	offset = 8;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b1100101100000000, 0b1010011000000000, 0b1011001100000000, 0b0110010100000000, 0b1001010100000000, 0b0000111000000000, 0b0100101100000000, 0b1001001000000000));
+	offset = 9;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b1001011000000000, 0b0100110000000000, 0b0110011000000000, 0b1100101000000000, 0b0010101000000000, 0b0001110000000000, 0b1001011000000000, 0b0010010000000000));
+	offset = 10;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b0010110000000000, 0b1001100000000000, 0b1100110000000000, 0b1001010000000000, 0b0101010000000000, 0b0011100000000000, 0b0010110000000000, 0b0100100000000000));
+	offset = 11;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b0101100000000000, 0b0011000000000000, 0b1001100000000000, 0b0010100000000000, 0b1010100000000000, 0b0111000000000000, 0b0101100000000000, 0b1001000000000000));
+	offset = 12;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b1011000000000000, 0b0110000000000000, 0b0011000000000000, 0b0101000000000000, 0b0101000000000000, 0b1110000000000000, 0b1011000000000000, 0b0010000000000000));
+	offset = 13;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b0110000000000000, 0b1100000000000000, 0b0110000000000000, 0b1010000000000000, 0b1010000000000000, 0b1100000000000000, 0b0110000000000000, 0b0100000000000000));
+	offset = 14;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b1100000000000000, 0b1000000000000000, 0b1100000000000000, 0b0100000000000000, 0b0100000000000000, 0b1000000000000000, 0b1100000000000000, 0b1000000000000000));
+	offset = 15;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset,
+	                  U16x8(0b1000000000000000, 0b0000000000000000, 0b1000000000000000, 0b1000000000000000, 0b1000000000000000, 0b0000000000000000, 0b1000000000000000, 0b0000000000000000));
+	offset = 16;
+	ASSERT_CRASH(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) << offset, U"Tried to shift ");
+	offset = 0;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010));
+	offset = 1;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0110001101100101, 0b0010101011010011, 0b0110010001011001, 0b0011010010110010, 0b0010110011001010, 0b0001100110000111, 0b0101001010100101, 0b0001011011001001));
+	offset = 2;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0011000110110010, 0b0001010101101001, 0b0011001000101100, 0b0001101001011001, 0b0001011001100101, 0b0000110011000011, 0b0010100101010010, 0b0000101101100100));
+	offset = 3;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0001100011011001, 0b0000101010110100, 0b0001100100010110, 0b0000110100101100, 0b0000101100110010, 0b0000011001100001, 0b0001010010101001, 0b0000010110110010));
+	offset = 4;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000110001101100, 0b0000010101011010, 0b0000110010001011, 0b0000011010010110, 0b0000010110011001, 0b0000001100110000, 0b0000101001010100, 0b0000001011011001));
+	offset = 5;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000011000110110, 0b0000001010101101, 0b0000011001000101, 0b0000001101001011, 0b0000001011001100, 0b0000000110011000, 0b0000010100101010, 0b0000000101101100));
+	offset = 6;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000001100011011, 0b0000000101010110, 0b0000001100100010, 0b0000000110100101, 0b0000000101100110, 0b0000000011001100, 0b0000001010010101, 0b0000000010110110));
+	offset = 7;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000000110001101, 0b0000000010101011, 0b0000000110010001, 0b0000000011010010, 0b0000000010110011, 0b0000000001100110, 0b0000000101001010, 0b0000000001011011));
+	offset = 8;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000000011000110, 0b0000000001010101, 0b0000000011001000, 0b0000000001101001, 0b0000000001011001, 0b0000000000110011, 0b0000000010100101, 0b0000000000101101));
+	offset = 9;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000000001100011, 0b0000000000101010, 0b0000000001100100, 0b0000000000110100, 0b0000000000101100, 0b0000000000011001, 0b0000000001010010, 0b0000000000010110));
+	offset = 10;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000000000110001, 0b0000000000010101, 0b0000000000110010, 0b0000000000011010, 0b0000000000010110, 0b0000000000001100, 0b0000000000101001, 0b0000000000001011));
+	offset = 11;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000000000011000, 0b0000000000001010, 0b0000000000011001, 0b0000000000001101, 0b0000000000001011, 0b0000000000000110, 0b0000000000010100, 0b0000000000000101));
+	offset = 12;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000000000001100, 0b0000000000000101, 0b0000000000001100, 0b0000000000000110, 0b0000000000000101, 0b0000000000000011, 0b0000000000001010, 0b0000000000000010));
+	offset = 13;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000000000000110, 0b0000000000000010, 0b0000000000000110, 0b0000000000000011, 0b0000000000000010, 0b0000000000000001, 0b0000000000000101, 0b0000000000000001));
+	offset = 14;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000000000000011, 0b0000000000000001, 0b0000000000000011, 0b0000000000000001, 0b0000000000000001, 0b0000000000000000, 0b0000000000000010, 0b0000000000000000));
+	offset = 15;
+	ASSERT_EQUAL_SIMD(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset,
+	                  U16x8(0b0000000000000001, 0b0000000000000000, 0b0000000000000001, 0b0000000000000000, 0b0000000000000000, 0b0000000000000000, 0b0000000000000001, 0b0000000000000000));
+	offset = 16;
+	ASSERT_CRASH(U16x8(0b1100011011001011, 0b0101010110100110, 0b1100100010110011, 0b0110100101100101, 0b0101100110010101, 0b0011001100001110, 0b1010010101001011, 0b0010110110010010) >> offset, U"Tried to shift ");
+
+	offset = 0;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010));
+	offset = 1;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10001101100101101010101101001100, 0b10010001011001101101001011001010, 0b10110011001010100110011000011100, 0b01001010100101100101101100100100));
+	offset = 2;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b00011011001011010101011010011000, 0b00100010110011011010010110010100, 0b01100110010101001100110000111000, 0b10010101001011001011011001001000));
+	offset = 3;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b00110110010110101010110100110000, 0b01000101100110110100101100101000, 0b11001100101010011001100001110000, 0b00101010010110010110110010010000));
+	offset = 4;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b01101100101101010101101001100000, 0b10001011001101101001011001010000, 0b10011001010100110011000011100000, 0b01010100101100101101100100100000));
+	offset = 5;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b11011001011010101011010011000000, 0b00010110011011010010110010100000, 0b00110010101001100110000111000000, 0b10101001011001011011001001000000));
+	offset = 6;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10110010110101010110100110000000, 0b00101100110110100101100101000000, 0b01100101010011001100001110000000, 0b01010010110010110110010010000000));
+	offset = 7;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b01100101101010101101001100000000, 0b01011001101101001011001010000000, 0b11001010100110011000011100000000, 0b10100101100101101100100100000000));
+	offset = 8;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b11001011010101011010011000000000, 0b10110011011010010110010100000000, 0b10010101001100110000111000000000, 0b01001011001011011001001000000000));
+	offset = 9;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10010110101010110100110000000000, 0b01100110110100101100101000000000, 0b00101010011001100001110000000000, 0b10010110010110110010010000000000));
+	offset = 10;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b00101101010101101001100000000000, 0b11001101101001011001010000000000, 0b01010100110011000011100000000000, 0b00101100101101100100100000000000));
+	offset = 11;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b01011010101011010011000000000000, 0b10011011010010110010100000000000, 0b10101001100110000111000000000000, 0b01011001011011001001000000000000));
+	offset = 12;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10110101010110100110000000000000, 0b00110110100101100101000000000000, 0b01010011001100001110000000000000, 0b10110010110110010010000000000000));
+	offset = 13;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b01101010101101001100000000000000, 0b01101101001011001010000000000000, 0b10100110011000011100000000000000, 0b01100101101100100100000000000000));
+	offset = 14;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b11010101011010011000000000000000, 0b11011010010110010100000000000000, 0b01001100110000111000000000000000, 0b11001011011001001000000000000000));
+	offset = 15;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10101010110100110000000000000000, 0b10110100101100101000000000000000, 0b10011001100001110000000000000000, 0b10010110110010010000000000000000));
+	offset = 16;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b01010101101001100000000000000000, 0b01101001011001010000000000000000, 0b00110011000011100000000000000000, 0b00101101100100100000000000000000));
+	offset = 17;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10101011010011000000000000000000, 0b11010010110010100000000000000000, 0b01100110000111000000000000000000, 0b01011011001001000000000000000000));
+	offset = 18;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b01010110100110000000000000000000, 0b10100101100101000000000000000000, 0b11001100001110000000000000000000, 0b10110110010010000000000000000000));
+	offset = 19;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10101101001100000000000000000000, 0b01001011001010000000000000000000, 0b10011000011100000000000000000000, 0b01101100100100000000000000000000));
+	offset = 20;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b01011010011000000000000000000000, 0b10010110010100000000000000000000, 0b00110000111000000000000000000000, 0b11011001001000000000000000000000));
+	offset = 21;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10110100110000000000000000000000, 0b00101100101000000000000000000000, 0b01100001110000000000000000000000, 0b10110010010000000000000000000000));
+	offset = 22;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b01101001100000000000000000000000, 0b01011001010000000000000000000000, 0b11000011100000000000000000000000, 0b01100100100000000000000000000000));
+	offset = 23;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b11010011000000000000000000000000, 0b10110010100000000000000000000000, 0b10000111000000000000000000000000, 0b11001001000000000000000000000000));
+	offset = 24;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10100110000000000000000000000000, 0b01100101000000000000000000000000, 0b00001110000000000000000000000000, 0b10010010000000000000000000000000));
+	offset = 25;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b01001100000000000000000000000000, 0b11001010000000000000000000000000, 0b00011100000000000000000000000000, 0b00100100000000000000000000000000));
+	offset = 26;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10011000000000000000000000000000, 0b10010100000000000000000000000000, 0b00111000000000000000000000000000, 0b01001000000000000000000000000000));
+	offset = 27;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b00110000000000000000000000000000, 0b00101000000000000000000000000000, 0b01110000000000000000000000000000, 0b10010000000000000000000000000000));
+	offset = 28;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b01100000000000000000000000000000, 0b01010000000000000000000000000000, 0b11100000000000000000000000000000, 0b00100000000000000000000000000000));
+	offset = 29;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b11000000000000000000000000000000, 0b10100000000000000000000000000000, 0b11000000000000000000000000000000, 0b01000000000000000000000000000000));
+	offset = 30;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b10000000000000000000000000000000, 0b01000000000000000000000000000000, 0b10000000000000000000000000000000, 0b10000000000000000000000000000000));
+	offset = 31;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset,
+	                  U32x4(0b00000000000000000000000000000000, 0b10000000000000000000000000000000, 0b00000000000000000000000000000000, 0b00000000000000000000000000000000));
+	offset = 32;
+	ASSERT_CRASH(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) << offset, U"Tried to shift ");
+	offset = 0;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010));
+	offset = 1;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b01100011011001011010101011010011, 0b01100100010110011011010010110010, 0b00101100110010101001100110000111, 0b01010010101001011001011011001001));
+	offset = 2;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00110001101100101101010101101001, 0b00110010001011001101101001011001, 0b00010110011001010100110011000011, 0b00101001010100101100101101100100));
+	offset = 3;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00011000110110010110101010110100, 0b00011001000101100110110100101100, 0b00001011001100101010011001100001, 0b00010100101010010110010110110010));
+	offset = 4;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00001100011011001011010101011010, 0b00001100100010110011011010010110, 0b00000101100110010101001100110000, 0b00001010010101001011001011011001));
+	offset = 5;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000110001101100101101010101101, 0b00000110010001011001101101001011, 0b00000010110011001010100110011000, 0b00000101001010100101100101101100));
+	offset = 6;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000011000110110010110101010110, 0b00000011001000101100110110100101, 0b00000001011001100101010011001100, 0b00000010100101010010110010110110));
+	offset = 7;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000001100011011001011010101011, 0b00000001100100010110011011010010, 0b00000000101100110010101001100110, 0b00000001010010101001011001011011));
+	offset = 8;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000110001101100101101010101, 0b00000000110010001011001101101001, 0b00000000010110011001010100110011, 0b00000000101001010100101100101101));
+	offset = 9;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000011000110110010110101010, 0b00000000011001000101100110110100, 0b00000000001011001100101010011001, 0b00000000010100101010010110010110));
+	offset = 10;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000001100011011001011010101, 0b00000000001100100010110011011010, 0b00000000000101100110010101001100, 0b00000000001010010101001011001011));
+	offset = 11;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000110001101100101101010, 0b00000000000110010001011001101101, 0b00000000000010110011001010100110, 0b00000000000101001010100101100101));
+	offset = 12;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000011000110110010110101, 0b00000000000011001000101100110110, 0b00000000000001011001100101010011, 0b00000000000010100101010010110010));
+	offset = 13;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000001100011011001011010, 0b00000000000001100100010110011011, 0b00000000000000101100110010101001, 0b00000000000001010010101001011001));
+	offset = 14;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000110001101100101101, 0b00000000000000110010001011001101, 0b00000000000000010110011001010100, 0b00000000000000101001010100101100));
+	offset = 15;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000011000110110010110, 0b00000000000000011001000101100110, 0b00000000000000001011001100101010, 0b00000000000000010100101010010110));
+	offset = 16;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000001100011011001011, 0b00000000000000001100100010110011, 0b00000000000000000101100110010101, 0b00000000000000001010010101001011));
+	offset = 17;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000110001101100101, 0b00000000000000000110010001011001, 0b00000000000000000010110011001010, 0b00000000000000000101001010100101));
+	offset = 18;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000011000110110010, 0b00000000000000000011001000101100, 0b00000000000000000001011001100101, 0b00000000000000000010100101010010));
+	offset = 19;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000001100011011001, 0b00000000000000000001100100010110, 0b00000000000000000000101100110010, 0b00000000000000000001010010101001));
+	offset = 20;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000110001101100, 0b00000000000000000000110010001011, 0b00000000000000000000010110011001, 0b00000000000000000000101001010100));
+	offset = 21;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000011000110110, 0b00000000000000000000011001000101, 0b00000000000000000000001011001100, 0b00000000000000000000010100101010));
+	offset = 22;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000001100011011, 0b00000000000000000000001100100010, 0b00000000000000000000000101100110, 0b00000000000000000000001010010101));
+	offset = 23;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000000110001101, 0b00000000000000000000000110010001, 0b00000000000000000000000010110011, 0b00000000000000000000000101001010));
+	offset = 24;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000000011000110, 0b00000000000000000000000011001000, 0b00000000000000000000000001011001, 0b00000000000000000000000010100101));
+	offset = 25;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000000001100011, 0b00000000000000000000000001100100, 0b00000000000000000000000000101100, 0b00000000000000000000000001010010));
+	offset = 26;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000000000110001, 0b00000000000000000000000000110010, 0b00000000000000000000000000010110, 0b00000000000000000000000000101001));
+	offset = 27;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000000000011000, 0b00000000000000000000000000011001, 0b00000000000000000000000000001011, 0b00000000000000000000000000010100));
+	offset = 28;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000000000001100, 0b00000000000000000000000000001100, 0b00000000000000000000000000000101, 0b00000000000000000000000000001010));
+	offset = 29;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000000000000110, 0b00000000000000000000000000000110, 0b00000000000000000000000000000010, 0b00000000000000000000000000000101));
+	offset = 30;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000000000000011, 0b00000000000000000000000000000011, 0b00000000000000000000000000000001, 0b00000000000000000000000000000010));
+	offset = 31;
+	ASSERT_EQUAL_SIMD(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset,
+	                  U32x4(0b00000000000000000000000000000001, 0b00000000000000000000000000000001, 0b00000000000000000000000000000000, 0b00000000000000000000000000000001));
+	offset = 32;
+	ASSERT_CRASH(U32x4(0b11000110110010110101010110100110, 0b11001000101100110110100101100101, 0b01011001100101010011001100001110, 0b10100101010010110010110110010010) >> offset, U"Tried to shift ");
+
+	offset = 1;
+	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) << offset, U32x4(2, 4, 6, 8));
 	offset = 2;
-	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) << U32x4(offset), U32x4(4, 8, 12, 16));
+	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) << offset, U32x4(4, 8, 12, 16));
 	offset = 3;
-	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) << U32x4(offset), U32x4(8, 16, 24, 32));
+	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) << offset, U32x4(8, 16, 24, 32));
 	offset = 4;
-	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) << U32x4(offset), U32x4(16, 32, 48, 64));
+	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) << offset, U32x4(16, 32, 48, 64));
 	offset = 1;
-	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) >> U32x4(offset), U32x4(0, 1, 1, 2));
-	ASSERT_EQUAL_SIMD(U32x4(2, 4, 6, 8) >> U32x4(offset), U32x4(1, 2, 3, 4));
+	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) >> offset, U32x4(0, 1, 1, 2));
+	ASSERT_EQUAL_SIMD(U32x4(2, 4, 6, 8) >> offset, U32x4(1, 2, 3, 4));
 	offset = 2;
-	ASSERT_EQUAL_SIMD(U32x4(2, 4, 6, 8) >> U32x4(offset), U32x4(0, 1, 1, 2));
+	ASSERT_EQUAL_SIMD(U32x4(2, 4, 6, 8) >> offset, U32x4(0, 1, 1, 2));
 
 	// Bit shift with multiple offsets.
 	ASSERT_EQUAL_SIMD(U32x4(1, 2, 3, 4) << U32x4(0, 3, 1, 2), U32x4(1, 16, 6, 16));