Browse Source

Implemented 256-bit SIMD vectors for AVX2. And some general clean-up.

David Piuva 2 years ago
parent
commit
9440208038

+ 0 - 3
Doc/Generator/Input/Manual.txt

@@ -6,9 +6,6 @@ It should not grow obsolete and bloated like OpenGL from not breaking anything,
 This library is entirely statically linked, so that your source code can be saved together with a specific version of the library, a slimmed down modification of the library, or even a mix of different versions.
 By being compiled from source code automatically when making changes to the library, it is easy to mix different versions of the library if something eventually has to be removed to reduce bloat.
 
-SIMD registers are expected to get bigger in the future, but the library does not use any inline assembly, so the old intrinsics will simply be translated into the equivalent machine instructions on the next version of the instruction set.
-If larger SIMD registers become so widely adopted that it can be used as a minimum requirement in the future, the default alignment of Buffer will likely become a multiple of the old alignment to support both new and old SIMD vectors.
-
 Anything with "impl" in the name is expected to change at any time, so don't do hacks with the internals unless you copy the code into your own project or stay with the same version of the library.
 Anything that requires defining DFPSR_INTERNAL_ACCESS before a header is also considered internal.
 

+ 0 - 4
Doc/Manual.html

@@ -31,10 +31,6 @@ It should not grow obsolete and bloated like OpenGL from not breaking anything,
 This library is entirely statically linked, so that your source code can be saved together with a specific version of the library, a slimmed down modification of the library, or even a mix of different versions.
 By being compiled from source code automatically when making changes to the library, it is easy to mix different versions of the library if something eventually has to be removed to reduce bloat.
 
-</P><P>
-SIMD registers are expected to get bigger in the future, but the library does not use any inline assembly, so the old intrinsics will simply be translated into the equivalent machine instructions on the next version of the instruction set.
-If larger SIMD registers become so widely adopted that it can be used as a minimum requirement in the future, the default alignment of Buffer will likely become a multiple of the old alignment to support both new and old SIMD vectors.
-
 </P><P>
 Anything with "impl" in the name is expected to change at any time, so don't do hacks with the internals unless you copy the code into your own project or stay with the same version of the library.
 Anything that requires defining DFPSR_INTERNAL_ACCESS before a header is also considered internal.

+ 3 - 2
Source/DFPSR/api/bufferAPI.cpp

@@ -26,6 +26,7 @@
 #include "bufferAPI.h"
 #include "stringAPI.h"
 #include "../math/scalar.h"
+#include "../base/simd.h"
 
 namespace dsr {
 
@@ -54,8 +55,8 @@ public:
 
 // Internal methods
 
-// buffer_alignment must be a power of two for buffer_alignment_mask to work
-static const int buffer_alignment = 16;
+// Buffers are aligned and padded for the deafult SIMD vector size, so that vectorization can be efficient.
+static const int buffer_alignment = DSR_DEFAULT_ALIGNMENT;
 
 // If this C++ version additionally includes the C11 features then we may assume that aligned_alloc is available
 #ifdef _ISOC11_SOURCE

+ 6 - 5
Source/DFPSR/api/imageAPI.cpp

@@ -33,24 +33,25 @@
 #include "../image/internal/imageInternal.h"
 #include "../image/stbImage/stbImageWrapper.h"
 #include "../math/scalar.h"
+#include "../base/simd.h"
 
 using namespace dsr;
 
 // Constructors
 AlignedImageU8 dsr::image_create_U8(int32_t width, int32_t height) {
-	return AlignedImageU8(std::make_shared<ImageU8Impl>(width, height));
+	return AlignedImageU8(std::make_shared<ImageU8Impl>(width, height, DSR_DEFAULT_ALIGNMENT));
 }
 AlignedImageU16 dsr::image_create_U16(int32_t width, int32_t height) {
-	return AlignedImageU16(std::make_shared<ImageU16Impl>(width, height));
+	return AlignedImageU16(std::make_shared<ImageU16Impl>(width, height, DSR_DEFAULT_ALIGNMENT));
 }
 AlignedImageF32 dsr::image_create_F32(int32_t width, int32_t height) {
-	return AlignedImageF32(std::make_shared<ImageF32Impl>(width, height));
+	return AlignedImageF32(std::make_shared<ImageF32Impl>(width, height, DSR_DEFAULT_ALIGNMENT));
 }
 OrderedImageRgbaU8 dsr::image_create_RgbaU8(int32_t width, int32_t height) {
-	return OrderedImageRgbaU8(std::make_shared<ImageRgbaU8Impl>(width, height));
+	return OrderedImageRgbaU8(std::make_shared<ImageRgbaU8Impl>(width, height, DSR_DEFAULT_ALIGNMENT));
 }
 AlignedImageRgbaU8 dsr::image_create_RgbaU8_native(int32_t width, int32_t height, PackOrderIndex packOrderIndex) {
-	return AlignedImageRgbaU8(std::make_shared<ImageRgbaU8Impl>(width, height, packOrderIndex));
+	return AlignedImageRgbaU8(std::make_shared<ImageRgbaU8Impl>(width, height, packOrderIndex, DSR_DEFAULT_ALIGNMENT));
 }
 
 // Loading from data pointer

File diff suppressed because it is too large
+ 1847 - 527
Source/DFPSR/base/simd.h


+ 1 - 1
Source/DFPSR/gui/VisualComponent.cpp

@@ -410,7 +410,7 @@ void VisualComponent::updateIndirectStates() {
 		childStates |= this->children[i]->currentState;
 	}
 	// Direct and indirect inheritance.
-	ComponentState expectedIndirectStates = ((childStates & componentState_direct) << 1) | childStates & componentState_indirect;
+	ComponentState expectedIndirectStates = ((childStates & componentState_direct) << 1) | (childStates & componentState_indirect);
 	this->currentState = (this->currentState & componentState_direct) | expectedIndirectStates;
 }
 

+ 0 - 1
Source/DFPSR/gui/components/Menu.cpp

@@ -238,7 +238,6 @@ static void closeEntireMenu(VisualComponent* menu) {
 
 void Menu::receiveMouseEvent(const MouseEvent& event) {
 	int childCount = this->getChildCount();
-	IVector2D positionFromParent = event.position;
 	MouseEvent localEvent = event;
 	localEvent.position -= this->location.upperLeft();
 	if (this->showingOverlay() && this->pointIsInsideOfOverlay(event.position)) {

+ 1 - 1
Source/DFPSR/image/ImageF32.h

@@ -35,7 +35,7 @@ public:
 	// Inherit constructors
 	using ImageImpl::ImageImpl;
 	ImageF32Impl(int32_t newWidth, int32_t newHeight, int32_t newStride, Buffer buffer, intptr_t startOffset);
-	ImageF32Impl(int32_t newWidth, int32_t newHeight, int32_t alignment = 16);
+	ImageF32Impl(int32_t newWidth, int32_t newHeight, int32_t alignment);
 	// Macro defined functions
 	IMAGE_DECLARATION(ImageF32Impl, 1, float, float);
 };

+ 4 - 3
Source/DFPSR/image/ImageRgbaU8.cpp

@@ -25,6 +25,7 @@
 #include "internal/imageInternal.h"
 #include "internal/imageTemplate.h"
 #include <algorithm>
+#include "../base/simd.h"
 
 using namespace dsr;
 
@@ -42,7 +43,7 @@ ImageRgbaU8Impl::ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, int32_t al
 }
 
 // Native canvas constructor
-ImageRgbaU8Impl::ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, PackOrderIndex packOrderIndex) :
+ImageRgbaU8Impl::ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, PackOrderIndex packOrderIndex, int32_t alignment) :
   ImageImpl(newWidth, newHeight, roundUp(newWidth * sizeof(Color4xU8), 16), sizeof(Color4xU8)) {
 	this->packOrder = PackOrder::getPackOrder(packOrderIndex);
 	this->initializeRgbaImage();
@@ -62,7 +63,7 @@ ImageRgbaU8Impl ImageRgbaU8Impl::getWithoutPadding() const {
 		return *this;
 	} else {
 		// Copy each row without padding
-		ImageRgbaU8Impl result = ImageRgbaU8Impl(this->width, this->height, this->packOrder.packOrderIndex);
+		ImageRgbaU8Impl result = ImageRgbaU8Impl(this->width, this->height, this->packOrder.packOrderIndex, DSR_DEFAULT_ALIGNMENT);
 		const SafePointer<uint8_t> sourceRow = imageInternal::getSafeData<uint8_t>(*this);
 		int32_t sourceStride = this->stride;
 		SafePointer<uint8_t> targetRow = imageInternal::getSafeData<uint8_t>(result);
@@ -98,7 +99,7 @@ ImageU8Impl ImageRgbaU8Impl::getChannel(int32_t channelIndex) const {
 	// Safety for release mode
 	if (channelIndex < 0) { channelIndex = 0; }
 	if (channelIndex > channelCount) { channelIndex = channelCount; }
-	ImageU8Impl result(this->width, this->height);
+	ImageU8Impl result(this->width, this->height, DSR_DEFAULT_ALIGNMENT);
 	extractChannel(imageInternal::getSafeData<uint8_t>(result), result.stride, imageInternal::getSafeData<uint8_t>(*this), this->stride, channelCount, channelIndex, this->width, this->height);
 	return result;
 }

+ 2 - 2
Source/DFPSR/image/ImageRgbaU8.h

@@ -65,9 +65,9 @@ public:
 	IMAGE_DECLARATION(ImageRgbaU8Impl, 4, Color4xU8, uint8_t);
 	// Constructors
 	ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, int32_t newStride, Buffer buffer, intptr_t startOffset, const PackOrder &packOrder);
-	ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, int32_t alignment = 16);
+	ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, int32_t alignment);
 	// Native canvas constructor
-	ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, PackOrderIndex packOrderIndex);
+	ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, PackOrderIndex packOrderIndex, int32_t alignment);
 	// Fast reading
 	TextureRgba texture; // The texture view
 	void initializeRgbaImage(); // Points to level 0 from all bins to allow rendering

+ 1 - 1
Source/DFPSR/image/ImageU16.h

@@ -36,7 +36,7 @@ public:
 	// Inherit constructors
 	using ImageImpl::ImageImpl;
 	ImageU16Impl(int32_t newWidth, int32_t newHeight, int32_t newStride, Buffer buffer, intptr_t startOffset);
-	ImageU16Impl(int32_t newWidth, int32_t newHeight, int32_t alignment = 16);
+	ImageU16Impl(int32_t newWidth, int32_t newHeight, int32_t alignment);
 	// Macro defined functions
 	IMAGE_DECLARATION(ImageU16Impl, 1, uint16_t, uint16_t);
 };

+ 1 - 1
Source/DFPSR/image/ImageU8.h

@@ -36,7 +36,7 @@ public:
 	using ImageImpl::ImageImpl;
 	// Constructors
 	ImageU8Impl(int32_t newWidth, int32_t newHeight, int32_t newStride, Buffer buffer, intptr_t startOffset);
-	ImageU8Impl(int32_t newWidth, int32_t newHeight, int32_t alignment = 16);
+	ImageU8Impl(int32_t newWidth, int32_t newHeight, int32_t alignment);
 	// Macro defined functions
 	IMAGE_DECLARATION(ImageU8Impl, 1, uint8_t, uint8_t);
 };

+ 2 - 2
Source/DFPSR/image/draw.cpp

@@ -1070,10 +1070,10 @@ static void resize_aux(ImageU8Impl& target, const ImageU8Impl& source, bool inte
 
 // Creating an image to replacedImage with the same pack order as originalImage when applicable to the image format.
 static ImageRgbaU8Impl createWithSamePackOrder(const ImageRgbaU8Impl& originalImage, int32_t width, int32_t height) {
-	return ImageRgbaU8Impl(width, height, originalImage.packOrder.packOrderIndex);
+	return ImageRgbaU8Impl(width, height, originalImage.packOrder.packOrderIndex, DSR_DEFAULT_ALIGNMENT);
 }
 static ImageU8Impl createWithSamePackOrder(const ImageU8Impl& originalImage, int32_t width, int32_t height) {
-	return ImageU8Impl(width, height);
+	return ImageU8Impl(width, height, DSR_DEFAULT_ALIGNMENT);
 }
 
 template <typename IMAGE_TYPE>

+ 3 - 3
Source/DFPSR/render/shader/RgbaMultiply.h

@@ -81,7 +81,7 @@ public:
 		Shader_RgbaMultiply tempShader(triangleInput);
 		tempShader.fillShape(colorBuffer, depthBuffer, triangle, projection, shape, filter);
 	}
-	rgba_F32 getPixels_2x2(const F32x4x3 &vertexWeights) const override {
+	Rgba_F32 getPixels_2x2(const F32x4x3 &vertexWeights) const override {
 		if (HAS_DIFFUSE_MAP && !HAS_LIGHT_MAP && COLORLESS) {
 			// Optimized for diffuse only
 			ALIGN16 F32x4 u1(shaderMethods::interpolate(this->texCoords.u1, vertexWeights));
@@ -98,9 +98,9 @@ public:
 			return shaderMethods::sample_F32<Interpolation::BL, false>(this->lightLayer, u2, v2);
 		} else {
 			// Interpolate the vertex color
-			ALIGN16 rgba_F32 color = HAS_VERTEX_FADING ?
+			ALIGN16 Rgba_F32 color = HAS_VERTEX_FADING ?
 			  shaderMethods::interpolateVertexColor(this->colors.red, this->colors.green, this->colors.blue, this->colors.alpha, vertexWeights) :
-			  rgba_F32(F32x4(this->colors.red.x), F32x4(this->colors.green.x), F32x4(this->colors.blue.x), F32x4(this->colors.alpha.x));
+			  Rgba_F32(F32x4(this->colors.red.x), F32x4(this->colors.green.x), F32x4(this->colors.blue.x), F32x4(this->colors.alpha.x));
 			// Sample diffuse
 			if (HAS_DIFFUSE_MAP) {
 				ALIGN16 F32x4 u1(shaderMethods::interpolate(this->texCoords.u1, vertexWeights));

+ 2 - 2
Source/DFPSR/render/shader/Shader.cpp

@@ -138,7 +138,7 @@ inline static void fillQuadSuper(const Shader& shader, int x, SafePointer<uint32
 			// Get the color
 			ALIGN16 U32x4 packedColor(0u); // Allow uninitialized memory?
 			// Execute the shader
-			ALIGN16 rgba_F32 planarSourceColor = shader.getPixels_2x2(weights);
+			ALIGN16 Rgba_F32 planarSourceColor = shader.getPixels_2x2(weights);
 			// Apply alpha filtering
 			if (FILTER == Filter::Alpha) {
 				// Get opacity from the source color
@@ -146,7 +146,7 @@ inline static void fillQuadSuper(const Shader& shader, int x, SafePointer<uint32
 				// Read the packed colors for alpha blending
 				ALIGN16 U32x4 packedTargetColor = clippedRead<CLIP_SIDES>(pixelDataUpper, pixelDataLower, vis0, vis1, vis2, vis3);
 				// Unpack the target color into planar RGBA format so that it can be mixed with the source color
-				ALIGN16 rgba_F32 planarTargetColor(packedTargetColor, targetPackingOrder);
+				ALIGN16 Rgba_F32 planarTargetColor(packedTargetColor, targetPackingOrder);
 				// Blend linearly using floats
 				planarSourceColor = (planarSourceColor * opacity) + (planarTargetColor * (1.0f - opacity));
 			}

+ 1 - 1
Source/DFPSR/render/shader/Shader.h

@@ -88,7 +88,7 @@ class Shader {
 public:
 	void fillShape(ImageRgbaU8Impl *colorBuffer, ImageF32Impl *depthBuffer, const ITriangle2D &triangle, const Projection &projection, const RowShape &shape, Filter filter);
 	// The main call that defines the pixel shader
-	virtual rgba_F32 getPixels_2x2(const F32x4x3 &vertexWeights) const = 0;
+	virtual Rgba_F32 getPixels_2x2(const F32x4x3 &vertexWeights) const = 0;
 };
 
 }

+ 55 - 55
Source/DFPSR/render/shader/shaderMethods.h

@@ -43,8 +43,8 @@ namespace shaderMethods {
 		return vMA + vMB + vMC;
 	}
 
-	inline rgba_F32 interpolateVertexColor(const FVector3D &red, const FVector3D &green, const FVector3D &blue, const FVector3D &alpha, const F32x4x3 &vertexWeights) {
-		return rgba_F32(
+	inline Rgba_F32 interpolateVertexColor(const FVector3D &red, const FVector3D &green, const FVector3D &blue, const FVector3D &alpha, const F32x4x3 &vertexWeights) {
+		return Rgba_F32(
 		  interpolate(red,   vertexWeights),
 		  interpolate(green, vertexWeights),
 		  interpolate(blue,  vertexWeights),
@@ -87,10 +87,10 @@ namespace shaderMethods {
 
 	inline U32x4 mix_BL(const U32x4 &colorA, const U32x4 &colorB, const U32x4 &colorC, const U32x4 &colorD, const U32x4 &weightX, const U32x4 &weightY) {
 		// Get inverse weights
-		ALIGN16 U16x8 weightXR = repeatAs16Bits(weightX);
-		ALIGN16 U16x8 weightYB = repeatAs16Bits(weightY);
-		ALIGN16 U16x8 weightXL = invertWeight(weightXR);
-		ALIGN16 U16x8 weightYT = invertWeight(weightYB);
+		U16x8 weightXR = repeatAs16Bits(weightX);
+		U16x8 weightYB = repeatAs16Bits(weightY);
+		U16x8 weightXL = invertWeight(weightXR);
+		U16x8 weightYT = invertWeight(weightYB);
 		// Multiply
 		return weightColors(weightColors(colorA, weightXL, colorB, weightXR), weightYT, weightColors(colorC, weightXL, colorD, weightXR), weightYB);
 	}
@@ -98,8 +98,9 @@ namespace shaderMethods {
 	// Single layer sampling methods
 	inline U32x4 sample_U32(const TextureRgbaLayer *source, const U32x4 &col, const U32x4 &row) {
 		#ifdef USE_AVX2
-			ALIGN16 U32x4 pixelOffset((col + (row << (source->strideShift - 2)))); // PixelOffset = Column + Row * PixelStride
-			return U32x4(GATHER_U32_AVX2(source->data, pixelOffset.v, 4));
+			U32x4 pixelOffset((col + (row << (source->strideShift - 2)))); // PixelOffset = Column + Row * PixelStride
+			return U32x4(GATHER_U32x4_AVX2(source->data, pixelOffset.v, 4));
+			// return gather(source->data, pixelOffset.v); TODO: Needs SafePointer, so that this function can use the gather function with automatic emulation instead of hardcoding for AVX2
 		#else
 			UVector4D byteOffset = ((col << 2) + (row << source->strideShift)).get(); // ByteOffset = Column * 4 + Row * ByteStride
 			return U32x4(
@@ -143,70 +144,69 @@ namespace shaderMethods {
 	template<Interpolation INTERPOLATION>
 	inline U32x4 sample_U32(const TextureRgbaLayer *source, const F32x4 &u, const F32x4 &v) {
 		if (INTERPOLATION == Interpolation::BL) {
-			ALIGN16 F32x4 uLow(u + source->halfPixelOffsetU);
-			ALIGN16 F32x4 vLow(v + source->halfPixelOffsetV);
-			ALIGN16 U32x4 subPixLowX(truncateToU32(uLow * source->subWidth)); // SubPixelLowX = ULow * (Width * 256)
-			ALIGN16 U32x4 subPixLowY(truncateToU32(vLow * source->subHeight)); // SubPixelLowY = VLow * (Height * 256)
-			ALIGN16 U32x4 weightX = subPixLowX & 255; // WeightX = SubPixelLowX % 256
-			ALIGN16 U32x4 weightY = subPixLowY & 255; // WeightY = SubPixelLowY % 256
-			ALIGN16 U32x4 pixLowX(subPixLowX >> 8); // PixelLowX = SubPixelLowX / 256
-			ALIGN16 U32x4 pixLowY(subPixLowY >> 8); // PixelLowY = SubPixelLowY / 256
-			ALIGN16 U32x4 wMask(source->widthMask);
-			ALIGN16 U32x4 hMask(source->heightMask);
-			ALIGN16 U32x4 colLow(pixLowX & wMask); // ColumnLow = PixelLowX % Width
-			ALIGN16 U32x4 rowLow(pixLowY & hMask); // RowLow = PixelLowY % Height
-			ALIGN16 U32x4 colHigh(((colLow + 1) & wMask)); // ColumnHigh = (ColumnLow + 1) % Width
-			ALIGN16 U32x4 rowHigh(((rowLow + 1) & hMask)); // RowHigh = (RowLow + 1) % Height
+			F32x4 uLow(u + source->halfPixelOffsetU);
+			F32x4 vLow(v + source->halfPixelOffsetV);
+			U32x4 subPixLowX(truncateToU32(uLow * source->subWidth)); // SubPixelLowX = ULow * (Width * 256)
+			U32x4 subPixLowY(truncateToU32(vLow * source->subHeight)); // SubPixelLowY = VLow * (Height * 256)
+			U32x4 weightX = subPixLowX & 255; // WeightX = SubPixelLowX % 256
+			U32x4 weightY = subPixLowY & 255; // WeightY = SubPixelLowY % 256
+			U32x4 pixLowX(subPixLowX >> 8); // PixelLowX = SubPixelLowX / 256
+			U32x4 pixLowY(subPixLowY >> 8); // PixelLowY = SubPixelLowY / 256
+			U32x4 wMask(source->widthMask);
+			U32x4 hMask(source->heightMask);
+			U32x4 colLow(pixLowX & wMask); // ColumnLow = PixelLowX % Width
+			U32x4 rowLow(pixLowY & hMask); // RowLow = PixelLowY % Height
+			U32x4 colHigh(((colLow + 1) & wMask)); // ColumnHigh = (ColumnLow + 1) % Width
+			U32x4 rowHigh(((rowLow + 1) & hMask)); // RowHigh = (RowLow + 1) % Height
 			// Sample colors in the 4 closest pixels
-			ALIGN16 U32x4 colorA(sample_U32(source, colLow, rowLow));
-			ALIGN16 U32x4 colorB(sample_U32(source, colHigh, rowLow));
-			ALIGN16 U32x4 colorC(sample_U32(source, colLow, rowHigh));
-			ALIGN16 U32x4 colorD(sample_U32(source, colHigh, rowHigh));
+			U32x4 colorA(sample_U32(source, colLow, rowLow));
+			U32x4 colorB(sample_U32(source, colHigh, rowLow));
+			U32x4 colorC(sample_U32(source, colLow, rowHigh));
+			U32x4 colorD(sample_U32(source, colHigh, rowHigh));
 			// Take a weighted average
 			return shaderMethods::mix_BL(colorA, colorB, colorC, colorD, weightX, weightY);
 		} else { // Interpolation::NN or unhandled
-			ALIGN16 U32x4 pixX(truncateToU32(u * source->width)); // PixelX = U * Width
-			ALIGN16 U32x4 pixY(truncateToU32(v * source->height)); // PixelY = V * Height
-			ALIGN16 U32x4 col(pixX & source->widthMask); // Column = PixelX % Width
-			ALIGN16 U32x4 row(pixY & source->heightMask); // Row = PixelY % Height
+			U32x4 pixX(truncateToU32(u * source->width)); // PixelX = U * Width
+			U32x4 pixY(truncateToU32(v * source->height)); // PixelY = V * Height
+			U32x4 col(pixX & source->widthMask); // Column = PixelX % Width
+			U32x4 row(pixY & source->heightMask); // Row = PixelY % Height
 			return sample_U32(source, col, row);
 		}
 	}
 
 	// Precondition: u, v > -0.875f = 1 - (0.5 / minimumMipSize)
 	template<Interpolation INTERPOLATION, bool HIGH_QUALITY>
-	inline rgba_F32 sample_F32(const TextureRgbaLayer *source, const F32x4 &u, const F32x4 &v) {
+	inline Rgba_F32 sample_F32(const TextureRgbaLayer *source, const F32x4 &u, const F32x4 &v) {
 		if (INTERPOLATION == Interpolation::BL) {
 			if (HIGH_QUALITY) { // High quality interpolation
-				ALIGN16 F32x4 uLow(u + source->halfPixelOffsetU);
-				ALIGN16 F32x4 vLow(v + source->halfPixelOffsetV);
-				ALIGN16 F32x4 pixX = uLow * source->width; // PixelX = ULow * Width
-				ALIGN16 F32x4 pixY = vLow * source->height; // PixelY = VLow * Height
+				F32x4 uLow(u + source->halfPixelOffsetU);
+				F32x4 vLow(v + source->halfPixelOffsetV);
+				F32x4 pixX = uLow * source->width; // PixelX = ULow * Width
+				F32x4 pixY = vLow * source->height; // PixelY = VLow * Height
 				// Truncation can be used as floor for positive input
-				ALIGN16 U32x4 pixLowX(truncateToU32(pixX)); // PixelLowX = floor(PixelX)
-				ALIGN16 U32x4 pixLowY(truncateToU32(pixY)); // PixelLowY = floor(PixelY)
-				ALIGN16 U32x4 wMask(source->widthMask);
-				ALIGN16 U32x4 hMask(source->heightMask);
-				ALIGN16 U32x4 colLow(pixLowX & wMask); // ColumnLow = PixelLowX % Width
-				ALIGN16 U32x4 rowLow(pixLowY & hMask); // RowLow = PixelLowY % Height
-				ALIGN16 U32x4 colHigh(((colLow + 1) & wMask)); // ColumnHigh = (ColumnLow + 1) % Width
-				ALIGN16 U32x4 rowHigh(((rowLow + 1) & hMask)); // RowHigh = (RowLow + 1) % Height
+				U32x4 pixLowX(truncateToU32(pixX)); // PixelLowX = floor(PixelX)
+				U32x4 pixLowY(truncateToU32(pixY)); // PixelLowY = floor(PixelY)
+				U32x4 wMask(source->widthMask);
+				U32x4 hMask(source->heightMask);
+				U32x4 colLow(pixLowX & wMask); // ColumnLow = PixelLowX % Width
+				U32x4 rowLow(pixLowY & hMask); // RowLow = PixelLowY % Height
+				U32x4 colHigh(((colLow + 1) & wMask)); // ColumnHigh = (ColumnLow + 1) % Width
+				U32x4 rowHigh(((rowLow + 1) & hMask)); // RowHigh = (RowLow + 1) % Height
 				// Sample colors in the 4 closest pixels
-				ALIGN16 rgba_F32 colorA(rgba_F32(sample_U32(source, colLow, rowLow)));
-				ALIGN16 rgba_F32 colorB(rgba_F32(sample_U32(source, colHigh, rowLow)));
-				ALIGN16 rgba_F32 colorC(rgba_F32(sample_U32(source, colLow, rowHigh)));
-				ALIGN16 rgba_F32 colorD(rgba_F32(sample_U32(source, colHigh, rowHigh)));
-
-				ALIGN16 F32x4 weightX = pixX - floatFromU32(pixLowX);
-				ALIGN16 F32x4 weightY = pixY - floatFromU32(pixLowY);
-				ALIGN16 F32x4 invWeightX = 1.0f - weightX;
-				ALIGN16 F32x4 invWeightY = 1.0f - weightY;
+				Rgba_F32 colorA(Rgba_F32(sample_U32(source, colLow, rowLow)));
+				Rgba_F32 colorB(Rgba_F32(sample_U32(source, colHigh, rowLow)));
+				Rgba_F32 colorC(Rgba_F32(sample_U32(source, colLow, rowHigh)));
+				Rgba_F32 colorD(Rgba_F32(sample_U32(source, colHigh, rowHigh)));
+				F32x4 weightX = pixX - floatFromU32(pixLowX);
+				F32x4 weightY = pixY - floatFromU32(pixLowY);
+				F32x4 invWeightX = 1.0f - weightX;
+				F32x4 invWeightY = 1.0f - weightY;
 				return (colorA * invWeightX + colorB * weightX) * invWeightY + (colorC * invWeightX + colorD * weightX) * weightY;
 			} else { // Fast interpolation
-				return rgba_F32(sample_U32<Interpolation::BL>(source, u, v));
+				return Rgba_F32(sample_U32<Interpolation::BL>(source, u, v));
 			}
 		} else { // Interpolation::NN or unhandled
-			return rgba_F32(sample_U32<Interpolation::NN>(source, u, v));
+			return Rgba_F32(sample_U32<Interpolation::NN>(source, u, v));
 		}
 	}
 
@@ -220,7 +220,7 @@ namespace shaderMethods {
 
 	// Precondition: u, v > -0.875f = 1 - (0.5 / minimumMipSize)
 	template<Interpolation INTERPOLATION, bool HIGH_QUALITY>
-	inline rgba_F32 sample_F32(const TextureRgba *source, const F32x4 &u, const F32x4 &v) {
+	inline Rgba_F32 sample_F32(const TextureRgba *source, const F32x4 &u, const F32x4 &v) {
 		int mipLevel = getMipLevel(source, u, v);
 		return sample_F32<INTERPOLATION, HIGH_QUALITY>(&(source->mips[mipLevel]), u, v);
 	}

+ 12 - 12
Source/DFPSR/render/shader/shaderTypes.h

@@ -31,22 +31,22 @@
 
 namespace dsr {
 
-struct rgba_F32 {
+struct Rgba_F32 {
 	F32x4 red;
 	F32x4 green;
 	F32x4 blue;
 	F32x4 alpha;
-	explicit rgba_F32(const U32x4 &color) :
+	explicit Rgba_F32(const U32x4 &color) :
 	  red(  floatFromU32(getRed(  color))),
 	  green(floatFromU32(getGreen(color))),
 	  blue( floatFromU32(getBlue( color))),
 	  alpha(floatFromU32(getAlpha(color))) {}
-	rgba_F32(const U32x4 &color, const PackOrder &order) :
+	Rgba_F32(const U32x4 &color, const PackOrder &order) :
 	  red(  floatFromU32(getRed(  color, order))),
 	  green(floatFromU32(getGreen(color, order))),
 	  blue( floatFromU32(getBlue( color, order))),
 	  alpha(floatFromU32(getAlpha(color, order))) {}
-	rgba_F32(const F32x4 &red, const F32x4 &green, const F32x4 &blue, const F32x4 &alpha) : red(red), green(green), blue(blue), alpha(alpha) {}
+	Rgba_F32(const F32x4 &red, const F32x4 &green, const F32x4 &blue, const F32x4 &alpha) : red(red), green(green), blue(blue), alpha(alpha) {}
 	// TODO: Use a template argument for deciding the packing order for external image formats
 	U32x4 toSaturatedByte() const {
 		return floatToSaturatedByte(this->red, this->green, this->blue, this->alpha);
@@ -55,17 +55,17 @@ struct rgba_F32 {
 		return floatToSaturatedByte(this->red, this->green, this->blue, this->alpha, order);
 	}
 };
-inline rgba_F32 operator+(const rgba_F32 &left, const rgba_F32 &right) {
-	return rgba_F32(left.red + right.red, left.green + right.green, left.blue + right.blue, left.alpha + right.alpha);
+inline Rgba_F32 operator+(const Rgba_F32 &left, const Rgba_F32 &right) {
+	return Rgba_F32(left.red + right.red, left.green + right.green, left.blue + right.blue, left.alpha + right.alpha);
 }
-inline rgba_F32 operator-(const rgba_F32 &left, const rgba_F32 &right) {
-	return rgba_F32(left.red - right.red, left.green - right.green, left.blue - right.blue, left.alpha - right.alpha);
+inline Rgba_F32 operator-(const Rgba_F32 &left, const Rgba_F32 &right) {
+	return Rgba_F32(left.red - right.red, left.green - right.green, left.blue - right.blue, left.alpha - right.alpha);
 }
-inline rgba_F32 operator*(const rgba_F32 &left, const rgba_F32 &right) {
-	return rgba_F32(left.red * right.red, left.green * right.green, left.blue * right.blue, left.alpha * right.alpha);
+inline Rgba_F32 operator*(const Rgba_F32 &left, const Rgba_F32 &right) {
+	return Rgba_F32(left.red * right.red, left.green * right.green, left.blue * right.blue, left.alpha * right.alpha);
 }
-inline rgba_F32 operator*(const rgba_F32 &left, const F32x4 &right) {
-	return rgba_F32(left.red * right, left.green * right, left.blue * right, left.alpha * right);
+inline Rgba_F32 operator*(const Rgba_F32 &left, const F32x4 &right) {
+	return Rgba_F32(left.red * right, left.green * right, left.blue * right, left.alpha * right);
 }
 
 }

+ 1 - 1
Source/soundManagers/AlsaSound.cpp

@@ -15,7 +15,7 @@ static SafePointer<int16_t> outputData;
 static SafePointer<float> floatData;
 
 static void allocateBuffers(int neededElements) {
-	int64_t roundedElements = roundUp(neededElements, 8); // Using the same padding for both allow loading two whole SIMD vectors for large input and writing a single output vector.
+	int64_t roundedElements = roundUp(neededElements, 8); // Using the same padding for both allow loading two whole 128-bit SIMD vectors for large input and writing a single output vector.
 	outputBuffer = buffer_create(roundedElements * sizeof(int16_t));
 	floatBuffer = buffer_create(roundedElements * sizeof(float));
 	outputData = buffer_getSafeData<int16_t>(outputBuffer, "Output data");

+ 411 - 2
Source/test/tests/SimdTest.cpp

@@ -136,7 +136,7 @@ START_TEST(Simd)
 		}
 	#endif
 
-	// Reinterpret
+	// Reinterpret (Depends on endianness!)
 	ASSERT_EQUAL(U16x8(U32x4(12, 34, 56, 78)), U16x8(12, 0, 34, 0, 56, 0, 78, 0));
 	ASSERT_EQUAL(U16x8(12, 0, 34, 0, 56, 0, 78, 0).get_U32(), U32x4(12, 34, 56, 78));
 
@@ -333,6 +333,7 @@ START_TEST(Simd)
 	ASSERT_EQUAL(vectorExtract_15(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31));
 	ASSERT_EQUAL(vectorExtract_16(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32));
 
+	// TODO: Move SIMD extra into simd.h and implement emulation.
 	#ifdef USE_SIMD_EXTRA
 		SIMD_U32x4 a = U32x4(1, 2, 3, 4).v;
 		SIMD_U32x4 b = U32x4(5, 6, 7, 8).v;
@@ -344,5 +345,413 @@ START_TEST(Simd)
 		ASSERT_EQUAL(U32x4(d), U32x4(1, 5, 2, 6));
 		ASSERT_EQUAL(U32x4(e), U32x4(3, 7, 4, 8));
 	#endif
-END_TEST
 
+	// 256-bit SIMD tests (emulated using scalar operations if the test is not compiled with AVX2 enabled)
+
+	// F32x8 Comparisons
+	ASSERT_EQUAL(F32x8(1.5f), F32x8(1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f));
+	ASSERT_EQUAL(F32x8(-1.5f), F32x8(-1.5f, -1.5f, -1.5f, -1.5f, -1.5f, -1.5f, -1.5f, -1.5f));
+	ASSERT_EQUAL(F32x8(1.2f, 3.4f, 5.6f, 7.8f, -2.4f, 452.351f, 1000000.0f, -1000.0f), F32x8(1.2f, 3.4f, 5.6f, 7.8f, -2.4f, 452.351f, 1000000.0f, -1000.0f));
+	ASSERT_NOT_EQUAL(F32x8(1.3f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f), F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f));
+	ASSERT_NOT_EQUAL(F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f), F32x8(1.2f, -1.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f));
+	ASSERT_NOT_EQUAL(F32x8(1.2f, 3.4f, 5.5f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f), F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f));
+	ASSERT_NOT_EQUAL(F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f), F32x8(1.2f, 3.4f, 5.6f, -7.8f, 5.3f, 6.7f, 1.4f, -5.2f));
+	ASSERT_NOT_EQUAL(F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f), F32x8(1.2f, 3.4f, 5.6f, 7.8f, 0.0f, 6.7f, 1.4f, -5.2f));
+	ASSERT_NOT_EQUAL(F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f), F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.69f, 1.4f, -5.2f));
+	ASSERT_NOT_EQUAL(F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f), F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.3f, -5.2f));
+	ASSERT_NOT_EQUAL(F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, -5.2f), F32x8(1.2f, 3.4f, 5.6f, 7.8f, 5.3f, 6.7f, 1.4f, 5.2f));
+
+	// I32x8 Comparisons
+	ASSERT_EQUAL(I32x8(4), I32x8(4, 4, 4, 4, 4, 4, 4, 4));
+	ASSERT_EQUAL(I32x8(-4), I32x8(-4, -4, -4, -4, -4, -4, -4, -4));
+	ASSERT_EQUAL(I32x8(-1, 2, -3, 4, -5, 6, -7, 8), I32x8(-1, 2, -3, 4, -5, 6, -7, 8));
+	ASSERT_NOT_EQUAL(I32x8(-1, 2, 7, 4, 8, 3, 5, 45), I32x8(-1, 2, -3, 4, 8, 3, 5, 45));
+
+	// U32x8 Comparisons
+	ASSERT_EQUAL(U32x8(4), U32x8(4, 4, 4, 4, 4, 4, 4, 4));
+	ASSERT_EQUAL(U32x8(1, 2, 3, 4, 5, 6, 7, 8), U32x8(1, 2, 3, 4, 5, 6, 7, 8));
+	ASSERT_NOT_EQUAL(U32x8(1, 2, 3, 4, 5, 6, 12, 8), U32x8(1, 2, 3, 4, 5, 6, 7, 8));
+
+	// U16x16 Comparisons
+	ASSERT_EQUAL(U16x16((uint16_t)8), U16x16(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8));
+	ASSERT_EQUAL(U16x16((uint32_t)8), U16x16(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0));
+	ASSERT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 0, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 0, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 0, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9,  0, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  0, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  0, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,  0, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,  0, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,  0, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  0), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 2, 0, 4, 5, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(1, 0, 3, 4, 5, 6, 0, 0, 9, 10, 11, 12, 13,  0, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(0, 2, 3, 4, 0, 6, 7, 8, 9, 10, 11, 0,  13, 14, 15, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_NOT_EQUAL(U16x16(0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 11, 0,  13, 14,  0, 16), U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+
+	// U8x32 Comparisons
+	ASSERT_EQUAL(U8x32((uint8_t)250), U8x32(250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250));
+	ASSERT_NOT_EQUAL(U8x32((uint8_t)250), U8x32(250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 100, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250));
+	ASSERT_NOT_EQUAL(U8x32((uint8_t)250), U8x32(0, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250));
+	ASSERT_NOT_EQUAL(U8x32((uint8_t)250), U8x32(250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 0));
+
+	// Reinterpret (Depends on endianness!)
+	ASSERT_EQUAL(U16x16(U32x8(12, 34, 56, 78, 11, 22, 33, 44)), U16x16(12, 0, 34, 0, 56, 0, 78, 0, 11, 0, 22, 0, 33, 0, 44, 0));
+	ASSERT_EQUAL(U16x16(U32x8(12, 34, 56, 78, 11, 22, 33, 131116)), U16x16(12, 0, 34, 0, 56, 0, 78, 0, 11, 0, 22, 0, 33, 0, 44, 2));
+	ASSERT_EQUAL(U16x16(12, 0, 34, 0, 56, 0, 78, 0, 11, 0, 22, 0, 33, 0, 44, 2).get_U32(), U32x8(12, 34, 56, 78, 11, 22, 33, 131116));
+
+	// Reciprocal: 1 / x
+	ASSERT_EQUAL(F32x8(0.5f, 1.0f, 2.0f, 4.0f, 8.0f, 10.0f, 100.0f, 1000.0f).reciprocal(), F32x8(2.0f, 1.0f, 0.5f, 0.25f, 0.125f, 0.1f, 0.01f, 0.001f));
+
+	// Square root: sqrt(x)
+	ASSERT_EQUAL(F32x8(1.0f, 4.0f, 9.0f, 100.0f, 64.0f, 256.0f, 1024.0f, 4096.0f).squareRoot(), F32x8(1.0f, 2.0f, 3.0f, 10.0f, 8.0f, 16.0f, 32.0f, 64.0f));
+
+	// Reciprocal square root: 1 / sqrt(x)
+	ASSERT_EQUAL(F32x8(1.0f, 4.0f, 16.0f, 100.0f, 400.0f, 64.0f, 25.0f, 100.0f).reciprocalSquareRoot(), F32x8(1.0f, 0.5f, 0.25f, 0.1f, 0.05f, 0.125f, 0.2f, 0.1f));
+
+	// Minimum
+	ASSERT_EQUAL(min(F32x8(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f), F32x8(5.0f, 3.0f, 1.0f, -1.0f, 4.0f, 5.0f, -2.5f, 10.0f)), F32x8(1.1f, 2.2f, 1.0f, -1.0f, 4.0f, 5.0f, -2.5f, 8.8f));
+
+	// Maximum
+	ASSERT_EQUAL(max(F32x8(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f), F32x8(5.0f, 3.0f, 1.0f, -1.0f, 4.0f, 5.0f, -2.5f, 10.0f)), F32x8(5.0f, 3.0f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 10.0f));
+
+	// Clamp
+	ASSERT_EQUAL(F32x8(-35.1f, 1.0f, 2.0f, 45.7f, 0.0f, -1.0f, 2.1f, -1.9f).clamp(-1.5f, 1.5f), F32x8(-1.5f, 1.0f, 1.5f, 1.5f, 0.0f, -1.0f, 1.5f, -1.5f));
+
+	// F32x8 operations
+	ASSERT_EQUAL(F32x8(1.1f, -2.2f, 3.3f, 4.0f, 1.4f, 2.3f, 3.2f, 4.1f) + F32x8(2.2f, -4.4f, 6.6f, 8.0f, 4.11f, 3.22f, 2.33f, 1.44f), F32x8(3.3f, -6.6f, 9.9f, 12.0f, 5.51f, 5.52f, 5.53f, 5.54f));
+	ASSERT_EQUAL(F32x8(-1.5f, -0.5f, 0.5f, 1.5f, 1000.0f, 2000.0f, -4000.0f, -1500.0f) + 1.0f, F32x8(-0.5f, 0.5f, 1.5f, 2.5f, 1001.0f, 2001.0f, -3999.0f, -1499.0f));
+	ASSERT_EQUAL(1.0f + F32x8(-1.5f, -0.5f, 0.5f, 1.5f, 1000.0f, 2000.0f, -4000.0f, -1500.0f), F32x8(-0.5f, 0.5f, 1.5f, 2.5f, 1001.0f, 2001.0f, -3999.0f, -1499.0f));
+	ASSERT_EQUAL(F32x8(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f) - F32x8(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f), F32x8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
+	ASSERT_EQUAL(F32x8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f) - 0.5f, F32x8(0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f));
+	ASSERT_EQUAL(0.5f - F32x8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f), F32x8(-0.5f, -1.5f, -2.5f, -3.5f, -4.5f, -5.5f, -6.5f, -7.5f));
+	ASSERT_EQUAL(2.0f * F32x8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f), F32x8(2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f, 14.0f, 16.0f));
+	ASSERT_EQUAL(F32x8(1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f) * -2.0f, F32x8(-2.0f, 4.0f, -6.0f, 8.0f, -10.0f, 12.0f, -14.0f, 16.0f));
+	ASSERT_EQUAL(F32x8(1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f) * F32x8(1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f), F32x8(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f));
+	ASSERT_EQUAL(-F32x8(1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f), F32x8(-1.0f, 2.0f, -3.0f, 4.0f, -5.0f, 6.0f, -7.0f, 8.0f));
+
+	// I32x8 operations
+	ASSERT_EQUAL(I32x8(1, 2, 3, 4, 5, 6, 7, 8) - 1, I32x8(0, 1, 2, 3, 4, 5, 6, 7));
+	ASSERT_EQUAL(1 - I32x8(1, 2, 3, 4, 5, 6, 7, 8), I32x8(0, -1, -2, -3, -4, -5, -6, -7));
+	ASSERT_EQUAL(2 * I32x8(1, 2, 3, 4, 5, 6, 7, 8), I32x8(2, 4, 6, 8, 10, 12, 14, 16));
+	ASSERT_EQUAL(I32x8(1, -2, 3, -4, 5, -6, 7, -8) * -2, I32x8(-2, 4, -6, 8, -10, 12, -14, 16));
+	ASSERT_EQUAL(I32x8(1, -2, 3, -4, 5, -6, 7, -8) * I32x8(1, -2, 3, -4, 5, -6, 7, -8), I32x8(1, 4, 9, 16, 25, 36, 49, 64));
+	ASSERT_EQUAL(-I32x8(1, -2, 3, -4, 5, -6, 7, -8), I32x8(-1, 2, -3, 4, -5, 6, -7, 8));
+
+	// U32x8 operations
+	ASSERT_EQUAL(U32x8(1, 2, 3, 4, 5, 6, 7, 8) - 1, U32x8(0, 1, 2, 3, 4, 5, 6, 7));
+	ASSERT_EQUAL(10 - U32x8(1, 2, 3, 4, 5, 6, 7, 8), U32x8(9, 8, 7, 6, 5, 4, 3, 2));
+	ASSERT_EQUAL(2 * U32x8(1, 2, 3, 4, 5, 6, 7, 8), U32x8(2, 4, 6, 8, 10, 12, 14, 16));
+	ASSERT_EQUAL(U32x8(1, 2, 3, 4, 5, 6, 7, 8) * 2, U32x8(2, 4, 6, 8, 10, 12, 14, 16));
+	ASSERT_EQUAL(U32x8(1, 2, 3, 4, 5, 6, 7, 8) * U32x8(1, 2, 3, 4, 5, 6, 7, 8), U32x8(1, 4, 9, 16, 25, 36, 49, 64));
+
+	// U16x16 operations
+	ASSERT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + U16x16(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32), U16x16(3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48));
+	ASSERT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + 8, U16x16(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24));
+	ASSERT_EQUAL(8 + U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24));
+	ASSERT_EQUAL(U16x16(3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48) - U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32));
+	ASSERT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) - 1, U16x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
+	ASSERT_EQUAL(16 - U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+	ASSERT_EQUAL(U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) * 2, U16x16(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32));
+	ASSERT_EQUAL(2 * U16x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U16x16(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32));
+
+	// U8x32 operations
+	ASSERT_EQUAL(
+	      U8x32( 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)
+	    + U8x32( 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64),
+	      U8x32( 3,  6,  9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96));
+	ASSERT_EQUAL(
+	      U8x32( 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32) + 5,
+	      U8x32( 6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37));
+	ASSERT_EQUAL(
+	  5 + U8x32( 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32),
+	      U8x32( 6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37));
+	ASSERT_EQUAL(
+	      U8x32( 3,  6,  9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96)
+	    - U8x32( 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32),
+	      U8x32( 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64));
+	ASSERT_EQUAL(
+	      U8x32( 6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37) - 5,
+	      U8x32( 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32));
+	ASSERT_EQUAL(
+	 33 - U8x32( 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32),
+	      U8x32(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1));
+	ASSERT_EQUAL(
+	  saturatedAddition(
+	    U8x32(  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,255),
+	    U8x32((uint8_t)240)),
+	    U8x32(241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255)
+	);
+	ASSERT_EQUAL(
+	  saturatedSubtraction(
+	    U8x32(  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,255),
+	    U8x32((uint8_t)16)),
+	    U8x32(  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,239)
+	);
+
+	// Saturated unsigned integer packing
+	ASSERT_EQUAL(saturateToU8(
+	  U16x16(1, 2, 3, 4, 65535, 6, 7, 8, 9, 10, 11, 12, 1000, 14, 15, 16), U16x16(17, 18, 19, 20, 21, 22, 23, 65535, 25, 26, 27, 28, 29, 30, 31, 32)),
+	  U8x32( 1, 2, 3, 4, 255,   6, 7, 8, 9, 10, 11, 12,  255, 14, 15, 16,         17, 18, 19, 20, 21, 22, 23,   255, 25, 26, 27, 28, 29, 30, 31, 32));
+
+	// Unsigned integer unpacking
+	ASSERT_EQUAL(lowerToU32(U16x16(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)), U32x8(1,2,3,4,5,6,7,8));
+	ASSERT_EQUAL(higherToU32(U16x16(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)), U32x8(9,10,11,12,13,14,15,16));
+	ASSERT_EQUAL(lowerToU32(U16x16(1,2,3,4,5,6,65535,8,9,10,11,12,13,1000,15,16)), U32x8(1,2,3,4,5,6,65535,8));
+	ASSERT_EQUAL(higherToU32(U16x16(1,2,3,4,5,6,65535,8,9,10,11,12,13,1000,15,16)), U32x8(9,10,11,12,13,1000,15,16));
+	ASSERT_EQUAL(lowerToU16(U8x32(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,255,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,255)), U16x16(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,255));
+	ASSERT_EQUAL(higherToU16(U8x32(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,255,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,255)), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,255));
+
+	// Bitwise operations
+	ASSERT_EQUAL(
+	    U32x8(0xFFFFFFFF, 0x12345678, 0xF0F0F0F0, 0x00000000, 0xEEEEEEEE, 0x87654321, 0x0F0F0F0F, 0x00010001)
+	  & 0x0000FFFF,
+	    U32x8(0x0000FFFF, 0x00005678, 0x0000F0F0, 0x00000000, 0x0000EEEE, 0x00004321, 0x00000F0F, 0x00000001));
+	ASSERT_EQUAL(
+	    U32x8(0xFFFFFFFF, 0x12345678, 0xF0F0F0F0, 0x00000000, 0xEEEEEEEE, 0x87654321, 0x0F0F0F0F, 0x00010001)
+	  & 0xFFFF0000,
+	    U32x8(0xFFFF0000, 0x12340000, 0xF0F00000, 0x00000000, 0xEEEE0000, 0x87650000, 0x0F0F0000, 0x00010000));
+	ASSERT_EQUAL(
+	    U32x8(0xFFFFFFFF, 0x12345678, 0xF0F0F0F0, 0x00000000, 0xEEEEEEEE, 0x87654321, 0x0F0F0F0F, 0x00010001)
+	  | 0x0000FFFF,
+	    U32x8(0xFFFFFFFF, 0x1234FFFF, 0xF0F0FFFF, 0x0000FFFF, 0xEEEEFFFF, 0x8765FFFF, 0x0F0FFFFF, 0x0001FFFF));
+	ASSERT_EQUAL(
+	    U32x8(0xFFFFFFFF, 0x12345678, 0xF0F0F0F0, 0x00000000, 0xEEEEEEEE, 0x87654321, 0x0F0F0F0F, 0x00010001)
+	  | 0xFFFF0000,
+	    U32x8(0xFFFFFFFF, 0xFFFF5678, 0xFFFFF0F0, 0xFFFF0000, 0xFFFFEEEE, 0xFFFF4321, 0xFFFF0F0F, 0xFFFF0001));
+	ASSERT_EQUAL(
+	    U32x8(0xFFFFFFFF, 0xFFF000FF, 0xF0F0F0F0, 0x12345678, 0xEEEEEEEE, 0x87654321, 0x0F0F0F0F, 0x00010001)
+	  & U32x8(0xFF00FF00, 0xFFFF0000, 0x000FF000, 0x0FF00FF0, 0xF00FF00F, 0x00FFFF00, 0xF0F0F0F0, 0x0000FFFF),
+	    U32x8(0xFF00FF00, 0xFFF00000, 0x0000F000, 0x02300670, 0xE00EE00E, 0x00654300, 0x00000000, 0x00000001));
+	ASSERT_EQUAL(
+	    U32x8(0xFFFFFFFF, 0xFFF000FF, 0xF0F0F0F0, 0x12345678, 0xEEEEEEEE, 0x87654321, 0x0F0F0F0F, 0x00010001)
+	  | U32x8(0xFF00FF00, 0xFFFF0000, 0x000FF000, 0x0FF00FF0, 0xF00FF00F, 0x00FFFF00, 0xF0F0F0F0, 0x0000FFFF),
+	    U32x8(0xFFFFFFFF, 0xFFFF00FF, 0xF0FFF0F0, 0x1FF45FF8, 0xFEEFFEEF, 0x87FFFF21, 0xFFFFFFFF, 0x0001FFFF));
+	ASSERT_EQUAL(
+	    U32x8(0b11001100110000110101010010110011, 0b00101011001011101010001101111001, 0b11001010000110111010010100101100, 0b01010111010001010010101110010110, 0b10101110100110100010101011011001, 0b00101110100111010001101010110000, 0b11101010001011100010101110001111, 0b00101010111100010110010110001000)
+	  ^ U32x8(0b00101101001110100011010010100001, 0b10101110100101000011101001010011, 0b00101011100101001011000010100100, 0b11010011101001000110010110110111, 0b00111100101000101010001101001010, 0b00101110100110000111110011010101, 0b11001010010101010010110010101000, 0b11110000111100001111000011110000),
+	    U32x8(0b11100001111110010110000000010010, 0b10000101101110101001100100101010, 0b11100001100011110001010110001000, 0b10000100111000010100111000100001, 0b10010010001110001000100110010011, 0b00000000000001010110011001100101, 0b00100000011110110000011100100111, 0b11011010000000011001010101111000));
+
+	// Bit shift
+	ASSERT_EQUAL(U32x8(1, 2, 3, 4, 5, 6, 7, 8) << 1, U32x8( 2, 4, 6, 8,10,12,14,16));
+	ASSERT_EQUAL(U32x8(1, 2, 3, 4, 5, 6, 7, 8) << 2, U32x8( 4, 8,12,16,20,24,28,32));
+	ASSERT_EQUAL(U32x8(1, 2, 3, 4, 5, 6, 7, 8) << 3, U32x8( 8,16,24,32,40,48,56,64));
+	ASSERT_EQUAL(U32x8(1, 2, 3, 4, 5, 6, 7, 8) << 4, U32x8(16,32,48,64,80,96,112,128));
+	ASSERT_EQUAL(U32x8(1, 2, 3, 4, 5, 6, 7, 8) >> 1, U32x8( 0, 1, 1, 2, 2, 3, 3, 4));
+	ASSERT_EQUAL(U32x8(2, 4, 6, 8, 10, 12, 14, 16) >> 1, U32x8(1, 2, 3, 4, 5, 6, 7, 8));
+	ASSERT_EQUAL(U32x8(2, 4, 6, 8, 10, 12, 14, 16) >> 2, U32x8(0, 1, 1, 2, 2, 3, 3, 4));
+	ASSERT_EQUAL(
+	    U32x8(0x0AB12CD0, 0xFFFFFFFF, 0x12345678, 0xF0000000, 0x87654321, 0x48484848, 0x76437643, 0x11111111)
+	 << 4,
+	    U32x8(0xAB12CD00, 0xFFFFFFF0, 0x23456780, 0x00000000, 0x76543210, 0x84848480, 0x64376430, 0x11111110));
+	ASSERT_EQUAL(
+	    U32x8(0x0AB12CD0, 0xFFFFFFFF, 0x12345678, 0x0000000F, 0x87654321, 0x48484848, 0x76437643, 0x11111111)
+	 >> 4,
+	    U32x8(0x00AB12CD, 0x0FFFFFFF, 0x01234567, 0x00000000, 0x08765432, 0x04848484, 0x07643764, 0x01111111));
+
+	// Element shift with insert
+	ASSERT_EQUAL(vectorExtract_0(U32x8( 1, 2, 3, 4, 5, 6, 7, 8), U32x8( 9,10,11,12,13,14,15,16)),
+	                             U32x8( 1, 2, 3, 4, 5, 6, 7, 8));
+	ASSERT_EQUAL(vectorExtract_1(U32x8( 1, 2, 3, 4, 5, 6, 7, 8), U32x8( 9,10,11,12,13,14,15,16)),
+	                                U32x8( 2, 3, 4, 5, 6, 7, 8,         9));
+	ASSERT_EQUAL(vectorExtract_2(U32x8( 1, 2, 3, 4, 5, 6, 7, 8), U32x8( 9,10,11,12,13,14,15,16)),
+	                                   U32x8( 3, 4, 5, 6, 7, 8,         9,10));
+	ASSERT_EQUAL(vectorExtract_3(U32x8( 1, 2, 3, 4, 5, 6, 7, 8), U32x8( 9,10,11,12,13,14,15,16)),
+	                                      U32x8( 4, 5, 6, 7, 8,         9,10,11));
+	ASSERT_EQUAL(vectorExtract_4(U32x8( 1, 2, 3, 4, 5, 6, 7, 8), U32x8( 9,10,11,12,13,14,15,16)),
+	                                         U32x8( 5, 6, 7, 8,         9,10,11,12));
+	ASSERT_EQUAL(vectorExtract_5(U32x8( 1, 2, 3, 4, 5, 6, 7, 8), U32x8( 9,10,11,12,13,14,15,16)),
+	                                            U32x8( 6, 7, 8,         9,10,11,12,13));
+	ASSERT_EQUAL(vectorExtract_6(U32x8( 1, 2, 3, 4, 5, 6, 7, 8), U32x8( 9,10,11,12,13,14,15,16)),
+	                                               U32x8( 7, 8,         9,10,11,12,13,14));
+	ASSERT_EQUAL(vectorExtract_7(U32x8( 1, 2, 3, 4, 5, 6, 7, 8), U32x8( 9,10,11,12,13,14,15,16)),
+	                                                  U32x8( 8,         9,10,11,12,13,14,15));
+	ASSERT_EQUAL(vectorExtract_8(U32x8( 1, 2, 3, 4, 5, 6, 7, 8), U32x8( 9,10,11,12,13,14,15,16)),
+	                                                             U32x8( 9,10,11,12,13,14,15,16));
+	ASSERT_EQUAL(vectorExtract_5(U32x8( 1, 2, 3, 4, 5, 6, 7, 4294967295), U32x8( 9,10,11,1000,13,14,15,16)),
+	                                            U32x8( 6, 7, 4294967295,         9,10,11,1000,13));
+	ASSERT_EQUAL(vectorExtract_0(I32x8( 1,-2, 3, 4,-5, 6, 7, 8), I32x8( 9,10,11,-12,13,14,15,-16)),
+	                             I32x8( 1,-2, 3, 4,-5, 6, 7, 8));
+	ASSERT_EQUAL(vectorExtract_1(I32x8( 1,-2, 3, 4,-5, 6, 7, 8), I32x8( 9,10,11,-12,13,14,15,-16)),
+	                                I32x8(-2, 3, 4,-5, 6, 7, 8,         9));
+	ASSERT_EQUAL(vectorExtract_2(I32x8( 1,-2, 3, 4,-5, 6, 7, 8), I32x8( 9,10,11,-12,13,14,15,-16)),
+	                                   I32x8( 3, 4,-5, 6, 7, 8,         9,10));
+	ASSERT_EQUAL(vectorExtract_3(I32x8( 1,-2, 3, 4,-5, 6, 7, 8), I32x8( 9,10,11,-12,13,14,15,-16)),
+	                                      I32x8( 4,-5, 6, 7, 8,         9,10,11));
+	ASSERT_EQUAL(vectorExtract_4(I32x8( 1,-2, 3, 4,-5, 6, 7, 8), I32x8( 9,10,11,-12,13,14,15,-16)),
+	                                         I32x8(-5, 6, 7, 8,         9,10,11,-12));
+	ASSERT_EQUAL(vectorExtract_5(I32x8( 1,-2, 3, 4,-5, 6, 7, 8), I32x8( 9,10,11,-12,13,14,15,-16)),
+	                                            I32x8( 6, 7, 8,         9,10,11,-12,13));
+	ASSERT_EQUAL(vectorExtract_6(I32x8( 1,-2, 3, 4,-5, 6, 7, 8), I32x8( 9,10,11,-12,13,14,15,-16)),
+	                                               I32x8( 7, 8,         9,10,11,-12,13,14));
+	ASSERT_EQUAL(vectorExtract_7(I32x8( 1,-2, 3, 4,-5, 6, 7, 8), I32x8( 9,10,11,-12,13,14,15,-16)),
+	                                                  I32x8( 8,         9,10,11,-12,13,14,15));
+	ASSERT_EQUAL(vectorExtract_8(I32x8( 1,-2, 3, 4,-5, 6, 7, 8), I32x8( 9,10,11,-12,13,14,15,-16)),
+	                                                             I32x8( 9,10,11,-12,13,14,15,-16));
+	ASSERT_EQUAL(vectorExtract_0(F32x8( 1.1f,-2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f), F32x8( 9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f,-16.0f)),
+	                             F32x8( 1.1f,-2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f));
+	ASSERT_EQUAL(vectorExtract_1(F32x8( 1.1f,-2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f), F32x8( 9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f,-16.0f)),
+	                                  F32x8( -2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f,         9.0f));
+	ASSERT_EQUAL(vectorExtract_2(F32x8( 1.1f,-2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f), F32x8( 9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f,-16.0f)),
+	                                         F32x8( 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f,         9.0f, 10.0f));
+	ASSERT_EQUAL(vectorExtract_3(F32x8( 1.1f,-2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f), F32x8( 9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f,-16.0f)),
+	                                               F32x8( 4.0f,-5.0f, 6.0f, 7.0f, 8.0f,         9.0f, 10.0f, 11.0f));
+	ASSERT_EQUAL(vectorExtract_4(F32x8( 1.1f,-2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f), F32x8( 9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f,-16.0f)),
+	                                                     F32x8(-5.0f, 6.0f, 7.0f, 8.0f,         9.0f, 10.0f, 11.0f,-12.0f));
+	ASSERT_EQUAL(vectorExtract_5(F32x8( 1.1f,-2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f), F32x8( 9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f,-16.0f)),
+	                                                           F32x8( 6.0f, 7.0f, 8.0f,         9.0f, 10.0f, 11.0f,-12.0f, 13.0f));
+	ASSERT_EQUAL(vectorExtract_6(F32x8( 1.1f,-2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f), F32x8( 9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f,-16.0f)),
+	                                                                 F32x8( 7.0f, 8.0f,         9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f));
+	ASSERT_EQUAL(vectorExtract_7(F32x8( 1.1f,-2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f), F32x8( 9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f,-16.0f)),
+	                                                                       F32x8( 8.0f,         9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f));
+	ASSERT_EQUAL(vectorExtract_8(F32x8( 1.1f,-2.2f, 3.0f, 4.0f,-5.0f, 6.0f, 7.0f, 8.0f), F32x8( 9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f,-16.0f)),
+	                                                                                     F32x8( 9.0f, 10.0f, 11.0f,-12.0f, 13.0f, 14.0f, 15.0f,-16.0f));
+	ASSERT_EQUAL(vectorExtract_0 (U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                             U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16));
+	ASSERT_EQUAL(vectorExtract_1 (U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                U16x16( 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,         17));
+	ASSERT_EQUAL(vectorExtract_2 (U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                   U16x16( 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,         17,18));
+	ASSERT_EQUAL(vectorExtract_3 (U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                      U16x16( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,         17,18,19));
+	ASSERT_EQUAL(vectorExtract_4 (U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                         U16x16( 5, 6, 7, 8, 9,10,11,12,13,14,15,16,         17,18,19,20));
+	ASSERT_EQUAL(vectorExtract_5 (U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                            U16x16( 6, 7, 8, 9,10,11,12,13,14,15,16,         17,18,19,20,21));
+	ASSERT_EQUAL(vectorExtract_6 (U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                               U16x16( 7, 8, 9,10,11,12,13,14,15,16,         17,18,19,20,21,22));
+	ASSERT_EQUAL(vectorExtract_7 (U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                                  U16x16( 8, 9,10,11,12,13,14,15,16,         17,18,19,20,21,22,23));
+	ASSERT_EQUAL(vectorExtract_8 (U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                                     U16x16( 9,10,11,12,13,14,15,16,         17,18,19,20,21,22,23,24));
+	ASSERT_EQUAL(vectorExtract_9 (U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                                        U16x16(10,11,12,13,14,15,16,         17,18,19,20,21,22,23,24,25));
+	ASSERT_EQUAL(vectorExtract_10(U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                                            U16x16(11,12,13,14,15,16,         17,18,19,20,21,22,23,24,25,26));
+	ASSERT_EQUAL(vectorExtract_11(U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                                               U16x16(12,13,14,15,16,         17,18,19,20,21,22,23,24,25,26,27));
+	ASSERT_EQUAL(vectorExtract_12(U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                                                  U16x16(13,14,15,16,         17,18,19,20,21,22,23,24,25,26,27,28));
+	ASSERT_EQUAL(vectorExtract_13(U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                                                     U16x16(14,15,16,         17,18,19,20,21,22,23,24,25,26,27,28,29));
+	ASSERT_EQUAL(vectorExtract_14(U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                                                        U16x16(15,16,         17,18,19,20,21,22,23,24,25,26,27,28,29,30));
+	ASSERT_EQUAL(vectorExtract_15(U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                                                           U16x16(16,         17,18,19,20,21,22,23,24,25,26,27,28,29,30,31));
+	ASSERT_EQUAL(vectorExtract_16(U16x16( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16), U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
+	                                                                                       U16x16(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32));
+	ASSERT_EQUAL(vectorExtract_0(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                             U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32));
+	ASSERT_EQUAL(vectorExtract_1 (U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32( 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33));
+	ASSERT_EQUAL(vectorExtract_2 (U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32( 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34));
+	ASSERT_EQUAL(vectorExtract_3 (U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35));
+	ASSERT_EQUAL(vectorExtract_4 (U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32( 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36));
+	ASSERT_EQUAL(vectorExtract_5 (U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32( 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37));
+	ASSERT_EQUAL(vectorExtract_6 (U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32( 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38));
+	ASSERT_EQUAL(vectorExtract_7 (U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32( 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39));
+	ASSERT_EQUAL(vectorExtract_8 (U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32( 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40));
+	ASSERT_EQUAL(vectorExtract_9 (U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41));
+	ASSERT_EQUAL(vectorExtract_10(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42));
+	ASSERT_EQUAL(vectorExtract_11(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43));
+	ASSERT_EQUAL(vectorExtract_12(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44));
+	ASSERT_EQUAL(vectorExtract_13(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45));
+	ASSERT_EQUAL(vectorExtract_14(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46));
+	ASSERT_EQUAL(vectorExtract_15(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47));
+	ASSERT_EQUAL(vectorExtract_16(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48));
+	ASSERT_EQUAL(vectorExtract_17(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49));
+	ASSERT_EQUAL(vectorExtract_18(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50));
+	ASSERT_EQUAL(vectorExtract_19(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51));
+	ASSERT_EQUAL(vectorExtract_20(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52));
+	ASSERT_EQUAL(vectorExtract_21(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53));
+	ASSERT_EQUAL(vectorExtract_22(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54));
+	ASSERT_EQUAL(vectorExtract_23(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55));
+	ASSERT_EQUAL(vectorExtract_24(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56));
+	ASSERT_EQUAL(vectorExtract_25(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57));
+	ASSERT_EQUAL(vectorExtract_26(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58));
+	ASSERT_EQUAL(vectorExtract_27(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59));
+	ASSERT_EQUAL(vectorExtract_28(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60));
+	ASSERT_EQUAL(vectorExtract_29(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61));
+	ASSERT_EQUAL(vectorExtract_30(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62));
+	ASSERT_EQUAL(vectorExtract_31(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63));
+	ASSERT_EQUAL(vectorExtract_32(U8x32( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32), U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64)),
+	                              U8x32(33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64));
+	{ // Gather test
+		// The Buffer must be kept alive during the pointer's lifetime to prevent freeing the memory too early with reference counting.
+		//   Because SafePointer exists only to be faster than Buffer but safer than a raw pointer.
+		Buffer gatherTestBuffer = buffer_create(sizeof(int32_t) * 32);
+		{
+			// 32-bit floating-point gather
+			SafePointer<float> pointerF = buffer_getSafeData<float>(gatherTestBuffer, "float gather test data");
+			for (int i = 0; i < 32; i++) { // -32.0f, -30.0f, -28.0f, -26.0f ... 24.0f, 26.0f, 28.0f, 30.0f
+				pointerF[i] = i * 2.0f - 32.0f;
+			}
+			ASSERT_EQUAL(gather(pointerF     , U32x4(2, 1, 30, 31)), F32x4(-28.0f, -30.0f, 28.0f, 30.0f));
+			ASSERT_EQUAL(gather(pointerF + 10, U32x4(0, 1, 2, 3)), F32x4(-12.0f, -10.0f, -8.0f, -6.0f));
+			ASSERT_EQUAL(gather(pointerF     , U32x8(2, 1, 28, 29, 3, 0, 30, 31)), F32x8(-28.0f, -30.0f, 24.0f, 26.0f, -26.0f, -32.0f, 28.0f, 30.0f));
+			ASSERT_EQUAL(gather(pointerF + 10, U32x8(0, 1, 2, 3, 4, 5, 6, 7)), F32x8(-12.0f, -10.0f, -8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 2.0f));
+		}
+		{
+			// Signed 32-bit integer gather
+			SafePointer<int32_t> pointerU = buffer_getSafeData<int32_t>(gatherTestBuffer, "int32_t gather test data");
+			for (int i = 0; i < 32; i++) { // -32, -30, -28, -26 ... 24, 26, 28, 30
+				pointerU[i] = i * 2 - 32;
+			}
+			ASSERT_EQUAL(gather(pointerU     , U32x4(2, 1, 30, 31)), I32x4(-28, -30, 28, 30));
+			ASSERT_EQUAL(gather(pointerU + 10, U32x4(0, 1, 2, 3)), I32x4(-12, -10, -8, -6));
+			ASSERT_EQUAL(gather(pointerU     , U32x8(2, 1, 28, 29, 3, 0, 30, 31)), I32x8(-28, -30, 24, 26, -26, -32, 28, 30));
+			ASSERT_EQUAL(gather(pointerU + 10, U32x8(0, 1, 2, 3, 4, 5, 6, 7)), I32x8(-12, -10, -8, -6, -4, -2, 0, 2));
+		}
+		{
+			// Unsigned 32-bit integer gather
+			SafePointer<uint32_t> pointerI = buffer_getSafeData<uint32_t>(gatherTestBuffer, "uint32_t gather test data");
+			for (int i = 0; i < 32; i++) { // 100, 102, 104, 106 ... 156, 158, 160, 162
+				pointerI[i] = 100 + i * 2;
+			}
+			// Signed 32-bit integer gather
+			ASSERT_EQUAL(gather(pointerI     , U32x4(2, 1, 30, 31)), U32x4(104, 102, 160, 162));
+			ASSERT_EQUAL(gather(pointerI + 10, U32x4(0, 1, 2, 3)), U32x4(120, 122, 124, 126));
+			ASSERT_EQUAL(gather(pointerI     , U32x8(2, 1, 28, 29, 3, 0, 30, 31)), U32x8(104, 102, 156, 158, 106, 100, 160, 162));
+			ASSERT_EQUAL(gather(pointerI + 10, U32x8(0, 1, 2, 3, 4, 5, 6, 7)), U32x8(120, 122, 124, 126, 128, 130, 132, 134));
+		}
+	}
+
+END_TEST

Some files were not shown because too many files changed in this diff