2 months ago · 244ae39b30
--- a/src/video/SDL_blit_N.c
+++ b/src/video/SDL_blit_N.c
@@ -870,28 +870,17 @@ static void ConvertAltivec32to32_prefetch(SDL_BlitInfo *info)
 
				     vec_dss(DST_CHAN_DEST);
			
 
				 }
			
 
				 
			
 
				-static Uint32 GetBlitFeatures(void)
			
 
				-{
			
 
				-    static Uint32 features = ~0u;
			
 
				-    if (features == ~0u) {
			
 
				-        features = (0
			
 
				-                    // Feature 1 is has-SSE41
			
 
				-                    | ((SDL_HasSSE41()) ? BLIT_FEATURE_HAS_SSE41 : 0)
			
 
				-                    // Feature 2 is has-AltiVec
			
 
				-                    | ((SDL_HasAltiVec()) ? BLIT_FEATURE_HAS_ALTIVEC : 0)
			
 
				-                    // Feature 4 is dont-use-prefetch
			
 
				-                    // !!!! FIXME: Check for G5 or later, not the cache size! Always prefetch on a G4.
			
 
				-                    | ((GetL3CacheSize() == 0) ? BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH : 0));
			
 
				-    }
			
 
				-    return features;
			
 
				-}
			
 
				+// !!!! FIXME: Check for G5 or later, not the cache size! Always prefetch on a G4.
			
 
				+#define GetBlitFeatures()   \
			
 
				+            ((SDL_HasAltiVec() ? BLIT_FEATURE_HAS_ALTIVEC : 0) | \
			
 
				+             ((GetL3CacheSize() == 0) ? BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH : 0))
			
 
				 
			
 
				 #ifdef __MWERKS__
			
 
				 #pragma altivec_model off
			
 
				 #endif
			
 
				 #else
			
 
				-// Feature 1 is has-SSE41
			
 
				-#define GetBlitFeatures() ((SDL_HasSSE41() ? BLIT_FEATURE_HAS_SSE41 : 0))
			
 
				+#define GetBlitFeatures()   \
			
 
				+             (SDL_HasSSE41() ? BLIT_FEATURE_HAS_SSE41 : 0)
			
 
				 #endif
			
 
				 
			
 
				 // This is now endian dependent
			
@@ -1165,7 +1154,7 @@ static void Blit_XRGB8888_RGB565(SDL_BlitInfo *info)
 
				 
			
 
				 #ifdef SDL_SSE4_1_INTRINSICS
			
 
				 
			
 
				-static void SDL_TARGETING("sse4.1") Blit_RGB565_32_SSE41(SDL_BlitInfo *info, int Rshift, int Gshift, int Bshift, int Amask)
			
 
				+static void SDL_TARGETING("sse4.1") Blit_RGB565_32_SSE41(SDL_BlitInfo *info)
			
 
				 {
			
 
				     int c;
			
 
				     int width, height;
			
@@ -1182,47 +1171,104 @@ static void SDL_TARGETING("sse4.1") Blit_RGB565_32_SSE41(SDL_BlitInfo *info, int
 
				     dst = (Uint32 *)info->dst;
			
 
				     dstskip = info->dst_skip / 4;
			
 
				 
			
 
				+    // Red and blue channel multiplier to repeat 5 bits
			
 
				+    __m128i rb_mult = _mm_shuffle_epi32(_mm_cvtsi32_si128(0x01080108), 0);
			
 
				+
			
 
				+    // Green channel multiplier to shift by 5 and then repeat 6 bits
			
 
				+    __m128i g_mult = _mm_shuffle_epi32(_mm_cvtsi32_si128(0x20802080), 0);
			
 
				+
			
 
				+    // Red channel mask
			
 
				+    __m128i r_mask = _mm_shuffle_epi32(_mm_cvtsi32_si128(0xf800f800), 0);
			
 
				+
			
 
				+    // Green channel mask
			
 
				+    __m128i g_mask = _mm_shuffle_epi32(_mm_cvtsi32_si128(0x07e007e0), 0);
			
 
				+
			
 
				+    // Alpha channel mask
			
 
				+    __m128i a_mask = _mm_shuffle_epi32(_mm_cvtsi32_si128(0xff00ff00), 0);
			
 
				+
			
 
				+    // Get the masks for converting from ARGB
			
 
				+    const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
			
 
				+    const Uint32 Rshift = dstfmt->Rshift;
			
 
				+    const Uint32 Gshift = dstfmt->Gshift;
			
 
				+    const Uint32 Bshift = dstfmt->Bshift;
			
 
				+    Uint32 Amask, Ashift;
			
 
				+
			
 
				+    SDL_Get8888AlphaMaskAndShift(dstfmt, &Amask, &Ashift);
			
 
				+
			
 
				+    // The byte offsets for the start of each pixel
			
 
				+    const __m128i mask_offsets = _mm_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
			
 
				+    const __m128i convert_mask = _mm_add_epi32(
			
 
				+            _mm_set1_epi32(
			
 
				+                ((16 >> 3) << Rshift) |
			
 
				+                (( 8 >> 3) << Gshift) |
			
 
				+                (( 0 >> 3) << Bshift) |
			
 
				+                ((24 >> 3) << Ashift)),
			
 
				+            mask_offsets);
			
 
				+
			
 
				     while (height--) {
			
 
				-        // Copy in 4 pixel chunks
			
 
				-        for (c = width / 4; c; --c) {
			
 
				-            // Load 4 16-bit RGB565 pixels into an SSE register
			
 
				-            __m128i pixels_rgb565 = _mm_loadu_si128((__m128i*)src);
			
 
				-
			
 
				-            // Extract Red components (5 bits)
			
 
				-            __m128i red_5bit = _mm_and_si128(pixels_rgb565, _mm_set1_epi16(0xF800)); // Mask for Red
			
 
				-            red_5bit = _mm_srli_epi16(red_5bit, 11); // Shift to get 5-bit value
			
 
				-            __m128i red_8bit = _mm_cvtepu16_epi32(red_5bit); // Convert to 32-bit and zero-extend
			
 
				-            red_8bit = _mm_slli_epi32(red_8bit, 3); // Scale to 8 bits (multiply by 8)
			
 
				-            red_8bit = _mm_or_si128(red_8bit, _mm_srli_epi32(red_8bit, 5)); // Replicate top 3 bits for better scaling
			
 
				-
			
 
				-            // Extract Green components (6 bits)
			
 
				-            __m128i green_6bit = _mm_and_si128(pixels_rgb565, _mm_set1_epi16(0x07E0)); // Mask for Green
			
 
				-            green_6bit = _mm_srli_epi16(green_6bit, 5); // Shift to get 6-bit value
			
 
				-            __m128i green_8bit = _mm_cvtepu16_epi32(green_6bit); // Convert to 32-bit and zero-extend
			
 
				-            green_8bit = _mm_slli_epi32(green_8bit, 2); // Scale to 8 bits (multiply by 4)
			
 
				-            green_8bit = _mm_or_si128(green_8bit, _mm_srli_epi32(green_8bit, 6)); // Replicate top 2 bits
			
 
				-
			
 
				-            // Extract Blue components (5 bits)
			
 
				-            __m128i blue_5bit = _mm_and_si128(pixels_rgb565, _mm_set1_epi16(0x001F)); // Mask for Blue
			
 
				-            __m128i blue_8bit = _mm_cvtepu16_epi32(blue_5bit); // Convert to 32-bit and zero-extend
			
 
				-            blue_8bit = _mm_slli_epi32(blue_8bit, 3); // Scale to 8 bits (multiply by 8)
			
 
				-            blue_8bit = _mm_or_si128(blue_8bit, _mm_srli_epi32(blue_8bit, 5)); // Replicate top 3 bits
			
 
				-
			
 
				-            // Set Alpha to opaque (0xFF)
			
 
				-            __m128i alpha_8bit = _mm_set1_epi32(Amask);
			
 
				-
			
 
				-            // Combine into 32-bit values
			
 
				-            __m128i argb_pixels_low = _mm_or_si128(alpha_8bit, _mm_slli_epi32(red_8bit, Rshift));
			
 
				-            argb_pixels_low = _mm_or_si128(argb_pixels_low, _mm_slli_epi32(green_8bit, Gshift));
			
 
				-            argb_pixels_low = _mm_or_si128(argb_pixels_low, _mm_slli_epi32(blue_8bit, Bshift));
			
 
				-
			
 
				-            // Store the results
			
 
				-            _mm_storeu_si128((__m128i*)dst, argb_pixels_low);
			
 
				-            src += 4;
			
 
				-            dst += 4;
			
 
				+        // Copy in 8 pixel chunks
			
 
				+        for (c = width / 8; c; --c) {
			
 
				+            __m128i pixel = _mm_loadu_si128((__m128i *)src);
			
 
				+            __m128i red = pixel;
			
 
				+            __m128i green = pixel;
			
 
				+            __m128i blue = pixel;
			
 
				+
			
 
				+            // Get red in the upper 5 bits and then multiply
			
 
				+            red = _mm_and_si128(red, r_mask);
			
 
				+            red = _mm_mulhi_epu16(red, rb_mult);
			
 
				+
			
 
				+            // Get blue in the upper 5 bits and then multiply
			
 
				+            blue = _mm_slli_epi16(blue, 11);
			
 
				+            blue = _mm_mulhi_epu16(blue, rb_mult);
			
 
				+
			
 
				+            // Combine the red and blue channels
			
 
				+            __m128i red_blue = _mm_or_si128(_mm_slli_epi16(red, 8), blue);
			
 
				+
			
 
				+            // Get the green channel and then multiply into place
			
 
				+            green = _mm_and_si128(green, g_mask);
			
 
				+            green = _mm_mulhi_epu16(green, g_mult);
			
 
				+
			
 
				+            // Combine the green and alpha channels
			
 
				+            __m128i green_alpha = _mm_or_si128(green, a_mask);
			
 
				+
			
 
				+            // Unpack them into output ARGB pixels
			
 
				+            __m128i out1 = _mm_unpacklo_epi8(red_blue, green_alpha);
			
 
				+            __m128i out2 = _mm_unpackhi_epi8(red_blue, green_alpha);
			
 
				+
			
 
				+            // Convert to dst format and save!
			
 
				+            // This is an SSSE3 instruction
			
 
				+            out1 = _mm_shuffle_epi8(out1, convert_mask);
			
 
				+            out2 = _mm_shuffle_epi8(out2, convert_mask);
			
 
				+
			
 
				+            _mm_storeu_si128((__m128i*)dst, out1);
			
 
				+            _mm_storeu_si128((__m128i*)(dst + 4), out2);
			
 
				+
			
 
				+            src += 8;
			
 
				+            dst += 8;
			
 
				         }
			
 
				+
			
 
				         // Get any leftovers
			
 
				-        switch (width & 3) {
			
 
				+        switch (width & 7) {
			
 
				+        case 7:
			
 
				+            RGB_FROM_RGB565(*src, r, g, b);
			
 
				+            *dst++ = (r << Rshift) | (g << Gshift) | (b << Bshift) | Amask;
			
 
				+            ++src;
			
 
				+            SDL_FALLTHROUGH;
			
 
				+        case 6:
			
 
				+            RGB_FROM_RGB565(*src, r, g, b);
			
 
				+            *dst++ = (r << Rshift) | (g << Gshift) | (b << Bshift) | Amask;
			
 
				+            ++src;
			
 
				+            SDL_FALLTHROUGH;
			
 
				+        case 5:
			
 
				+            RGB_FROM_RGB565(*src, r, g, b);
			
 
				+            *dst++ = (r << Rshift) | (g << Gshift) | (b << Bshift) | Amask;
			
 
				+            ++src;
			
 
				+            SDL_FALLTHROUGH;
			
 
				+        case 4:
			
 
				+            RGB_FROM_RGB565(*src, r, g, b);
			
 
				+            *dst++ = (r << Rshift) | (g << Gshift) | (b << Bshift) | Amask;
			
 
				+            ++src;
			
 
				+            SDL_FALLTHROUGH;
			
 
				         case 3:
			
 
				             RGB_FROM_RGB565(*src, r, g, b);
			
 
				             *dst++ = (r << Rshift) | (g << Gshift) | (b << Bshift) | Amask;
			
@@ -1244,26 +1290,6 @@ static void SDL_TARGETING("sse4.1") Blit_RGB565_32_SSE41(SDL_BlitInfo *info, int
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void Blit_RGB565_ARGB8888_SSE41(SDL_BlitInfo * info)
			
 
				-{
			
 
				-    Blit_RGB565_32_SSE41(info, 16, 8, 0, 0xFF000000);
			
 
				-}
			
 
				-
			
 
				-static void Blit_RGB565_ABGR8888_SSE41(SDL_BlitInfo * info)
			
 
				-{
			
 
				-    Blit_RGB565_32_SSE41(info, 0, 8, 16, 0xFF000000);
			
 
				-}
			
 
				-
			
 
				-static void Blit_RGB565_RGBA8888_SSE41(SDL_BlitInfo * info)
			
 
				-{
			
 
				-    Blit_RGB565_32_SSE41(info, 24, 16, 8, 0x000000FF);
			
 
				-}
			
 
				-
			
 
				-static void Blit_RGB565_BGRA8888_SSE41(SDL_BlitInfo * info)
			
 
				-{
			
 
				-    Blit_RGB565_32_SSE41(info, 8, 16, 24, 0x000000FF);
			
 
				-}
			
 
				-
			
 
				 #endif // SDL_SSE4_1_INTRINSICS
			
 
				 
			
 
				 #ifdef SDL_HAVE_BLIT_N_RGB565
			
@@ -2555,6 +2581,7 @@ static void SDL_TARGETING("sse4.1") Blit8888to8888PixelSwizzleSSE41(SDL_BlitInfo
 
				             __m128i src128 = _mm_loadu_si128((__m128i *)src);
			
 
				 
			
 
				             // Convert to dst format
			
 
				+            // This is an SSSE3 instruction
			
 
				             src128 = _mm_shuffle_epi8(src128, convert_mask);
			
 
				 
			
 
				             if (fill_alpha) {
			
@@ -2950,13 +2977,13 @@ static const struct blit_table normal_blit_2[] = {
 
				 #endif
			
 
				 #ifdef SDL_SSE4_1_INTRINSICS
			
 
				     { 0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00FF0000, 0x0000FF00, 0x000000FF,
			
 
				-      BLIT_FEATURE_HAS_SSE41, Blit_RGB565_ARGB8888_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
			
 
				+      BLIT_FEATURE_HAS_SSE41, Blit_RGB565_32_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
			
 
				     { 0x0000F800, 0x000007E0, 0x0000001F, 4, 0x000000FF, 0x0000FF00, 0x00FF0000,
			
 
				-      BLIT_FEATURE_HAS_SSE41, Blit_RGB565_ABGR8888_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
			
 
				+      BLIT_FEATURE_HAS_SSE41, Blit_RGB565_32_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
			
 
				     { 0x0000F800, 0x000007E0, 0x0000001F, 4, 0xFF000000, 0x00FF0000, 0x0000FF00,
			
 
				-      BLIT_FEATURE_HAS_SSE41, Blit_RGB565_RGBA8888_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
			
 
				+      BLIT_FEATURE_HAS_SSE41, Blit_RGB565_32_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
			
 
				     { 0x0000F800, 0x000007E0, 0x0000001F, 4, 0x0000FF00, 0x00FF0000, 0xFF000000,
			
 
				-      BLIT_FEATURE_HAS_SSE41, Blit_RGB565_BGRA8888_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
			
 
				+      BLIT_FEATURE_HAS_SSE41, Blit_RGB565_32_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
			
 
				 #endif
			
 
				 #ifdef SDL_HAVE_BLIT_N_RGB565
			
 
				     { 0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00FF0000, 0x0000FF00, 0x000000FF,
			
--- a/test/testautomation_surface.c
+++ b/test/testautomation_surface.c
@@ -1680,39 +1680,53 @@ static Uint32 Calculate(int v, int bits, int vmax, int shift)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-static Uint32 Calculate565toARGB(int v)
			
 
				+static Uint32 Calculate565toARGB(int v, const SDL_PixelFormatDetails *fmt)
			
 
				 {
			
 
				     Uint8 r = (v & 0xF800) >> 11;
			
 
				     Uint8 g = (v & 0x07E0) >> 5;
			
 
				     Uint8 b = (v & 0x001F);
			
 
				-    return 0xFF000000 |
			
 
				-            Calculate(r, 5, 31, 16) |
			
 
				-            Calculate(g, 6, 63, 8) |
			
 
				-            Calculate(b, 5, 31, 0);
			
 
				+    return fmt->Amask |
			
 
				+            Calculate(r, 5, 31, fmt->Rshift) |
			
 
				+            Calculate(g, 6, 63, fmt->Gshift) |
			
 
				+            Calculate(b, 5, 31, fmt->Bshift);
			
 
				 }
			
 
				 
			
 
				 static int SDLCALL surface_test16BitTo32Bit(void *arg)
			
 
				 {
			
 
				+    static const SDL_PixelFormat formats[] = {
			
 
				+        SDL_PIXELFORMAT_ARGB8888,
			
 
				+        SDL_PIXELFORMAT_ABGR8888,
			
 
				+        SDL_PIXELFORMAT_RGBA8888,
			
 
				+        SDL_PIXELFORMAT_BGRA8888
			
 
				+    };
			
 
				     static Uint16 pixels[1 << 16];
			
 
				     static Uint32 expected[1 << 16];
			
 
				-    int i, ret;
			
 
				+    int i, p, ret;
			
 
				     SDL_Surface *surface16;
			
 
				     SDL_Surface *surface32;
			
 
				     SDL_Surface *expected32;
			
 
				 
			
 
				-    for (i = 0; i < SDL_arraysize(pixels); ++i) {
			
 
				-        pixels[i] = i;
			
 
				-        expected[i] = Calculate565toARGB(i);
			
 
				+    for (p = 0; p < SDL_arraysize(pixels); ++p) {
			
 
				+        pixels[p] = p;
			
 
				     }
			
 
				-
			
 
				     surface16 = SDL_CreateSurfaceFrom(SDL_arraysize(pixels), 1, SDL_PIXELFORMAT_RGB565, pixels, sizeof(pixels));
			
 
				-    surface32 = SDL_ConvertSurface(surface16, SDL_PIXELFORMAT_ARGB8888);
			
 
				-    expected32 = SDL_CreateSurfaceFrom(SDL_arraysize(expected), 1, SDL_PIXELFORMAT_ARGB8888, expected, sizeof(expected));
			
 
				-    ret = SDLTest_CompareSurfaces(surface32, expected32, 0);
			
 
				-    SDLTest_AssertCheck(ret == 0, "Validate result from SDLTest_CompareSurfaces, expected: 0, got: %i", ret);
			
 
				+
			
 
				+    for (i = 0; i < SDL_arraysize(formats); ++i) {
			
 
				+        SDL_PixelFormat format = formats[i];
			
 
				+        const SDL_PixelFormatDetails *fmt = SDL_GetPixelFormatDetails(format);
			
 
				+
			
 
				+        SDLTest_Log("Checking conversion from SDL_PIXELFORMAT_RGB565 to %s", SDL_GetPixelFormatName(format));
			
 
				+        surface32 = SDL_ConvertSurface(surface16, format);
			
 
				+        for (p = 0; p < SDL_arraysize(pixels); ++p) {
			
 
				+            expected[p] = Calculate565toARGB(p, fmt);
			
 
				+        }
			
 
				+        expected32 = SDL_CreateSurfaceFrom(SDL_arraysize(expected), 1, format, expected, sizeof(expected));
			
 
				+        ret = SDLTest_CompareSurfaces(surface32, expected32, 0);
			
 
				+        SDLTest_AssertCheck(ret == 0, "Validate result from SDLTest_CompareSurfaces, expected: 0, got: %i", ret);
			
 
				+        SDL_DestroySurface(surface32);
			
 
				+        SDL_DestroySurface(expected32);
			
 
				+    }
			
 
				     SDL_DestroySurface(surface16);
			
 
				-    SDL_DestroySurface(surface32);
			
 
				-    SDL_DestroySurface(expected32);
			
 
				 
			
 
				     return TEST_COMPLETED;
			
 
				 }