Pārlūkot izejas kodu

Implement faster no simd version of pixel blending

rexim 5 gadi atpakaļ
vecāks
revīzija
8ddd5943f7
1 mainītis faili ar 69 papildinājumiem un 18 dzēšanām
  1. 69 18
      sse2/main.cpp

+ 69 - 18
sse2/main.cpp

@@ -16,8 +16,6 @@
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #include "./stb_image_write.h"
 
-#define SIMD
-
 struct Pixel32
 {
     uint8_t r, g, b, a;
@@ -48,6 +46,17 @@ Pixel32 mix_pixels(Pixel32 a32, Pixel32 b32)
     return r;
 }
 
+static inline Pixel32 mix_pixels_no_float(Pixel32 src, Pixel32 dst)
+{
+    uint8_t rev_src_a = 255 - src.a;
+    Pixel32 result;
+    result.r = ((uint16_t) src.r * (uint16_t) src.a + (uint16_t) dst.r * rev_src_a) >> 8;
+    result.g = ((uint16_t) src.g * (uint16_t) src.a + (uint16_t) dst.g * rev_src_a) >> 8;
+    result.b = ((uint16_t) src.b * (uint16_t) src.a + (uint16_t) dst.b * rev_src_a) >> 8;
+    result.a = dst.a;
+    return result;
+}
+
 // NOTE: Stolen from https://stackoverflow.com/a/53707227
 void mix_pixels_sse(Pixel32 *src, Pixel32 *dst, Pixel32 *c)
 {
@@ -73,7 +82,8 @@ void mix_pixels_sse(Pixel32 *src, Pixel32 *dst, Pixel32 *c)
 
     __m128i _dst = _mm_loadu_si128((__m128i*)dst);
     __m128i _dst_a = _mm_shuffle_epi8(_dst, _aa);
-    __m128i _one_minus_src_a = _mm_subs_epu8(_mm_set1_epi8(-1), _src_a);
+    __m128i _one_minus_src_a = _mm_subs_epu8(
+        _mm_set1_epi8(-1), _src_a);
 
     __m128i _out = {};
     {
@@ -124,6 +134,21 @@ void mix_pixels_sse(Pixel32 *src, Pixel32 *dst, Pixel32 *c)
     _mm_storeu_si128( (__m128i_u*) c, _ret );
 }
 
+void slap_image32_onto_image32_no_float(Image32 src, Image32 dst,
+                                        size_t x0, size_t y0)
+{
+    size_t x1 = std::min(x0 + src.width, dst.width);
+    size_t y1 = std::min(y0 + src.height, dst.height);
+    for (size_t y = y0; y < y1; ++y) {
+        for (size_t x = x0; x < x1; ++x) {
+            dst.pixels[y * dst.width + x] =
+                mix_pixels_no_float(
+                    src.pixels[(y - y0) * src.width + (x - x0)],
+                    dst.pixels[y * dst.width + x]);
+        }
+    }
+}
+
 void slap_image32_onto_image32(Image32 src, Image32 dst,
                               size_t x0, size_t y0)
 {
@@ -167,17 +192,48 @@ Image32 load_image32(const char *filepath)
     return result;
 }
 
+int main_(int argc, char *argv[])
+{
+    Pixel32 a[] = {
+        {1, 2, 3, 4},
+        {5, 6, 7, 8},
+        {9, 10, 11, 12},
+        {13, 14, 15, 16},
+    };
+
+    Pixel32 b[] = {
+        {17, 18, 19, 20},
+        {21, 22, 23, 24},
+        {25, 26, 27, 28},
+        {29, 30, 31, 32},
+    };
+
+    Pixel32 c[4] = {};
+
+    mix_pixels_sse(a, b, c);
+
+    return 0;
+}
+
+template <typename Slap>
+void benchmark(Slap slap,
+               Image32 src, Image32 dst,
+               size_t pos_x, size_t pos_y,
+               size_t N, const char *message)
+{
+    printf("%s\n", message);
+    clock_t begin = clock();
+    for (size_t i = 0; i < N; ++i) {
+        slap(src, dst, pos_x, pos_y);
+    }
+    printf("    %fs\n", (float)(clock() - begin) / (float) CLOCKS_PER_SEC);
+}
+
 int main(int argc, char *argv[])
 {
     static_assert(sizeof(Pixel32) == sizeof(uint32_t),
                   "Size of Pixel32 is scuffed on your platform lol");
 
-#ifdef SIMD
-    printf("SIMD ON\n");
-#else
-    printf("SIMD OFF\n");
-#endif
-
     const char * const DST_FILENAME = "maxresdefault.jpg";
     Image32 dst = load_image32(DST_FILENAME);
 
@@ -190,16 +246,11 @@ int main(int argc, char *argv[])
 
     size_t pos_x = (dst.width >> 1) - (src.width >> 1);
     size_t pos_y = (dst.height >> 1) - (src.height >> 1);
+
     const size_t N = 100'000;
-    clock_t begin = clock();
-    for (size_t i = 0; i < N; ++i) {
-#ifdef SIMD
-        slap_image32_onto_image32_simd(src, dst, pos_x, pos_y);
-#else
-        slap_image32_onto_image32(src, dst, pos_x, pos_y);
-#endif
-    }
-    printf("    %fs\n", (float)(clock() - begin) / (float) CLOCKS_PER_SEC);
+    benchmark(slap_image32_onto_image32, src, dst, pos_x, pos_y, N, "Original NO SIMD");
+    benchmark(slap_image32_onto_image32_no_float, src, dst, pos_x, pos_y, N, "Faster NO SIMD");
+    benchmark(slap_image32_onto_image32_simd, src, dst, pos_x, pos_y, N, "SIMD");
 
     const char * const OUT_FILENAME = "output.png";
     int ret = stbi_write_png(OUT_FILENAME, dst.width, dst.height, 4, dst.pixels, dst.width * 4);