123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259 |
- #include <cstdio>
- #include <cstdlib>
- #include <cstdint>
- #include <ctime>
- #include <algorithm>
- #include <tmmintrin.h>
- #include <smmintrin.h>
- #define STB_IMAGE_IMPLEMENTATION
- #define STBI_ONLY_JPEG
- #define STBI_ONLY_PNG
- #include "./stb_image.h"
- #define STB_IMAGE_WRITE_IMPLEMENTATION
- #include "./stb_image_write.h"
- struct Pixel32
- {
- uint8_t r, g, b, a;
- };
- struct Image32
- {
- Pixel32 *pixels;
- size_t width;
- size_t height;
- };
- const size_t SIMD_PIXEL_PACK_SIZE = sizeof(__m128i) / sizeof(Pixel32);
- Pixel32 mix_pixels(Pixel32 a32, Pixel32 b32)
- {
- const float a32_alpha = a32.a / 255.0;
- const float b32_alpha = b32.a / 255.0;
- const float r_alpha = b32_alpha + a32_alpha * (1.0f - b32_alpha);
- Pixel32 r = {};
- r.r = (uint8_t) ((b32.r * b32_alpha + a32.r * a32_alpha * (1.0f - b32_alpha)) / r_alpha);
- r.g = (uint8_t) ((b32.g * b32_alpha + a32.g * a32_alpha * (1.0f - b32_alpha)) / r_alpha);
- r.b = (uint8_t) ((b32.b * b32_alpha + a32.b * a32_alpha * (1.0f - b32_alpha)) / r_alpha);
- r.a = (uint8_t) (r_alpha * 255.0);
- return r;
- }
- static inline Pixel32 mix_pixels_no_float(Pixel32 src, Pixel32 dst)
- {
- uint8_t rev_src_a = 255 - src.a;
- Pixel32 result;
- result.r = ((uint16_t) src.r * (uint16_t) src.a + (uint16_t) dst.r * rev_src_a) >> 8;
- result.g = ((uint16_t) src.g * (uint16_t) src.a + (uint16_t) dst.g * rev_src_a) >> 8;
- result.b = ((uint16_t) src.b * (uint16_t) src.a + (uint16_t) dst.b * rev_src_a) >> 8;
- result.a = dst.a;
- return result;
- }
- // NOTE: Stolen from https://stackoverflow.com/a/53707227
- void mix_pixels_sse(Pixel32 *src, Pixel32 *dst, Pixel32 *c)
- {
- const __m128i _swap_mask =
- _mm_set_epi8(7, 6, 5, 4,
- 3, 2, 1, 0,
- 15, 14, 13, 12,
- 11, 10, 9, 8
- );
- const __m128i _aa =
- _mm_set_epi8( 15,15,15,15,
- 11,11,11,11,
- 7,7,7,7,
- 3,3,3,3 );
- const __m128i _mask1 = _mm_set_epi16(-1,0,0,0, -1,0,0,0);
- const __m128i _mask2 = _mm_set_epi16(0,-1,-1,-1, 0,-1,-1,-1);
- const __m128i _v1 = _mm_set1_epi16( 1 );
- __m128i _src = _mm_loadu_si128((__m128i*)src);
- __m128i _src_a = _mm_shuffle_epi8(_src, _aa);
- __m128i _dst = _mm_loadu_si128((__m128i*)dst);
- __m128i _dst_a = _mm_shuffle_epi8(_dst, _aa);
- __m128i _one_minus_src_a = _mm_subs_epu8(
- _mm_set1_epi8(-1), _src_a);
- __m128i _out = {};
- {
- __m128i _s_a = _mm_cvtepu8_epi16( _src_a );
- __m128i _s = _mm_cvtepu8_epi16( _src );
- __m128i _d = _mm_cvtepu8_epi16( _dst );
- __m128i _d_a = _mm_cvtepu8_epi16( _one_minus_src_a );
- _out = _mm_adds_epu16(
- _mm_mullo_epi16(_s, _s_a),
- _mm_mullo_epi16(_d, _d_a));
- _out = _mm_srli_epi16(
- _mm_adds_epu16(
- _mm_adds_epu16( _v1, _out ),
- _mm_srli_epi16( _out, 8 ) ), 8 );
- _out = _mm_or_si128(
- _mm_and_si128(_out,_mask2),
- _mm_and_si128(
- _mm_adds_epu16(
- _s_a,
- _mm_cvtepu8_epi16(_dst_a)), _mask1));
- }
- // compute _out2 using high quadword of of the _src and _dst
- //...
- __m128i _out2 = {};
- {
- __m128i _s_a = _mm_cvtepu8_epi16(_mm_shuffle_epi8(_src_a, _swap_mask));
- __m128i _s = _mm_cvtepu8_epi16(_mm_shuffle_epi8(_src, _swap_mask));
- __m128i _d = _mm_cvtepu8_epi16(_mm_shuffle_epi8(_dst, _swap_mask));
- __m128i _d_a = _mm_cvtepu8_epi16(_mm_shuffle_epi8(_one_minus_src_a, _swap_mask));
- _out2 = _mm_adds_epu16(
- _mm_mullo_epi16(_s, _s_a),
- _mm_mullo_epi16(_d, _d_a));
- _out2 = _mm_srli_epi16(
- _mm_adds_epu16(
- _mm_adds_epu16( _v1, _out2 ),
- _mm_srli_epi16( _out2, 8 ) ), 8 );
- _out2 = _mm_or_si128(
- _mm_and_si128(_out2,_mask2),
- _mm_and_si128(
- _mm_adds_epu16(
- _s_a,
- _mm_cvtepu8_epi16(_dst_a)), _mask1));
- }
- __m128i _ret = _mm_packus_epi16( _out, _out2 );
- _mm_storeu_si128( (__m128i_u*) c, _ret );
- }
- void slap_image32_onto_image32_no_float(Image32 src, Image32 dst,
- size_t x0, size_t y0)
- {
- size_t x1 = std::min(x0 + src.width, dst.width);
- size_t y1 = std::min(y0 + src.height, dst.height);
- for (size_t y = y0; y < y1; ++y) {
- for (size_t x = x0; x < x1; ++x) {
- dst.pixels[y * dst.width + x] =
- mix_pixels_no_float(
- src.pixels[(y - y0) * src.width + (x - x0)],
- dst.pixels[y * dst.width + x]);
- }
- }
- }
- void slap_image32_onto_image32(Image32 src, Image32 dst,
- size_t x0, size_t y0)
- {
- size_t x1 = std::min(x0 + src.width, dst.width);
- size_t y1 = std::min(y0 + src.height, dst.height);
- for (size_t y = y0; y < y1; ++y) {
- for (size_t x = x0; x < x1; ++x) {
- dst.pixels[y * dst.width + x] =
- mix_pixels(
- dst.pixels[y * dst.width + x],
- src.pixels[(y - y0) * src.width + (x - x0)]);
- }
- }
- }
- void slap_image32_onto_image32_simd(Image32 src, Image32 dst,
- size_t x0, size_t y0)
- {
- Pixel32 out[SIMD_PIXEL_PACK_SIZE] = {};
- size_t x1 = std::min(x0 + src.width, dst.width);
- size_t y1 = std::min(y0 + src.height, dst.height);
- for (size_t y = y0; y < y1; ++y) {
- for (size_t x = x0; x < x1; x += SIMD_PIXEL_PACK_SIZE) {
- mix_pixels_sse(
- &src.pixels[(y - y0) * src.width + (x - x0)],
- &dst.pixels[y * dst.width + x],
- &dst.pixels[y * dst.width + x]);
- // TODO: tail of the row is not taken into account
- }
- }
- }
- Image32 load_image32(const char *filepath)
- {
- Image32 result = {};
- int x, y, n;
- result.pixels = (Pixel32*) stbi_load(filepath, &x, &y, &n, 4);
- result.width = x;
- result.height = y;
- return result;
- }
- int main_(int argc, char *argv[])
- {
- Pixel32 a[] = {
- {1, 2, 3, 4},
- {5, 6, 7, 8},
- {9, 10, 11, 12},
- {13, 14, 15, 16},
- };
- Pixel32 b[] = {
- {17, 18, 19, 20},
- {21, 22, 23, 24},
- {25, 26, 27, 28},
- {29, 30, 31, 32},
- };
- Pixel32 c[4] = {};
- mix_pixels_sse(a, b, c);
- return 0;
- }
- template <typename Slap>
- void benchmark(Slap slap,
- Image32 src, Image32 dst,
- size_t pos_x, size_t pos_y,
- size_t N, const char *message)
- {
- printf("%s\n", message);
- clock_t begin = clock();
- for (size_t i = 0; i < N; ++i) {
- slap(src, dst, pos_x, pos_y);
- }
- printf(" %fs\n", (float)(clock() - begin) / (float) CLOCKS_PER_SEC);
- }
- int main(int argc, char *argv[])
- {
- static_assert(sizeof(Pixel32) == sizeof(uint32_t),
- "Size of Pixel32 is scuffed on your platform lol");
- const char * const DST_FILENAME = "maxresdefault.jpg";
- Image32 dst = load_image32(DST_FILENAME);
- const char * const SRC_FILENAME = "tsodinFeels.png";
- Image32 src = load_image32(SRC_FILENAME);
- for (size_t i = 0; i < src.width * src.height; ++i) {
- src.pixels[i].a = src.pixels[i].a >> 1;
- }
- size_t pos_x = (dst.width >> 1) - (src.width >> 1);
- size_t pos_y = (dst.height >> 1) - (src.height >> 1);
- const size_t N = 100'000;
- benchmark(slap_image32_onto_image32, src, dst, pos_x, pos_y, N, "Original NO SIMD");
- benchmark(slap_image32_onto_image32_no_float, src, dst, pos_x, pos_y, N, "Faster NO SIMD");
- benchmark(slap_image32_onto_image32_simd, src, dst, pos_x, pos_y, N, "SIMD");
- const char * const OUT_FILENAME = "output.png";
- int ret = stbi_write_png(OUT_FILENAME, dst.width, dst.height, 4, dst.pixels, dst.width * 4);
- printf(" ret = %d\n", ret);
- return 0;
- }
|