c
/
tsoding.simd
mirror of https://github.com/tsoding/simd


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
							#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <ctime>

#include <algorithm>

#include <tmmintrin.h>
#include <smmintrin.h>

#define STB_IMAGE_IMPLEMENTATION
#define STBI_ONLY_JPEG
#define STBI_ONLY_PNG
#include "./stb_image.h"

#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "./stb_image_write.h"

struct Pixel32
{
    uint8_t r, g, b, a;
};

struct Image32
{
    Pixel32 *pixels;
    size_t width;
    size_t height;
};

const size_t SIMD_PIXEL_PACK_SIZE = sizeof(__m128i) / sizeof(Pixel32);

Pixel32 mix_pixels(Pixel32 a32, Pixel32 b32)
{
    const float a32_alpha = a32.a / 255.0;
    const float b32_alpha = b32.a / 255.0;
    const float r_alpha = b32_alpha + a32_alpha * (1.0f - b32_alpha);

    Pixel32 r = {};

    r.r = (uint8_t) ((b32.r * b32_alpha + a32.r * a32_alpha * (1.0f - b32_alpha)) / r_alpha);
    r.g = (uint8_t) ((b32.g * b32_alpha + a32.g * a32_alpha * (1.0f - b32_alpha)) / r_alpha);
    r.b = (uint8_t) ((b32.b * b32_alpha + a32.b * a32_alpha * (1.0f - b32_alpha)) / r_alpha);
    r.a = (uint8_t) (r_alpha * 255.0);

    return r;
}

static inline Pixel32 mix_pixels_no_float(Pixel32 src, Pixel32 dst)
{
    uint8_t rev_src_a = 255 - src.a;
    Pixel32 result;
    result.r = ((uint16_t) src.r * (uint16_t) src.a + (uint16_t) dst.r * rev_src_a) >> 8;
    result.g = ((uint16_t) src.g * (uint16_t) src.a + (uint16_t) dst.g * rev_src_a) >> 8;
    result.b = ((uint16_t) src.b * (uint16_t) src.a + (uint16_t) dst.b * rev_src_a) >> 8;
    result.a = dst.a;
    return result;
}

// NOTE: Stolen from https://stackoverflow.com/a/53707227
void mix_pixels_sse(Pixel32 *src, Pixel32 *dst, Pixel32 *c)
{
    const __m128i _swap_mask =
        _mm_set_epi8(7,  6,   5,  4,
                     3,  2,   1,  0,
                     15, 14, 13, 12,
                     11, 10,  9,  8
            );

    const __m128i _aa =
        _mm_set_epi8( 15,15,15,15,
                      11,11,11,11,
                      7,7,7,7,
                      3,3,3,3 );

    const __m128i _mask1 = _mm_set_epi16(-1,0,0,0, -1,0,0,0);
    const __m128i _mask2 = _mm_set_epi16(0,-1,-1,-1, 0,-1,-1,-1);
    const __m128i _v1 = _mm_set1_epi16( 1 );

    __m128i _src = _mm_loadu_si128((__m128i*)src);
    __m128i _src_a = _mm_shuffle_epi8(_src, _aa);

    __m128i _dst = _mm_loadu_si128((__m128i*)dst);
    __m128i _dst_a = _mm_shuffle_epi8(_dst, _aa);
    __m128i _one_minus_src_a = _mm_subs_epu8(
        _mm_set1_epi8(-1), _src_a);

    __m128i _out = {};
    {
        __m128i _s_a = _mm_cvtepu8_epi16( _src_a );
        __m128i _s = _mm_cvtepu8_epi16( _src );
        __m128i _d = _mm_cvtepu8_epi16( _dst );
        __m128i _d_a = _mm_cvtepu8_epi16( _one_minus_src_a );
        _out = _mm_adds_epu16(
            _mm_mullo_epi16(_s, _s_a),
            _mm_mullo_epi16(_d, _d_a));
        _out = _mm_srli_epi16(
            _mm_adds_epu16(
                _mm_adds_epu16( _v1, _out ),
                _mm_srli_epi16( _out, 8 ) ), 8 );
        _out = _mm_or_si128(
            _mm_and_si128(_out,_mask2),
            _mm_and_si128(
                _mm_adds_epu16(
                    _s_a,
                    _mm_cvtepu8_epi16(_dst_a)), _mask1));
    }

    // compute _out2 using high quadword of of the _src and _dst
    //...
    __m128i _out2 = {};
    {
        __m128i _s_a = _mm_cvtepu8_epi16(_mm_shuffle_epi8(_src_a, _swap_mask));
        __m128i _s = _mm_cvtepu8_epi16(_mm_shuffle_epi8(_src, _swap_mask));
        __m128i _d = _mm_cvtepu8_epi16(_mm_shuffle_epi8(_dst, _swap_mask));
        __m128i _d_a = _mm_cvtepu8_epi16(_mm_shuffle_epi8(_one_minus_src_a, _swap_mask));
        _out2 = _mm_adds_epu16(
            _mm_mullo_epi16(_s, _s_a),
            _mm_mullo_epi16(_d, _d_a));
        _out2 = _mm_srli_epi16(
            _mm_adds_epu16(
                _mm_adds_epu16( _v1, _out2 ),
                _mm_srli_epi16( _out2, 8 ) ), 8 );
        _out2 = _mm_or_si128(
            _mm_and_si128(_out2,_mask2),
            _mm_and_si128(
                _mm_adds_epu16(
                    _s_a,
                    _mm_cvtepu8_epi16(_dst_a)), _mask1));
    }

    __m128i _ret = _mm_packus_epi16( _out, _out2 );

    _mm_storeu_si128( (__m128i_u*) c, _ret );
}

void slap_image32_onto_image32_no_float(Image32 src, Image32 dst,
                                        size_t x0, size_t y0)
{
    size_t x1 = std::min(x0 + src.width, dst.width);
    size_t y1 = std::min(y0 + src.height, dst.height);
    for (size_t y = y0; y < y1; ++y) {
        for (size_t x = x0; x < x1; ++x) {
            dst.pixels[y * dst.width + x] =
                mix_pixels_no_float(
                    src.pixels[(y - y0) * src.width + (x - x0)],
                    dst.pixels[y * dst.width + x]);
        }
    }
}

void slap_image32_onto_image32(Image32 src, Image32 dst,
                              size_t x0, size_t y0)
{
    size_t x1 = std::min(x0 + src.width, dst.width);
    size_t y1 = std::min(y0 + src.height, dst.height);
    for (size_t y = y0; y < y1; ++y) {
        for (size_t x = x0; x < x1; ++x) {
            dst.pixels[y * dst.width + x] =
                mix_pixels(
                    dst.pixels[y * dst.width + x],
                    src.pixels[(y - y0) * src.width + (x - x0)]);
        }
    }
}

void slap_image32_onto_image32_simd(Image32 src, Image32 dst,
                               size_t x0, size_t y0)
{
    Pixel32 out[SIMD_PIXEL_PACK_SIZE] = {};

    size_t x1 = std::min(x0 + src.width, dst.width);
    size_t y1 = std::min(y0 + src.height, dst.height);
    for (size_t y = y0; y < y1; ++y) {
        for (size_t x = x0; x < x1; x += SIMD_PIXEL_PACK_SIZE) {
            mix_pixels_sse(
                &src.pixels[(y - y0) * src.width + (x - x0)],
                &dst.pixels[y * dst.width + x],
                &dst.pixels[y * dst.width + x]);
            // TODO: tail of the row is not taken into account
        }
    }
}

Image32 load_image32(const char *filepath)
{
    Image32 result = {};
    int x, y, n;
    result.pixels = (Pixel32*) stbi_load(filepath, &x, &y, &n, 4);
    result.width = x;
    result.height = y;
    return result;
}

int main_(int argc, char *argv[])
{
    Pixel32 a[] = {
        {1, 2, 3, 4},
        {5, 6, 7, 8},
        {9, 10, 11, 12},
        {13, 14, 15, 16},
    };

    Pixel32 b[] = {
        {17, 18, 19, 20},
        {21, 22, 23, 24},
        {25, 26, 27, 28},
        {29, 30, 31, 32},
    };

    Pixel32 c[4] = {};

    mix_pixels_sse(a, b, c);

    return 0;
}

template <typename Slap>
void benchmark(Slap slap,
               Image32 src, Image32 dst,
               size_t pos_x, size_t pos_y,
               size_t N, const char *message)
{
    printf("%s\n", message);
    clock_t begin = clock();
    for (size_t i = 0; i < N; ++i) {
        slap(src, dst, pos_x, pos_y);
    }
    printf("    %fs\n", (float)(clock() - begin) / (float) CLOCKS_PER_SEC);
}

int main(int argc, char *argv[])
{
    static_assert(sizeof(Pixel32) == sizeof(uint32_t),
                  "Size of Pixel32 is scuffed on your platform lol");

    const char * const DST_FILENAME = "maxresdefault.jpg";
    Image32 dst = load_image32(DST_FILENAME);

    const char * const SRC_FILENAME = "tsodinFeels.png";
    Image32 src = load_image32(SRC_FILENAME);

    for (size_t i = 0; i < src.width * src.height; ++i) {
        src.pixels[i].a = src.pixels[i].a >> 1;
    }

    size_t pos_x = (dst.width >> 1) - (src.width >> 1);
    size_t pos_y = (dst.height >> 1) - (src.height >> 1);

    const size_t N = 100'000;
    benchmark(slap_image32_onto_image32, src, dst, pos_x, pos_y, N, "Original NO SIMD");
    benchmark(slap_image32_onto_image32_no_float, src, dst, pos_x, pos_y, N, "Faster NO SIMD");
    benchmark(slap_image32_onto_image32_simd, src, dst, pos_x, pos_y, N, "SIMD");

    const char * const OUT_FILENAME = "output.png";
    int ret = stbi_write_png(OUT_FILENAME, dst.width, dst.height, 4, dst.pixels, dst.width * 4);
    printf("    ret = %d\n", ret);
    return 0;
}