10 年之前 · e5db25f637
--- a/stb_image.h
+++ b/stb_image.h
@@ -422,7 +422,16 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 
				 #include <emmintrin.h>
			
 
				 
			
 
				 #ifdef _MSC_VER
			
 
				+#include <intrin.h> // __cpuid
			
 
				 #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
			
 
				+
			
 
				+static int stbi__sse2_available()
			
 
				+{
			
 
				+   int info[4];
			
 
				+   __cpuid(info, 1);
			
 
				+   return ((info[3] >> 26) & 1) != 0;
			
 
				+}
			
 
				+
			
 
				 #else // assume GCC-style if not VC++
			
 
				 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
			
 
				 #endif
			
@@ -1311,7 +1320,7 @@ stbi_inline static stbi_uc stbi__clamp(int x)
 
				    return (stbi_uc) x;
			
 
				 }
			
 
				 
			
 
				-#define stbi__f2f(x)  (int) (((x) * 4096 + 0.5))
			
 
				+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
			
 
				 #define stbi__fsh(x)  ((x) << 12)
			
 
				 
			
 
				 // derived from jidctint -- DCT_ISLOW
			
@@ -1421,17 +1430,191 @@ static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
 
				    }
			
 
				 }
			
 
				 
			
 
				-#ifdef STBI_SIMD
			
 
				-static void stbi__idct_block_wrapper(stbi_uc *out, int out_stride, short data[64], unsigned short dequant[64])
			
 
				-{
			
 
				-   stbi__idct_block(out, out_stride, data);
			
 
				+#ifdef STBI_SSE2
			
 
				+
			
 
				+// sse2 integer IDCT. not the fastest possible implementation but it
			
 
				+// produces bit-identical results to the generic C version so it's
			
 
				+// fully "transparent".
			
 
				+static void stbi__idct_sse2(stbi_uc *out, int out_stride, short data[64])
			
 
				+{
			
 
				+   // This is constructed to match our regular (generic) integer IDCT exactly.
			
 
				+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
			
 
				+   __m128i tmp;
			
 
				+
			
 
				+   // dot product constant: even elems=x, odd elems=y
			
 
				+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
			
 
				+
			
 
				+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
			
 
				+   // out(1) = c1[even]*x + c1[odd]*y
			
 
				+   #define dct_rot(out0,out1, x,y,c0,c1) \
			
 
				+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
			
 
				+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
			
 
				+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
			
 
				+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
			
 
				+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
			
 
				+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
			
 
				+
			
 
				+   // out = in << 12  (in 16-bit, out 32-bit)
			
 
				+   #define dct_widen(out, in) \
			
 
				+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
			
 
				+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
			
 
				+
			
 
				+   // wide add
			
 
				+   #define dct_wadd(out, a, b) \
			
 
				+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
			
 
				+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
			
 
				+
			
 
				+   // wide sub
			
 
				+   #define dct_wsub(out, a, b) \
			
 
				+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
			
 
				+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
			
 
				+
			
 
				+   // butterfly a/b, add bias, then shift by "s" and pack
			
 
				+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
			
 
				+      { \
			
 
				+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
			
 
				+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
			
 
				+         dct_wadd(sum, abiased, b); \
			
 
				+         dct_wsub(dif, abiased, b); \
			
 
				+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
			
 
				+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
			
 
				+      }
			
 
				+
			
 
				+   // 8-bit interleave step (for transposes)
			
 
				+   #define dct_interleave8(a, b) \
			
 
				+      tmp = a; \
			
 
				+      a = _mm_unpacklo_epi8(a, b); \
			
 
				+      b = _mm_unpackhi_epi8(tmp, b)
			
 
				+
			
 
				+   // 16-bit interleave step (for transposes)
			
 
				+   #define dct_interleave16(a, b) \
			
 
				+      tmp = a; \
			
 
				+      a = _mm_unpacklo_epi16(a, b); \
			
 
				+      b = _mm_unpackhi_epi16(tmp, b)
			
 
				+
			
 
				+   #define dct_pass(bias,shift) \
			
 
				+      { \
			
 
				+         /* even part */ \
			
 
				+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
			
 
				+         __m128i sum04 = _mm_add_epi16(row0, row4); \
			
 
				+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
			
 
				+         dct_widen(t0e, sum04); \
			
 
				+         dct_widen(t1e, dif04); \
			
 
				+         dct_wadd(x0, t0e, t3e); \
			
 
				+         dct_wsub(x3, t0e, t3e); \
			
 
				+         dct_wadd(x1, t1e, t2e); \
			
 
				+         dct_wsub(x2, t1e, t2e); \
			
 
				+         /* odd part */ \
			
 
				+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
			
 
				+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
			
 
				+         __m128i sum17 = _mm_add_epi16(row1, row7); \
			
 
				+         __m128i sum35 = _mm_add_epi16(row3, row5); \
			
 
				+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
			
 
				+         dct_wadd(x4, y0o, y4o); \
			
 
				+         dct_wadd(x5, y1o, y5o); \
			
 
				+         dct_wadd(x6, y2o, y5o); \
			
 
				+         dct_wadd(x7, y3o, y4o); \
			
 
				+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
			
 
				+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
			
 
				+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
			
 
				+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
			
 
				+      }
			
 
				+
			
 
				+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
			
 
				+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
			
 
				+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
			
 
				+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
			
 
				+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
			
 
				+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
			
 
				+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
			
 
				+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
			
 
				+
			
 
				+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
			
 
				+   __m128i bias_0 = _mm_set1_epi32(512);
			
 
				+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
			
 
				+
			
 
				+   // load
			
 
				+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
			
 
				+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
			
 
				+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
			
 
				+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
			
 
				+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
			
 
				+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
			
 
				+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
			
 
				+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
			
 
				+
			
 
				+   // column pass
			
 
				+   dct_pass(bias_0, 10);
			
 
				+
			
 
				+   {
			
 
				+      // 16bit 8x8 transpose pass 1
			
 
				+      dct_interleave16(row0, row4);
			
 
				+      dct_interleave16(row1, row5);
			
 
				+      dct_interleave16(row2, row6);
			
 
				+      dct_interleave16(row3, row7);
			
 
				+
			
 
				+      // transpose pass 2
			
 
				+      dct_interleave16(row0, row2);
			
 
				+      dct_interleave16(row1, row3);
			
 
				+      dct_interleave16(row4, row6);
			
 
				+      dct_interleave16(row5, row7);
			
 
				+
			
 
				+      // transpose pass 3
			
 
				+      dct_interleave16(row0, row1);
			
 
				+      dct_interleave16(row2, row3);
			
 
				+      dct_interleave16(row4, row5);
			
 
				+      dct_interleave16(row6, row7);
			
 
				+   }
			
 
				+
			
 
				+   // row pass
			
 
				+   dct_pass(bias_1, 17);
			
 
				+
			
 
				+   {
			
 
				+      // pack
			
 
				+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
			
 
				+      __m128i p1 = _mm_packus_epi16(row2, row3);
			
 
				+      __m128i p2 = _mm_packus_epi16(row4, row5);
			
 
				+      __m128i p3 = _mm_packus_epi16(row6, row7);
			
 
				+
			
 
				+      // 8bit 8x8 transpose pass 1
			
 
				+      dct_interleave8(p0, p2); // a0e0a1e1...
			
 
				+      dct_interleave8(p1, p3); // c0g0c1g1...
			
 
				+
			
 
				+      // transpose pass 2
			
 
				+      dct_interleave8(p0, p1); // a0c0e0g0...
			
 
				+      dct_interleave8(p2, p3); // b0d0f0h0...
			
 
				+
			
 
				+      // transpose pass 3
			
 
				+      dct_interleave8(p0, p2); // a0b0c0d0...
			
 
				+      dct_interleave8(p1, p3); // a4b4c4d4...
			
 
				+
			
 
				+      // store
			
 
				+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
			
 
				+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
			
 
				+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
			
 
				+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
			
 
				+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
			
 
				+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
			
 
				+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
			
 
				+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
			
 
				+   }
			
 
				+
			
 
				+#undef dct_const
			
 
				+#undef dct_rot
			
 
				+#undef dct_widen
			
 
				+#undef dct_wadd
			
 
				+#undef dct_wsub
			
 
				+#undef dct_bfly32o
			
 
				+#undef dct_interleave8
			
 
				+#undef dct_interleave16
			
 
				+#undef dct_pass
			
 
				 }
			
 
				 
			
 
				-static stbi_idct_8x8 stbi__idct_installed = stbi__idct_block_wrapper;
			
 
				+#endif // STBI_SSE2
			
 
				 
			
 
				+#ifdef STBI_SIMD
			
 
				 STBIDEF void stbi_install_idct(stbi_idct_8x8 func)
			
 
				 {
			
 
				-   stbi__idct_installed = func;
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -1887,6 +2070,12 @@ STBIDEF void stbi_install_YCbCr_to_RGB(stbi_YCbCr_to_RGB_run func)
 
				 static void stbi__setup_jpeg(stbi__jpeg *j)
			
 
				 {
			
 
				    j->idct_block_kernel = stbi__idct_block;
			
 
				+
			
 
				+#ifdef STBI_SSE2
			
 
				+   if (stbi__sse2_available()) {
			
 
				+      j->idct_block_kernel = stbi__idct_sse2;
			
 
				+   }
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 // clean up the temporary component buffers