10 năm trước cách đây · a32d73dc3b
--- a/stb_image.h
+++ b/stb_image.h
@@ -1631,6 +1631,214 @@ static void stbi__idct_sse2(stbi_uc *out, int out_stride, short data[64])
 
				 
			
 
				 #endif // STBI_SSE2
			
 
				 
			
 
				+#ifdef STBI_NEON
			
 
				+
			
 
				+// NEON integer IDCT. should produce bit-identical
			
 
				+// results to the generic C version.
			
 
				+static void stbi__idct_neon(stbi_uc *out, int out_stride, short data[64])
			
 
				+{
			
 
				+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
			
 
				+
			
 
				+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
			
 
				+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
			
 
				+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
			
 
				+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
			
 
				+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
			
 
				+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
			
 
				+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
			
 
				+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
			
 
				+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
			
 
				+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
			
 
				+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
			
 
				+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
			
 
				+
			
 
				+#define dct_long_mul(out, inq, coeff) \
			
 
				+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
			
 
				+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
			
 
				+
			
 
				+#define dct_long_mac(out, acc, inq, coeff) \
			
 
				+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
			
 
				+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
			
 
				+
			
 
				+#define dct_widen(out, inq) \
			
 
				+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
			
 
				+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
			
 
				+
			
 
				+// wide add
			
 
				+#define dct_wadd(out, a, b) \
			
 
				+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
			
 
				+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
			
 
				+
			
 
				+// wide sub
			
 
				+#define dct_wsub(out, a, b) \
			
 
				+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
			
 
				+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
			
 
				+
			
 
				+// butterfly a/b, then shift using "shiftop" by "s" and pack
			
 
				+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
			
 
				+   { \
			
 
				+      dct_wadd(sum, a, b); \
			
 
				+      dct_wsub(dif, a, b); \
			
 
				+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
			
 
				+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
			
 
				+   }
			
 
				+
			
 
				+#define dct_pass(shiftop, shift) \
			
 
				+   { \
			
 
				+      /* even part */ \
			
 
				+      int16x8_t sum26 = vaddq_s16(row2, row6); \
			
 
				+      dct_long_mul(p1e, sum26, rot0_0); \
			
 
				+      dct_long_mac(t2e, p1e, row6, rot0_1); \
			
 
				+      dct_long_mac(t3e, p1e, row2, rot0_2); \
			
 
				+      int16x8_t sum04 = vaddq_s16(row0, row4); \
			
 
				+      int16x8_t dif04 = vsubq_s16(row0, row4); \
			
 
				+      dct_widen(t0e, sum04); \
			
 
				+      dct_widen(t1e, dif04); \
			
 
				+      dct_wadd(x0, t0e, t3e); \
			
 
				+      dct_wsub(x3, t0e, t3e); \
			
 
				+      dct_wadd(x1, t1e, t2e); \
			
 
				+      dct_wsub(x2, t1e, t2e); \
			
 
				+      /* odd part */ \
			
 
				+      int16x8_t sum15 = vaddq_s16(row1, row5); \
			
 
				+      int16x8_t sum17 = vaddq_s16(row1, row7); \
			
 
				+      int16x8_t sum35 = vaddq_s16(row3, row5); \
			
 
				+      int16x8_t sum37 = vaddq_s16(row3, row7); \
			
 
				+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
			
 
				+      dct_long_mul(p5o, sumodd, rot1_0); \
			
 
				+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
			
 
				+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
			
 
				+      dct_long_mul(p3o, sum37, rot2_0); \
			
 
				+      dct_long_mul(p4o, sum15, rot2_1); \
			
 
				+      dct_wadd(sump13o, p1o, p3o); \
			
 
				+      dct_wadd(sump24o, p2o, p4o); \
			
 
				+      dct_wadd(sump23o, p2o, p3o); \
			
 
				+      dct_wadd(sump14o, p1o, p4o); \
			
 
				+      dct_long_mac(x4, sump13o, row7, rot3_0); \
			
 
				+      dct_long_mac(x5, sump24o, row5, rot3_1); \
			
 
				+      dct_long_mac(x6, sump23o, row3, rot3_2); \
			
 
				+      dct_long_mac(x7, sump14o, row1, rot3_3); \
			
 
				+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
			
 
				+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
			
 
				+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
			
 
				+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
			
 
				+   }
			
 
				+
			
 
				+   // load
			
 
				+   row0 = vld1q_s16(data + 0*8);
			
 
				+   row1 = vld1q_s16(data + 1*8);
			
 
				+   row2 = vld1q_s16(data + 2*8);
			
 
				+   row3 = vld1q_s16(data + 3*8);
			
 
				+   row4 = vld1q_s16(data + 4*8);
			
 
				+   row5 = vld1q_s16(data + 5*8);
			
 
				+   row6 = vld1q_s16(data + 6*8);
			
 
				+   row7 = vld1q_s16(data + 7*8);
			
 
				+
			
 
				+   // add DC bias
			
 
				+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
			
 
				+
			
 
				+   // column pass
			
 
				+   dct_pass(vrshrn_n_s32, 10);
			
 
				+
			
 
				+   // 16bit 8x8 transpose
			
 
				+   {
			
 
				+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
			
 
				+// whether compilers actually get this is another story, sadly.
			
 
				+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
			
 
				+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
			
 
				+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
			
 
				+
			
 
				+      // pass 1
			
 
				+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
			
 
				+      dct_trn16(row2, row3);
			
 
				+      dct_trn16(row4, row5); 
			
 
				+      dct_trn16(row6, row7);
			
 
				+
			
 
				+      // pass 2
			
 
				+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
			
 
				+      dct_trn32(row1, row3);
			
 
				+      dct_trn32(row4, row6);
			
 
				+      dct_trn32(row5, row7);
			
 
				+
			
 
				+      // pass 3
			
 
				+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
			
 
				+      dct_trn64(row1, row5);
			
 
				+      dct_trn64(row2, row6);
			
 
				+      dct_trn64(row3, row7);
			
 
				+
			
 
				+#undef dct_trn16
			
 
				+#undef dct_trn32
			
 
				+#undef dct_trn64
			
 
				+   }
			
 
				+
			
 
				+   // row pass
			
 
				+   // vrshrn_n_s32 only supports shifts up to 16, we need
			
 
				+   // 17. so do a non-rounding shift of 16 first then follow
			
 
				+   // up with a rounding shift by 1.
			
 
				+   dct_pass(vshrn_n_s32, 16);
			
 
				+
			
 
				+   {
			
 
				+      // pack and round
			
 
				+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
			
 
				+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
			
 
				+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
			
 
				+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
			
 
				+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
			
 
				+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
			
 
				+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
			
 
				+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
			
 
				+
			
 
				+      // again, these can translate into one instruction, but often don't.
			
 
				+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
			
 
				+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
			
 
				+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
			
 
				+
			
 
				+      // sadly can't use interleaved stores here since we only write
			
 
				+      // 8 bytes to each scan line!
			
 
				+
			
 
				+      // 8x8 8-bit transpose pass 1
			
 
				+      dct_trn8_8(p0, p1);
			
 
				+      dct_trn8_8(p2, p3);
			
 
				+      dct_trn8_8(p4, p5);
			
 
				+      dct_trn8_8(p6, p7);
			
 
				+
			
 
				+      // pass 2
			
 
				+      dct_trn8_16(p0, p2);
			
 
				+      dct_trn8_16(p1, p3);
			
 
				+      dct_trn8_16(p4, p6);
			
 
				+      dct_trn8_16(p5, p7);
			
 
				+
			
 
				+      // pass 3
			
 
				+      dct_trn8_32(p0, p4);
			
 
				+      dct_trn8_32(p1, p5);
			
 
				+      dct_trn8_32(p2, p6);
			
 
				+      dct_trn8_32(p3, p7);
			
 
				+
			
 
				+      // store
			
 
				+      vst1_u8(out, p0); out += out_stride;
			
 
				+      vst1_u8(out, p1); out += out_stride;
			
 
				+      vst1_u8(out, p2); out += out_stride;
			
 
				+      vst1_u8(out, p3); out += out_stride;
			
 
				+      vst1_u8(out, p4); out += out_stride;
			
 
				+      vst1_u8(out, p5); out += out_stride;
			
 
				+      vst1_u8(out, p6); out += out_stride;
			
 
				+      vst1_u8(out, p7);
			
 
				+
			
 
				+#undef dct_trn8_8
			
 
				+#undef dct_trn8_16
			
 
				+#undef dct_trn8_32
			
 
				+   }
			
 
				+
			
 
				+#undef dct_long_mul
			
 
				+#undef dct_long_mac
			
 
				+#undef dct_widen
			
 
				+#undef dct_wadd
			
 
				+#undef dct_wsub
			
 
				+#undef dct_bfly32o
			
 
				+#undef dct_pass
			
 
				+}
			
 
				+
			
 
				+#endif // STBI_NEON
			
 
				+
			
 
				 #define STBI__MARKER_none  0xff
			
 
				 // if there's a pending marker from the entropy stream, return that
			
 
				 // otherwise, fetch from the stream and get a marker. if there's no
			
@@ -2321,6 +2529,7 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
 
				 #endif
			
 
				 
			
 
				 #ifdef STBI_NEON
			
 
				+   j->idct_block_kernel = stbi__idct_neon;
			
 
				    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
			
 
				 #endif
			
 
				 }