inv_txfm_sse2.h 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
  11. #define VPX_DSP_X86_INV_TXFM_SSE2_H_
  12. #include <emmintrin.h> // SSE2
  13. #include "./vpx_config.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_dsp/inv_txfm.h"
  16. #include "vpx_dsp/x86/txfm_common_sse2.h"
  17. // perform 8x8 transpose
  18. static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
  19. const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
  20. const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
  21. const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
  22. const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
  23. const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
  24. const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
  25. const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
  26. const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
  27. const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  28. const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
  29. const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  30. const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
  31. const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
  32. const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
  33. const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
  34. const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
  35. res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
  36. res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
  37. res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
  38. res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
  39. res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
  40. res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
  41. res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
  42. res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
  43. }
  44. #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
  45. { \
  46. const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
  47. const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
  48. \
  49. in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
  50. in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
  51. }
  52. static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
  53. const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
  54. const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
  55. const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
  56. const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
  57. const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  58. const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  59. const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
  60. const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
  61. out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
  62. out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
  63. out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
  64. out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
  65. }
  66. static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
  67. __m128i tbuf[8];
  68. array_transpose_8x8(res0, res0);
  69. array_transpose_8x8(res1, tbuf);
  70. array_transpose_8x8(res0 + 8, res1);
  71. array_transpose_8x8(res1 + 8, res1 + 8);
  72. res0[8] = tbuf[0];
  73. res0[9] = tbuf[1];
  74. res0[10] = tbuf[2];
  75. res0[11] = tbuf[3];
  76. res0[12] = tbuf[4];
  77. res0[13] = tbuf[5];
  78. res0[14] = tbuf[6];
  79. res0[15] = tbuf[7];
  80. }
  81. // Function to allow 8 bit optimisations to be used when profile 0 is used with
  82. // highbitdepth enabled
  83. static INLINE __m128i load_input_data(const tran_low_t *data) {
  84. #if CONFIG_VP9_HIGHBITDEPTH
  85. return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
  86. data[6], data[7]);
  87. #else
  88. return _mm_load_si128((const __m128i *)data);
  89. #endif
  90. }
  91. static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
  92. in[0] = load_input_data(input + 0 * 16);
  93. in[1] = load_input_data(input + 1 * 16);
  94. in[2] = load_input_data(input + 2 * 16);
  95. in[3] = load_input_data(input + 3 * 16);
  96. in[4] = load_input_data(input + 4 * 16);
  97. in[5] = load_input_data(input + 5 * 16);
  98. in[6] = load_input_data(input + 6 * 16);
  99. in[7] = load_input_data(input + 7 * 16);
  100. in[8] = load_input_data(input + 8 * 16);
  101. in[9] = load_input_data(input + 9 * 16);
  102. in[10] = load_input_data(input + 10 * 16);
  103. in[11] = load_input_data(input + 11 * 16);
  104. in[12] = load_input_data(input + 12 * 16);
  105. in[13] = load_input_data(input + 13 * 16);
  106. in[14] = load_input_data(input + 14 * 16);
  107. in[15] = load_input_data(input + 15 * 16);
  108. }
  109. #define RECON_AND_STORE(dest, in_x) \
  110. { \
  111. __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
  112. d0 = _mm_unpacklo_epi8(d0, zero); \
  113. d0 = _mm_add_epi16(in_x, d0); \
  114. d0 = _mm_packus_epi16(d0, d0); \
  115. _mm_storel_epi64((__m128i *)(dest), d0); \
  116. }
  117. static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
  118. const __m128i final_rounding = _mm_set1_epi16(1<<5);
  119. const __m128i zero = _mm_setzero_si128();
  120. // Final rounding and shift
  121. in[0] = _mm_adds_epi16(in[0], final_rounding);
  122. in[1] = _mm_adds_epi16(in[1], final_rounding);
  123. in[2] = _mm_adds_epi16(in[2], final_rounding);
  124. in[3] = _mm_adds_epi16(in[3], final_rounding);
  125. in[4] = _mm_adds_epi16(in[4], final_rounding);
  126. in[5] = _mm_adds_epi16(in[5], final_rounding);
  127. in[6] = _mm_adds_epi16(in[6], final_rounding);
  128. in[7] = _mm_adds_epi16(in[7], final_rounding);
  129. in[8] = _mm_adds_epi16(in[8], final_rounding);
  130. in[9] = _mm_adds_epi16(in[9], final_rounding);
  131. in[10] = _mm_adds_epi16(in[10], final_rounding);
  132. in[11] = _mm_adds_epi16(in[11], final_rounding);
  133. in[12] = _mm_adds_epi16(in[12], final_rounding);
  134. in[13] = _mm_adds_epi16(in[13], final_rounding);
  135. in[14] = _mm_adds_epi16(in[14], final_rounding);
  136. in[15] = _mm_adds_epi16(in[15], final_rounding);
  137. in[0] = _mm_srai_epi16(in[0], 6);
  138. in[1] = _mm_srai_epi16(in[1], 6);
  139. in[2] = _mm_srai_epi16(in[2], 6);
  140. in[3] = _mm_srai_epi16(in[3], 6);
  141. in[4] = _mm_srai_epi16(in[4], 6);
  142. in[5] = _mm_srai_epi16(in[5], 6);
  143. in[6] = _mm_srai_epi16(in[6], 6);
  144. in[7] = _mm_srai_epi16(in[7], 6);
  145. in[8] = _mm_srai_epi16(in[8], 6);
  146. in[9] = _mm_srai_epi16(in[9], 6);
  147. in[10] = _mm_srai_epi16(in[10], 6);
  148. in[11] = _mm_srai_epi16(in[11], 6);
  149. in[12] = _mm_srai_epi16(in[12], 6);
  150. in[13] = _mm_srai_epi16(in[13], 6);
  151. in[14] = _mm_srai_epi16(in[14], 6);
  152. in[15] = _mm_srai_epi16(in[15], 6);
  153. RECON_AND_STORE(dest + 0 * stride, in[0]);
  154. RECON_AND_STORE(dest + 1 * stride, in[1]);
  155. RECON_AND_STORE(dest + 2 * stride, in[2]);
  156. RECON_AND_STORE(dest + 3 * stride, in[3]);
  157. RECON_AND_STORE(dest + 4 * stride, in[4]);
  158. RECON_AND_STORE(dest + 5 * stride, in[5]);
  159. RECON_AND_STORE(dest + 6 * stride, in[6]);
  160. RECON_AND_STORE(dest + 7 * stride, in[7]);
  161. RECON_AND_STORE(dest + 8 * stride, in[8]);
  162. RECON_AND_STORE(dest + 9 * stride, in[9]);
  163. RECON_AND_STORE(dest + 10 * stride, in[10]);
  164. RECON_AND_STORE(dest + 11 * stride, in[11]);
  165. RECON_AND_STORE(dest + 12 * stride, in[12]);
  166. RECON_AND_STORE(dest + 13 * stride, in[13]);
  167. RECON_AND_STORE(dest + 14 * stride, in[14]);
  168. RECON_AND_STORE(dest + 15 * stride, in[15]);
  169. }
  170. void idct4_sse2(__m128i *in);
  171. void idct8_sse2(__m128i *in);
  172. void idct16_sse2(__m128i *in0, __m128i *in1);
  173. void iadst4_sse2(__m128i *in);
  174. void iadst8_sse2(__m128i *in);
  175. void iadst16_sse2(__m128i *in0, __m128i *in1);
  176. #endif // VPX_DSP_X86_INV_TXFM_SSE2_H_