2
0

avg_intrin_sse2.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx/vpx_integer.h"
  13. #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
  14. #include "vpx_ports/mem.h"
  15. void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
  16. int *min, int *max) {
  17. __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
  18. u0 = _mm_setzero_si128();
  19. // Row 0
  20. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
  21. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
  22. diff = _mm_subs_epi16(s0, d0);
  23. negdiff = _mm_subs_epi16(u0, diff);
  24. absdiff0 = _mm_max_epi16(diff, negdiff);
  25. // Row 1
  26. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
  27. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
  28. diff = _mm_subs_epi16(s0, d0);
  29. negdiff = _mm_subs_epi16(u0, diff);
  30. absdiff = _mm_max_epi16(diff, negdiff);
  31. maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
  32. minabsdiff = _mm_min_epi16(absdiff0, absdiff);
  33. // Row 2
  34. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
  35. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
  36. diff = _mm_subs_epi16(s0, d0);
  37. negdiff = _mm_subs_epi16(u0, diff);
  38. absdiff = _mm_max_epi16(diff, negdiff);
  39. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  40. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  41. // Row 3
  42. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
  43. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
  44. diff = _mm_subs_epi16(s0, d0);
  45. negdiff = _mm_subs_epi16(u0, diff);
  46. absdiff = _mm_max_epi16(diff, negdiff);
  47. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  48. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  49. // Row 4
  50. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
  51. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
  52. diff = _mm_subs_epi16(s0, d0);
  53. negdiff = _mm_subs_epi16(u0, diff);
  54. absdiff = _mm_max_epi16(diff, negdiff);
  55. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  56. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  57. // Row 5
  58. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
  59. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
  60. diff = _mm_subs_epi16(s0, d0);
  61. negdiff = _mm_subs_epi16(u0, diff);
  62. absdiff = _mm_max_epi16(diff, negdiff);
  63. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  64. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  65. // Row 6
  66. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
  67. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
  68. diff = _mm_subs_epi16(s0, d0);
  69. negdiff = _mm_subs_epi16(u0, diff);
  70. absdiff = _mm_max_epi16(diff, negdiff);
  71. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  72. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  73. // Row 7
  74. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
  75. d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
  76. diff = _mm_subs_epi16(s0, d0);
  77. negdiff = _mm_subs_epi16(u0, diff);
  78. absdiff = _mm_max_epi16(diff, negdiff);
  79. maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
  80. minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
  81. maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
  82. maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
  83. maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
  84. *max = _mm_extract_epi16(maxabsdiff, 0);
  85. minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
  86. minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
  87. minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
  88. *min = _mm_extract_epi16(minabsdiff, 0);
  89. }
  90. unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
  91. __m128i s0, s1, u0;
  92. unsigned int avg = 0;
  93. u0 = _mm_setzero_si128();
  94. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
  95. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
  96. s0 = _mm_adds_epu16(s0, s1);
  97. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
  98. s0 = _mm_adds_epu16(s0, s1);
  99. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
  100. s0 = _mm_adds_epu16(s0, s1);
  101. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
  102. s0 = _mm_adds_epu16(s0, s1);
  103. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
  104. s0 = _mm_adds_epu16(s0, s1);
  105. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
  106. s0 = _mm_adds_epu16(s0, s1);
  107. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
  108. s0 = _mm_adds_epu16(s0, s1);
  109. s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
  110. s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
  111. s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
  112. avg = _mm_extract_epi16(s0, 0);
  113. return (avg + 32) >> 6;
  114. }
  115. unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
  116. __m128i s0, s1, u0;
  117. unsigned int avg = 0;
  118. u0 = _mm_setzero_si128();
  119. s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
  120. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
  121. s0 = _mm_adds_epu16(s0, s1);
  122. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
  123. s0 = _mm_adds_epu16(s0, s1);
  124. s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
  125. s0 = _mm_adds_epu16(s0, s1);
  126. s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
  127. s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
  128. avg = _mm_extract_epi16(s0, 0);
  129. return (avg + 8) >> 4;
  130. }
  131. static void hadamard_col8_sse2(__m128i *in, int iter) {
  132. __m128i a0 = in[0];
  133. __m128i a1 = in[1];
  134. __m128i a2 = in[2];
  135. __m128i a3 = in[3];
  136. __m128i a4 = in[4];
  137. __m128i a5 = in[5];
  138. __m128i a6 = in[6];
  139. __m128i a7 = in[7];
  140. __m128i b0 = _mm_add_epi16(a0, a1);
  141. __m128i b1 = _mm_sub_epi16(a0, a1);
  142. __m128i b2 = _mm_add_epi16(a2, a3);
  143. __m128i b3 = _mm_sub_epi16(a2, a3);
  144. __m128i b4 = _mm_add_epi16(a4, a5);
  145. __m128i b5 = _mm_sub_epi16(a4, a5);
  146. __m128i b6 = _mm_add_epi16(a6, a7);
  147. __m128i b7 = _mm_sub_epi16(a6, a7);
  148. a0 = _mm_add_epi16(b0, b2);
  149. a1 = _mm_add_epi16(b1, b3);
  150. a2 = _mm_sub_epi16(b0, b2);
  151. a3 = _mm_sub_epi16(b1, b3);
  152. a4 = _mm_add_epi16(b4, b6);
  153. a5 = _mm_add_epi16(b5, b7);
  154. a6 = _mm_sub_epi16(b4, b6);
  155. a7 = _mm_sub_epi16(b5, b7);
  156. if (iter == 0) {
  157. b0 = _mm_add_epi16(a0, a4);
  158. b7 = _mm_add_epi16(a1, a5);
  159. b3 = _mm_add_epi16(a2, a6);
  160. b4 = _mm_add_epi16(a3, a7);
  161. b2 = _mm_sub_epi16(a0, a4);
  162. b6 = _mm_sub_epi16(a1, a5);
  163. b1 = _mm_sub_epi16(a2, a6);
  164. b5 = _mm_sub_epi16(a3, a7);
  165. a0 = _mm_unpacklo_epi16(b0, b1);
  166. a1 = _mm_unpacklo_epi16(b2, b3);
  167. a2 = _mm_unpackhi_epi16(b0, b1);
  168. a3 = _mm_unpackhi_epi16(b2, b3);
  169. a4 = _mm_unpacklo_epi16(b4, b5);
  170. a5 = _mm_unpacklo_epi16(b6, b7);
  171. a6 = _mm_unpackhi_epi16(b4, b5);
  172. a7 = _mm_unpackhi_epi16(b6, b7);
  173. b0 = _mm_unpacklo_epi32(a0, a1);
  174. b1 = _mm_unpacklo_epi32(a4, a5);
  175. b2 = _mm_unpackhi_epi32(a0, a1);
  176. b3 = _mm_unpackhi_epi32(a4, a5);
  177. b4 = _mm_unpacklo_epi32(a2, a3);
  178. b5 = _mm_unpacklo_epi32(a6, a7);
  179. b6 = _mm_unpackhi_epi32(a2, a3);
  180. b7 = _mm_unpackhi_epi32(a6, a7);
  181. in[0] = _mm_unpacklo_epi64(b0, b1);
  182. in[1] = _mm_unpackhi_epi64(b0, b1);
  183. in[2] = _mm_unpacklo_epi64(b2, b3);
  184. in[3] = _mm_unpackhi_epi64(b2, b3);
  185. in[4] = _mm_unpacklo_epi64(b4, b5);
  186. in[5] = _mm_unpackhi_epi64(b4, b5);
  187. in[6] = _mm_unpacklo_epi64(b6, b7);
  188. in[7] = _mm_unpackhi_epi64(b6, b7);
  189. } else {
  190. in[0] = _mm_add_epi16(a0, a4);
  191. in[7] = _mm_add_epi16(a1, a5);
  192. in[3] = _mm_add_epi16(a2, a6);
  193. in[4] = _mm_add_epi16(a3, a7);
  194. in[2] = _mm_sub_epi16(a0, a4);
  195. in[6] = _mm_sub_epi16(a1, a5);
  196. in[1] = _mm_sub_epi16(a2, a6);
  197. in[5] = _mm_sub_epi16(a3, a7);
  198. }
  199. }
  200. void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
  201. tran_low_t *coeff) {
  202. __m128i src[8];
  203. src[0] = _mm_load_si128((const __m128i *)src_diff);
  204. src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  205. src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  206. src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  207. src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  208. src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  209. src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  210. src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
  211. hadamard_col8_sse2(src, 0);
  212. hadamard_col8_sse2(src, 1);
  213. store_tran_low(src[0], coeff);
  214. coeff += 8;
  215. store_tran_low(src[1], coeff);
  216. coeff += 8;
  217. store_tran_low(src[2], coeff);
  218. coeff += 8;
  219. store_tran_low(src[3], coeff);
  220. coeff += 8;
  221. store_tran_low(src[4], coeff);
  222. coeff += 8;
  223. store_tran_low(src[5], coeff);
  224. coeff += 8;
  225. store_tran_low(src[6], coeff);
  226. coeff += 8;
  227. store_tran_low(src[7], coeff);
  228. }
  229. void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
  230. tran_low_t *coeff) {
  231. int idx;
  232. for (idx = 0; idx < 4; ++idx) {
  233. int16_t const *src_ptr =
  234. src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
  235. vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
  236. }
  237. for (idx = 0; idx < 64; idx += 8) {
  238. __m128i coeff0 = load_tran_low(coeff);
  239. __m128i coeff1 = load_tran_low(coeff + 64);
  240. __m128i coeff2 = load_tran_low(coeff + 128);
  241. __m128i coeff3 = load_tran_low(coeff + 192);
  242. __m128i b0 = _mm_add_epi16(coeff0, coeff1);
  243. __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
  244. __m128i b2 = _mm_add_epi16(coeff2, coeff3);
  245. __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
  246. b0 = _mm_srai_epi16(b0, 1);
  247. b1 = _mm_srai_epi16(b1, 1);
  248. b2 = _mm_srai_epi16(b2, 1);
  249. b3 = _mm_srai_epi16(b3, 1);
  250. coeff0 = _mm_add_epi16(b0, b2);
  251. coeff1 = _mm_add_epi16(b1, b3);
  252. store_tran_low(coeff0, coeff);
  253. store_tran_low(coeff1, coeff + 64);
  254. coeff2 = _mm_sub_epi16(b0, b2);
  255. coeff3 = _mm_sub_epi16(b1, b3);
  256. store_tran_low(coeff2, coeff + 128);
  257. store_tran_low(coeff3, coeff + 192);
  258. coeff += 8;
  259. }
  260. }
  261. int vpx_satd_sse2(const tran_low_t *coeff, int length) {
  262. int i;
  263. const __m128i zero = _mm_setzero_si128();
  264. __m128i accum = zero;
  265. for (i = 0; i < length; i += 8) {
  266. const __m128i src_line = load_tran_low(coeff);
  267. const __m128i inv = _mm_sub_epi16(zero, src_line);
  268. const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
  269. const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
  270. const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
  271. const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
  272. accum = _mm_add_epi32(accum, sum);
  273. coeff += 8;
  274. }
  275. { // cascading summation of accum
  276. __m128i hi = _mm_srli_si128(accum, 8);
  277. accum = _mm_add_epi32(accum, hi);
  278. hi = _mm_srli_epi64(accum, 32);
  279. accum = _mm_add_epi32(accum, hi);
  280. }
  281. return _mm_cvtsi128_si32(accum);
  282. }
  283. void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref,
  284. const int ref_stride, const int height) {
  285. int idx;
  286. __m128i zero = _mm_setzero_si128();
  287. __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
  288. __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
  289. __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
  290. __m128i t0, t1;
  291. int height_1 = height - 1;
  292. ref += ref_stride;
  293. for (idx = 1; idx < height_1; idx += 2) {
  294. src_line = _mm_loadu_si128((const __m128i *)ref);
  295. t0 = _mm_unpacklo_epi8(src_line, zero);
  296. t1 = _mm_unpackhi_epi8(src_line, zero);
  297. s0 = _mm_adds_epu16(s0, t0);
  298. s1 = _mm_adds_epu16(s1, t1);
  299. ref += ref_stride;
  300. src_line = _mm_loadu_si128((const __m128i *)ref);
  301. t0 = _mm_unpacklo_epi8(src_line, zero);
  302. t1 = _mm_unpackhi_epi8(src_line, zero);
  303. s0 = _mm_adds_epu16(s0, t0);
  304. s1 = _mm_adds_epu16(s1, t1);
  305. ref += ref_stride;
  306. }
  307. src_line = _mm_loadu_si128((const __m128i *)ref);
  308. t0 = _mm_unpacklo_epi8(src_line, zero);
  309. t1 = _mm_unpackhi_epi8(src_line, zero);
  310. s0 = _mm_adds_epu16(s0, t0);
  311. s1 = _mm_adds_epu16(s1, t1);
  312. if (height == 64) {
  313. s0 = _mm_srai_epi16(s0, 5);
  314. s1 = _mm_srai_epi16(s1, 5);
  315. } else if (height == 32) {
  316. s0 = _mm_srai_epi16(s0, 4);
  317. s1 = _mm_srai_epi16(s1, 4);
  318. } else {
  319. s0 = _mm_srai_epi16(s0, 3);
  320. s1 = _mm_srai_epi16(s1, 3);
  321. }
  322. _mm_storeu_si128((__m128i *)hbuf, s0);
  323. hbuf += 8;
  324. _mm_storeu_si128((__m128i *)hbuf, s1);
  325. }
  326. int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
  327. __m128i zero = _mm_setzero_si128();
  328. __m128i src_line = _mm_load_si128((const __m128i *)ref);
  329. __m128i s0 = _mm_sad_epu8(src_line, zero);
  330. __m128i s1;
  331. int i;
  332. for (i = 16; i < width; i += 16) {
  333. ref += 16;
  334. src_line = _mm_load_si128((const __m128i *)ref);
  335. s1 = _mm_sad_epu8(src_line, zero);
  336. s0 = _mm_adds_epu16(s0, s1);
  337. }
  338. s1 = _mm_srli_si128(s0, 8);
  339. s0 = _mm_adds_epu16(s0, s1);
  340. return _mm_extract_epi16(s0, 0);
  341. }
  342. int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) {
  343. int idx;
  344. int width = 4 << bwl;
  345. int16_t mean;
  346. __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
  347. __m128i v1 = _mm_load_si128((const __m128i *)src);
  348. __m128i diff = _mm_subs_epi16(v0, v1);
  349. __m128i sum = diff;
  350. __m128i sse = _mm_madd_epi16(diff, diff);
  351. ref += 8;
  352. src += 8;
  353. for (idx = 8; idx < width; idx += 8) {
  354. v0 = _mm_loadu_si128((const __m128i *)ref);
  355. v1 = _mm_load_si128((const __m128i *)src);
  356. diff = _mm_subs_epi16(v0, v1);
  357. sum = _mm_add_epi16(sum, diff);
  358. v0 = _mm_madd_epi16(diff, diff);
  359. sse = _mm_add_epi32(sse, v0);
  360. ref += 8;
  361. src += 8;
  362. }
  363. v0 = _mm_srli_si128(sum, 8);
  364. sum = _mm_add_epi16(sum, v0);
  365. v0 = _mm_srli_epi64(sum, 32);
  366. sum = _mm_add_epi16(sum, v0);
  367. v0 = _mm_srli_epi32(sum, 16);
  368. sum = _mm_add_epi16(sum, v0);
  369. v1 = _mm_srli_si128(sse, 8);
  370. sse = _mm_add_epi32(sse, v1);
  371. v1 = _mm_srli_epi64(sse, 32);
  372. sse = _mm_add_epi32(sse, v1);
  373. mean = _mm_extract_epi16(sum, 0);
  374. return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
  375. }