2
0

highbd_convolve_avx2.c 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <immintrin.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/x86/convolve.h"
  13. // -----------------------------------------------------------------------------
  14. // Copy and average
  15. void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
  16. uint16_t *dst, ptrdiff_t dst_stride,
  17. const InterpKernel *filter, int x0_q4,
  18. int x_step_q4, int y0_q4, int y_step_q4,
  19. int width, int h, int bd) {
  20. (void)filter;
  21. (void)x0_q4;
  22. (void)x_step_q4;
  23. (void)y0_q4;
  24. (void)y_step_q4;
  25. (void)bd;
  26. assert(width % 4 == 0);
  27. if (width > 32) { // width = 64
  28. do {
  29. const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
  30. const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
  31. const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
  32. const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
  33. src += src_stride;
  34. _mm256_storeu_si256((__m256i *)dst, p0);
  35. _mm256_storeu_si256((__m256i *)(dst + 16), p1);
  36. _mm256_storeu_si256((__m256i *)(dst + 32), p2);
  37. _mm256_storeu_si256((__m256i *)(dst + 48), p3);
  38. dst += dst_stride;
  39. h--;
  40. } while (h > 0);
  41. } else if (width > 16) { // width = 32
  42. do {
  43. const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
  44. const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
  45. src += src_stride;
  46. _mm256_storeu_si256((__m256i *)dst, p0);
  47. _mm256_storeu_si256((__m256i *)(dst + 16), p1);
  48. dst += dst_stride;
  49. h--;
  50. } while (h > 0);
  51. } else if (width > 8) { // width = 16
  52. __m256i p0, p1;
  53. do {
  54. p0 = _mm256_loadu_si256((const __m256i *)src);
  55. src += src_stride;
  56. p1 = _mm256_loadu_si256((const __m256i *)src);
  57. src += src_stride;
  58. _mm256_storeu_si256((__m256i *)dst, p0);
  59. dst += dst_stride;
  60. _mm256_storeu_si256((__m256i *)dst, p1);
  61. dst += dst_stride;
  62. h -= 2;
  63. } while (h > 0);
  64. } else if (width > 4) { // width = 8
  65. __m128i p0, p1;
  66. do {
  67. p0 = _mm_loadu_si128((const __m128i *)src);
  68. src += src_stride;
  69. p1 = _mm_loadu_si128((const __m128i *)src);
  70. src += src_stride;
  71. _mm_storeu_si128((__m128i *)dst, p0);
  72. dst += dst_stride;
  73. _mm_storeu_si128((__m128i *)dst, p1);
  74. dst += dst_stride;
  75. h -= 2;
  76. } while (h > 0);
  77. } else { // width = 4
  78. __m128i p0, p1;
  79. do {
  80. p0 = _mm_loadl_epi64((const __m128i *)src);
  81. src += src_stride;
  82. p1 = _mm_loadl_epi64((const __m128i *)src);
  83. src += src_stride;
  84. _mm_storel_epi64((__m128i *)dst, p0);
  85. dst += dst_stride;
  86. _mm_storel_epi64((__m128i *)dst, p1);
  87. dst += dst_stride;
  88. h -= 2;
  89. } while (h > 0);
  90. }
  91. }
  92. void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
  93. uint16_t *dst, ptrdiff_t dst_stride,
  94. const InterpKernel *filter, int x0_q4,
  95. int x_step_q4, int y0_q4, int y_step_q4,
  96. int width, int h, int bd) {
  97. (void)filter;
  98. (void)x0_q4;
  99. (void)x_step_q4;
  100. (void)y0_q4;
  101. (void)y_step_q4;
  102. (void)bd;
  103. assert(width % 4 == 0);
  104. if (width > 32) { // width = 64
  105. __m256i p0, p1, p2, p3, u0, u1, u2, u3;
  106. do {
  107. p0 = _mm256_loadu_si256((const __m256i *)src);
  108. p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
  109. p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
  110. p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
  111. src += src_stride;
  112. u0 = _mm256_loadu_si256((const __m256i *)dst);
  113. u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
  114. u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
  115. u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
  116. _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
  117. _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
  118. _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
  119. _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
  120. dst += dst_stride;
  121. h--;
  122. } while (h > 0);
  123. } else if (width > 16) { // width = 32
  124. __m256i p0, p1, u0, u1;
  125. do {
  126. p0 = _mm256_loadu_si256((const __m256i *)src);
  127. p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
  128. src += src_stride;
  129. u0 = _mm256_loadu_si256((const __m256i *)dst);
  130. u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
  131. _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
  132. _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
  133. dst += dst_stride;
  134. h--;
  135. } while (h > 0);
  136. } else if (width > 8) { // width = 16
  137. __m256i p0, p1, u0, u1;
  138. do {
  139. p0 = _mm256_loadu_si256((const __m256i *)src);
  140. p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
  141. src += src_stride << 1;
  142. u0 = _mm256_loadu_si256((const __m256i *)dst);
  143. u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
  144. _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
  145. _mm256_storeu_si256((__m256i *)(dst + dst_stride),
  146. _mm256_avg_epu16(p1, u1));
  147. dst += dst_stride << 1;
  148. h -= 2;
  149. } while (h > 0);
  150. } else if (width > 4) { // width = 8
  151. __m128i p0, p1, u0, u1;
  152. do {
  153. p0 = _mm_loadu_si128((const __m128i *)src);
  154. p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
  155. src += src_stride << 1;
  156. u0 = _mm_loadu_si128((const __m128i *)dst);
  157. u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
  158. _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
  159. _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
  160. dst += dst_stride << 1;
  161. h -= 2;
  162. } while (h > 0);
  163. } else { // width = 4
  164. __m128i p0, p1, u0, u1;
  165. do {
  166. p0 = _mm_loadl_epi64((const __m128i *)src);
  167. p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
  168. src += src_stride << 1;
  169. u0 = _mm_loadl_epi64((const __m128i *)dst);
  170. u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
  171. _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
  172. _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
  173. dst += dst_stride << 1;
  174. h -= 2;
  175. } while (h > 0);
  176. }
  177. }
  178. // -----------------------------------------------------------------------------
  179. // Horizontal and vertical filtering
  180. #define CONV8_ROUNDING_BITS (7)
  181. static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
  182. 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
  183. 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
  184. static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
  185. 8, 9, 10, 11, 10, 11, 12, 13,
  186. 4, 5, 6, 7, 6, 7, 8, 9,
  187. 8, 9, 10, 11, 10, 11, 12, 13 };
  188. static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
  189. 10, 11, 12, 13, 12, 13, 14, 15,
  190. 6, 7, 8, 9, 8, 9, 10, 11,
  191. 10, 11, 12, 13, 12, 13, 14, 15 };
  192. static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
  193. // -----------------------------------------------------------------------------
  194. // Horizontal Filtering
  195. static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
  196. const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
  197. const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
  198. const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
  199. const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
  200. p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
  201. p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
  202. p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
  203. p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
  204. }
  205. // Note:
  206. // Shared by 8x2 and 16x1 block
  207. static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
  208. __m256i *x /*x[8]*/) {
  209. __m256i pp[8];
  210. pack_pixels(s0, pp);
  211. pack_pixels(s1, &pp[4]);
  212. x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
  213. x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
  214. x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
  215. x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
  216. x[4] = x[2];
  217. x[5] = x[3];
  218. x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
  219. x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
  220. }
  221. static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
  222. __m256i pp[8];
  223. __m256i s0;
  224. s0 = _mm256_loadu_si256((const __m256i *)src);
  225. pack_pixels(&s0, pp);
  226. x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
  227. x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
  228. x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
  229. x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
  230. }
  231. static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
  232. __m256i *x) {
  233. __m256i s0, s1;
  234. s0 = _mm256_loadu_si256((const __m256i *)src);
  235. s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
  236. pack_16_pixels(&s0, &s1, x);
  237. }
  238. static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
  239. __m256i s0, s1;
  240. s0 = _mm256_loadu_si256((const __m256i *)src);
  241. s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
  242. pack_16_pixels(&s0, &s1, x);
  243. }
  244. // Note:
  245. // Shared by horizontal and vertical filtering
  246. static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
  247. const __m128i h = _mm_loadu_si128((const __m128i *)filter);
  248. const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
  249. const __m256i p0 = _mm256_set1_epi32(0x03020100);
  250. const __m256i p1 = _mm256_set1_epi32(0x07060504);
  251. const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
  252. const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
  253. f[0] = _mm256_shuffle_epi8(hh, p0);
  254. f[1] = _mm256_shuffle_epi8(hh, p1);
  255. f[2] = _mm256_shuffle_epi8(hh, p2);
  256. f[3] = _mm256_shuffle_epi8(hh, p3);
  257. }
  258. static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
  259. const __m256i *fil /*fil[4]*/,
  260. __m256i *y) {
  261. __m256i a, a0, a1;
  262. a0 = _mm256_madd_epi16(fil[0], sig[0]);
  263. a1 = _mm256_madd_epi16(fil[3], sig[3]);
  264. a = _mm256_add_epi32(a0, a1);
  265. a0 = _mm256_madd_epi16(fil[1], sig[1]);
  266. a1 = _mm256_madd_epi16(fil[2], sig[2]);
  267. {
  268. const __m256i min = _mm256_min_epi32(a0, a1);
  269. a = _mm256_add_epi32(a, min);
  270. }
  271. {
  272. const __m256i max = _mm256_max_epi32(a0, a1);
  273. a = _mm256_add_epi32(a, max);
  274. }
  275. {
  276. const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
  277. a = _mm256_add_epi32(a, rounding);
  278. *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
  279. }
  280. }
  281. static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
  282. uint16_t *dst) {
  283. const __m128i a0 = _mm256_castsi256_si128(*y);
  284. const __m128i a1 = _mm256_extractf128_si256(*y, 1);
  285. __m128i res = _mm_packus_epi32(a0, a1);
  286. res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
  287. _mm_storeu_si128((__m128i *)dst, res);
  288. }
  289. static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
  290. const __m256i *mask, uint16_t *dst,
  291. ptrdiff_t pitch) {
  292. __m256i a = _mm256_packus_epi32(*y0, *y1);
  293. a = _mm256_min_epi16(a, *mask);
  294. _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
  295. _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
  296. }
  297. static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
  298. const __m256i *mask, uint16_t *dst) {
  299. __m256i a = _mm256_packus_epi32(*y0, *y1);
  300. a = _mm256_min_epi16(a, *mask);
  301. _mm256_storeu_si256((__m256i *)dst, a);
  302. }
  303. static void vpx_highbd_filter_block1d8_h8_avx2(
  304. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  305. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  306. __m256i signal[8], res0, res1;
  307. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  308. __m256i ff[4];
  309. pack_filters(filter, ff);
  310. src_ptr -= 3;
  311. do {
  312. pack_8x2_pixels(src_ptr, src_pitch, signal);
  313. filter_8x1_pixels(signal, ff, &res0);
  314. filter_8x1_pixels(&signal[4], ff, &res1);
  315. store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  316. height -= 2;
  317. src_ptr += src_pitch << 1;
  318. dst_ptr += dst_pitch << 1;
  319. } while (height > 1);
  320. if (height > 0) {
  321. pack_8x1_pixels(src_ptr, signal);
  322. filter_8x1_pixels(signal, ff, &res0);
  323. store_8x1_pixels(&res0, &max, dst_ptr);
  324. }
  325. }
  326. static void vpx_highbd_filter_block1d16_h8_avx2(
  327. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  328. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  329. __m256i signal[8], res0, res1;
  330. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  331. __m256i ff[4];
  332. pack_filters(filter, ff);
  333. src_ptr -= 3;
  334. do {
  335. pack_16x1_pixels(src_ptr, signal);
  336. filter_8x1_pixels(signal, ff, &res0);
  337. filter_8x1_pixels(&signal[4], ff, &res1);
  338. store_16x1_pixels(&res0, &res1, &max, dst_ptr);
  339. height -= 1;
  340. src_ptr += src_pitch;
  341. dst_ptr += dst_pitch;
  342. } while (height > 0);
  343. }
  344. // -----------------------------------------------------------------------------
  345. // 2-tap horizontal filtering
  346. static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
  347. const __m128i h = _mm_loadu_si128((const __m128i *)filter);
  348. const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
  349. const __m256i p = _mm256_set1_epi32(0x09080706);
  350. f[0] = _mm256_shuffle_epi8(hh, p);
  351. }
  352. // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
  353. // the difference is s0/s1 specifies first and second rows or,
  354. // first 16 samples and 8-sample shifted 16 samples
  355. static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
  356. __m256i *sig) {
  357. const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
  358. const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
  359. __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
  360. __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
  361. __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
  362. __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
  363. r0 = _mm256_shuffle_epi8(r0, sf2);
  364. r1 = _mm256_shuffle_epi8(r1, sf2);
  365. sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
  366. sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
  367. }
  368. static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
  369. const ptrdiff_t pitch, __m256i *sig) {
  370. const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
  371. const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
  372. pack_16_2t_pixels(&r0, &r1, sig);
  373. }
  374. static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
  375. __m256i *sig /*sig[2]*/) {
  376. const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
  377. const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
  378. pack_16_2t_pixels(&r0, &r1, sig);
  379. }
  380. static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
  381. __m256i *sig /*sig[2]*/) {
  382. const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
  383. const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
  384. __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
  385. __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
  386. r0 = _mm256_permutevar8x32_epi32(r0, idx);
  387. r0 = _mm256_shuffle_epi8(r0, sf2);
  388. sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
  389. }
  390. // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
  391. static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
  392. __m256i *y0, __m256i *y1) {
  393. const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
  394. __m256i x0 = _mm256_madd_epi16(sig[0], *f);
  395. __m256i x1 = _mm256_madd_epi16(sig[1], *f);
  396. x0 = _mm256_add_epi32(x0, rounding);
  397. x1 = _mm256_add_epi32(x1, rounding);
  398. *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
  399. *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
  400. }
  401. static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
  402. __m256i *y0) {
  403. const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
  404. __m256i x0 = _mm256_madd_epi16(sig[0], *f);
  405. x0 = _mm256_add_epi32(x0, rounding);
  406. *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
  407. }
  408. static void vpx_highbd_filter_block1d8_h2_avx2(
  409. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  410. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  411. __m256i signal[2], res0, res1;
  412. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  413. __m256i ff;
  414. pack_2t_filter(filter, &ff);
  415. src_ptr -= 3;
  416. do {
  417. pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
  418. filter_16_2t_pixels(signal, &ff, &res0, &res1);
  419. store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  420. height -= 2;
  421. src_ptr += src_pitch << 1;
  422. dst_ptr += dst_pitch << 1;
  423. } while (height > 1);
  424. if (height > 0) {
  425. pack_8x1_2t_pixels(src_ptr, signal);
  426. filter_8x1_2t_pixels(signal, &ff, &res0);
  427. store_8x1_pixels(&res0, &max, dst_ptr);
  428. }
  429. }
  430. static void vpx_highbd_filter_block1d16_h2_avx2(
  431. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  432. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  433. __m256i signal[2], res0, res1;
  434. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  435. __m256i ff;
  436. pack_2t_filter(filter, &ff);
  437. src_ptr -= 3;
  438. do {
  439. pack_16x1_2t_pixels(src_ptr, signal);
  440. filter_16_2t_pixels(signal, &ff, &res0, &res1);
  441. store_16x1_pixels(&res0, &res1, &max, dst_ptr);
  442. height -= 1;
  443. src_ptr += src_pitch;
  444. dst_ptr += dst_pitch;
  445. } while (height > 0);
  446. }
  447. // -----------------------------------------------------------------------------
  448. // Vertical Filtering
  449. static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
  450. __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
  451. __m256i s1 =
  452. _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
  453. __m256i s2 = _mm256_castsi128_si256(
  454. _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
  455. __m256i s3 = _mm256_castsi128_si256(
  456. _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
  457. __m256i s4 = _mm256_castsi128_si256(
  458. _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
  459. __m256i s5 = _mm256_castsi128_si256(
  460. _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
  461. __m256i s6 = _mm256_castsi128_si256(
  462. _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
  463. s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
  464. s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
  465. s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
  466. s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
  467. s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
  468. s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
  469. sig[0] = _mm256_unpacklo_epi16(s0, s1);
  470. sig[4] = _mm256_unpackhi_epi16(s0, s1);
  471. sig[1] = _mm256_unpacklo_epi16(s2, s3);
  472. sig[5] = _mm256_unpackhi_epi16(s2, s3);
  473. sig[2] = _mm256_unpacklo_epi16(s4, s5);
  474. sig[6] = _mm256_unpackhi_epi16(s4, s5);
  475. sig[8] = s6;
  476. }
  477. static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
  478. __m256i *sig) {
  479. // base + 7th row
  480. __m256i s0 = _mm256_castsi128_si256(
  481. _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
  482. // base + 8th row
  483. __m256i s1 = _mm256_castsi128_si256(
  484. _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
  485. __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
  486. __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
  487. sig[3] = _mm256_unpacklo_epi16(s2, s3);
  488. sig[7] = _mm256_unpackhi_epi16(s2, s3);
  489. sig[8] = s1;
  490. }
  491. static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
  492. __m256i *y0, __m256i *y1) {
  493. filter_8x1_pixels(sig, f, y0);
  494. filter_8x1_pixels(&sig[4], f, y1);
  495. }
  496. static INLINE void update_pixels(__m256i *sig) {
  497. int i;
  498. for (i = 0; i < 3; ++i) {
  499. sig[i] = sig[i + 1];
  500. sig[i + 4] = sig[i + 5];
  501. }
  502. }
  503. static void vpx_highbd_filter_block1d8_v8_avx2(
  504. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  505. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  506. __m256i signal[9], res0, res1;
  507. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  508. __m256i ff[4];
  509. pack_filters(filter, ff);
  510. pack_8x9_init(src_ptr, src_pitch, signal);
  511. do {
  512. pack_8x9_pixels(src_ptr, src_pitch, signal);
  513. filter_8x9_pixels(signal, ff, &res0, &res1);
  514. store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  515. update_pixels(signal);
  516. src_ptr += src_pitch << 1;
  517. dst_ptr += dst_pitch << 1;
  518. height -= 2;
  519. } while (height > 0);
  520. }
  521. static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
  522. __m256i u0, u1, u2, u3;
  523. // load 0-6 rows
  524. const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
  525. const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
  526. const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
  527. const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
  528. const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
  529. const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
  530. const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
  531. u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
  532. u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
  533. u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
  534. u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
  535. sig[0] = _mm256_unpacklo_epi16(u0, u2);
  536. sig[4] = _mm256_unpackhi_epi16(u0, u2);
  537. sig[8] = _mm256_unpacklo_epi16(u1, u3);
  538. sig[12] = _mm256_unpackhi_epi16(u1, u3);
  539. u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
  540. u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
  541. u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
  542. u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
  543. sig[1] = _mm256_unpacklo_epi16(u0, u2);
  544. sig[5] = _mm256_unpackhi_epi16(u0, u2);
  545. sig[9] = _mm256_unpacklo_epi16(u1, u3);
  546. sig[13] = _mm256_unpackhi_epi16(u1, u3);
  547. u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
  548. u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
  549. u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
  550. u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
  551. sig[2] = _mm256_unpacklo_epi16(u0, u2);
  552. sig[6] = _mm256_unpackhi_epi16(u0, u2);
  553. sig[10] = _mm256_unpacklo_epi16(u1, u3);
  554. sig[14] = _mm256_unpackhi_epi16(u1, u3);
  555. sig[16] = s6;
  556. }
  557. static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
  558. __m256i *sig) {
  559. // base + 7th row
  560. const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
  561. // base + 8th row
  562. const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
  563. __m256i u0, u1, u2, u3;
  564. u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
  565. u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
  566. u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
  567. u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
  568. sig[3] = _mm256_unpacklo_epi16(u0, u2);
  569. sig[7] = _mm256_unpackhi_epi16(u0, u2);
  570. sig[11] = _mm256_unpacklo_epi16(u1, u3);
  571. sig[15] = _mm256_unpackhi_epi16(u1, u3);
  572. sig[16] = s8;
  573. }
  574. static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
  575. __m256i *y0, __m256i *y1) {
  576. __m256i res[4];
  577. int i;
  578. for (i = 0; i < 4; ++i) {
  579. filter_8x1_pixels(&sig[i << 2], f, &res[i]);
  580. }
  581. {
  582. const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
  583. const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
  584. *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
  585. *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
  586. }
  587. }
  588. static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
  589. const __m256i *mask, uint16_t *dst,
  590. ptrdiff_t pitch) {
  591. __m256i p = _mm256_min_epi16(*y0, *mask);
  592. _mm256_storeu_si256((__m256i *)dst, p);
  593. p = _mm256_min_epi16(*y1, *mask);
  594. _mm256_storeu_si256((__m256i *)(dst + pitch), p);
  595. }
  596. static void update_16x9_pixels(__m256i *sig) {
  597. update_pixels(&sig[0]);
  598. update_pixels(&sig[8]);
  599. }
  600. static void vpx_highbd_filter_block1d16_v8_avx2(
  601. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  602. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  603. __m256i signal[17], res0, res1;
  604. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  605. __m256i ff[4];
  606. pack_filters(filter, ff);
  607. pack_16x9_init(src_ptr, src_pitch, signal);
  608. do {
  609. pack_16x9_pixels(src_ptr, src_pitch, signal);
  610. filter_16x9_pixels(signal, ff, &res0, &res1);
  611. store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  612. update_16x9_pixels(signal);
  613. src_ptr += src_pitch << 1;
  614. dst_ptr += dst_pitch << 1;
  615. height -= 2;
  616. } while (height > 0);
  617. }
  618. // -----------------------------------------------------------------------------
  619. // 2-tap vertical filtering
  620. static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
  621. sig[2] = _mm256_loadu_si256((const __m256i *)src);
  622. }
  623. static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
  624. __m256i *sig) {
  625. // load the next row
  626. const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
  627. sig[0] = _mm256_unpacklo_epi16(sig[2], u);
  628. sig[1] = _mm256_unpackhi_epi16(sig[2], u);
  629. sig[2] = u;
  630. }
  631. static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
  632. __m256i *y0, __m256i *y1) {
  633. filter_16_2t_pixels(sig, f, y0, y1);
  634. }
  635. static void vpx_highbd_filter_block1d16_v2_avx2(
  636. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  637. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  638. __m256i signal[3], res0, res1;
  639. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  640. __m256i ff;
  641. pack_2t_filter(filter, &ff);
  642. pack_16x2_init(src_ptr, signal);
  643. do {
  644. pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
  645. filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
  646. store_16x1_pixels(&res0, &res1, &max, dst_ptr);
  647. src_ptr += src_pitch;
  648. dst_ptr += dst_pitch;
  649. height -= 1;
  650. } while (height > 0);
  651. }
  652. static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
  653. const __m128i h = _mm_loadu_si128((const __m128i *)filter);
  654. const __m128i p = _mm_set1_epi32(0x09080706);
  655. f[0] = _mm_shuffle_epi8(h, p);
  656. }
  657. static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
  658. sig[2] = _mm_loadu_si128((const __m128i *)src);
  659. }
  660. static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
  661. __m128i *sig) {
  662. // load the next row
  663. const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
  664. sig[0] = _mm_unpacklo_epi16(sig[2], u);
  665. sig[1] = _mm_unpackhi_epi16(sig[2], u);
  666. sig[2] = u;
  667. }
  668. static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
  669. __m128i *y0, __m128i *y1) {
  670. const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
  671. __m128i x0 = _mm_madd_epi16(sig[0], *f);
  672. __m128i x1 = _mm_madd_epi16(sig[1], *f);
  673. x0 = _mm_add_epi32(x0, rounding);
  674. x1 = _mm_add_epi32(x1, rounding);
  675. *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
  676. *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
  677. }
  678. static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
  679. const __m128i *mask, uint16_t *dst) {
  680. __m128i res = _mm_packus_epi32(*y0, *y1);
  681. res = _mm_min_epi16(res, *mask);
  682. _mm_storeu_si128((__m128i *)dst, res);
  683. }
  684. static void vpx_highbd_filter_block1d8_v2_avx2(
  685. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  686. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  687. __m128i signal[3], res0, res1;
  688. const __m128i max = _mm_set1_epi16((1 << bd) - 1);
  689. __m128i ff;
  690. pack_8x1_2t_filter(filter, &ff);
  691. pack_8x2_init(src_ptr, signal);
  692. do {
  693. pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
  694. filter_8_2t_pixels(signal, &ff, &res0, &res1);
  695. store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
  696. src_ptr += src_pitch;
  697. dst_ptr += dst_pitch;
  698. height -= 1;
  699. } while (height > 0);
  700. }
  701. // Calculation with averaging the input pixels
  702. static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
  703. uint16_t *dst) {
  704. const __m128i a0 = _mm256_castsi256_si128(*y0);
  705. const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
  706. __m128i res = _mm_packus_epi32(a0, a1);
  707. const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
  708. res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
  709. res = _mm_avg_epu16(res, pix);
  710. _mm_storeu_si128((__m128i *)dst, res);
  711. }
  712. static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
  713. const __m256i *mask, uint16_t *dst,
  714. ptrdiff_t pitch) {
  715. __m256i a = _mm256_packus_epi32(*y0, *y1);
  716. const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
  717. const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
  718. const __m256i pix =
  719. _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
  720. a = _mm256_min_epi16(a, *mask);
  721. a = _mm256_avg_epu16(a, pix);
  722. _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
  723. _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
  724. }
  725. static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
  726. const __m256i *mask, uint16_t *dst) {
  727. __m256i a = _mm256_packus_epi32(*y0, *y1);
  728. const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
  729. a = _mm256_min_epi16(a, *mask);
  730. a = _mm256_avg_epu16(a, pix);
  731. _mm256_storeu_si256((__m256i *)dst, a);
  732. }
  733. static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
  734. const __m256i *mask, uint16_t *dst,
  735. ptrdiff_t pitch) {
  736. const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
  737. const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
  738. __m256i p = _mm256_min_epi16(*y0, *mask);
  739. p = _mm256_avg_epu16(p, pix0);
  740. _mm256_storeu_si256((__m256i *)dst, p);
  741. p = _mm256_min_epi16(*y1, *mask);
  742. p = _mm256_avg_epu16(p, pix1);
  743. _mm256_storeu_si256((__m256i *)(dst + pitch), p);
  744. }
  745. static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
  746. const __m128i *y1,
  747. const __m128i *mask,
  748. uint16_t *dst) {
  749. __m128i res = _mm_packus_epi32(*y0, *y1);
  750. const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
  751. res = _mm_min_epi16(res, *mask);
  752. res = _mm_avg_epu16(res, pix);
  753. _mm_storeu_si128((__m128i *)dst, res);
  754. }
  755. static void vpx_highbd_filter_block1d8_h8_avg_avx2(
  756. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  757. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  758. __m256i signal[8], res0, res1;
  759. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  760. __m256i ff[4];
  761. pack_filters(filter, ff);
  762. src_ptr -= 3;
  763. do {
  764. pack_8x2_pixels(src_ptr, src_pitch, signal);
  765. filter_8x1_pixels(signal, ff, &res0);
  766. filter_8x1_pixels(&signal[4], ff, &res1);
  767. store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  768. height -= 2;
  769. src_ptr += src_pitch << 1;
  770. dst_ptr += dst_pitch << 1;
  771. } while (height > 1);
  772. if (height > 0) {
  773. pack_8x1_pixels(src_ptr, signal);
  774. filter_8x1_pixels(signal, ff, &res0);
  775. store_8x1_avg_pixels(&res0, &max, dst_ptr);
  776. }
  777. }
  778. static void vpx_highbd_filter_block1d16_h8_avg_avx2(
  779. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  780. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  781. __m256i signal[8], res0, res1;
  782. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  783. __m256i ff[4];
  784. pack_filters(filter, ff);
  785. src_ptr -= 3;
  786. do {
  787. pack_16x1_pixels(src_ptr, signal);
  788. filter_8x1_pixels(signal, ff, &res0);
  789. filter_8x1_pixels(&signal[4], ff, &res1);
  790. store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
  791. height -= 1;
  792. src_ptr += src_pitch;
  793. dst_ptr += dst_pitch;
  794. } while (height > 0);
  795. }
  796. static void vpx_highbd_filter_block1d8_v8_avg_avx2(
  797. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  798. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  799. __m256i signal[9], res0, res1;
  800. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  801. __m256i ff[4];
  802. pack_filters(filter, ff);
  803. pack_8x9_init(src_ptr, src_pitch, signal);
  804. do {
  805. pack_8x9_pixels(src_ptr, src_pitch, signal);
  806. filter_8x9_pixels(signal, ff, &res0, &res1);
  807. store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  808. update_pixels(signal);
  809. src_ptr += src_pitch << 1;
  810. dst_ptr += dst_pitch << 1;
  811. height -= 2;
  812. } while (height > 0);
  813. }
  814. static void vpx_highbd_filter_block1d16_v8_avg_avx2(
  815. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  816. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  817. __m256i signal[17], res0, res1;
  818. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  819. __m256i ff[4];
  820. pack_filters(filter, ff);
  821. pack_16x9_init(src_ptr, src_pitch, signal);
  822. do {
  823. pack_16x9_pixels(src_ptr, src_pitch, signal);
  824. filter_16x9_pixels(signal, ff, &res0, &res1);
  825. store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  826. update_16x9_pixels(signal);
  827. src_ptr += src_pitch << 1;
  828. dst_ptr += dst_pitch << 1;
  829. height -= 2;
  830. } while (height > 0);
  831. }
  832. static void vpx_highbd_filter_block1d8_h2_avg_avx2(
  833. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  834. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  835. __m256i signal[2], res0, res1;
  836. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  837. __m256i ff;
  838. pack_2t_filter(filter, &ff);
  839. src_ptr -= 3;
  840. do {
  841. pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
  842. filter_16_2t_pixels(signal, &ff, &res0, &res1);
  843. store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  844. height -= 2;
  845. src_ptr += src_pitch << 1;
  846. dst_ptr += dst_pitch << 1;
  847. } while (height > 1);
  848. if (height > 0) {
  849. pack_8x1_2t_pixels(src_ptr, signal);
  850. filter_8x1_2t_pixels(signal, &ff, &res0);
  851. store_8x1_avg_pixels(&res0, &max, dst_ptr);
  852. }
  853. }
  854. static void vpx_highbd_filter_block1d16_h2_avg_avx2(
  855. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  856. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  857. __m256i signal[2], res0, res1;
  858. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  859. __m256i ff;
  860. pack_2t_filter(filter, &ff);
  861. src_ptr -= 3;
  862. do {
  863. pack_16x1_2t_pixels(src_ptr, signal);
  864. filter_16_2t_pixels(signal, &ff, &res0, &res1);
  865. store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
  866. height -= 1;
  867. src_ptr += src_pitch;
  868. dst_ptr += dst_pitch;
  869. } while (height > 0);
  870. }
  871. static void vpx_highbd_filter_block1d16_v2_avg_avx2(
  872. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  873. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  874. __m256i signal[3], res0, res1;
  875. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  876. __m256i ff;
  877. pack_2t_filter(filter, &ff);
  878. pack_16x2_init(src_ptr, signal);
  879. do {
  880. pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
  881. filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
  882. store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
  883. src_ptr += src_pitch;
  884. dst_ptr += dst_pitch;
  885. height -= 1;
  886. } while (height > 0);
  887. }
  888. static void vpx_highbd_filter_block1d8_v2_avg_avx2(
  889. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  890. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  891. __m128i signal[3], res0, res1;
  892. const __m128i max = _mm_set1_epi16((1 << bd) - 1);
  893. __m128i ff;
  894. pack_8x1_2t_filter(filter, &ff);
  895. pack_8x2_init(src_ptr, signal);
  896. do {
  897. pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
  898. filter_8_2t_pixels(signal, &ff, &res0, &res1);
  899. store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
  900. src_ptr += src_pitch;
  901. dst_ptr += dst_pitch;
  902. height -= 1;
  903. } while (height > 0);
  904. }
  905. void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
  906. ptrdiff_t, uint32_t, const int16_t *,
  907. int);
  908. void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
  909. ptrdiff_t, uint32_t, const int16_t *,
  910. int);
  911. void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
  912. ptrdiff_t, uint32_t, const int16_t *,
  913. int);
  914. void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
  915. ptrdiff_t, uint32_t, const int16_t *,
  916. int);
  917. #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
  918. #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
  919. #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
  920. #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
  921. HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
  922. HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
  923. HIGH_FUN_CONV_2D(, avx2);
  924. void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
  925. uint16_t *, ptrdiff_t, uint32_t,
  926. const int16_t *, int);
  927. void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
  928. uint16_t *, ptrdiff_t, uint32_t,
  929. const int16_t *, int);
  930. void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
  931. uint16_t *, ptrdiff_t, uint32_t,
  932. const int16_t *, int);
  933. void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
  934. uint16_t *, ptrdiff_t, uint32_t,
  935. const int16_t *, int);
  936. #define vpx_highbd_filter_block1d4_h8_avg_avx2 \
  937. vpx_highbd_filter_block1d4_h8_avg_sse2
  938. #define vpx_highbd_filter_block1d4_h2_avg_avx2 \
  939. vpx_highbd_filter_block1d4_h2_avg_sse2
  940. #define vpx_highbd_filter_block1d4_v8_avg_avx2 \
  941. vpx_highbd_filter_block1d4_v8_avg_sse2
  942. #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
  943. vpx_highbd_filter_block1d4_v2_avg_sse2
  944. HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
  945. HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
  946. avx2);
  947. HIGH_FUN_CONV_2D(avg_, avx2);
  948. #undef HIGHBD_FUNC