vpx_subpixel_8t_intrin_avx2.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. // Due to a header conflict between math.h and intrinsics includes with ceil()
  11. // in certain configurations under vs9 this include needs to precede
  12. // immintrin.h.
  13. #include <immintrin.h>
  14. #include "./vpx_dsp_rtcd.h"
  15. #include "vpx_dsp/x86/convolve.h"
  16. #include "vpx_ports/mem.h"
  17. // filters for 16_h8 and 16_v8
  18. DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
  19. 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
  20. 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  21. };
  22. DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
  23. 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
  24. 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  25. };
  26. DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
  27. 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
  28. 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
  29. };
  30. DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
  31. 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
  32. 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
  33. };
  34. #if defined(__clang__)
  35. // -- GODOT start -
  36. # if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
  37. (!defined(__MACPORTS__) && defined(__APPLE__) && \
  38. ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
  39. (__clang_major__ == 5 && __clang_minor__ == 0)))
  40. // -- GODOT end --
  41. # define MM256_BROADCASTSI128_SI256(x) \
  42. _mm_broadcastsi128_si256((__m128i const *)&(x))
  43. # else // clang > 3.3, and not 5.0 on macosx.
  44. # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
  45. # endif // clang <= 3.3
  46. #elif defined(__GNUC__)
  47. # if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
  48. # define MM256_BROADCASTSI128_SI256(x) \
  49. _mm_broadcastsi128_si256((__m128i const *)&(x))
  50. # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
  51. # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
  52. # else // gcc > 4.7
  53. # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
  54. # endif // gcc <= 4.6
  55. #else // !(gcc || clang)
  56. # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
  57. #endif // __clang__
  58. static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
  59. ptrdiff_t src_pixels_per_line,
  60. uint8_t *output_ptr,
  61. ptrdiff_t output_pitch,
  62. uint32_t output_height,
  63. const int16_t *filter) {
  64. __m128i filtersReg;
  65. __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
  66. __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
  67. __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
  68. __m256i srcReg32b1, srcReg32b2, filtersReg32;
  69. unsigned int i;
  70. ptrdiff_t src_stride, dst_stride;
  71. // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  72. addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
  73. filtersReg = _mm_loadu_si128((const __m128i *)filter);
  74. // converting the 16 bit (short) to 8 bit (byte) and have the same data
  75. // in both lanes of 128 bit register.
  76. filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
  77. // have the same data in both lanes of a 256 bit register
  78. filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
  79. // duplicate only the first 16 bits (first and second byte)
  80. // across 256 bit register
  81. firstFilters = _mm256_shuffle_epi8(filtersReg32,
  82. _mm256_set1_epi16(0x100u));
  83. // duplicate only the second 16 bits (third and forth byte)
  84. // across 256 bit register
  85. secondFilters = _mm256_shuffle_epi8(filtersReg32,
  86. _mm256_set1_epi16(0x302u));
  87. // duplicate only the third 16 bits (fifth and sixth byte)
  88. // across 256 bit register
  89. thirdFilters = _mm256_shuffle_epi8(filtersReg32,
  90. _mm256_set1_epi16(0x504u));
  91. // duplicate only the forth 16 bits (seventh and eighth byte)
  92. // across 256 bit register
  93. forthFilters = _mm256_shuffle_epi8(filtersReg32,
  94. _mm256_set1_epi16(0x706u));
  95. filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
  96. filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
  97. filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
  98. filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
  99. // multiple the size of the source and destination stride by two
  100. src_stride = src_pixels_per_line << 1;
  101. dst_stride = output_pitch << 1;
  102. for (i = output_height; i > 1; i-=2) {
  103. // load the 2 strides of source
  104. srcReg32b1 = _mm256_castsi128_si256(
  105. _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
  106. srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
  107. _mm_loadu_si128((const __m128i *)
  108. (src_ptr+src_pixels_per_line-3)), 1);
  109. // filter the source buffer
  110. srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
  111. srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
  112. // multiply 2 adjacent elements with the filter and add the result
  113. srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
  114. srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
  115. // add and saturate the results together
  116. srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
  117. // filter the source buffer
  118. srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
  119. srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
  120. // multiply 2 adjacent elements with the filter and add the result
  121. srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
  122. srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
  123. // add and saturate the results together
  124. srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
  125. _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
  126. // reading 2 strides of the next 16 bytes
  127. // (part of it was being read by earlier read)
  128. srcReg32b2 = _mm256_castsi128_si256(
  129. _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
  130. srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
  131. _mm_loadu_si128((const __m128i *)
  132. (src_ptr+src_pixels_per_line+5)), 1);
  133. // add and saturate the results together
  134. srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
  135. _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
  136. // filter the source buffer
  137. srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
  138. srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
  139. // multiply 2 adjacent elements with the filter and add the result
  140. srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
  141. srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
  142. // add and saturate the results together
  143. srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
  144. // filter the source buffer
  145. srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
  146. srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
  147. // multiply 2 adjacent elements with the filter and add the result
  148. srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
  149. srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
  150. // add and saturate the results together
  151. srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
  152. _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
  153. srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
  154. _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
  155. srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
  156. srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
  157. // shift by 7 bit each 16 bit
  158. srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
  159. srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
  160. // shrink to 8 bit each 16 bits, the first lane contain the first
  161. // convolve result and the second lane contain the second convolve
  162. // result
  163. srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
  164. srcRegFilt32b2_1);
  165. src_ptr+=src_stride;
  166. // save 16 bytes
  167. _mm_store_si128((__m128i*)output_ptr,
  168. _mm256_castsi256_si128(srcRegFilt32b1_1));
  169. // save the next 16 bits
  170. _mm_store_si128((__m128i*)(output_ptr+output_pitch),
  171. _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
  172. output_ptr+=dst_stride;
  173. }
  174. // if the number of strides is odd.
  175. // process only 16 bytes
  176. if (i > 0) {
  177. __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
  178. __m128i srcRegFilt2, srcRegFilt3;
  179. srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
  180. // filter the source buffer
  181. srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
  182. _mm256_castsi256_si128(filt1Reg));
  183. srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
  184. _mm256_castsi256_si128(filt4Reg));
  185. // multiply 2 adjacent elements with the filter and add the result
  186. srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
  187. _mm256_castsi256_si128(firstFilters));
  188. srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
  189. _mm256_castsi256_si128(forthFilters));
  190. // add and saturate the results together
  191. srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
  192. // filter the source buffer
  193. srcRegFilt3= _mm_shuffle_epi8(srcReg1,
  194. _mm256_castsi256_si128(filt2Reg));
  195. srcRegFilt2= _mm_shuffle_epi8(srcReg1,
  196. _mm256_castsi256_si128(filt3Reg));
  197. // multiply 2 adjacent elements with the filter and add the result
  198. srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
  199. _mm256_castsi256_si128(secondFilters));
  200. srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
  201. _mm256_castsi256_si128(thirdFilters));
  202. // add and saturate the results together
  203. srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
  204. _mm_min_epi16(srcRegFilt3, srcRegFilt2));
  205. // reading the next 16 bytes
  206. // (part of it was being read by earlier read)
  207. srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
  208. // add and saturate the results together
  209. srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
  210. _mm_max_epi16(srcRegFilt3, srcRegFilt2));
  211. // filter the source buffer
  212. srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
  213. _mm256_castsi256_si128(filt1Reg));
  214. srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
  215. _mm256_castsi256_si128(filt4Reg));
  216. // multiply 2 adjacent elements with the filter and add the result
  217. srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
  218. _mm256_castsi256_si128(firstFilters));
  219. srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
  220. _mm256_castsi256_si128(forthFilters));
  221. // add and saturate the results together
  222. srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
  223. // filter the source buffer
  224. srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
  225. _mm256_castsi256_si128(filt2Reg));
  226. srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
  227. _mm256_castsi256_si128(filt3Reg));
  228. // multiply 2 adjacent elements with the filter and add the result
  229. srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
  230. _mm256_castsi256_si128(secondFilters));
  231. srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
  232. _mm256_castsi256_si128(thirdFilters));
  233. // add and saturate the results together
  234. srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
  235. _mm_min_epi16(srcRegFilt3, srcRegFilt2));
  236. srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
  237. _mm_max_epi16(srcRegFilt3, srcRegFilt2));
  238. srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
  239. _mm256_castsi256_si128(addFilterReg64));
  240. srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
  241. _mm256_castsi256_si128(addFilterReg64));
  242. // shift by 7 bit each 16 bit
  243. srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
  244. srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
  245. // shrink to 8 bit each 16 bits, the first lane contain the first
  246. // convolve result and the second lane contain the second convolve
  247. // result
  248. srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
  249. // save 16 bytes
  250. _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
  251. }
  252. }
  253. static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
  254. ptrdiff_t src_pitch,
  255. uint8_t *output_ptr,
  256. ptrdiff_t out_pitch,
  257. uint32_t output_height,
  258. const int16_t *filter) {
  259. __m128i filtersReg;
  260. __m256i addFilterReg64;
  261. __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
  262. __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
  263. __m256i srcReg32b11, srcReg32b12, filtersReg32;
  264. __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
  265. unsigned int i;
  266. ptrdiff_t src_stride, dst_stride;
  267. // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  268. addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
  269. filtersReg = _mm_loadu_si128((const __m128i *)filter);
  270. // converting the 16 bit (short) to 8 bit (byte) and have the
  271. // same data in both lanes of 128 bit register.
  272. filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
  273. // have the same data in both lanes of a 256 bit register
  274. filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
  275. // duplicate only the first 16 bits (first and second byte)
  276. // across 256 bit register
  277. firstFilters = _mm256_shuffle_epi8(filtersReg32,
  278. _mm256_set1_epi16(0x100u));
  279. // duplicate only the second 16 bits (third and forth byte)
  280. // across 256 bit register
  281. secondFilters = _mm256_shuffle_epi8(filtersReg32,
  282. _mm256_set1_epi16(0x302u));
  283. // duplicate only the third 16 bits (fifth and sixth byte)
  284. // across 256 bit register
  285. thirdFilters = _mm256_shuffle_epi8(filtersReg32,
  286. _mm256_set1_epi16(0x504u));
  287. // duplicate only the forth 16 bits (seventh and eighth byte)
  288. // across 256 bit register
  289. forthFilters = _mm256_shuffle_epi8(filtersReg32,
  290. _mm256_set1_epi16(0x706u));
  291. // multiple the size of the source and destination stride by two
  292. src_stride = src_pitch << 1;
  293. dst_stride = out_pitch << 1;
  294. // load 16 bytes 7 times in stride of src_pitch
  295. srcReg32b1 = _mm256_castsi128_si256(
  296. _mm_loadu_si128((const __m128i *)(src_ptr)));
  297. srcReg32b2 = _mm256_castsi128_si256(
  298. _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
  299. srcReg32b3 = _mm256_castsi128_si256(
  300. _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
  301. srcReg32b4 = _mm256_castsi128_si256(
  302. _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
  303. srcReg32b5 = _mm256_castsi128_si256(
  304. _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
  305. srcReg32b6 = _mm256_castsi128_si256(
  306. _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
  307. srcReg32b7 = _mm256_castsi128_si256(
  308. _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
  309. // have each consecutive loads on the same 256 register
  310. srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
  311. _mm256_castsi256_si128(srcReg32b2), 1);
  312. srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
  313. _mm256_castsi256_si128(srcReg32b3), 1);
  314. srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
  315. _mm256_castsi256_si128(srcReg32b4), 1);
  316. srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
  317. _mm256_castsi256_si128(srcReg32b5), 1);
  318. srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
  319. _mm256_castsi256_si128(srcReg32b6), 1);
  320. srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
  321. _mm256_castsi256_si128(srcReg32b7), 1);
  322. // merge every two consecutive registers except the last one
  323. srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
  324. srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
  325. // save
  326. srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
  327. // save
  328. srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
  329. // save
  330. srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
  331. // save
  332. srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
  333. for (i = output_height; i > 1; i-=2) {
  334. // load the last 2 loads of 16 bytes and have every two
  335. // consecutive loads in the same 256 bit register
  336. srcReg32b8 = _mm256_castsi128_si256(
  337. _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
  338. srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
  339. _mm256_castsi256_si128(srcReg32b8), 1);
  340. srcReg32b9 = _mm256_castsi128_si256(
  341. _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
  342. srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
  343. _mm256_castsi256_si128(srcReg32b9), 1);
  344. // merge every two consecutive registers
  345. // save
  346. srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
  347. srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
  348. // multiply 2 adjacent elements with the filter and add the result
  349. srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
  350. srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
  351. // add and saturate the results together
  352. srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
  353. // multiply 2 adjacent elements with the filter and add the result
  354. srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
  355. srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
  356. // add and saturate the results together
  357. srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
  358. _mm256_min_epi16(srcReg32b8, srcReg32b12));
  359. srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
  360. _mm256_max_epi16(srcReg32b8, srcReg32b12));
  361. // multiply 2 adjacent elements with the filter and add the result
  362. srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
  363. srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
  364. srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
  365. // multiply 2 adjacent elements with the filter and add the result
  366. srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
  367. srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
  368. // add and saturate the results together
  369. srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
  370. _mm256_min_epi16(srcReg32b8, srcReg32b12));
  371. srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
  372. _mm256_max_epi16(srcReg32b8, srcReg32b12));
  373. srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
  374. srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
  375. // shift by 7 bit each 16 bit
  376. srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
  377. srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
  378. // shrink to 8 bit each 16 bits, the first lane contain the first
  379. // convolve result and the second lane contain the second convolve
  380. // result
  381. srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
  382. src_ptr+=src_stride;
  383. // save 16 bytes
  384. _mm_store_si128((__m128i*)output_ptr,
  385. _mm256_castsi256_si128(srcReg32b1));
  386. // save the next 16 bits
  387. _mm_store_si128((__m128i*)(output_ptr+out_pitch),
  388. _mm256_extractf128_si256(srcReg32b1, 1));
  389. output_ptr+=dst_stride;
  390. // save part of the registers for next strides
  391. srcReg32b10 = srcReg32b11;
  392. srcReg32b1 = srcReg32b3;
  393. srcReg32b11 = srcReg32b2;
  394. srcReg32b3 = srcReg32b5;
  395. srcReg32b2 = srcReg32b4;
  396. srcReg32b5 = srcReg32b7;
  397. srcReg32b7 = srcReg32b9;
  398. }
  399. if (i > 0) {
  400. __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
  401. __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
  402. // load the last 16 bytes
  403. srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
  404. // merge the last 2 results together
  405. srcRegFilt4 = _mm_unpacklo_epi8(
  406. _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
  407. srcRegFilt7 = _mm_unpackhi_epi8(
  408. _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
  409. // multiply 2 adjacent elements with the filter and add the result
  410. srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
  411. _mm256_castsi256_si128(firstFilters));
  412. srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
  413. _mm256_castsi256_si128(forthFilters));
  414. srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
  415. _mm256_castsi256_si128(firstFilters));
  416. srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
  417. _mm256_castsi256_si128(forthFilters));
  418. // add and saturate the results together
  419. srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
  420. srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
  421. // multiply 2 adjacent elements with the filter and add the result
  422. srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
  423. _mm256_castsi256_si128(secondFilters));
  424. srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
  425. _mm256_castsi256_si128(secondFilters));
  426. // multiply 2 adjacent elements with the filter and add the result
  427. srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
  428. _mm256_castsi256_si128(thirdFilters));
  429. srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
  430. _mm256_castsi256_si128(thirdFilters));
  431. // add and saturate the results together
  432. srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
  433. _mm_min_epi16(srcRegFilt4, srcRegFilt6));
  434. srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
  435. _mm_min_epi16(srcRegFilt5, srcRegFilt7));
  436. // add and saturate the results together
  437. srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
  438. _mm_max_epi16(srcRegFilt4, srcRegFilt6));
  439. srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
  440. _mm_max_epi16(srcRegFilt5, srcRegFilt7));
  441. srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
  442. _mm256_castsi256_si128(addFilterReg64));
  443. srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
  444. _mm256_castsi256_si128(addFilterReg64));
  445. // shift by 7 bit each 16 bit
  446. srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
  447. srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
  448. // shrink to 8 bit each 16 bits, the first lane contain the first
  449. // convolve result and the second lane contain the second convolve
  450. // result
  451. srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
  452. // save 16 bytes
  453. _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
  454. }
  455. }
  456. #if HAVE_AVX2 && HAVE_SSSE3
  457. filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
  458. #if ARCH_X86_64
  459. filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
  460. filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
  461. filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
  462. #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
  463. #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
  464. #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
  465. #else // ARCH_X86
  466. filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
  467. filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
  468. filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
  469. #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
  470. #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
  471. #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
  472. #endif // ARCH_X86_64
  473. filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
  474. filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
  475. filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
  476. filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
  477. filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
  478. filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
  479. #define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
  480. #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
  481. #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
  482. #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
  483. #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
  484. #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
  485. #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
  486. // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
  487. // uint8_t *dst, ptrdiff_t dst_stride,
  488. // const int16_t *filter_x, int x_step_q4,
  489. // const int16_t *filter_y, int y_step_q4,
  490. // int w, int h);
  491. // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
  492. // uint8_t *dst, ptrdiff_t dst_stride,
  493. // const int16_t *filter_x, int x_step_q4,
  494. // const int16_t *filter_y, int y_step_q4,
  495. // int w, int h);
  496. FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
  497. FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
  498. // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
  499. // uint8_t *dst, ptrdiff_t dst_stride,
  500. // const int16_t *filter_x, int x_step_q4,
  501. // const int16_t *filter_y, int y_step_q4,
  502. // int w, int h);
  503. FUN_CONV_2D(, avx2);
  504. #endif // HAVE_AX2 && HAVE_SSSE3