quantize_avx.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #if defined(_MSC_VER)
  12. #include <intrin.h>
  13. #endif
  14. #include <immintrin.h>
  15. #include "./vpx_dsp_rtcd.h"
  16. #include "vpx/vpx_integer.h"
  17. #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
  18. void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  19. int skip_block, const int16_t *zbin_ptr,
  20. const int16_t *round_ptr, const int16_t *quant_ptr,
  21. const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
  22. tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
  23. uint16_t *eob_ptr, const int16_t *scan_ptr,
  24. const int16_t *iscan_ptr) {
  25. const __m128i zero = _mm_setzero_si128();
  26. const __m256i big_zero = _mm256_setzero_si256();
  27. int index;
  28. __m128i zbin, round, quant, dequant, shift;
  29. __m128i coeff0, coeff1;
  30. __m128i qcoeff0, qcoeff1;
  31. __m128i cmp_mask0, cmp_mask1;
  32. __m128i all_zero;
  33. __m128i qtmp0, qtmp1;
  34. __m128i zero_coeff0, zero_coeff1, iscan0, iscan1;
  35. __m128i eob = zero, eob0, eob1;
  36. (void)scan_ptr;
  37. (void)skip_block;
  38. assert(!skip_block);
  39. *eob_ptr = 0;
  40. // Setup global values.
  41. zbin = _mm_load_si128((const __m128i *)zbin_ptr);
  42. // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
  43. // it is a strict "greater" comparison.
  44. zbin = _mm_sub_epi16(zbin, _mm_set1_epi16(1));
  45. round = _mm_load_si128((const __m128i *)round_ptr);
  46. quant = _mm_load_si128((const __m128i *)quant_ptr);
  47. dequant = _mm_load_si128((const __m128i *)dequant_ptr);
  48. shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
  49. // Do DC and first 15 AC.
  50. coeff0 = load_tran_low(coeff_ptr);
  51. coeff1 = load_tran_low(coeff_ptr + 8);
  52. qcoeff0 = _mm_abs_epi16(coeff0);
  53. qcoeff1 = _mm_abs_epi16(coeff1);
  54. cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
  55. zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
  56. cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
  57. all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
  58. if (_mm_test_all_zeros(all_zero, all_zero)) {
  59. _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
  60. _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
  61. #if CONFIG_VP9_HIGHBITDEPTH
  62. _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
  63. _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
  64. #endif // CONFIG_VP9_HIGHBITDEPTH
  65. if (n_coeffs == 16) return;
  66. round = _mm_unpackhi_epi64(round, round);
  67. quant = _mm_unpackhi_epi64(quant, quant);
  68. shift = _mm_unpackhi_epi64(shift, shift);
  69. dequant = _mm_unpackhi_epi64(dequant, dequant);
  70. } else {
  71. qcoeff0 = _mm_adds_epi16(qcoeff0, round);
  72. round = _mm_unpackhi_epi64(round, round);
  73. qcoeff1 = _mm_adds_epi16(qcoeff1, round);
  74. qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
  75. quant = _mm_unpackhi_epi64(quant, quant);
  76. qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
  77. qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
  78. qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
  79. qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
  80. shift = _mm_unpackhi_epi64(shift, shift);
  81. qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
  82. // Reinsert signs
  83. qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
  84. qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
  85. // Mask out zbin threshold coeffs
  86. qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
  87. qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
  88. store_tran_low(qcoeff0, qcoeff_ptr);
  89. store_tran_low(qcoeff1, qcoeff_ptr + 8);
  90. coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
  91. dequant = _mm_unpackhi_epi64(dequant, dequant);
  92. coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
  93. store_tran_low(coeff0, dqcoeff_ptr);
  94. store_tran_low(coeff1, dqcoeff_ptr + 8);
  95. // Scan for eob.
  96. zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
  97. zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
  98. iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
  99. iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
  100. // Add one to convert from indices to counts
  101. iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);
  102. iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);
  103. eob = _mm_andnot_si128(zero_coeff0, iscan0);
  104. eob1 = _mm_andnot_si128(zero_coeff1, iscan1);
  105. eob = _mm_max_epi16(eob, eob1);
  106. }
  107. // AC only loop.
  108. for (index = 16; index < n_coeffs; index += 16) {
  109. coeff0 = load_tran_low(coeff_ptr + index);
  110. coeff1 = load_tran_low(coeff_ptr + index + 8);
  111. qcoeff0 = _mm_abs_epi16(coeff0);
  112. qcoeff1 = _mm_abs_epi16(coeff1);
  113. cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
  114. cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
  115. all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
  116. if (_mm_test_all_zeros(all_zero, all_zero)) {
  117. _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
  118. _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
  119. #if CONFIG_VP9_HIGHBITDEPTH
  120. _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
  121. _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
  122. #endif // CONFIG_VP9_HIGHBITDEPTH
  123. continue;
  124. }
  125. qcoeff0 = _mm_adds_epi16(qcoeff0, round);
  126. qcoeff1 = _mm_adds_epi16(qcoeff1, round);
  127. qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
  128. qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
  129. qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
  130. qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
  131. qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
  132. qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
  133. qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
  134. qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
  135. qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
  136. qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
  137. store_tran_low(qcoeff0, qcoeff_ptr + index);
  138. store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
  139. coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
  140. coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
  141. store_tran_low(coeff0, dqcoeff_ptr + index);
  142. store_tran_low(coeff1, dqcoeff_ptr + index + 8);
  143. zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
  144. zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
  145. iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + index));
  146. iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + index + 8));
  147. iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);
  148. iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);
  149. eob0 = _mm_andnot_si128(zero_coeff0, iscan0);
  150. eob1 = _mm_andnot_si128(zero_coeff1, iscan1);
  151. eob0 = _mm_max_epi16(eob0, eob1);
  152. eob = _mm_max_epi16(eob, eob0);
  153. }
  154. // Accumulate eob.
  155. {
  156. __m128i eob_shuffled;
  157. eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
  158. eob = _mm_max_epi16(eob, eob_shuffled);
  159. eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
  160. eob = _mm_max_epi16(eob, eob_shuffled);
  161. eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
  162. eob = _mm_max_epi16(eob, eob_shuffled);
  163. *eob_ptr = _mm_extract_epi16(eob, 1);
  164. }
  165. }
  166. void vpx_quantize_b_32x32_avx(
  167. const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
  168. const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
  169. const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
  170. tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
  171. const int16_t *scan_ptr, const int16_t *iscan_ptr) {
  172. const __m128i zero = _mm_setzero_si128();
  173. const __m128i one = _mm_set1_epi16(1);
  174. const __m256i big_zero = _mm256_setzero_si256();
  175. int index;
  176. __m128i zbin, round, quant, dequant, shift;
  177. __m128i coeff0, coeff1;
  178. __m128i qcoeff0, qcoeff1;
  179. __m128i cmp_mask0, cmp_mask1;
  180. __m128i all_zero;
  181. __m128i qtmp0, qtmp1;
  182. __m128i zero_coeff0, zero_coeff1, iscan0, iscan1;
  183. __m128i eob = zero, eob0, eob1;
  184. (void)scan_ptr;
  185. (void)n_coeffs;
  186. (void)skip_block;
  187. assert(!skip_block);
  188. *eob_ptr = 0;
  189. // Setup global values.
  190. // The 32x32 halves zbin and round.
  191. zbin = _mm_load_si128((const __m128i *)zbin_ptr);
  192. // Shift with rounding.
  193. zbin = _mm_add_epi16(zbin, one);
  194. zbin = _mm_srli_epi16(zbin, 1);
  195. // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
  196. // it is a strict "greater" comparison.
  197. zbin = _mm_sub_epi16(zbin, one);
  198. round = _mm_load_si128((const __m128i *)round_ptr);
  199. round = _mm_add_epi16(round, one);
  200. round = _mm_srli_epi16(round, 1);
  201. quant = _mm_load_si128((const __m128i *)quant_ptr);
  202. dequant = _mm_load_si128((const __m128i *)dequant_ptr);
  203. shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
  204. shift = _mm_slli_epi16(shift, 1);
  205. // Do DC and first 15 AC.
  206. coeff0 = load_tran_low(coeff_ptr);
  207. coeff1 = load_tran_low(coeff_ptr + 8);
  208. qcoeff0 = _mm_abs_epi16(coeff0);
  209. qcoeff1 = _mm_abs_epi16(coeff1);
  210. cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
  211. zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
  212. cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
  213. all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
  214. if (_mm_test_all_zeros(all_zero, all_zero)) {
  215. _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
  216. _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
  217. #if CONFIG_VP9_HIGHBITDEPTH
  218. _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
  219. _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
  220. #endif // CONFIG_VP9_HIGHBITDEPTH
  221. round = _mm_unpackhi_epi64(round, round);
  222. quant = _mm_unpackhi_epi64(quant, quant);
  223. shift = _mm_unpackhi_epi64(shift, shift);
  224. dequant = _mm_unpackhi_epi64(dequant, dequant);
  225. } else {
  226. qcoeff0 = _mm_adds_epi16(qcoeff0, round);
  227. round = _mm_unpackhi_epi64(round, round);
  228. qcoeff1 = _mm_adds_epi16(qcoeff1, round);
  229. qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
  230. quant = _mm_unpackhi_epi64(quant, quant);
  231. qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
  232. qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
  233. qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
  234. qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
  235. shift = _mm_unpackhi_epi64(shift, shift);
  236. qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
  237. // Reinsert signs
  238. qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
  239. qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
  240. // Mask out zbin threshold coeffs
  241. qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
  242. qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
  243. store_tran_low(qcoeff0, qcoeff_ptr);
  244. store_tran_low(qcoeff1, qcoeff_ptr + 8);
  245. // Un-sign to bias rounding like C.
  246. // dequant is almost always negative, so this is probably the backwards way
  247. // to handle the sign. However, it matches the previous assembly.
  248. coeff0 = _mm_abs_epi16(qcoeff0);
  249. coeff1 = _mm_abs_epi16(qcoeff1);
  250. coeff0 = _mm_mullo_epi16(coeff0, dequant);
  251. dequant = _mm_unpackhi_epi64(dequant, dequant);
  252. coeff1 = _mm_mullo_epi16(coeff1, dequant);
  253. // "Divide" by 2.
  254. coeff0 = _mm_srli_epi16(coeff0, 1);
  255. coeff1 = _mm_srli_epi16(coeff1, 1);
  256. coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
  257. coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
  258. store_tran_low(coeff0, dqcoeff_ptr);
  259. store_tran_low(coeff1, dqcoeff_ptr + 8);
  260. // Scan for eob.
  261. zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
  262. zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
  263. iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
  264. iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
  265. // Add one to convert from indices to counts
  266. iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);
  267. iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);
  268. eob = _mm_andnot_si128(zero_coeff0, iscan0);
  269. eob1 = _mm_andnot_si128(zero_coeff1, iscan1);
  270. eob = _mm_max_epi16(eob, eob1);
  271. }
  272. // AC only loop.
  273. for (index = 16; index < 32 * 32; index += 16) {
  274. coeff0 = load_tran_low(coeff_ptr + index);
  275. coeff1 = load_tran_low(coeff_ptr + index + 8);
  276. qcoeff0 = _mm_abs_epi16(coeff0);
  277. qcoeff1 = _mm_abs_epi16(coeff1);
  278. cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
  279. cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
  280. all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
  281. if (_mm_test_all_zeros(all_zero, all_zero)) {
  282. _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
  283. _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
  284. #if CONFIG_VP9_HIGHBITDEPTH
  285. _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
  286. _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
  287. #endif // CONFIG_VP9_HIGHBITDEPTH
  288. continue;
  289. }
  290. qcoeff0 = _mm_adds_epi16(qcoeff0, round);
  291. qcoeff1 = _mm_adds_epi16(qcoeff1, round);
  292. qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
  293. qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
  294. qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
  295. qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
  296. qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
  297. qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
  298. qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
  299. qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
  300. qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
  301. qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
  302. store_tran_low(qcoeff0, qcoeff_ptr + index);
  303. store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
  304. coeff0 = _mm_abs_epi16(qcoeff0);
  305. coeff1 = _mm_abs_epi16(qcoeff1);
  306. coeff0 = _mm_mullo_epi16(coeff0, dequant);
  307. coeff1 = _mm_mullo_epi16(coeff1, dequant);
  308. coeff0 = _mm_srli_epi16(coeff0, 1);
  309. coeff1 = _mm_srli_epi16(coeff1, 1);
  310. coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
  311. coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
  312. store_tran_low(coeff0, dqcoeff_ptr + index);
  313. store_tran_low(coeff1, dqcoeff_ptr + index + 8);
  314. zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
  315. zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
  316. iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + index));
  317. iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + index + 8));
  318. iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);
  319. iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);
  320. eob0 = _mm_andnot_si128(zero_coeff0, iscan0);
  321. eob1 = _mm_andnot_si128(zero_coeff1, iscan1);
  322. eob0 = _mm_max_epi16(eob0, eob1);
  323. eob = _mm_max_epi16(eob, eob0);
  324. }
  325. // Accumulate eob.
  326. {
  327. __m128i eob_shuffled;
  328. eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
  329. eob = _mm_max_epi16(eob, eob_shuffled);
  330. eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
  331. eob = _mm_max_epi16(eob, eob_shuffled);
  332. eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
  333. eob = _mm_max_epi16(eob, eob_shuffled);
  334. *eob_ptr = _mm_extract_epi16(eob, 1);
  335. }
  336. }