deblock_neon.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. /*
  2. * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include <assert.h>
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx/vpx_integer.h"
  14. #include "vpx_dsp/arm/transpose_neon.h"
  15. extern const int16_t vpx_rv[];
  16. static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1,
  17. const uint8x8_t v0, const uint8x8_t b1,
  18. const uint8x8_t b2) {
  19. const uint8x8_t k1 = vrhadd_u8(a2, a1);
  20. const uint8x8_t k2 = vrhadd_u8(b2, b1);
  21. const uint8x8_t k3 = vrhadd_u8(k1, k2);
  22. return vrhadd_u8(k3, v0);
  23. }
  24. static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1,
  25. const uint8x8_t v0, const uint8x8_t b1,
  26. const uint8x8_t b2, const uint8x8_t filter) {
  27. const uint8x8_t a2_v0 = vabd_u8(a2, v0);
  28. const uint8x8_t a1_v0 = vabd_u8(a1, v0);
  29. const uint8x8_t b1_v0 = vabd_u8(b1, v0);
  30. const uint8x8_t b2_v0 = vabd_u8(b2, v0);
  31. uint8x8_t max = vmax_u8(a2_v0, a1_v0);
  32. max = vmax_u8(b1_v0, max);
  33. max = vmax_u8(b2_v0, max);
  34. return vclt_u8(max, filter);
  35. }
  36. static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1,
  37. const uint8x8_t v0, const uint8x8_t b1,
  38. const uint8x8_t b2, const uint8x8_t filter) {
  39. const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2);
  40. const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter);
  41. return vbsl_u8(mask, k_out, v0);
  42. }
  43. // Same functions but for uint8x16_t.
  44. static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1,
  45. const uint8x16_t v0, const uint8x16_t b1,
  46. const uint8x16_t b2) {
  47. const uint8x16_t k1 = vrhaddq_u8(a2, a1);
  48. const uint8x16_t k2 = vrhaddq_u8(b2, b1);
  49. const uint8x16_t k3 = vrhaddq_u8(k1, k2);
  50. return vrhaddq_u8(k3, v0);
  51. }
  52. static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1,
  53. const uint8x16_t v0, const uint8x16_t b1,
  54. const uint8x16_t b2, const uint8x16_t filter) {
  55. const uint8x16_t a2_v0 = vabdq_u8(a2, v0);
  56. const uint8x16_t a1_v0 = vabdq_u8(a1, v0);
  57. const uint8x16_t b1_v0 = vabdq_u8(b1, v0);
  58. const uint8x16_t b2_v0 = vabdq_u8(b2, v0);
  59. uint8x16_t max = vmaxq_u8(a2_v0, a1_v0);
  60. max = vmaxq_u8(b1_v0, max);
  61. max = vmaxq_u8(b2_v0, max);
  62. return vcltq_u8(max, filter);
  63. }
  64. static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1,
  65. const uint8x16_t v0, const uint8x16_t b1,
  66. const uint8x16_t b2,
  67. const uint8x16_t filter) {
  68. const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2);
  69. const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter);
  70. return vbslq_u8(mask, k_out, v0);
  71. }
  72. void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
  73. uint8_t *dst_ptr, int src_stride,
  74. int dst_stride, int cols,
  75. uint8_t *f, int size) {
  76. uint8_t *src, *dst;
  77. int row;
  78. int col;
  79. // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for
  80. // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8
  81. // (for U/V).
  82. assert((size == 8 || size == 16) && cols % 8 == 0);
  83. // While columns of length 16 can be processed, load them.
  84. for (col = 0; col < cols - 8; col += 16) {
  85. uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;
  86. src = src_ptr - 2 * src_stride;
  87. dst = dst_ptr;
  88. a0 = vld1q_u8(src);
  89. src += src_stride;
  90. a1 = vld1q_u8(src);
  91. src += src_stride;
  92. a2 = vld1q_u8(src);
  93. src += src_stride;
  94. a3 = vld1q_u8(src);
  95. src += src_stride;
  96. for (row = 0; row < size; row += 4) {
  97. uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3;
  98. const uint8x16_t filterq = vld1q_u8(f + col);
  99. a4 = vld1q_u8(src);
  100. src += src_stride;
  101. a5 = vld1q_u8(src);
  102. src += src_stride;
  103. a6 = vld1q_u8(src);
  104. src += src_stride;
  105. a7 = vld1q_u8(src);
  106. src += src_stride;
  107. v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq);
  108. v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq);
  109. v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq);
  110. v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq);
  111. vst1q_u8(dst, v_out_0);
  112. dst += dst_stride;
  113. vst1q_u8(dst, v_out_1);
  114. dst += dst_stride;
  115. vst1q_u8(dst, v_out_2);
  116. dst += dst_stride;
  117. vst1q_u8(dst, v_out_3);
  118. dst += dst_stride;
  119. // Rotate over to the next slot.
  120. a0 = a4;
  121. a1 = a5;
  122. a2 = a6;
  123. a3 = a7;
  124. }
  125. src_ptr += 16;
  126. dst_ptr += 16;
  127. }
  128. // Clean up any left over column of length 8.
  129. if (col != cols) {
  130. uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
  131. src = src_ptr - 2 * src_stride;
  132. dst = dst_ptr;
  133. a0 = vld1_u8(src);
  134. src += src_stride;
  135. a1 = vld1_u8(src);
  136. src += src_stride;
  137. a2 = vld1_u8(src);
  138. src += src_stride;
  139. a3 = vld1_u8(src);
  140. src += src_stride;
  141. for (row = 0; row < size; row += 4) {
  142. uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3;
  143. const uint8x8_t filter = vld1_u8(f + col);
  144. a4 = vld1_u8(src);
  145. src += src_stride;
  146. a5 = vld1_u8(src);
  147. src += src_stride;
  148. a6 = vld1_u8(src);
  149. src += src_stride;
  150. a7 = vld1_u8(src);
  151. src += src_stride;
  152. v_out_0 = generate_output(a0, a1, a2, a3, a4, filter);
  153. v_out_1 = generate_output(a1, a2, a3, a4, a5, filter);
  154. v_out_2 = generate_output(a2, a3, a4, a5, a6, filter);
  155. v_out_3 = generate_output(a3, a4, a5, a6, a7, filter);
  156. vst1_u8(dst, v_out_0);
  157. dst += dst_stride;
  158. vst1_u8(dst, v_out_1);
  159. dst += dst_stride;
  160. vst1_u8(dst, v_out_2);
  161. dst += dst_stride;
  162. vst1_u8(dst, v_out_3);
  163. dst += dst_stride;
  164. // Rotate over to the next slot.
  165. a0 = a4;
  166. a1 = a5;
  167. a2 = a6;
  168. a3 = a7;
  169. }
  170. // Not strictly necessary but makes resetting dst_ptr easier.
  171. dst_ptr += 8;
  172. }
  173. dst_ptr -= cols;
  174. for (row = 0; row < size; row += 8) {
  175. uint8x8_t a0, a1, a2, a3;
  176. uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
  177. src = dst_ptr;
  178. dst = dst_ptr;
  179. // Load 8 values, transpose 4 of them, and discard 2 because they will be
  180. // reloaded later.
  181. load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3);
  182. a3 = a1;
  183. a2 = a1 = a0; // Extend left border.
  184. src += 2;
  185. for (col = 0; col < cols; col += 8) {
  186. uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6,
  187. v_out_7;
  188. // Although the filter is meant to be applied vertically and is instead
  189. // being applied horizontally here it's OK because it's set in blocks of 8
  190. // (or 16).
  191. const uint8x8_t filter = vld1_u8(f + col);
  192. load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5,
  193. &b6, &b7);
  194. if (col + 8 == cols) {
  195. // Last row. Extend border (b5).
  196. b6 = b7 = b5;
  197. }
  198. v_out_0 = generate_output(a0, a1, a2, a3, b0, filter);
  199. v_out_1 = generate_output(a1, a2, a3, b0, b1, filter);
  200. v_out_2 = generate_output(a2, a3, b0, b1, b2, filter);
  201. v_out_3 = generate_output(a3, b0, b1, b2, b3, filter);
  202. v_out_4 = generate_output(b0, b1, b2, b3, b4, filter);
  203. v_out_5 = generate_output(b1, b2, b3, b4, b5, filter);
  204. v_out_6 = generate_output(b2, b3, b4, b5, b6, filter);
  205. v_out_7 = generate_output(b3, b4, b5, b6, b7, filter);
  206. transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2,
  207. v_out_3, v_out_4, v_out_5, v_out_6, v_out_7);
  208. a0 = b4;
  209. a1 = b5;
  210. a2 = b6;
  211. a3 = b7;
  212. src += 8;
  213. dst += 8;
  214. }
  215. dst_ptr += 8 * dst_stride;
  216. }
  217. }
  218. // sum += x;
  219. // sumsq += x * y;
  220. static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy,
  221. int16x4_t *const sum, int32x4_t *const sumsq) {
  222. const int16x4_t zero = vdup_n_s16(0);
  223. const int32x4_t zeroq = vdupq_n_s32(0);
  224. // Add in the first set because vext doesn't work with '0'.
  225. *sum = vadd_s16(*sum, x);
  226. *sumsq = vaddq_s32(*sumsq, xy);
  227. // Shift x and xy to the right and sum. vext requires an immediate.
  228. *sum = vadd_s16(*sum, vext_s16(zero, x, 1));
  229. *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1));
  230. *sum = vadd_s16(*sum, vext_s16(zero, x, 2));
  231. *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2));
  232. *sum = vadd_s16(*sum, vext_s16(zero, x, 3));
  233. *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3));
  234. }
  235. // Generate mask based on (sumsq * 15 - sum * sum < flimit)
  236. static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq,
  237. const int32x4_t f, const int32x4_t fifteen) {
  238. const int32x4_t a = vmulq_s32(sumsq, fifteen);
  239. const int32x4_t b = vmlsl_s16(a, sum, sum);
  240. const uint32x4_t mask32 = vcltq_s32(b, f);
  241. return vmovn_u32(mask32);
  242. }
  243. static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high,
  244. const int32x4_t sumsq_low,
  245. const int32x4_t sumsq_high, const int32x4_t f) {
  246. const int32x4_t fifteen = vdupq_n_s32(15);
  247. const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen);
  248. const uint16x4_t mask16_high =
  249. calculate_mask(sum_high, sumsq_high, f, fifteen);
  250. return vmovn_u16(vcombine_u16(mask16_low, mask16_high));
  251. }
  252. // Apply filter of (8 + sum + s[c]) >> 4.
  253. static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) {
  254. const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
  255. const int16x8_t sum_s = vaddq_s16(sum, s16);
  256. return vqrshrun_n_s16(sum_s, 4);
  257. }
  258. void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,
  259. int flimit) {
  260. int row, col;
  261. const int32x4_t f = vdupq_n_s32(flimit);
  262. assert(cols % 8 == 0);
  263. for (row = 0; row < rows; ++row) {
  264. // Sum the first 8 elements, which are extended from s[0].
  265. // sumsq gets primed with +16.
  266. int sumsq = src[0] * src[0] * 9 + 16;
  267. int sum = src[0] * 9;
  268. uint8x8_t left_context, s, right_context;
  269. int16x4_t sum_low, sum_high;
  270. int32x4_t sumsq_low, sumsq_high;
  271. // Sum (+square) the next 6 elements.
  272. // Skip [0] because it's included above.
  273. for (col = 1; col <= 6; ++col) {
  274. sumsq += src[col] * src[col];
  275. sum += src[col];
  276. }
  277. // Prime the sums. Later the loop uses the _high values to prime the new
  278. // vectors.
  279. sumsq_high = vdupq_n_s32(sumsq);
  280. sum_high = vdup_n_s16(sum);
  281. // Manually extend the left border.
  282. left_context = vdup_n_u8(src[0]);
  283. for (col = 0; col < cols; col += 8) {
  284. uint8x8_t mask, output;
  285. int16x8_t x, y;
  286. int32x4_t xy_low, xy_high;
  287. s = vld1_u8(src + col);
  288. if (col + 8 == cols) {
  289. // Last row. Extend border.
  290. right_context = vdup_n_u8(src[col + 7]);
  291. } else {
  292. right_context = vld1_u8(src + col + 7);
  293. }
  294. x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context));
  295. y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context));
  296. xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
  297. xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
  298. // Catch up to the last sum'd value.
  299. sum_low = vdup_lane_s16(sum_high, 3);
  300. sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1);
  301. accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low);
  302. // Need to do this sequentially because we need the max value from
  303. // sum_low.
  304. sum_high = vdup_lane_s16(sum_low, 3);
  305. sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1);
  306. accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high);
  307. mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f);
  308. output = filter_pixels(vcombine_s16(sum_low, sum_high), s);
  309. output = vbsl_u8(mask, output, s);
  310. vst1_u8(src + col, output);
  311. left_context = s;
  312. }
  313. src += pitch;
  314. }
  315. }
  316. // Apply filter of (vpx_rv + sum + s[c]) >> 4.
  317. static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s,
  318. const int16x8_t rv) {
  319. const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
  320. const int16x8_t sum_s = vaddq_s16(sum, s16);
  321. const int16x8_t rounded = vaddq_s16(sum_s, rv);
  322. return vqshrun_n_s16(rounded, 4);
  323. }
  324. void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols,
  325. int flimit) {
  326. int row, col, i;
  327. const int32x4_t f = vdupq_n_s32(flimit);
  328. uint8x8_t below_context = vdup_n_u8(0);
  329. // 8 columns are processed at a time.
  330. // If rows is less than 8 the bottom border extension fails.
  331. assert(cols % 8 == 0);
  332. assert(rows >= 8);
  333. // Load and keep the first 8 values in memory. Process a vertical stripe that
  334. // is 8 wide.
  335. for (col = 0; col < cols; col += 8) {
  336. uint8x8_t s, above_context[8];
  337. int16x8_t sum, sum_tmp;
  338. int32x4_t sumsq_low, sumsq_high;
  339. // Load and extend the top border.
  340. s = vld1_u8(dst);
  341. for (i = 0; i < 8; i++) {
  342. above_context[i] = s;
  343. }
  344. sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s));
  345. // sum * 9
  346. sum = vmulq_n_s16(sum_tmp, 9);
  347. // (sum * 9) * sum == sum * sum * 9
  348. sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp));
  349. sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp));
  350. // Load and discard the next 6 values to prime sum and sumsq.
  351. for (i = 1; i <= 6; ++i) {
  352. const uint8x8_t a = vld1_u8(dst + i * pitch);
  353. const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a));
  354. sum = vaddq_s16(sum, b);
  355. sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b));
  356. sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b));
  357. }
  358. for (row = 0; row < rows; ++row) {
  359. uint8x8_t mask, output;
  360. int16x8_t x, y;
  361. int32x4_t xy_low, xy_high;
  362. s = vld1_u8(dst + row * pitch);
  363. // Extend the bottom border.
  364. if (row + 7 < rows) {
  365. below_context = vld1_u8(dst + (row + 7) * pitch);
  366. }
  367. x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0]));
  368. y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0]));
  369. xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
  370. xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
  371. sum = vaddq_s16(sum, x);
  372. sumsq_low = vaddq_s32(sumsq_low, xy_low);
  373. sumsq_high = vaddq_s32(sumsq_high, xy_high);
  374. mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low,
  375. sumsq_high, f);
  376. output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127)));
  377. output = vbsl_u8(mask, output, s);
  378. vst1_u8(dst + row * pitch, output);
  379. above_context[0] = above_context[1];
  380. above_context[1] = above_context[2];
  381. above_context[2] = above_context[3];
  382. above_context[3] = above_context[4];
  383. above_context[4] = above_context[5];
  384. above_context[5] = above_context[6];
  385. above_context[6] = above_context[7];
  386. above_context[7] = s;
  387. }
  388. dst += 8;
  389. }
  390. }