intrapred_neon.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx/vpx_integer.h"
  14. //------------------------------------------------------------------------------
  15. // DC 4x4
  16. // 'do_above' and 'do_left' facilitate branch removal when inlined.
  17. static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
  18. const uint8_t *above, const uint8_t *left,
  19. int do_above, int do_left) {
  20. uint16x8_t sum_top;
  21. uint16x8_t sum_left;
  22. uint8x8_t dc0;
  23. if (do_above) {
  24. const uint8x8_t A = vld1_u8(above); // top row
  25. const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
  26. const uint16x4_t p1 = vpadd_u16(p0, p0);
  27. sum_top = vcombine_u16(p1, p1);
  28. }
  29. if (do_left) {
  30. const uint8x8_t L = vld1_u8(left); // left border
  31. const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
  32. const uint16x4_t p1 = vpadd_u16(p0, p0);
  33. sum_left = vcombine_u16(p1, p1);
  34. }
  35. if (do_above && do_left) {
  36. const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
  37. dc0 = vrshrn_n_u16(sum, 3);
  38. } else if (do_above) {
  39. dc0 = vrshrn_n_u16(sum_top, 2);
  40. } else if (do_left) {
  41. dc0 = vrshrn_n_u16(sum_left, 2);
  42. } else {
  43. dc0 = vdup_n_u8(0x80);
  44. }
  45. {
  46. const uint8x8_t dc = vdup_lane_u8(dc0, 0);
  47. int i;
  48. for (i = 0; i < 4; ++i) {
  49. vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
  50. }
  51. }
  52. }
  53. void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  54. const uint8_t *above, const uint8_t *left) {
  55. dc_4x4(dst, stride, above, left, 1, 1);
  56. }
  57. void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  58. const uint8_t *above, const uint8_t *left) {
  59. (void)above;
  60. dc_4x4(dst, stride, NULL, left, 0, 1);
  61. }
  62. void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  63. const uint8_t *above, const uint8_t *left) {
  64. (void)left;
  65. dc_4x4(dst, stride, above, NULL, 1, 0);
  66. }
  67. void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  68. const uint8_t *above, const uint8_t *left) {
  69. (void)above;
  70. (void)left;
  71. dc_4x4(dst, stride, NULL, NULL, 0, 0);
  72. }
  73. //------------------------------------------------------------------------------
  74. // DC 8x8
  75. // 'do_above' and 'do_left' facilitate branch removal when inlined.
  76. static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
  77. const uint8_t *above, const uint8_t *left,
  78. int do_above, int do_left) {
  79. uint16x8_t sum_top;
  80. uint16x8_t sum_left;
  81. uint8x8_t dc0;
  82. if (do_above) {
  83. const uint8x8_t A = vld1_u8(above); // top row
  84. const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
  85. const uint16x4_t p1 = vpadd_u16(p0, p0);
  86. const uint16x4_t p2 = vpadd_u16(p1, p1);
  87. sum_top = vcombine_u16(p2, p2);
  88. }
  89. if (do_left) {
  90. const uint8x8_t L = vld1_u8(left); // left border
  91. const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
  92. const uint16x4_t p1 = vpadd_u16(p0, p0);
  93. const uint16x4_t p2 = vpadd_u16(p1, p1);
  94. sum_left = vcombine_u16(p2, p2);
  95. }
  96. if (do_above && do_left) {
  97. const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
  98. dc0 = vrshrn_n_u16(sum, 4);
  99. } else if (do_above) {
  100. dc0 = vrshrn_n_u16(sum_top, 3);
  101. } else if (do_left) {
  102. dc0 = vrshrn_n_u16(sum_left, 3);
  103. } else {
  104. dc0 = vdup_n_u8(0x80);
  105. }
  106. {
  107. const uint8x8_t dc = vdup_lane_u8(dc0, 0);
  108. int i;
  109. for (i = 0; i < 8; ++i) {
  110. vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
  111. }
  112. }
  113. }
  114. void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  115. const uint8_t *above, const uint8_t *left) {
  116. dc_8x8(dst, stride, above, left, 1, 1);
  117. }
  118. void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  119. const uint8_t *above, const uint8_t *left) {
  120. (void)above;
  121. dc_8x8(dst, stride, NULL, left, 0, 1);
  122. }
  123. void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  124. const uint8_t *above, const uint8_t *left) {
  125. (void)left;
  126. dc_8x8(dst, stride, above, NULL, 1, 0);
  127. }
  128. void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  129. const uint8_t *above, const uint8_t *left) {
  130. (void)above;
  131. (void)left;
  132. dc_8x8(dst, stride, NULL, NULL, 0, 0);
  133. }
  134. //------------------------------------------------------------------------------
  135. // DC 16x16
  136. // 'do_above' and 'do_left' facilitate branch removal when inlined.
  137. static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
  138. const uint8_t *above, const uint8_t *left,
  139. int do_above, int do_left) {
  140. uint16x8_t sum_top;
  141. uint16x8_t sum_left;
  142. uint8x8_t dc0;
  143. if (do_above) {
  144. const uint8x16_t A = vld1q_u8(above); // top row
  145. const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
  146. const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
  147. const uint16x4_t p2 = vpadd_u16(p1, p1);
  148. const uint16x4_t p3 = vpadd_u16(p2, p2);
  149. sum_top = vcombine_u16(p3, p3);
  150. }
  151. if (do_left) {
  152. const uint8x16_t L = vld1q_u8(left); // left row
  153. const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
  154. const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
  155. const uint16x4_t p2 = vpadd_u16(p1, p1);
  156. const uint16x4_t p3 = vpadd_u16(p2, p2);
  157. sum_left = vcombine_u16(p3, p3);
  158. }
  159. if (do_above && do_left) {
  160. const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
  161. dc0 = vrshrn_n_u16(sum, 5);
  162. } else if (do_above) {
  163. dc0 = vrshrn_n_u16(sum_top, 4);
  164. } else if (do_left) {
  165. dc0 = vrshrn_n_u16(sum_left, 4);
  166. } else {
  167. dc0 = vdup_n_u8(0x80);
  168. }
  169. {
  170. const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
  171. int i;
  172. for (i = 0; i < 16; ++i) {
  173. vst1q_u8(dst + i * stride, dc);
  174. }
  175. }
  176. }
  177. void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
  178. const uint8_t *above, const uint8_t *left) {
  179. dc_16x16(dst, stride, above, left, 1, 1);
  180. }
  181. void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
  182. const uint8_t *above,
  183. const uint8_t *left) {
  184. (void)above;
  185. dc_16x16(dst, stride, NULL, left, 0, 1);
  186. }
  187. void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
  188. const uint8_t *above,
  189. const uint8_t *left) {
  190. (void)left;
  191. dc_16x16(dst, stride, above, NULL, 1, 0);
  192. }
  193. void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
  194. const uint8_t *above,
  195. const uint8_t *left) {
  196. (void)above;
  197. (void)left;
  198. dc_16x16(dst, stride, NULL, NULL, 0, 0);
  199. }
  200. //------------------------------------------------------------------------------
  201. // DC 32x32
  202. // 'do_above' and 'do_left' facilitate branch removal when inlined.
  203. static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
  204. const uint8_t *above, const uint8_t *left,
  205. int do_above, int do_left) {
  206. uint16x8_t sum_top;
  207. uint16x8_t sum_left;
  208. uint8x8_t dc0;
  209. if (do_above) {
  210. const uint8x16_t A0 = vld1q_u8(above); // top row
  211. const uint8x16_t A1 = vld1q_u8(above + 16);
  212. const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
  213. const uint16x8_t p1 = vpaddlq_u8(A1);
  214. const uint16x8_t p2 = vaddq_u16(p0, p1);
  215. const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
  216. const uint16x4_t p4 = vpadd_u16(p3, p3);
  217. const uint16x4_t p5 = vpadd_u16(p4, p4);
  218. sum_top = vcombine_u16(p5, p5);
  219. }
  220. if (do_left) {
  221. const uint8x16_t L0 = vld1q_u8(left); // left row
  222. const uint8x16_t L1 = vld1q_u8(left + 16);
  223. const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
  224. const uint16x8_t p1 = vpaddlq_u8(L1);
  225. const uint16x8_t p2 = vaddq_u16(p0, p1);
  226. const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
  227. const uint16x4_t p4 = vpadd_u16(p3, p3);
  228. const uint16x4_t p5 = vpadd_u16(p4, p4);
  229. sum_left = vcombine_u16(p5, p5);
  230. }
  231. if (do_above && do_left) {
  232. const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
  233. dc0 = vrshrn_n_u16(sum, 6);
  234. } else if (do_above) {
  235. dc0 = vrshrn_n_u16(sum_top, 5);
  236. } else if (do_left) {
  237. dc0 = vrshrn_n_u16(sum_left, 5);
  238. } else {
  239. dc0 = vdup_n_u8(0x80);
  240. }
  241. {
  242. const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
  243. int i;
  244. for (i = 0; i < 32; ++i) {
  245. vst1q_u8(dst + i * stride, dc);
  246. vst1q_u8(dst + i * stride + 16, dc);
  247. }
  248. }
  249. }
  250. void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
  251. const uint8_t *above, const uint8_t *left) {
  252. dc_32x32(dst, stride, above, left, 1, 1);
  253. }
  254. void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
  255. const uint8_t *above,
  256. const uint8_t *left) {
  257. (void)above;
  258. dc_32x32(dst, stride, NULL, left, 0, 1);
  259. }
  260. void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
  261. const uint8_t *above,
  262. const uint8_t *left) {
  263. (void)left;
  264. dc_32x32(dst, stride, above, NULL, 1, 0);
  265. }
  266. void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
  267. const uint8_t *above,
  268. const uint8_t *left) {
  269. (void)above;
  270. (void)left;
  271. dc_32x32(dst, stride, NULL, NULL, 0, 0);
  272. }
  273. // -----------------------------------------------------------------------------
  274. void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  275. const uint8_t *above, const uint8_t *left) {
  276. const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row
  277. const uint64x1_t A1 = vshr_n_u64(A0, 8);
  278. const uint64x1_t A2 = vshr_n_u64(A0, 16);
  279. const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
  280. const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
  281. const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
  282. const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
  283. const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
  284. const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  285. const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
  286. const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  287. const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  288. const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  289. (void)left;
  290. vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  291. vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  292. vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  293. vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
  294. dst[3 * stride + 3] = above[7];
  295. }
  296. void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  297. const uint8_t *above, const uint8_t *left) {
  298. static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
  299. static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
  300. const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
  301. const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
  302. const uint8x8_t A0 = vld1_u8(above); // top row
  303. const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
  304. const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
  305. const uint8x8_t avg1 = vhadd_u8(A0, A2);
  306. uint8x8_t row = vrhadd_u8(avg1, A1);
  307. int i;
  308. (void)left;
  309. for (i = 0; i < 7; ++i) {
  310. vst1_u8(dst + i * stride, row);
  311. row = vtbl1_u8(row, sh_12345677);
  312. }
  313. vst1_u8(dst + i * stride, row);
  314. }
  315. void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
  316. const uint8_t *above, const uint8_t *left) {
  317. const uint8x16_t A0 = vld1q_u8(above); // top row
  318. const uint8x16_t above_right = vld1q_dup_u8(above + 15);
  319. const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
  320. const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
  321. const uint8x16_t avg1 = vhaddq_u8(A0, A2);
  322. uint8x16_t row = vrhaddq_u8(avg1, A1);
  323. int i;
  324. (void)left;
  325. for (i = 0; i < 15; ++i) {
  326. vst1q_u8(dst + i * stride, row);
  327. row = vextq_u8(row, above_right, 1);
  328. }
  329. vst1q_u8(dst + i * stride, row);
  330. }
  331. // -----------------------------------------------------------------------------
  332. void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  333. const uint8_t *above, const uint8_t *left) {
  334. const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
  335. const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
  336. const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
  337. const uint32x2_t zero = vdup_n_u32(0);
  338. const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
  339. const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
  340. const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
  341. const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
  342. const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
  343. const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
  344. const uint8_t D = vget_lane_u8(XABCD_u8, 4);
  345. const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
  346. const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
  347. const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
  348. const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
  349. const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  350. const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
  351. const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  352. const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  353. const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  354. vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  355. vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  356. vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  357. vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
  358. }
  359. #if !HAVE_NEON_ASM
  360. void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  361. const uint8_t *above, const uint8_t *left) {
  362. int i;
  363. uint32x2_t d0u32 = vdup_n_u32(0);
  364. (void)left;
  365. d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
  366. for (i = 0; i < 4; i++, dst += stride)
  367. vst1_lane_u32((uint32_t *)dst, d0u32, 0);
  368. }
  369. void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  370. const uint8_t *above, const uint8_t *left) {
  371. int i;
  372. uint8x8_t d0u8 = vdup_n_u8(0);
  373. (void)left;
  374. d0u8 = vld1_u8(above);
  375. for (i = 0; i < 8; i++, dst += stride)
  376. vst1_u8(dst, d0u8);
  377. }
  378. void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
  379. const uint8_t *above, const uint8_t *left) {
  380. int i;
  381. uint8x16_t q0u8 = vdupq_n_u8(0);
  382. (void)left;
  383. q0u8 = vld1q_u8(above);
  384. for (i = 0; i < 16; i++, dst += stride)
  385. vst1q_u8(dst, q0u8);
  386. }
  387. void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
  388. const uint8_t *above, const uint8_t *left) {
  389. int i;
  390. uint8x16_t q0u8 = vdupq_n_u8(0);
  391. uint8x16_t q1u8 = vdupq_n_u8(0);
  392. (void)left;
  393. q0u8 = vld1q_u8(above);
  394. q1u8 = vld1q_u8(above + 16);
  395. for (i = 0; i < 32; i++, dst += stride) {
  396. vst1q_u8(dst, q0u8);
  397. vst1q_u8(dst + 16, q1u8);
  398. }
  399. }
  400. void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  401. const uint8_t *above, const uint8_t *left) {
  402. uint8x8_t d0u8 = vdup_n_u8(0);
  403. uint32x2_t d1u32 = vdup_n_u32(0);
  404. (void)above;
  405. d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
  406. d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
  407. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  408. dst += stride;
  409. d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
  410. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  411. dst += stride;
  412. d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
  413. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  414. dst += stride;
  415. d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
  416. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  417. }
  418. void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  419. const uint8_t *above, const uint8_t *left) {
  420. uint8x8_t d0u8 = vdup_n_u8(0);
  421. uint64x1_t d1u64 = vdup_n_u64(0);
  422. (void)above;
  423. d1u64 = vld1_u64((const uint64_t *)left);
  424. d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
  425. vst1_u8(dst, d0u8);
  426. dst += stride;
  427. d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
  428. vst1_u8(dst, d0u8);
  429. dst += stride;
  430. d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
  431. vst1_u8(dst, d0u8);
  432. dst += stride;
  433. d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
  434. vst1_u8(dst, d0u8);
  435. dst += stride;
  436. d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
  437. vst1_u8(dst, d0u8);
  438. dst += stride;
  439. d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
  440. vst1_u8(dst, d0u8);
  441. dst += stride;
  442. d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
  443. vst1_u8(dst, d0u8);
  444. dst += stride;
  445. d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
  446. vst1_u8(dst, d0u8);
  447. }
  448. void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
  449. const uint8_t *above, const uint8_t *left) {
  450. int j;
  451. uint8x8_t d2u8 = vdup_n_u8(0);
  452. uint8x16_t q0u8 = vdupq_n_u8(0);
  453. uint8x16_t q1u8 = vdupq_n_u8(0);
  454. (void)above;
  455. q1u8 = vld1q_u8(left);
  456. d2u8 = vget_low_u8(q1u8);
  457. for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
  458. q0u8 = vdupq_lane_u8(d2u8, 0);
  459. vst1q_u8(dst, q0u8);
  460. dst += stride;
  461. q0u8 = vdupq_lane_u8(d2u8, 1);
  462. vst1q_u8(dst, q0u8);
  463. dst += stride;
  464. q0u8 = vdupq_lane_u8(d2u8, 2);
  465. vst1q_u8(dst, q0u8);
  466. dst += stride;
  467. q0u8 = vdupq_lane_u8(d2u8, 3);
  468. vst1q_u8(dst, q0u8);
  469. dst += stride;
  470. q0u8 = vdupq_lane_u8(d2u8, 4);
  471. vst1q_u8(dst, q0u8);
  472. dst += stride;
  473. q0u8 = vdupq_lane_u8(d2u8, 5);
  474. vst1q_u8(dst, q0u8);
  475. dst += stride;
  476. q0u8 = vdupq_lane_u8(d2u8, 6);
  477. vst1q_u8(dst, q0u8);
  478. dst += stride;
  479. q0u8 = vdupq_lane_u8(d2u8, 7);
  480. vst1q_u8(dst, q0u8);
  481. dst += stride;
  482. }
  483. }
  484. void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
  485. const uint8_t *above, const uint8_t *left) {
  486. int j, k;
  487. uint8x8_t d2u8 = vdup_n_u8(0);
  488. uint8x16_t q0u8 = vdupq_n_u8(0);
  489. uint8x16_t q1u8 = vdupq_n_u8(0);
  490. (void)above;
  491. for (k = 0; k < 2; k++, left += 16) {
  492. q1u8 = vld1q_u8(left);
  493. d2u8 = vget_low_u8(q1u8);
  494. for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
  495. q0u8 = vdupq_lane_u8(d2u8, 0);
  496. vst1q_u8(dst, q0u8);
  497. vst1q_u8(dst + 16, q0u8);
  498. dst += stride;
  499. q0u8 = vdupq_lane_u8(d2u8, 1);
  500. vst1q_u8(dst, q0u8);
  501. vst1q_u8(dst + 16, q0u8);
  502. dst += stride;
  503. q0u8 = vdupq_lane_u8(d2u8, 2);
  504. vst1q_u8(dst, q0u8);
  505. vst1q_u8(dst + 16, q0u8);
  506. dst += stride;
  507. q0u8 = vdupq_lane_u8(d2u8, 3);
  508. vst1q_u8(dst, q0u8);
  509. vst1q_u8(dst + 16, q0u8);
  510. dst += stride;
  511. q0u8 = vdupq_lane_u8(d2u8, 4);
  512. vst1q_u8(dst, q0u8);
  513. vst1q_u8(dst + 16, q0u8);
  514. dst += stride;
  515. q0u8 = vdupq_lane_u8(d2u8, 5);
  516. vst1q_u8(dst, q0u8);
  517. vst1q_u8(dst + 16, q0u8);
  518. dst += stride;
  519. q0u8 = vdupq_lane_u8(d2u8, 6);
  520. vst1q_u8(dst, q0u8);
  521. vst1q_u8(dst + 16, q0u8);
  522. dst += stride;
  523. q0u8 = vdupq_lane_u8(d2u8, 7);
  524. vst1q_u8(dst, q0u8);
  525. vst1q_u8(dst + 16, q0u8);
  526. dst += stride;
  527. }
  528. }
  529. }
  530. void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  531. const uint8_t *above, const uint8_t *left) {
  532. int i;
  533. uint16x8_t q1u16, q3u16;
  534. int16x8_t q1s16;
  535. uint8x8_t d0u8 = vdup_n_u8(0);
  536. uint32x2_t d2u32 = vdup_n_u32(0);
  537. d0u8 = vld1_dup_u8(above - 1);
  538. d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
  539. q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
  540. for (i = 0; i < 4; i++, dst += stride) {
  541. q1u16 = vdupq_n_u16((uint16_t)left[i]);
  542. q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
  543. vreinterpretq_s16_u16(q3u16));
  544. d0u8 = vqmovun_s16(q1s16);
  545. vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  546. }
  547. }
  548. void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  549. const uint8_t *above, const uint8_t *left) {
  550. int j;
  551. uint16x8_t q0u16, q3u16, q10u16;
  552. int16x8_t q0s16;
  553. uint16x4_t d20u16;
  554. uint8x8_t d0u8, d2u8, d30u8;
  555. d0u8 = vld1_dup_u8(above - 1);
  556. d30u8 = vld1_u8(left);
  557. d2u8 = vld1_u8(above);
  558. q10u16 = vmovl_u8(d30u8);
  559. q3u16 = vsubl_u8(d2u8, d0u8);
  560. d20u16 = vget_low_u16(q10u16);
  561. for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
  562. q0u16 = vdupq_lane_u16(d20u16, 0);
  563. q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
  564. vreinterpretq_s16_u16(q0u16));
  565. d0u8 = vqmovun_s16(q0s16);
  566. vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
  567. dst += stride;
  568. q0u16 = vdupq_lane_u16(d20u16, 1);
  569. q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
  570. vreinterpretq_s16_u16(q0u16));
  571. d0u8 = vqmovun_s16(q0s16);
  572. vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
  573. dst += stride;
  574. q0u16 = vdupq_lane_u16(d20u16, 2);
  575. q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
  576. vreinterpretq_s16_u16(q0u16));
  577. d0u8 = vqmovun_s16(q0s16);
  578. vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
  579. dst += stride;
  580. q0u16 = vdupq_lane_u16(d20u16, 3);
  581. q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
  582. vreinterpretq_s16_u16(q0u16));
  583. d0u8 = vqmovun_s16(q0s16);
  584. vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
  585. dst += stride;
  586. }
  587. }
  588. void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
  589. const uint8_t *above, const uint8_t *left) {
  590. int j, k;
  591. uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
  592. uint8x16_t q0u8, q1u8;
  593. int16x8_t q0s16, q1s16, q8s16, q11s16;
  594. uint16x4_t d20u16;
  595. uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
  596. q0u8 = vld1q_dup_u8(above - 1);
  597. q1u8 = vld1q_u8(above);
  598. q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
  599. q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
  600. for (k = 0; k < 2; k++, left += 8) {
  601. d18u8 = vld1_u8(left);
  602. q10u16 = vmovl_u8(d18u8);
  603. d20u16 = vget_low_u16(q10u16);
  604. for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
  605. q0u16 = vdupq_lane_u16(d20u16, 0);
  606. q8u16 = vdupq_lane_u16(d20u16, 1);
  607. q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  608. vreinterpretq_s16_u16(q2u16));
  609. q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  610. vreinterpretq_s16_u16(q3u16));
  611. q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
  612. vreinterpretq_s16_u16(q2u16));
  613. q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
  614. vreinterpretq_s16_u16(q3u16));
  615. d2u8 = vqmovun_s16(q1s16);
  616. d3u8 = vqmovun_s16(q0s16);
  617. d22u8 = vqmovun_s16(q11s16);
  618. d23u8 = vqmovun_s16(q8s16);
  619. vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
  620. vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
  621. dst += stride;
  622. vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
  623. vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
  624. dst += stride;
  625. q0u16 = vdupq_lane_u16(d20u16, 2);
  626. q8u16 = vdupq_lane_u16(d20u16, 3);
  627. q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  628. vreinterpretq_s16_u16(q2u16));
  629. q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  630. vreinterpretq_s16_u16(q3u16));
  631. q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
  632. vreinterpretq_s16_u16(q2u16));
  633. q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
  634. vreinterpretq_s16_u16(q3u16));
  635. d2u8 = vqmovun_s16(q1s16);
  636. d3u8 = vqmovun_s16(q0s16);
  637. d22u8 = vqmovun_s16(q11s16);
  638. d23u8 = vqmovun_s16(q8s16);
  639. vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
  640. vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
  641. dst += stride;
  642. vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
  643. vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
  644. dst += stride;
  645. }
  646. }
  647. }
  648. void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
  649. const uint8_t *above, const uint8_t *left) {
  650. int j, k;
  651. uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
  652. uint8x16_t q0u8, q1u8, q2u8;
  653. int16x8_t q12s16, q13s16, q14s16, q15s16;
  654. uint16x4_t d6u16;
  655. uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
  656. q0u8 = vld1q_dup_u8(above - 1);
  657. q1u8 = vld1q_u8(above);
  658. q2u8 = vld1q_u8(above + 16);
  659. q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
  660. q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
  661. q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
  662. q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
  663. for (k = 0; k < 4; k++, left += 8) {
  664. d26u8 = vld1_u8(left);
  665. q3u16 = vmovl_u8(d26u8);
  666. d6u16 = vget_low_u16(q3u16);
  667. for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
  668. q0u16 = vdupq_lane_u16(d6u16, 0);
  669. q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  670. vreinterpretq_s16_u16(q8u16));
  671. q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  672. vreinterpretq_s16_u16(q9u16));
  673. q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  674. vreinterpretq_s16_u16(q10u16));
  675. q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  676. vreinterpretq_s16_u16(q11u16));
  677. d0u8 = vqmovun_s16(q12s16);
  678. d1u8 = vqmovun_s16(q13s16);
  679. d2u8 = vqmovun_s16(q14s16);
  680. d3u8 = vqmovun_s16(q15s16);
  681. q0u8 = vcombine_u8(d0u8, d1u8);
  682. q1u8 = vcombine_u8(d2u8, d3u8);
  683. vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
  684. vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
  685. dst += stride;
  686. q0u16 = vdupq_lane_u16(d6u16, 1);
  687. q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  688. vreinterpretq_s16_u16(q8u16));
  689. q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  690. vreinterpretq_s16_u16(q9u16));
  691. q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  692. vreinterpretq_s16_u16(q10u16));
  693. q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  694. vreinterpretq_s16_u16(q11u16));
  695. d0u8 = vqmovun_s16(q12s16);
  696. d1u8 = vqmovun_s16(q13s16);
  697. d2u8 = vqmovun_s16(q14s16);
  698. d3u8 = vqmovun_s16(q15s16);
  699. q0u8 = vcombine_u8(d0u8, d1u8);
  700. q1u8 = vcombine_u8(d2u8, d3u8);
  701. vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
  702. vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
  703. dst += stride;
  704. q0u16 = vdupq_lane_u16(d6u16, 2);
  705. q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  706. vreinterpretq_s16_u16(q8u16));
  707. q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  708. vreinterpretq_s16_u16(q9u16));
  709. q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  710. vreinterpretq_s16_u16(q10u16));
  711. q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  712. vreinterpretq_s16_u16(q11u16));
  713. d0u8 = vqmovun_s16(q12s16);
  714. d1u8 = vqmovun_s16(q13s16);
  715. d2u8 = vqmovun_s16(q14s16);
  716. d3u8 = vqmovun_s16(q15s16);
  717. q0u8 = vcombine_u8(d0u8, d1u8);
  718. q1u8 = vcombine_u8(d2u8, d3u8);
  719. vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
  720. vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
  721. dst += stride;
  722. q0u16 = vdupq_lane_u16(d6u16, 3);
  723. q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  724. vreinterpretq_s16_u16(q8u16));
  725. q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  726. vreinterpretq_s16_u16(q9u16));
  727. q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  728. vreinterpretq_s16_u16(q10u16));
  729. q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
  730. vreinterpretq_s16_u16(q11u16));
  731. d0u8 = vqmovun_s16(q12s16);
  732. d1u8 = vqmovun_s16(q13s16);
  733. d2u8 = vqmovun_s16(q14s16);
  734. d3u8 = vqmovun_s16(q15s16);
  735. q0u8 = vcombine_u8(d0u8, d1u8);
  736. q1u8 = vcombine_u8(d2u8, d3u8);
  737. vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
  738. vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
  739. dst += stride;
  740. }
  741. }
  742. }
  743. #endif // !HAVE_NEON_ASM