idct_neon.h 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940
  1. /*
  2. * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_DSP_ARM_IDCT_NEON_H_
  11. #define VPX_DSP_ARM_IDCT_NEON_H_
  12. #include <arm_neon.h>
  13. #include "./vpx_config.h"
  14. #include "vpx_dsp/arm/transpose_neon.h"
  15. #include "vpx_dsp/txfm_common.h"
  16. #include "vpx_dsp/vpx_dsp_common.h"
  17. static const int16_t kCospi[16] = {
  18. 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
  19. 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
  20. 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
  21. -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */,
  22. 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */,
  23. 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */,
  24. 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */,
  25. 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
  26. };
  27. static const int32_t kCospi32[16] = {
  28. 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
  29. 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
  30. 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
  31. -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */,
  32. 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */,
  33. 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */,
  34. 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */,
  35. 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
  36. };
  37. //------------------------------------------------------------------------------
  38. // Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
  39. static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
  40. #if CONFIG_VP9_HIGHBITDEPTH
  41. return vqaddq_s16(a, b);
  42. #else
  43. return vaddq_s16(a, b);
  44. #endif
  45. }
  46. static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) {
  47. #if CONFIG_VP9_HIGHBITDEPTH
  48. return vqsubq_s16(a, b);
  49. #else
  50. return vsubq_s16(a, b);
  51. #endif
  52. }
  53. //------------------------------------------------------------------------------
  54. static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0,
  55. const int32x4x2_t s1) {
  56. int32x4x2_t t;
  57. t.val[0] = vaddq_s32(s0.val[0], s1.val[0]);
  58. t.val[1] = vaddq_s32(s0.val[1], s1.val[1]);
  59. return t;
  60. }
  61. static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0,
  62. const int32x4x2_t s1) {
  63. int32x4x2_t t;
  64. t.val[0] = vsubq_s32(s0.val[0], s1.val[0]);
  65. t.val[1] = vsubq_s32(s0.val[1], s1.val[1]);
  66. return t;
  67. }
  68. //------------------------------------------------------------------------------
  69. // Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
  70. static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
  71. const int16_t a_const) {
  72. // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed
  73. // streams. See WRAPLOW and dct_const_round_shift for details.
  74. // This instruction doubles the result and returns the high half, essentially
  75. // resulting in a right shift by 15. By multiplying the constant first that
  76. // becomes a right shift by DCT_CONST_BITS.
  77. // The largest possible value used here is
  78. // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
  79. // within the range of int16_t (+32767 / -32768) even when negated.
  80. return vqrdmulhq_n_s16(a, a_const * 2);
  81. }
  82. // Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
  83. static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
  84. const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
  85. // In both add_ and it's pair, sub_, the input for well-formed streams will be
  86. // well within 16 bits (input to the idct is the difference between two frames
  87. // and will be within -255 to 255, or 9 bits)
  88. // However, for inputs over about 25,000 (valid for int16_t, but not for idct
  89. // input) this function can not use vaddq_s16.
  90. // In order to match existing behavior and intentionally out of range tests,
  91. // expand the addition up to 32 bits to prevent truncation.
  92. int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
  93. int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
  94. temp_low = vmulq_n_s32(temp_low, ab_const);
  95. temp_high = vmulq_n_s32(temp_high, ab_const);
  96. return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
  97. vrshrn_n_s32(temp_high, DCT_CONST_BITS));
  98. }
  99. // Subtract b from a, then multiply by ab_const. Shift and narrow by
  100. // DCT_CONST_BITS.
  101. static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
  102. const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
  103. int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
  104. int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
  105. temp_low = vmulq_n_s32(temp_low, ab_const);
  106. temp_high = vmulq_n_s32(temp_high, ab_const);
  107. return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
  108. vrshrn_n_s32(temp_high, DCT_CONST_BITS));
  109. }
  110. // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
  111. // DCT_CONST_BITS.
  112. static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
  113. const int16x8_t a, const int16_t a_const, const int16x8_t b,
  114. const int16_t b_const) {
  115. int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
  116. int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
  117. temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
  118. temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
  119. return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
  120. vrshrn_n_s32(temp_high, DCT_CONST_BITS));
  121. }
  122. //------------------------------------------------------------------------------
  123. // Note: The following 4 functions could use 32-bit operations for bit-depth 10.
  124. // However, although it's 20% faster with gcc, it's 20% slower with clang.
  125. // Use 64-bit operations for now.
  126. // Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
  127. static INLINE int32x4x2_t
  128. multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) {
  129. int64x2_t b[4];
  130. int32x4x2_t c;
  131. b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
  132. b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
  133. b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
  134. b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
  135. c.val[0] = vcombine_s32(vrshrn_n_s64(b[0], DCT_CONST_BITS),
  136. vrshrn_n_s64(b[1], DCT_CONST_BITS));
  137. c.val[1] = vcombine_s32(vrshrn_n_s64(b[2], DCT_CONST_BITS),
  138. vrshrn_n_s64(b[3], DCT_CONST_BITS));
  139. return c;
  140. }
  141. // Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
  142. static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual(
  143. const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
  144. const int32x4_t temp_low = vaddq_s32(a.val[0], b.val[0]);
  145. const int32x4_t temp_high = vaddq_s32(a.val[1], b.val[1]);
  146. int64x2_t c[4];
  147. int32x4x2_t d;
  148. c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const);
  149. c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const);
  150. c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const);
  151. c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const);
  152. d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
  153. vrshrn_n_s64(c[1], DCT_CONST_BITS));
  154. d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
  155. vrshrn_n_s64(c[3], DCT_CONST_BITS));
  156. return d;
  157. }
  158. // Subtract b from a, then multiply by ab_const. Shift and narrow by
  159. // DCT_CONST_BITS.
  160. static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual(
  161. const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
  162. const int32x4_t temp_low = vsubq_s32(a.val[0], b.val[0]);
  163. const int32x4_t temp_high = vsubq_s32(a.val[1], b.val[1]);
  164. int64x2_t c[4];
  165. int32x4x2_t d;
  166. c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const);
  167. c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const);
  168. c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const);
  169. c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const);
  170. d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
  171. vrshrn_n_s64(c[1], DCT_CONST_BITS));
  172. d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
  173. vrshrn_n_s64(c[3], DCT_CONST_BITS));
  174. return d;
  175. }
  176. // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
  177. // DCT_CONST_BITS.
  178. static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual(
  179. const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b,
  180. const int32_t b_const) {
  181. int64x2_t c[4];
  182. int32x4x2_t d;
  183. c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
  184. c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
  185. c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
  186. c[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
  187. c[0] = vmlal_n_s32(c[0], vget_low_s32(b.val[0]), b_const);
  188. c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const);
  189. c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const);
  190. c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const);
  191. d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
  192. vrshrn_n_s64(c[1], DCT_CONST_BITS));
  193. d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
  194. vrshrn_n_s64(c[3], DCT_CONST_BITS));
  195. return d;
  196. }
  197. // Shift the output down by 6 and add it to the destination buffer.
  198. static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
  199. const int16x8_t a2, const int16x8_t a3,
  200. const int16x8_t a4, const int16x8_t a5,
  201. const int16x8_t a6, const int16x8_t a7,
  202. uint8_t *b, const int b_stride) {
  203. uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
  204. int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
  205. b0 = vld1_u8(b);
  206. b += b_stride;
  207. b1 = vld1_u8(b);
  208. b += b_stride;
  209. b2 = vld1_u8(b);
  210. b += b_stride;
  211. b3 = vld1_u8(b);
  212. b += b_stride;
  213. b4 = vld1_u8(b);
  214. b += b_stride;
  215. b5 = vld1_u8(b);
  216. b += b_stride;
  217. b6 = vld1_u8(b);
  218. b += b_stride;
  219. b7 = vld1_u8(b);
  220. b -= (7 * b_stride);
  221. // c = b + (a >> 6)
  222. c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
  223. c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
  224. c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
  225. c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
  226. c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
  227. c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
  228. c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
  229. c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
  230. b0 = vqmovun_s16(c0);
  231. b1 = vqmovun_s16(c1);
  232. b2 = vqmovun_s16(c2);
  233. b3 = vqmovun_s16(c3);
  234. b4 = vqmovun_s16(c4);
  235. b5 = vqmovun_s16(c5);
  236. b6 = vqmovun_s16(c6);
  237. b7 = vqmovun_s16(c7);
  238. vst1_u8(b, b0);
  239. b += b_stride;
  240. vst1_u8(b, b1);
  241. b += b_stride;
  242. vst1_u8(b, b2);
  243. b += b_stride;
  244. vst1_u8(b, b3);
  245. b += b_stride;
  246. vst1_u8(b, b4);
  247. b += b_stride;
  248. vst1_u8(b, b5);
  249. b += b_stride;
  250. vst1_u8(b, b6);
  251. b += b_stride;
  252. vst1_u8(b, b7);
  253. }
  254. static INLINE uint8x16_t create_dcq(const int16_t dc) {
  255. // Clip both sides and gcc may compile to assembly 'usat'.
  256. const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc);
  257. return vdupq_n_u8((uint8_t)t);
  258. }
  259. static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
  260. int16x8_t *const a0,
  261. int16x8_t *const a1) {
  262. int16x4_t b0, b1, b2, b3;
  263. int32x4_t c0, c1, c2, c3;
  264. int16x8_t d0, d1;
  265. transpose_s16_4x4q(a0, a1);
  266. b0 = vget_low_s16(*a0);
  267. b1 = vget_high_s16(*a0);
  268. b2 = vget_low_s16(*a1);
  269. b3 = vget_high_s16(*a1);
  270. c0 = vmull_lane_s16(b0, cospis, 2);
  271. c2 = vmull_lane_s16(b1, cospis, 2);
  272. c1 = vsubq_s32(c0, c2);
  273. c0 = vaddq_s32(c0, c2);
  274. c2 = vmull_lane_s16(b2, cospis, 3);
  275. c3 = vmull_lane_s16(b2, cospis, 1);
  276. c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
  277. c3 = vmlal_lane_s16(c3, b3, cospis, 3);
  278. b0 = vrshrn_n_s32(c0, DCT_CONST_BITS);
  279. b1 = vrshrn_n_s32(c1, DCT_CONST_BITS);
  280. b2 = vrshrn_n_s32(c2, DCT_CONST_BITS);
  281. b3 = vrshrn_n_s32(c3, DCT_CONST_BITS);
  282. d0 = vcombine_s16(b0, b1);
  283. d1 = vcombine_s16(b3, b2);
  284. *a0 = vaddq_s16(d0, d1);
  285. *a1 = vsubq_s16(d0, d1);
  286. }
  287. static INLINE void idct8x8_12_pass1_bd8(
  288. const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
  289. int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2,
  290. int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5,
  291. int16x4_t *const io6, int16x4_t *const io7) {
  292. int16x4_t step1[8], step2[8];
  293. int32x4_t t32[2];
  294. transpose_s16_4x4d(io0, io1, io2, io3);
  295. // stage 1
  296. step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3);
  297. step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2);
  298. step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1);
  299. step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0);
  300. // stage 2
  301. step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2);
  302. step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3);
  303. step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1);
  304. step2[4] = vadd_s16(step1[4], step1[5]);
  305. step2[5] = vsub_s16(step1[4], step1[5]);
  306. step2[6] = vsub_s16(step1[7], step1[6]);
  307. step2[7] = vadd_s16(step1[7], step1[6]);
  308. // stage 3
  309. step1[0] = vadd_s16(step2[1], step2[3]);
  310. step1[1] = vadd_s16(step2[1], step2[2]);
  311. step1[2] = vsub_s16(step2[1], step2[2]);
  312. step1[3] = vsub_s16(step2[1], step2[3]);
  313. t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
  314. t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
  315. t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
  316. step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
  317. step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
  318. // stage 4
  319. *io0 = vadd_s16(step1[0], step2[7]);
  320. *io1 = vadd_s16(step1[1], step1[6]);
  321. *io2 = vadd_s16(step1[2], step1[5]);
  322. *io3 = vadd_s16(step1[3], step2[4]);
  323. *io4 = vsub_s16(step1[3], step2[4]);
  324. *io5 = vsub_s16(step1[2], step1[5]);
  325. *io6 = vsub_s16(step1[1], step1[6]);
  326. *io7 = vsub_s16(step1[0], step2[7]);
  327. }
  328. static INLINE void idct8x8_12_pass2_bd8(
  329. const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
  330. const int16x4_t input0, const int16x4_t input1, const int16x4_t input2,
  331. const int16x4_t input3, const int16x4_t input4, const int16x4_t input5,
  332. const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0,
  333. int16x8_t *const output1, int16x8_t *const output2,
  334. int16x8_t *const output3, int16x8_t *const output4,
  335. int16x8_t *const output5, int16x8_t *const output6,
  336. int16x8_t *const output7) {
  337. int16x8_t in[4];
  338. int16x8_t step1[8], step2[8];
  339. int32x4_t t32[8];
  340. int16x4_t t16[8];
  341. transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6,
  342. input7, &in[0], &in[1], &in[2], &in[3]);
  343. // stage 1
  344. step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3);
  345. step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2);
  346. step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1);
  347. step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0);
  348. // stage 2
  349. step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2);
  350. step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3);
  351. step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1);
  352. step2[4] = vaddq_s16(step1[4], step1[5]);
  353. step2[5] = vsubq_s16(step1[4], step1[5]);
  354. step2[6] = vsubq_s16(step1[7], step1[6]);
  355. step2[7] = vaddq_s16(step1[7], step1[6]);
  356. // stage 3
  357. step1[0] = vaddq_s16(step2[1], step2[3]);
  358. step1[1] = vaddq_s16(step2[1], step2[2]);
  359. step1[2] = vsubq_s16(step2[1], step2[2]);
  360. step1[3] = vsubq_s16(step2[1], step2[3]);
  361. t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
  362. t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
  363. t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
  364. t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
  365. t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
  366. t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
  367. t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
  368. t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
  369. t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
  370. t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
  371. step1[5] = vcombine_s16(t16[0], t16[1]);
  372. step1[6] = vcombine_s16(t16[2], t16[3]);
  373. // stage 4
  374. *output0 = vaddq_s16(step1[0], step2[7]);
  375. *output1 = vaddq_s16(step1[1], step1[6]);
  376. *output2 = vaddq_s16(step1[2], step1[5]);
  377. *output3 = vaddq_s16(step1[3], step2[4]);
  378. *output4 = vsubq_s16(step1[3], step2[4]);
  379. *output5 = vsubq_s16(step1[2], step1[5]);
  380. *output6 = vsubq_s16(step1[1], step1[6]);
  381. *output7 = vsubq_s16(step1[0], step2[7]);
  382. }
  383. static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
  384. const int16x4_t cospis1,
  385. int16x8_t *const io0, int16x8_t *const io1,
  386. int16x8_t *const io2, int16x8_t *const io3,
  387. int16x8_t *const io4, int16x8_t *const io5,
  388. int16x8_t *const io6,
  389. int16x8_t *const io7) {
  390. int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
  391. input_7l, input_7h;
  392. int16x4_t step1l[4], step1h[4];
  393. int16x8_t step1[8], step2[8];
  394. int32x4_t t32[8];
  395. int16x4_t t16[8];
  396. transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7);
  397. // stage 1
  398. input_1l = vget_low_s16(*io1);
  399. input_1h = vget_high_s16(*io1);
  400. input_3l = vget_low_s16(*io3);
  401. input_3h = vget_high_s16(*io3);
  402. input_5l = vget_low_s16(*io5);
  403. input_5h = vget_high_s16(*io5);
  404. input_7l = vget_low_s16(*io7);
  405. input_7h = vget_high_s16(*io7);
  406. step1l[0] = vget_low_s16(*io0);
  407. step1h[0] = vget_high_s16(*io0);
  408. step1l[1] = vget_low_s16(*io2);
  409. step1h[1] = vget_high_s16(*io2);
  410. step1l[2] = vget_low_s16(*io4);
  411. step1h[2] = vget_high_s16(*io4);
  412. step1l[3] = vget_low_s16(*io6);
  413. step1h[3] = vget_high_s16(*io6);
  414. t32[0] = vmull_lane_s16(input_1l, cospis1, 3);
  415. t32[1] = vmull_lane_s16(input_1h, cospis1, 3);
  416. t32[2] = vmull_lane_s16(input_3l, cospis1, 2);
  417. t32[3] = vmull_lane_s16(input_3h, cospis1, 2);
  418. t32[4] = vmull_lane_s16(input_3l, cospis1, 1);
  419. t32[5] = vmull_lane_s16(input_3h, cospis1, 1);
  420. t32[6] = vmull_lane_s16(input_1l, cospis1, 0);
  421. t32[7] = vmull_lane_s16(input_1h, cospis1, 0);
  422. t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0);
  423. t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0);
  424. t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1);
  425. t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1);
  426. t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2);
  427. t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
  428. t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
  429. t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
  430. t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
  431. t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
  432. t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
  433. t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
  434. t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
  435. t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
  436. t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
  437. t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
  438. step1[4] = vcombine_s16(t16[0], t16[1]);
  439. step1[5] = vcombine_s16(t16[2], t16[3]);
  440. step1[6] = vcombine_s16(t16[4], t16[5]);
  441. step1[7] = vcombine_s16(t16[6], t16[7]);
  442. // stage 2
  443. t32[2] = vmull_lane_s16(step1l[0], cospis0, 2);
  444. t32[3] = vmull_lane_s16(step1h[0], cospis0, 2);
  445. t32[4] = vmull_lane_s16(step1l[1], cospis0, 3);
  446. t32[5] = vmull_lane_s16(step1h[1], cospis0, 3);
  447. t32[6] = vmull_lane_s16(step1l[1], cospis0, 1);
  448. t32[7] = vmull_lane_s16(step1h[1], cospis0, 1);
  449. t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2);
  450. t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2);
  451. t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2);
  452. t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2);
  453. t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1);
  454. t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
  455. t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
  456. t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
  457. t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
  458. t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
  459. t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
  460. t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
  461. t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
  462. t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
  463. t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
  464. t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
  465. step2[0] = vcombine_s16(t16[0], t16[1]);
  466. step2[1] = vcombine_s16(t16[2], t16[3]);
  467. step2[2] = vcombine_s16(t16[4], t16[5]);
  468. step2[3] = vcombine_s16(t16[6], t16[7]);
  469. step2[4] = vaddq_s16(step1[4], step1[5]);
  470. step2[5] = vsubq_s16(step1[4], step1[5]);
  471. step2[6] = vsubq_s16(step1[7], step1[6]);
  472. step2[7] = vaddq_s16(step1[7], step1[6]);
  473. // stage 3
  474. step1[0] = vaddq_s16(step2[0], step2[3]);
  475. step1[1] = vaddq_s16(step2[1], step2[2]);
  476. step1[2] = vsubq_s16(step2[1], step2[2]);
  477. step1[3] = vsubq_s16(step2[0], step2[3]);
  478. t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
  479. t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
  480. t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
  481. t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
  482. t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
  483. t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
  484. t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
  485. t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
  486. t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
  487. t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
  488. step1[5] = vcombine_s16(t16[0], t16[1]);
  489. step1[6] = vcombine_s16(t16[2], t16[3]);
  490. // stage 4
  491. *io0 = vaddq_s16(step1[0], step2[7]);
  492. *io1 = vaddq_s16(step1[1], step1[6]);
  493. *io2 = vaddq_s16(step1[2], step1[5]);
  494. *io3 = vaddq_s16(step1[3], step2[4]);
  495. *io4 = vsubq_s16(step1[3], step2[4]);
  496. *io5 = vsubq_s16(step1[2], step1[5]);
  497. *io6 = vsubq_s16(step1[1], step1[6]);
  498. *io7 = vsubq_s16(step1[0], step2[7]);
  499. }
  500. static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
  501. int16x8_t *const d0,
  502. int16x8_t *const d1) {
  503. int16x4_t t16[4];
  504. t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
  505. t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
  506. t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
  507. t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
  508. *d0 = vcombine_s16(t16[0], t16[1]);
  509. *d1 = vcombine_s16(t16[2], t16[3]);
  510. }
  511. static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
  512. const int16x8_t s1,
  513. const int16x4_t cospi_0_8_16_24,
  514. int32x4_t *const t32) {
  515. t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
  516. t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
  517. t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
  518. t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
  519. t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
  520. t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
  521. t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
  522. t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
  523. }
  524. static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
  525. const int16x4_t cospi_0_8_16_24,
  526. int16x8_t *const d0, int16x8_t *const d1) {
  527. int32x4_t t32[4];
  528. idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
  529. idct16x16_add_wrap_low_8x2(t32, d0, d1);
  530. }
  531. static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
  532. const int16x4_t cospi_0_8_16_24,
  533. int16x8_t *const d0,
  534. int16x8_t *const d1) {
  535. int32x4_t t32[4];
  536. idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
  537. t32[2] = vnegq_s32(t32[2]);
  538. t32[3] = vnegq_s32(t32[3]);
  539. idct16x16_add_wrap_low_8x2(t32, d0, d1);
  540. }
  541. static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
  542. const int16x4_t cospi_0_8_16_24,
  543. int16x8_t *const d0,
  544. int16x8_t *const d1) {
  545. int32x4_t t32[6];
  546. t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
  547. t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
  548. t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
  549. t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
  550. t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
  551. t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
  552. idct16x16_add_wrap_low_8x2(t32, d0, d1);
  553. }
  554. static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
  555. const int16x4_t cospi_2_30_10_22,
  556. int16x8_t *const d0, int16x8_t *const d1) {
  557. int32x4_t t32[4];
  558. t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
  559. t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
  560. t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
  561. t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
  562. t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
  563. t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
  564. t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
  565. t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
  566. idct16x16_add_wrap_low_8x2(t32, d0, d1);
  567. }
  568. static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
  569. const int16x4_t cospi_4_12_20N_28,
  570. int16x8_t *const d0, int16x8_t *const d1) {
  571. int32x4_t t32[4];
  572. t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
  573. t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
  574. t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
  575. t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
  576. t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
  577. t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
  578. t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
  579. t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
  580. idct16x16_add_wrap_low_8x2(t32, d0, d1);
  581. }
  582. static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
  583. const int16x4_t cospi_6_26N_14_18N,
  584. int16x8_t *const d0, int16x8_t *const d1) {
  585. int32x4_t t32[4];
  586. t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 0);
  587. t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 0);
  588. t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 0);
  589. t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 0);
  590. t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 1);
  591. t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1);
  592. t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1);
  593. t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1);
  594. idct16x16_add_wrap_low_8x2(t32, d0, d1);
  595. }
  596. static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
  597. const int16x4_t cospi_2_30_10_22,
  598. int16x8_t *const d0, int16x8_t *const d1) {
  599. int32x4_t t32[4];
  600. t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
  601. t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
  602. t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
  603. t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
  604. t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
  605. t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
  606. t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
  607. t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
  608. idct16x16_add_wrap_low_8x2(t32, d0, d1);
  609. }
  610. static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
  611. const int16x4_t cospi_4_12_20N_28,
  612. int16x8_t *const d0, int16x8_t *const d1) {
  613. int32x4_t t32[4];
  614. t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
  615. t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
  616. t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
  617. t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
  618. t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
  619. t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
  620. t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
  621. t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
  622. idct16x16_add_wrap_low_8x2(t32, d0, d1);
  623. }
  624. static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
  625. const int16x4_t cospi_6_26N_14_18N,
  626. int16x8_t *const d0, int16x8_t *const d1) {
  627. int32x4_t t32[4];
  628. t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 2);
  629. t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 2);
  630. t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 2);
  631. t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 2);
  632. t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 3);
  633. t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3);
  634. t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3);
  635. t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3);
  636. idct16x16_add_wrap_low_8x2(t32, d0, d1);
  637. }
  638. static INLINE void idct16x16_add_stage7(const int16x8_t *const step2,
  639. int16x8_t *const out) {
  640. #if CONFIG_VP9_HIGHBITDEPTH
  641. // Use saturating add/sub to avoid overflow in 2nd pass
  642. out[0] = vqaddq_s16(step2[0], step2[15]);
  643. out[1] = vqaddq_s16(step2[1], step2[14]);
  644. out[2] = vqaddq_s16(step2[2], step2[13]);
  645. out[3] = vqaddq_s16(step2[3], step2[12]);
  646. out[4] = vqaddq_s16(step2[4], step2[11]);
  647. out[5] = vqaddq_s16(step2[5], step2[10]);
  648. out[6] = vqaddq_s16(step2[6], step2[9]);
  649. out[7] = vqaddq_s16(step2[7], step2[8]);
  650. out[8] = vqsubq_s16(step2[7], step2[8]);
  651. out[9] = vqsubq_s16(step2[6], step2[9]);
  652. out[10] = vqsubq_s16(step2[5], step2[10]);
  653. out[11] = vqsubq_s16(step2[4], step2[11]);
  654. out[12] = vqsubq_s16(step2[3], step2[12]);
  655. out[13] = vqsubq_s16(step2[2], step2[13]);
  656. out[14] = vqsubq_s16(step2[1], step2[14]);
  657. out[15] = vqsubq_s16(step2[0], step2[15]);
  658. #else
  659. out[0] = vaddq_s16(step2[0], step2[15]);
  660. out[1] = vaddq_s16(step2[1], step2[14]);
  661. out[2] = vaddq_s16(step2[2], step2[13]);
  662. out[3] = vaddq_s16(step2[3], step2[12]);
  663. out[4] = vaddq_s16(step2[4], step2[11]);
  664. out[5] = vaddq_s16(step2[5], step2[10]);
  665. out[6] = vaddq_s16(step2[6], step2[9]);
  666. out[7] = vaddq_s16(step2[7], step2[8]);
  667. out[8] = vsubq_s16(step2[7], step2[8]);
  668. out[9] = vsubq_s16(step2[6], step2[9]);
  669. out[10] = vsubq_s16(step2[5], step2[10]);
  670. out[11] = vsubq_s16(step2[4], step2[11]);
  671. out[12] = vsubq_s16(step2[3], step2[12]);
  672. out[13] = vsubq_s16(step2[2], step2[13]);
  673. out[14] = vsubq_s16(step2[1], step2[14]);
  674. out[15] = vsubq_s16(step2[0], step2[15]);
  675. #endif
  676. }
  677. static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
  678. int16_t *output) {
  679. // Save the result into output
  680. vst1q_s16(output, out[0]);
  681. output += 16;
  682. vst1q_s16(output, out[1]);
  683. output += 16;
  684. vst1q_s16(output, out[2]);
  685. output += 16;
  686. vst1q_s16(output, out[3]);
  687. output += 16;
  688. vst1q_s16(output, out[4]);
  689. output += 16;
  690. vst1q_s16(output, out[5]);
  691. output += 16;
  692. vst1q_s16(output, out[6]);
  693. output += 16;
  694. vst1q_s16(output, out[7]);
  695. output += 16;
  696. vst1q_s16(output, out[8]);
  697. output += 16;
  698. vst1q_s16(output, out[9]);
  699. output += 16;
  700. vst1q_s16(output, out[10]);
  701. output += 16;
  702. vst1q_s16(output, out[11]);
  703. output += 16;
  704. vst1q_s16(output, out[12]);
  705. output += 16;
  706. vst1q_s16(output, out[13]);
  707. output += 16;
  708. vst1q_s16(output, out[14]);
  709. output += 16;
  710. vst1q_s16(output, out[15]);
  711. }
  712. static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
  713. const int stride) {
  714. uint8x8_t d = vld1_u8(*dest);
  715. uint16x8_t q;
  716. res = vrshrq_n_s16(res, 6);
  717. q = vaddw_u8(vreinterpretq_u16_s16(res), d);
  718. d = vqmovun_s16(vreinterpretq_s16_u16(q));
  719. vst1_u8(*dest, d);
  720. *dest += stride;
  721. }
  722. static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max,
  723. uint16_t **dest, const int stride) {
  724. uint16x8_t d = vld1q_u16(*dest);
  725. res = vqaddq_s16(res, vreinterpretq_s16_u16(d));
  726. res = vminq_s16(res, max);
  727. d = vqshluq_n_s16(res, 0);
  728. vst1q_u16(*dest, d);
  729. *dest += stride;
  730. }
  731. static INLINE void highbd_idct16x16_add8x1_bd8(int16x8_t res, uint16_t **dest,
  732. const int stride) {
  733. uint16x8_t d = vld1q_u16(*dest);
  734. res = vrsraq_n_s16(vreinterpretq_s16_u16(d), res, 6);
  735. d = vmovl_u8(vqmovun_s16(res));
  736. vst1q_u16(*dest, d);
  737. *dest += stride;
  738. }
  739. static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a,
  740. uint16_t *out, const int b_stride) {
  741. highbd_idct16x16_add8x1_bd8(a[0], &out, b_stride);
  742. highbd_idct16x16_add8x1_bd8(a[1], &out, b_stride);
  743. highbd_idct16x16_add8x1_bd8(a[2], &out, b_stride);
  744. highbd_idct16x16_add8x1_bd8(a[3], &out, b_stride);
  745. highbd_idct16x16_add8x1_bd8(a[4], &out, b_stride);
  746. highbd_idct16x16_add8x1_bd8(a[5], &out, b_stride);
  747. highbd_idct16x16_add8x1_bd8(a[6], &out, b_stride);
  748. highbd_idct16x16_add8x1_bd8(a[7], &out, b_stride);
  749. highbd_idct16x16_add8x1_bd8(a[8], &out, b_stride);
  750. highbd_idct16x16_add8x1_bd8(a[9], &out, b_stride);
  751. highbd_idct16x16_add8x1_bd8(a[10], &out, b_stride);
  752. highbd_idct16x16_add8x1_bd8(a[11], &out, b_stride);
  753. highbd_idct16x16_add8x1_bd8(a[12], &out, b_stride);
  754. highbd_idct16x16_add8x1_bd8(a[13], &out, b_stride);
  755. highbd_idct16x16_add8x1_bd8(a[14], &out, b_stride);
  756. highbd_idct16x16_add8x1_bd8(a[15], &out, b_stride);
  757. highbd_idct16x16_add8x1_bd8(a[16], &out, b_stride);
  758. highbd_idct16x16_add8x1_bd8(a[17], &out, b_stride);
  759. highbd_idct16x16_add8x1_bd8(a[18], &out, b_stride);
  760. highbd_idct16x16_add8x1_bd8(a[19], &out, b_stride);
  761. highbd_idct16x16_add8x1_bd8(a[20], &out, b_stride);
  762. highbd_idct16x16_add8x1_bd8(a[21], &out, b_stride);
  763. highbd_idct16x16_add8x1_bd8(a[22], &out, b_stride);
  764. highbd_idct16x16_add8x1_bd8(a[23], &out, b_stride);
  765. highbd_idct16x16_add8x1_bd8(a[24], &out, b_stride);
  766. highbd_idct16x16_add8x1_bd8(a[25], &out, b_stride);
  767. highbd_idct16x16_add8x1_bd8(a[26], &out, b_stride);
  768. highbd_idct16x16_add8x1_bd8(a[27], &out, b_stride);
  769. highbd_idct16x16_add8x1_bd8(a[28], &out, b_stride);
  770. highbd_idct16x16_add8x1_bd8(a[29], &out, b_stride);
  771. highbd_idct16x16_add8x1_bd8(a[30], &out, b_stride);
  772. highbd_idct16x16_add8x1_bd8(a[31], &out, b_stride);
  773. }
  774. static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
  775. uint16_t *dest, const int stride,
  776. const int bd) {
  777. // Add the result to dest
  778. const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
  779. int16x8_t o[16];
  780. o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
  781. vrshrn_n_s32(out[0].val[1], 6));
  782. o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
  783. vrshrn_n_s32(out[1].val[1], 6));
  784. o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
  785. vrshrn_n_s32(out[2].val[1], 6));
  786. o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
  787. vrshrn_n_s32(out[3].val[1], 6));
  788. o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
  789. vrshrn_n_s32(out[4].val[1], 6));
  790. o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
  791. vrshrn_n_s32(out[5].val[1], 6));
  792. o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
  793. vrshrn_n_s32(out[6].val[1], 6));
  794. o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
  795. vrshrn_n_s32(out[7].val[1], 6));
  796. o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
  797. vrshrn_n_s32(out[8].val[1], 6));
  798. o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
  799. vrshrn_n_s32(out[9].val[1], 6));
  800. o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
  801. vrshrn_n_s32(out[10].val[1], 6));
  802. o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
  803. vrshrn_n_s32(out[11].val[1], 6));
  804. o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
  805. vrshrn_n_s32(out[12].val[1], 6));
  806. o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
  807. vrshrn_n_s32(out[13].val[1], 6));
  808. o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
  809. vrshrn_n_s32(out[14].val[1], 6));
  810. o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
  811. vrshrn_n_s32(out[15].val[1], 6));
  812. highbd_idct16x16_add8x1(o[0], max, &dest, stride);
  813. highbd_idct16x16_add8x1(o[1], max, &dest, stride);
  814. highbd_idct16x16_add8x1(o[2], max, &dest, stride);
  815. highbd_idct16x16_add8x1(o[3], max, &dest, stride);
  816. highbd_idct16x16_add8x1(o[4], max, &dest, stride);
  817. highbd_idct16x16_add8x1(o[5], max, &dest, stride);
  818. highbd_idct16x16_add8x1(o[6], max, &dest, stride);
  819. highbd_idct16x16_add8x1(o[7], max, &dest, stride);
  820. highbd_idct16x16_add8x1(o[8], max, &dest, stride);
  821. highbd_idct16x16_add8x1(o[9], max, &dest, stride);
  822. highbd_idct16x16_add8x1(o[10], max, &dest, stride);
  823. highbd_idct16x16_add8x1(o[11], max, &dest, stride);
  824. highbd_idct16x16_add8x1(o[12], max, &dest, stride);
  825. highbd_idct16x16_add8x1(o[13], max, &dest, stride);
  826. highbd_idct16x16_add8x1(o[14], max, &dest, stride);
  827. highbd_idct16x16_add8x1(o[15], max, &dest, stride);
  828. }
  829. void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
  830. void *const dest, const int stride,
  831. const int highbd_flag);
  832. void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
  833. void *const dest, const int stride,
  834. const int highbd_flag);
  835. void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
  836. int16_t *output);
  837. void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
  838. int16_t *const output, void *const dest,
  839. const int stride, const int highbd_flag);
  840. void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
  841. const int stride, const int highbd_flag);
  842. void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output);
  843. void vpx_idct32_16_neon(const int16_t *const input, void *const output,
  844. const int stride, const int highbd_flag);
  845. void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output);
  846. void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
  847. const int highbd_flag);
  848. #endif // VPX_DSP_ARM_IDCT_NEON_H_