idct32x32_add_neon.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "vpx_dsp/txfm_common.h"
  13. #define LOAD_FROM_TRANSPOSED(prev, first, second) \
  14. q14s16 = vld1q_s16(trans_buf + first * 8); \
  15. q13s16 = vld1q_s16(trans_buf + second * 8);
  16. #define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
  17. qA = vld1q_s16(out + first * 32); \
  18. qB = vld1q_s16(out + second * 32);
  19. #define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
  20. vst1q_s16(out + first * 32, qA); \
  21. vst1q_s16(out + second * 32, qB);
  22. #define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
  23. __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
  24. q6s16, q7s16, q8s16, q9s16);
  25. static INLINE void __STORE_COMBINE_CENTER_RESULTS(
  26. uint8_t *p1,
  27. uint8_t *p2,
  28. int stride,
  29. int16x8_t q6s16,
  30. int16x8_t q7s16,
  31. int16x8_t q8s16,
  32. int16x8_t q9s16) {
  33. int16x4_t d8s16, d9s16, d10s16, d11s16;
  34. d8s16 = vld1_s16((int16_t *)p1);
  35. p1 += stride;
  36. d11s16 = vld1_s16((int16_t *)p2);
  37. p2 -= stride;
  38. d9s16 = vld1_s16((int16_t *)p1);
  39. d10s16 = vld1_s16((int16_t *)p2);
  40. q7s16 = vrshrq_n_s16(q7s16, 6);
  41. q8s16 = vrshrq_n_s16(q8s16, 6);
  42. q9s16 = vrshrq_n_s16(q9s16, 6);
  43. q6s16 = vrshrq_n_s16(q6s16, 6);
  44. q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
  45. vreinterpret_u8_s16(d9s16)));
  46. q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
  47. vreinterpret_u8_s16(d10s16)));
  48. q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
  49. vreinterpret_u8_s16(d11s16)));
  50. q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
  51. vreinterpret_u8_s16(d8s16)));
  52. d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
  53. d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
  54. d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
  55. d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
  56. vst1_s16((int16_t *)p1, d9s16);
  57. p1 -= stride;
  58. vst1_s16((int16_t *)p2, d10s16);
  59. p2 += stride;
  60. vst1_s16((int16_t *)p1, d8s16);
  61. vst1_s16((int16_t *)p2, d11s16);
  62. return;
  63. }
  64. #define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
  65. __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
  66. q4s16, q5s16, q6s16, q7s16);
  67. static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
  68. uint8_t *p1,
  69. uint8_t *p2,
  70. int stride,
  71. int16x8_t q4s16,
  72. int16x8_t q5s16,
  73. int16x8_t q6s16,
  74. int16x8_t q7s16) {
  75. int16x4_t d4s16, d5s16, d6s16, d7s16;
  76. d4s16 = vld1_s16((int16_t *)p1);
  77. p1 += stride;
  78. d7s16 = vld1_s16((int16_t *)p2);
  79. p2 -= stride;
  80. d5s16 = vld1_s16((int16_t *)p1);
  81. d6s16 = vld1_s16((int16_t *)p2);
  82. q5s16 = vrshrq_n_s16(q5s16, 6);
  83. q6s16 = vrshrq_n_s16(q6s16, 6);
  84. q7s16 = vrshrq_n_s16(q7s16, 6);
  85. q4s16 = vrshrq_n_s16(q4s16, 6);
  86. q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
  87. vreinterpret_u8_s16(d5s16)));
  88. q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
  89. vreinterpret_u8_s16(d6s16)));
  90. q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
  91. vreinterpret_u8_s16(d7s16)));
  92. q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
  93. vreinterpret_u8_s16(d4s16)));
  94. d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
  95. d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
  96. d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
  97. d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
  98. vst1_s16((int16_t *)p1, d5s16);
  99. p1 -= stride;
  100. vst1_s16((int16_t *)p2, d6s16);
  101. p2 += stride;
  102. vst1_s16((int16_t *)p2, d7s16);
  103. vst1_s16((int16_t *)p1, d4s16);
  104. return;
  105. }
  106. #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
  107. DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
  108. static INLINE void DO_BUTTERFLY(
  109. int16x8_t q14s16,
  110. int16x8_t q13s16,
  111. int16_t first_const,
  112. int16_t second_const,
  113. int16x8_t *qAs16,
  114. int16x8_t *qBs16) {
  115. int16x4_t d30s16, d31s16;
  116. int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
  117. int16x4_t dCs16, dDs16, dAs16, dBs16;
  118. dCs16 = vget_low_s16(q14s16);
  119. dDs16 = vget_high_s16(q14s16);
  120. dAs16 = vget_low_s16(q13s16);
  121. dBs16 = vget_high_s16(q13s16);
  122. d30s16 = vdup_n_s16(first_const);
  123. d31s16 = vdup_n_s16(second_const);
  124. q8s32 = vmull_s16(dCs16, d30s16);
  125. q10s32 = vmull_s16(dAs16, d31s16);
  126. q9s32 = vmull_s16(dDs16, d30s16);
  127. q11s32 = vmull_s16(dBs16, d31s16);
  128. q12s32 = vmull_s16(dCs16, d31s16);
  129. q8s32 = vsubq_s32(q8s32, q10s32);
  130. q9s32 = vsubq_s32(q9s32, q11s32);
  131. q10s32 = vmull_s16(dDs16, d31s16);
  132. q11s32 = vmull_s16(dAs16, d30s16);
  133. q15s32 = vmull_s16(dBs16, d30s16);
  134. q11s32 = vaddq_s32(q12s32, q11s32);
  135. q10s32 = vaddq_s32(q10s32, q15s32);
  136. *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
  137. vqrshrn_n_s32(q9s32, 14));
  138. *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
  139. vqrshrn_n_s32(q10s32, 14));
  140. return;
  141. }
  142. static INLINE void idct32_transpose_pair(
  143. int16_t *input,
  144. int16_t *t_buf) {
  145. int16_t *in;
  146. int i;
  147. const int stride = 32;
  148. int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  149. int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  150. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  151. int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
  152. int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
  153. for (i = 0; i < 4; i++, input += 8) {
  154. in = input;
  155. q8s16 = vld1q_s16(in);
  156. in += stride;
  157. q9s16 = vld1q_s16(in);
  158. in += stride;
  159. q10s16 = vld1q_s16(in);
  160. in += stride;
  161. q11s16 = vld1q_s16(in);
  162. in += stride;
  163. q12s16 = vld1q_s16(in);
  164. in += stride;
  165. q13s16 = vld1q_s16(in);
  166. in += stride;
  167. q14s16 = vld1q_s16(in);
  168. in += stride;
  169. q15s16 = vld1q_s16(in);
  170. d16s16 = vget_low_s16(q8s16);
  171. d17s16 = vget_high_s16(q8s16);
  172. d18s16 = vget_low_s16(q9s16);
  173. d19s16 = vget_high_s16(q9s16);
  174. d20s16 = vget_low_s16(q10s16);
  175. d21s16 = vget_high_s16(q10s16);
  176. d22s16 = vget_low_s16(q11s16);
  177. d23s16 = vget_high_s16(q11s16);
  178. d24s16 = vget_low_s16(q12s16);
  179. d25s16 = vget_high_s16(q12s16);
  180. d26s16 = vget_low_s16(q13s16);
  181. d27s16 = vget_high_s16(q13s16);
  182. d28s16 = vget_low_s16(q14s16);
  183. d29s16 = vget_high_s16(q14s16);
  184. d30s16 = vget_low_s16(q15s16);
  185. d31s16 = vget_high_s16(q15s16);
  186. q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
  187. q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
  188. q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
  189. q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
  190. q12s16 = vcombine_s16(d17s16, d25s16);
  191. q13s16 = vcombine_s16(d19s16, d27s16);
  192. q14s16 = vcombine_s16(d21s16, d29s16);
  193. q15s16 = vcombine_s16(d23s16, d31s16);
  194. q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
  195. vreinterpretq_s32_s16(q10s16));
  196. q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
  197. vreinterpretq_s32_s16(q11s16));
  198. q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
  199. vreinterpretq_s32_s16(q14s16));
  200. q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
  201. vreinterpretq_s32_s16(q15s16));
  202. q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
  203. vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
  204. q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
  205. vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
  206. q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
  207. vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
  208. q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
  209. vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
  210. vst1q_s16(t_buf, q0x2s16.val[0]);
  211. t_buf += 8;
  212. vst1q_s16(t_buf, q0x2s16.val[1]);
  213. t_buf += 8;
  214. vst1q_s16(t_buf, q1x2s16.val[0]);
  215. t_buf += 8;
  216. vst1q_s16(t_buf, q1x2s16.val[1]);
  217. t_buf += 8;
  218. vst1q_s16(t_buf, q2x2s16.val[0]);
  219. t_buf += 8;
  220. vst1q_s16(t_buf, q2x2s16.val[1]);
  221. t_buf += 8;
  222. vst1q_s16(t_buf, q3x2s16.val[0]);
  223. t_buf += 8;
  224. vst1q_s16(t_buf, q3x2s16.val[1]);
  225. t_buf += 8;
  226. }
  227. return;
  228. }
  229. static INLINE void idct32_bands_end_1st_pass(
  230. int16_t *out,
  231. int16x8_t q2s16,
  232. int16x8_t q3s16,
  233. int16x8_t q6s16,
  234. int16x8_t q7s16,
  235. int16x8_t q8s16,
  236. int16x8_t q9s16,
  237. int16x8_t q10s16,
  238. int16x8_t q11s16,
  239. int16x8_t q12s16,
  240. int16x8_t q13s16,
  241. int16x8_t q14s16,
  242. int16x8_t q15s16) {
  243. int16x8_t q0s16, q1s16, q4s16, q5s16;
  244. STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
  245. STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
  246. LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
  247. q4s16 = vaddq_s16(q2s16, q1s16);
  248. q5s16 = vaddq_s16(q3s16, q0s16);
  249. q6s16 = vsubq_s16(q3s16, q0s16);
  250. q7s16 = vsubq_s16(q2s16, q1s16);
  251. STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
  252. STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
  253. LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
  254. q2s16 = vaddq_s16(q10s16, q1s16);
  255. q3s16 = vaddq_s16(q11s16, q0s16);
  256. q4s16 = vsubq_s16(q11s16, q0s16);
  257. q5s16 = vsubq_s16(q10s16, q1s16);
  258. LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
  259. q8s16 = vaddq_s16(q4s16, q1s16);
  260. q9s16 = vaddq_s16(q5s16, q0s16);
  261. q6s16 = vsubq_s16(q5s16, q0s16);
  262. q7s16 = vsubq_s16(q4s16, q1s16);
  263. STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
  264. STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
  265. LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
  266. q4s16 = vaddq_s16(q2s16, q1s16);
  267. q5s16 = vaddq_s16(q3s16, q0s16);
  268. q6s16 = vsubq_s16(q3s16, q0s16);
  269. q7s16 = vsubq_s16(q2s16, q1s16);
  270. STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
  271. STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
  272. LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
  273. q2s16 = vaddq_s16(q12s16, q1s16);
  274. q3s16 = vaddq_s16(q13s16, q0s16);
  275. q4s16 = vsubq_s16(q13s16, q0s16);
  276. q5s16 = vsubq_s16(q12s16, q1s16);
  277. LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
  278. q8s16 = vaddq_s16(q4s16, q1s16);
  279. q9s16 = vaddq_s16(q5s16, q0s16);
  280. q6s16 = vsubq_s16(q5s16, q0s16);
  281. q7s16 = vsubq_s16(q4s16, q1s16);
  282. STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
  283. STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
  284. LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
  285. q4s16 = vaddq_s16(q2s16, q1s16);
  286. q5s16 = vaddq_s16(q3s16, q0s16);
  287. q6s16 = vsubq_s16(q3s16, q0s16);
  288. q7s16 = vsubq_s16(q2s16, q1s16);
  289. STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
  290. STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
  291. LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
  292. q2s16 = vaddq_s16(q14s16, q1s16);
  293. q3s16 = vaddq_s16(q15s16, q0s16);
  294. q4s16 = vsubq_s16(q15s16, q0s16);
  295. q5s16 = vsubq_s16(q14s16, q1s16);
  296. LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
  297. q8s16 = vaddq_s16(q4s16, q1s16);
  298. q9s16 = vaddq_s16(q5s16, q0s16);
  299. q6s16 = vsubq_s16(q5s16, q0s16);
  300. q7s16 = vsubq_s16(q4s16, q1s16);
  301. STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
  302. STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
  303. LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
  304. q4s16 = vaddq_s16(q2s16, q1s16);
  305. q5s16 = vaddq_s16(q3s16, q0s16);
  306. q6s16 = vsubq_s16(q3s16, q0s16);
  307. q7s16 = vsubq_s16(q2s16, q1s16);
  308. STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
  309. STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
  310. return;
  311. }
  312. static INLINE void idct32_bands_end_2nd_pass(
  313. int16_t *out,
  314. uint8_t *dest,
  315. int stride,
  316. int16x8_t q2s16,
  317. int16x8_t q3s16,
  318. int16x8_t q6s16,
  319. int16x8_t q7s16,
  320. int16x8_t q8s16,
  321. int16x8_t q9s16,
  322. int16x8_t q10s16,
  323. int16x8_t q11s16,
  324. int16x8_t q12s16,
  325. int16x8_t q13s16,
  326. int16x8_t q14s16,
  327. int16x8_t q15s16) {
  328. uint8_t *r6 = dest + 31 * stride;
  329. uint8_t *r7 = dest/* + 0 * stride*/;
  330. uint8_t *r9 = dest + 15 * stride;
  331. uint8_t *r10 = dest + 16 * stride;
  332. int str2 = stride << 1;
  333. int16x8_t q0s16, q1s16, q4s16, q5s16;
  334. STORE_COMBINE_CENTER_RESULTS(r10, r9);
  335. r10 += str2; r9 -= str2;
  336. LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
  337. q4s16 = vaddq_s16(q2s16, q1s16);
  338. q5s16 = vaddq_s16(q3s16, q0s16);
  339. q6s16 = vsubq_s16(q3s16, q0s16);
  340. q7s16 = vsubq_s16(q2s16, q1s16);
  341. STORE_COMBINE_EXTREME_RESULTS(r7, r6);
  342. r7 += str2; r6 -= str2;
  343. LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
  344. q2s16 = vaddq_s16(q10s16, q1s16);
  345. q3s16 = vaddq_s16(q11s16, q0s16);
  346. q4s16 = vsubq_s16(q11s16, q0s16);
  347. q5s16 = vsubq_s16(q10s16, q1s16);
  348. LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
  349. q8s16 = vaddq_s16(q4s16, q1s16);
  350. q9s16 = vaddq_s16(q5s16, q0s16);
  351. q6s16 = vsubq_s16(q5s16, q0s16);
  352. q7s16 = vsubq_s16(q4s16, q1s16);
  353. STORE_COMBINE_CENTER_RESULTS(r10, r9);
  354. r10 += str2; r9 -= str2;
  355. LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
  356. q4s16 = vaddq_s16(q2s16, q1s16);
  357. q5s16 = vaddq_s16(q3s16, q0s16);
  358. q6s16 = vsubq_s16(q3s16, q0s16);
  359. q7s16 = vsubq_s16(q2s16, q1s16);
  360. STORE_COMBINE_EXTREME_RESULTS(r7, r6);
  361. r7 += str2; r6 -= str2;
  362. LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
  363. q2s16 = vaddq_s16(q12s16, q1s16);
  364. q3s16 = vaddq_s16(q13s16, q0s16);
  365. q4s16 = vsubq_s16(q13s16, q0s16);
  366. q5s16 = vsubq_s16(q12s16, q1s16);
  367. LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
  368. q8s16 = vaddq_s16(q4s16, q1s16);
  369. q9s16 = vaddq_s16(q5s16, q0s16);
  370. q6s16 = vsubq_s16(q5s16, q0s16);
  371. q7s16 = vsubq_s16(q4s16, q1s16);
  372. STORE_COMBINE_CENTER_RESULTS(r10, r9);
  373. r10 += str2; r9 -= str2;
  374. LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
  375. q4s16 = vaddq_s16(q2s16, q1s16);
  376. q5s16 = vaddq_s16(q3s16, q0s16);
  377. q6s16 = vsubq_s16(q3s16, q0s16);
  378. q7s16 = vsubq_s16(q2s16, q1s16);
  379. STORE_COMBINE_EXTREME_RESULTS(r7, r6);
  380. r7 += str2; r6 -= str2;
  381. LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
  382. q2s16 = vaddq_s16(q14s16, q1s16);
  383. q3s16 = vaddq_s16(q15s16, q0s16);
  384. q4s16 = vsubq_s16(q15s16, q0s16);
  385. q5s16 = vsubq_s16(q14s16, q1s16);
  386. LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
  387. q8s16 = vaddq_s16(q4s16, q1s16);
  388. q9s16 = vaddq_s16(q5s16, q0s16);
  389. q6s16 = vsubq_s16(q5s16, q0s16);
  390. q7s16 = vsubq_s16(q4s16, q1s16);
  391. STORE_COMBINE_CENTER_RESULTS(r10, r9);
  392. LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
  393. q4s16 = vaddq_s16(q2s16, q1s16);
  394. q5s16 = vaddq_s16(q3s16, q0s16);
  395. q6s16 = vsubq_s16(q3s16, q0s16);
  396. q7s16 = vsubq_s16(q2s16, q1s16);
  397. STORE_COMBINE_EXTREME_RESULTS(r7, r6);
  398. return;
  399. }
  400. void vpx_idct32x32_1024_add_neon(
  401. int16_t *input,
  402. uint8_t *dest,
  403. int stride) {
  404. int i, idct32_pass_loop;
  405. int16_t trans_buf[32 * 8];
  406. int16_t pass1[32 * 32];
  407. int16_t pass2[32 * 32];
  408. int16_t *out;
  409. int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  410. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  411. for (idct32_pass_loop = 0, out = pass1;
  412. idct32_pass_loop < 2;
  413. idct32_pass_loop++,
  414. input = pass1, // the input of pass2 is the result of pass1
  415. out = pass2) {
  416. for (i = 0;
  417. i < 4; i++,
  418. input += 32 * 8, out += 8) { // idct32_bands_loop
  419. idct32_transpose_pair(input, trans_buf);
  420. // -----------------------------------------
  421. // BLOCK A: 16-19,28-31
  422. // -----------------------------------------
  423. // generate 16,17,30,31
  424. // part of stage 1
  425. LOAD_FROM_TRANSPOSED(0, 1, 31)
  426. DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
  427. LOAD_FROM_TRANSPOSED(31, 17, 15)
  428. DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
  429. // part of stage 2
  430. q4s16 = vaddq_s16(q0s16, q1s16);
  431. q13s16 = vsubq_s16(q0s16, q1s16);
  432. q6s16 = vaddq_s16(q2s16, q3s16);
  433. q14s16 = vsubq_s16(q2s16, q3s16);
  434. // part of stage 3
  435. DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
  436. // generate 18,19,28,29
  437. // part of stage 1
  438. LOAD_FROM_TRANSPOSED(15, 9, 23)
  439. DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
  440. LOAD_FROM_TRANSPOSED(23, 25, 7)
  441. DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
  442. // part of stage 2
  443. q13s16 = vsubq_s16(q3s16, q2s16);
  444. q3s16 = vaddq_s16(q3s16, q2s16);
  445. q14s16 = vsubq_s16(q1s16, q0s16);
  446. q2s16 = vaddq_s16(q1s16, q0s16);
  447. // part of stage 3
  448. DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
  449. // part of stage 4
  450. q8s16 = vaddq_s16(q4s16, q2s16);
  451. q9s16 = vaddq_s16(q5s16, q0s16);
  452. q10s16 = vaddq_s16(q7s16, q1s16);
  453. q15s16 = vaddq_s16(q6s16, q3s16);
  454. q13s16 = vsubq_s16(q5s16, q0s16);
  455. q14s16 = vsubq_s16(q7s16, q1s16);
  456. STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
  457. STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
  458. // part of stage 5
  459. DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
  460. STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
  461. // part of stage 4
  462. q13s16 = vsubq_s16(q4s16, q2s16);
  463. q14s16 = vsubq_s16(q6s16, q3s16);
  464. // part of stage 5
  465. DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
  466. STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
  467. // -----------------------------------------
  468. // BLOCK B: 20-23,24-27
  469. // -----------------------------------------
  470. // generate 20,21,26,27
  471. // part of stage 1
  472. LOAD_FROM_TRANSPOSED(7, 5, 27)
  473. DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
  474. LOAD_FROM_TRANSPOSED(27, 21, 11)
  475. DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
  476. // part of stage 2
  477. q13s16 = vsubq_s16(q0s16, q1s16);
  478. q0s16 = vaddq_s16(q0s16, q1s16);
  479. q14s16 = vsubq_s16(q2s16, q3s16);
  480. q2s16 = vaddq_s16(q2s16, q3s16);
  481. // part of stage 3
  482. DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
  483. // generate 22,23,24,25
  484. // part of stage 1
  485. LOAD_FROM_TRANSPOSED(11, 13, 19)
  486. DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
  487. LOAD_FROM_TRANSPOSED(19, 29, 3)
  488. DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
  489. // part of stage 2
  490. q14s16 = vsubq_s16(q4s16, q5s16);
  491. q5s16 = vaddq_s16(q4s16, q5s16);
  492. q13s16 = vsubq_s16(q6s16, q7s16);
  493. q6s16 = vaddq_s16(q6s16, q7s16);
  494. // part of stage 3
  495. DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
  496. // part of stage 4
  497. q10s16 = vaddq_s16(q7s16, q1s16);
  498. q11s16 = vaddq_s16(q5s16, q0s16);
  499. q12s16 = vaddq_s16(q6s16, q2s16);
  500. q15s16 = vaddq_s16(q4s16, q3s16);
  501. // part of stage 6
  502. LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
  503. q8s16 = vaddq_s16(q14s16, q11s16);
  504. q9s16 = vaddq_s16(q13s16, q10s16);
  505. q13s16 = vsubq_s16(q13s16, q10s16);
  506. q11s16 = vsubq_s16(q14s16, q11s16);
  507. STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
  508. LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
  509. q8s16 = vsubq_s16(q9s16, q12s16);
  510. q10s16 = vaddq_s16(q14s16, q15s16);
  511. q14s16 = vsubq_s16(q14s16, q15s16);
  512. q12s16 = vaddq_s16(q9s16, q12s16);
  513. STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
  514. // part of stage 7
  515. DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
  516. STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
  517. q13s16 = q11s16;
  518. q14s16 = q8s16;
  519. DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
  520. STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
  521. // part of stage 4
  522. q14s16 = vsubq_s16(q5s16, q0s16);
  523. q13s16 = vsubq_s16(q6s16, q2s16);
  524. DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
  525. q14s16 = vsubq_s16(q7s16, q1s16);
  526. q13s16 = vsubq_s16(q4s16, q3s16);
  527. DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
  528. // part of stage 6
  529. LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
  530. q8s16 = vaddq_s16(q14s16, q1s16);
  531. q9s16 = vaddq_s16(q13s16, q6s16);
  532. q13s16 = vsubq_s16(q13s16, q6s16);
  533. q1s16 = vsubq_s16(q14s16, q1s16);
  534. STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
  535. LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
  536. q14s16 = vsubq_s16(q8s16, q5s16);
  537. q10s16 = vaddq_s16(q8s16, q5s16);
  538. q11s16 = vaddq_s16(q9s16, q0s16);
  539. q0s16 = vsubq_s16(q9s16, q0s16);
  540. STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
  541. // part of stage 7
  542. DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
  543. STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
  544. DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
  545. &q1s16, &q0s16);
  546. STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
  547. // -----------------------------------------
  548. // BLOCK C: 8-10,11-15
  549. // -----------------------------------------
  550. // generate 8,9,14,15
  551. // part of stage 2
  552. LOAD_FROM_TRANSPOSED(3, 2, 30)
  553. DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
  554. LOAD_FROM_TRANSPOSED(30, 18, 14)
  555. DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
  556. // part of stage 3
  557. q13s16 = vsubq_s16(q0s16, q1s16);
  558. q0s16 = vaddq_s16(q0s16, q1s16);
  559. q14s16 = vsubq_s16(q2s16, q3s16);
  560. q2s16 = vaddq_s16(q2s16, q3s16);
  561. // part of stage 4
  562. DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
  563. // generate 10,11,12,13
  564. // part of stage 2
  565. LOAD_FROM_TRANSPOSED(14, 10, 22)
  566. DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
  567. LOAD_FROM_TRANSPOSED(22, 26, 6)
  568. DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
  569. // part of stage 3
  570. q14s16 = vsubq_s16(q4s16, q5s16);
  571. q5s16 = vaddq_s16(q4s16, q5s16);
  572. q13s16 = vsubq_s16(q6s16, q7s16);
  573. q6s16 = vaddq_s16(q6s16, q7s16);
  574. // part of stage 4
  575. DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
  576. // part of stage 5
  577. q8s16 = vaddq_s16(q0s16, q5s16);
  578. q9s16 = vaddq_s16(q1s16, q7s16);
  579. q13s16 = vsubq_s16(q1s16, q7s16);
  580. q14s16 = vsubq_s16(q3s16, q4s16);
  581. q10s16 = vaddq_s16(q3s16, q4s16);
  582. q15s16 = vaddq_s16(q2s16, q6s16);
  583. STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
  584. STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
  585. // part of stage 6
  586. DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
  587. STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
  588. q13s16 = vsubq_s16(q0s16, q5s16);
  589. q14s16 = vsubq_s16(q2s16, q6s16);
  590. DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
  591. STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
  592. // -----------------------------------------
  593. // BLOCK D: 0-3,4-7
  594. // -----------------------------------------
  595. // generate 4,5,6,7
  596. // part of stage 3
  597. LOAD_FROM_TRANSPOSED(6, 4, 28)
  598. DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
  599. LOAD_FROM_TRANSPOSED(28, 20, 12)
  600. DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
  601. // part of stage 4
  602. q13s16 = vsubq_s16(q0s16, q1s16);
  603. q0s16 = vaddq_s16(q0s16, q1s16);
  604. q14s16 = vsubq_s16(q2s16, q3s16);
  605. q2s16 = vaddq_s16(q2s16, q3s16);
  606. // part of stage 5
  607. DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
  608. // generate 0,1,2,3
  609. // part of stage 4
  610. LOAD_FROM_TRANSPOSED(12, 0, 16)
  611. DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
  612. LOAD_FROM_TRANSPOSED(16, 8, 24)
  613. DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
  614. // part of stage 5
  615. q4s16 = vaddq_s16(q7s16, q6s16);
  616. q7s16 = vsubq_s16(q7s16, q6s16);
  617. q6s16 = vsubq_s16(q5s16, q14s16);
  618. q5s16 = vaddq_s16(q5s16, q14s16);
  619. // part of stage 6
  620. q8s16 = vaddq_s16(q4s16, q2s16);
  621. q9s16 = vaddq_s16(q5s16, q3s16);
  622. q10s16 = vaddq_s16(q6s16, q1s16);
  623. q11s16 = vaddq_s16(q7s16, q0s16);
  624. q12s16 = vsubq_s16(q7s16, q0s16);
  625. q13s16 = vsubq_s16(q6s16, q1s16);
  626. q14s16 = vsubq_s16(q5s16, q3s16);
  627. q15s16 = vsubq_s16(q4s16, q2s16);
  628. // part of stage 7
  629. LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
  630. q2s16 = vaddq_s16(q8s16, q1s16);
  631. q3s16 = vaddq_s16(q9s16, q0s16);
  632. q4s16 = vsubq_s16(q9s16, q0s16);
  633. q5s16 = vsubq_s16(q8s16, q1s16);
  634. LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
  635. q8s16 = vaddq_s16(q4s16, q1s16);
  636. q9s16 = vaddq_s16(q5s16, q0s16);
  637. q6s16 = vsubq_s16(q5s16, q0s16);
  638. q7s16 = vsubq_s16(q4s16, q1s16);
  639. if (idct32_pass_loop == 0) {
  640. idct32_bands_end_1st_pass(out,
  641. q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
  642. q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
  643. } else {
  644. idct32_bands_end_2nd_pass(out, dest, stride,
  645. q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
  646. q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
  647. dest += 8;
  648. }
  649. }
  650. }
  651. return;
  652. }