idct8x8_add_neon.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "vpx_dsp/txfm_common.h"
  13. static INLINE void TRANSPOSE8X8(
  14. int16x8_t *q8s16,
  15. int16x8_t *q9s16,
  16. int16x8_t *q10s16,
  17. int16x8_t *q11s16,
  18. int16x8_t *q12s16,
  19. int16x8_t *q13s16,
  20. int16x8_t *q14s16,
  21. int16x8_t *q15s16) {
  22. int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  23. int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  24. int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
  25. int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
  26. d16s16 = vget_low_s16(*q8s16);
  27. d17s16 = vget_high_s16(*q8s16);
  28. d18s16 = vget_low_s16(*q9s16);
  29. d19s16 = vget_high_s16(*q9s16);
  30. d20s16 = vget_low_s16(*q10s16);
  31. d21s16 = vget_high_s16(*q10s16);
  32. d22s16 = vget_low_s16(*q11s16);
  33. d23s16 = vget_high_s16(*q11s16);
  34. d24s16 = vget_low_s16(*q12s16);
  35. d25s16 = vget_high_s16(*q12s16);
  36. d26s16 = vget_low_s16(*q13s16);
  37. d27s16 = vget_high_s16(*q13s16);
  38. d28s16 = vget_low_s16(*q14s16);
  39. d29s16 = vget_high_s16(*q14s16);
  40. d30s16 = vget_low_s16(*q15s16);
  41. d31s16 = vget_high_s16(*q15s16);
  42. *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
  43. *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
  44. *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
  45. *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
  46. *q12s16 = vcombine_s16(d17s16, d25s16);
  47. *q13s16 = vcombine_s16(d19s16, d27s16);
  48. *q14s16 = vcombine_s16(d21s16, d29s16);
  49. *q15s16 = vcombine_s16(d23s16, d31s16);
  50. q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
  51. vreinterpretq_s32_s16(*q10s16));
  52. q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
  53. vreinterpretq_s32_s16(*q11s16));
  54. q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
  55. vreinterpretq_s32_s16(*q14s16));
  56. q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
  57. vreinterpretq_s32_s16(*q15s16));
  58. q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
  59. vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
  60. q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
  61. vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
  62. q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
  63. vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
  64. q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
  65. vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
  66. *q8s16 = q0x2s16.val[0];
  67. *q9s16 = q0x2s16.val[1];
  68. *q10s16 = q1x2s16.val[0];
  69. *q11s16 = q1x2s16.val[1];
  70. *q12s16 = q2x2s16.val[0];
  71. *q13s16 = q2x2s16.val[1];
  72. *q14s16 = q3x2s16.val[0];
  73. *q15s16 = q3x2s16.val[1];
  74. return;
  75. }
  76. static INLINE void IDCT8x8_1D(
  77. int16x8_t *q8s16,
  78. int16x8_t *q9s16,
  79. int16x8_t *q10s16,
  80. int16x8_t *q11s16,
  81. int16x8_t *q12s16,
  82. int16x8_t *q13s16,
  83. int16x8_t *q14s16,
  84. int16x8_t *q15s16) {
  85. int16x4_t d0s16, d1s16, d2s16, d3s16;
  86. int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
  87. int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  88. int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  89. int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  90. int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
  91. int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
  92. d0s16 = vdup_n_s16(cospi_28_64);
  93. d1s16 = vdup_n_s16(cospi_4_64);
  94. d2s16 = vdup_n_s16(cospi_12_64);
  95. d3s16 = vdup_n_s16(cospi_20_64);
  96. d16s16 = vget_low_s16(*q8s16);
  97. d17s16 = vget_high_s16(*q8s16);
  98. d18s16 = vget_low_s16(*q9s16);
  99. d19s16 = vget_high_s16(*q9s16);
  100. d20s16 = vget_low_s16(*q10s16);
  101. d21s16 = vget_high_s16(*q10s16);
  102. d22s16 = vget_low_s16(*q11s16);
  103. d23s16 = vget_high_s16(*q11s16);
  104. d24s16 = vget_low_s16(*q12s16);
  105. d25s16 = vget_high_s16(*q12s16);
  106. d26s16 = vget_low_s16(*q13s16);
  107. d27s16 = vget_high_s16(*q13s16);
  108. d28s16 = vget_low_s16(*q14s16);
  109. d29s16 = vget_high_s16(*q14s16);
  110. d30s16 = vget_low_s16(*q15s16);
  111. d31s16 = vget_high_s16(*q15s16);
  112. q2s32 = vmull_s16(d18s16, d0s16);
  113. q3s32 = vmull_s16(d19s16, d0s16);
  114. q5s32 = vmull_s16(d26s16, d2s16);
  115. q6s32 = vmull_s16(d27s16, d2s16);
  116. q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
  117. q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
  118. q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
  119. q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
  120. d8s16 = vqrshrn_n_s32(q2s32, 14);
  121. d9s16 = vqrshrn_n_s32(q3s32, 14);
  122. d10s16 = vqrshrn_n_s32(q5s32, 14);
  123. d11s16 = vqrshrn_n_s32(q6s32, 14);
  124. q4s16 = vcombine_s16(d8s16, d9s16);
  125. q5s16 = vcombine_s16(d10s16, d11s16);
  126. q2s32 = vmull_s16(d18s16, d1s16);
  127. q3s32 = vmull_s16(d19s16, d1s16);
  128. q9s32 = vmull_s16(d26s16, d3s16);
  129. q13s32 = vmull_s16(d27s16, d3s16);
  130. q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
  131. q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
  132. q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
  133. q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
  134. d14s16 = vqrshrn_n_s32(q2s32, 14);
  135. d15s16 = vqrshrn_n_s32(q3s32, 14);
  136. d12s16 = vqrshrn_n_s32(q9s32, 14);
  137. d13s16 = vqrshrn_n_s32(q13s32, 14);
  138. q6s16 = vcombine_s16(d12s16, d13s16);
  139. q7s16 = vcombine_s16(d14s16, d15s16);
  140. d0s16 = vdup_n_s16(cospi_16_64);
  141. q2s32 = vmull_s16(d16s16, d0s16);
  142. q3s32 = vmull_s16(d17s16, d0s16);
  143. q13s32 = vmull_s16(d16s16, d0s16);
  144. q15s32 = vmull_s16(d17s16, d0s16);
  145. q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
  146. q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
  147. q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
  148. q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
  149. d0s16 = vdup_n_s16(cospi_24_64);
  150. d1s16 = vdup_n_s16(cospi_8_64);
  151. d18s16 = vqrshrn_n_s32(q2s32, 14);
  152. d19s16 = vqrshrn_n_s32(q3s32, 14);
  153. d22s16 = vqrshrn_n_s32(q13s32, 14);
  154. d23s16 = vqrshrn_n_s32(q15s32, 14);
  155. *q9s16 = vcombine_s16(d18s16, d19s16);
  156. *q11s16 = vcombine_s16(d22s16, d23s16);
  157. q2s32 = vmull_s16(d20s16, d0s16);
  158. q3s32 = vmull_s16(d21s16, d0s16);
  159. q8s32 = vmull_s16(d20s16, d1s16);
  160. q12s32 = vmull_s16(d21s16, d1s16);
  161. q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
  162. q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
  163. q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
  164. q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
  165. d26s16 = vqrshrn_n_s32(q2s32, 14);
  166. d27s16 = vqrshrn_n_s32(q3s32, 14);
  167. d30s16 = vqrshrn_n_s32(q8s32, 14);
  168. d31s16 = vqrshrn_n_s32(q12s32, 14);
  169. *q13s16 = vcombine_s16(d26s16, d27s16);
  170. *q15s16 = vcombine_s16(d30s16, d31s16);
  171. q0s16 = vaddq_s16(*q9s16, *q15s16);
  172. q1s16 = vaddq_s16(*q11s16, *q13s16);
  173. q2s16 = vsubq_s16(*q11s16, *q13s16);
  174. q3s16 = vsubq_s16(*q9s16, *q15s16);
  175. *q13s16 = vsubq_s16(q4s16, q5s16);
  176. q4s16 = vaddq_s16(q4s16, q5s16);
  177. *q14s16 = vsubq_s16(q7s16, q6s16);
  178. q7s16 = vaddq_s16(q7s16, q6s16);
  179. d26s16 = vget_low_s16(*q13s16);
  180. d27s16 = vget_high_s16(*q13s16);
  181. d28s16 = vget_low_s16(*q14s16);
  182. d29s16 = vget_high_s16(*q14s16);
  183. d16s16 = vdup_n_s16(cospi_16_64);
  184. q9s32 = vmull_s16(d28s16, d16s16);
  185. q10s32 = vmull_s16(d29s16, d16s16);
  186. q11s32 = vmull_s16(d28s16, d16s16);
  187. q12s32 = vmull_s16(d29s16, d16s16);
  188. q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
  189. q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
  190. q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
  191. q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
  192. d10s16 = vqrshrn_n_s32(q9s32, 14);
  193. d11s16 = vqrshrn_n_s32(q10s32, 14);
  194. d12s16 = vqrshrn_n_s32(q11s32, 14);
  195. d13s16 = vqrshrn_n_s32(q12s32, 14);
  196. q5s16 = vcombine_s16(d10s16, d11s16);
  197. q6s16 = vcombine_s16(d12s16, d13s16);
  198. *q8s16 = vaddq_s16(q0s16, q7s16);
  199. *q9s16 = vaddq_s16(q1s16, q6s16);
  200. *q10s16 = vaddq_s16(q2s16, q5s16);
  201. *q11s16 = vaddq_s16(q3s16, q4s16);
  202. *q12s16 = vsubq_s16(q3s16, q4s16);
  203. *q13s16 = vsubq_s16(q2s16, q5s16);
  204. *q14s16 = vsubq_s16(q1s16, q6s16);
  205. *q15s16 = vsubq_s16(q0s16, q7s16);
  206. return;
  207. }
  208. void vpx_idct8x8_64_add_neon(
  209. int16_t *input,
  210. uint8_t *dest,
  211. int dest_stride) {
  212. uint8_t *d1, *d2;
  213. uint8x8_t d0u8, d1u8, d2u8, d3u8;
  214. uint64x1_t d0u64, d1u64, d2u64, d3u64;
  215. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  216. uint16x8_t q8u16, q9u16, q10u16, q11u16;
  217. q8s16 = vld1q_s16(input);
  218. q9s16 = vld1q_s16(input + 8);
  219. q10s16 = vld1q_s16(input + 16);
  220. q11s16 = vld1q_s16(input + 24);
  221. q12s16 = vld1q_s16(input + 32);
  222. q13s16 = vld1q_s16(input + 40);
  223. q14s16 = vld1q_s16(input + 48);
  224. q15s16 = vld1q_s16(input + 56);
  225. TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
  226. &q12s16, &q13s16, &q14s16, &q15s16);
  227. IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
  228. &q12s16, &q13s16, &q14s16, &q15s16);
  229. TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
  230. &q12s16, &q13s16, &q14s16, &q15s16);
  231. IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
  232. &q12s16, &q13s16, &q14s16, &q15s16);
  233. q8s16 = vrshrq_n_s16(q8s16, 5);
  234. q9s16 = vrshrq_n_s16(q9s16, 5);
  235. q10s16 = vrshrq_n_s16(q10s16, 5);
  236. q11s16 = vrshrq_n_s16(q11s16, 5);
  237. q12s16 = vrshrq_n_s16(q12s16, 5);
  238. q13s16 = vrshrq_n_s16(q13s16, 5);
  239. q14s16 = vrshrq_n_s16(q14s16, 5);
  240. q15s16 = vrshrq_n_s16(q15s16, 5);
  241. d1 = d2 = dest;
  242. d0u64 = vld1_u64((uint64_t *)d1);
  243. d1 += dest_stride;
  244. d1u64 = vld1_u64((uint64_t *)d1);
  245. d1 += dest_stride;
  246. d2u64 = vld1_u64((uint64_t *)d1);
  247. d1 += dest_stride;
  248. d3u64 = vld1_u64((uint64_t *)d1);
  249. d1 += dest_stride;
  250. q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
  251. vreinterpret_u8_u64(d0u64));
  252. q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
  253. vreinterpret_u8_u64(d1u64));
  254. q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
  255. vreinterpret_u8_u64(d2u64));
  256. q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
  257. vreinterpret_u8_u64(d3u64));
  258. d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  259. d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  260. d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
  261. d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
  262. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
  263. d2 += dest_stride;
  264. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
  265. d2 += dest_stride;
  266. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
  267. d2 += dest_stride;
  268. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
  269. d2 += dest_stride;
  270. q8s16 = q12s16;
  271. q9s16 = q13s16;
  272. q10s16 = q14s16;
  273. q11s16 = q15s16;
  274. d0u64 = vld1_u64((uint64_t *)d1);
  275. d1 += dest_stride;
  276. d1u64 = vld1_u64((uint64_t *)d1);
  277. d1 += dest_stride;
  278. d2u64 = vld1_u64((uint64_t *)d1);
  279. d1 += dest_stride;
  280. d3u64 = vld1_u64((uint64_t *)d1);
  281. d1 += dest_stride;
  282. q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
  283. vreinterpret_u8_u64(d0u64));
  284. q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
  285. vreinterpret_u8_u64(d1u64));
  286. q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
  287. vreinterpret_u8_u64(d2u64));
  288. q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
  289. vreinterpret_u8_u64(d3u64));
  290. d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  291. d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  292. d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
  293. d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
  294. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
  295. d2 += dest_stride;
  296. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
  297. d2 += dest_stride;
  298. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
  299. d2 += dest_stride;
  300. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
  301. d2 += dest_stride;
  302. return;
  303. }
  304. void vpx_idct8x8_12_add_neon(
  305. int16_t *input,
  306. uint8_t *dest,
  307. int dest_stride) {
  308. uint8_t *d1, *d2;
  309. uint8x8_t d0u8, d1u8, d2u8, d3u8;
  310. int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
  311. int16x4_t d26s16, d27s16, d28s16, d29s16;
  312. uint64x1_t d0u64, d1u64, d2u64, d3u64;
  313. int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  314. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  315. uint16x8_t q8u16, q9u16, q10u16, q11u16;
  316. int32x4_t q9s32, q10s32, q11s32, q12s32;
  317. q8s16 = vld1q_s16(input);
  318. q9s16 = vld1q_s16(input + 8);
  319. q10s16 = vld1q_s16(input + 16);
  320. q11s16 = vld1q_s16(input + 24);
  321. q12s16 = vld1q_s16(input + 32);
  322. q13s16 = vld1q_s16(input + 40);
  323. q14s16 = vld1q_s16(input + 48);
  324. q15s16 = vld1q_s16(input + 56);
  325. TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
  326. &q12s16, &q13s16, &q14s16, &q15s16);
  327. // First transform rows
  328. // stage 1
  329. q0s16 = vdupq_n_s16(cospi_28_64 * 2);
  330. q1s16 = vdupq_n_s16(cospi_4_64 * 2);
  331. q4s16 = vqrdmulhq_s16(q9s16, q0s16);
  332. q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
  333. q7s16 = vqrdmulhq_s16(q9s16, q1s16);
  334. q1s16 = vdupq_n_s16(cospi_12_64 * 2);
  335. q5s16 = vqrdmulhq_s16(q11s16, q0s16);
  336. q0s16 = vdupq_n_s16(cospi_16_64 * 2);
  337. q6s16 = vqrdmulhq_s16(q11s16, q1s16);
  338. // stage 2 & stage 3 - even half
  339. q1s16 = vdupq_n_s16(cospi_24_64 * 2);
  340. q9s16 = vqrdmulhq_s16(q8s16, q0s16);
  341. q0s16 = vdupq_n_s16(cospi_8_64 * 2);
  342. q13s16 = vqrdmulhq_s16(q10s16, q1s16);
  343. q15s16 = vqrdmulhq_s16(q10s16, q0s16);
  344. // stage 3 -odd half
  345. q0s16 = vaddq_s16(q9s16, q15s16);
  346. q1s16 = vaddq_s16(q9s16, q13s16);
  347. q2s16 = vsubq_s16(q9s16, q13s16);
  348. q3s16 = vsubq_s16(q9s16, q15s16);
  349. // stage 2 - odd half
  350. q13s16 = vsubq_s16(q4s16, q5s16);
  351. q4s16 = vaddq_s16(q4s16, q5s16);
  352. q14s16 = vsubq_s16(q7s16, q6s16);
  353. q7s16 = vaddq_s16(q7s16, q6s16);
  354. d26s16 = vget_low_s16(q13s16);
  355. d27s16 = vget_high_s16(q13s16);
  356. d28s16 = vget_low_s16(q14s16);
  357. d29s16 = vget_high_s16(q14s16);
  358. d16s16 = vdup_n_s16(cospi_16_64);
  359. q9s32 = vmull_s16(d28s16, d16s16);
  360. q10s32 = vmull_s16(d29s16, d16s16);
  361. q11s32 = vmull_s16(d28s16, d16s16);
  362. q12s32 = vmull_s16(d29s16, d16s16);
  363. q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
  364. q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
  365. q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
  366. q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
  367. d10s16 = vqrshrn_n_s32(q9s32, 14);
  368. d11s16 = vqrshrn_n_s32(q10s32, 14);
  369. d12s16 = vqrshrn_n_s32(q11s32, 14);
  370. d13s16 = vqrshrn_n_s32(q12s32, 14);
  371. q5s16 = vcombine_s16(d10s16, d11s16);
  372. q6s16 = vcombine_s16(d12s16, d13s16);
  373. // stage 4
  374. q8s16 = vaddq_s16(q0s16, q7s16);
  375. q9s16 = vaddq_s16(q1s16, q6s16);
  376. q10s16 = vaddq_s16(q2s16, q5s16);
  377. q11s16 = vaddq_s16(q3s16, q4s16);
  378. q12s16 = vsubq_s16(q3s16, q4s16);
  379. q13s16 = vsubq_s16(q2s16, q5s16);
  380. q14s16 = vsubq_s16(q1s16, q6s16);
  381. q15s16 = vsubq_s16(q0s16, q7s16);
  382. TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
  383. &q12s16, &q13s16, &q14s16, &q15s16);
  384. IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
  385. &q12s16, &q13s16, &q14s16, &q15s16);
  386. q8s16 = vrshrq_n_s16(q8s16, 5);
  387. q9s16 = vrshrq_n_s16(q9s16, 5);
  388. q10s16 = vrshrq_n_s16(q10s16, 5);
  389. q11s16 = vrshrq_n_s16(q11s16, 5);
  390. q12s16 = vrshrq_n_s16(q12s16, 5);
  391. q13s16 = vrshrq_n_s16(q13s16, 5);
  392. q14s16 = vrshrq_n_s16(q14s16, 5);
  393. q15s16 = vrshrq_n_s16(q15s16, 5);
  394. d1 = d2 = dest;
  395. d0u64 = vld1_u64((uint64_t *)d1);
  396. d1 += dest_stride;
  397. d1u64 = vld1_u64((uint64_t *)d1);
  398. d1 += dest_stride;
  399. d2u64 = vld1_u64((uint64_t *)d1);
  400. d1 += dest_stride;
  401. d3u64 = vld1_u64((uint64_t *)d1);
  402. d1 += dest_stride;
  403. q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
  404. vreinterpret_u8_u64(d0u64));
  405. q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
  406. vreinterpret_u8_u64(d1u64));
  407. q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
  408. vreinterpret_u8_u64(d2u64));
  409. q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
  410. vreinterpret_u8_u64(d3u64));
  411. d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  412. d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  413. d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
  414. d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
  415. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
  416. d2 += dest_stride;
  417. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
  418. d2 += dest_stride;
  419. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
  420. d2 += dest_stride;
  421. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
  422. d2 += dest_stride;
  423. q8s16 = q12s16;
  424. q9s16 = q13s16;
  425. q10s16 = q14s16;
  426. q11s16 = q15s16;
  427. d0u64 = vld1_u64((uint64_t *)d1);
  428. d1 += dest_stride;
  429. d1u64 = vld1_u64((uint64_t *)d1);
  430. d1 += dest_stride;
  431. d2u64 = vld1_u64((uint64_t *)d1);
  432. d1 += dest_stride;
  433. d3u64 = vld1_u64((uint64_t *)d1);
  434. d1 += dest_stride;
  435. q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
  436. vreinterpret_u8_u64(d0u64));
  437. q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
  438. vreinterpret_u8_u64(d1u64));
  439. q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
  440. vreinterpret_u8_u64(d2u64));
  441. q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
  442. vreinterpret_u8_u64(d3u64));
  443. d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  444. d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  445. d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
  446. d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
  447. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
  448. d2 += dest_stride;
  449. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
  450. d2 += dest_stride;
  451. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
  452. d2 += dest_stride;
  453. vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
  454. d2 += dest_stride;
  455. return;
  456. }