idct16x16_add_neon.c 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "vpx_dsp/txfm_common.h"
  13. static INLINE void TRANSPOSE8X8(
  14. int16x8_t *q8s16,
  15. int16x8_t *q9s16,
  16. int16x8_t *q10s16,
  17. int16x8_t *q11s16,
  18. int16x8_t *q12s16,
  19. int16x8_t *q13s16,
  20. int16x8_t *q14s16,
  21. int16x8_t *q15s16) {
  22. int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  23. int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  24. int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
  25. int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
  26. d16s16 = vget_low_s16(*q8s16);
  27. d17s16 = vget_high_s16(*q8s16);
  28. d18s16 = vget_low_s16(*q9s16);
  29. d19s16 = vget_high_s16(*q9s16);
  30. d20s16 = vget_low_s16(*q10s16);
  31. d21s16 = vget_high_s16(*q10s16);
  32. d22s16 = vget_low_s16(*q11s16);
  33. d23s16 = vget_high_s16(*q11s16);
  34. d24s16 = vget_low_s16(*q12s16);
  35. d25s16 = vget_high_s16(*q12s16);
  36. d26s16 = vget_low_s16(*q13s16);
  37. d27s16 = vget_high_s16(*q13s16);
  38. d28s16 = vget_low_s16(*q14s16);
  39. d29s16 = vget_high_s16(*q14s16);
  40. d30s16 = vget_low_s16(*q15s16);
  41. d31s16 = vget_high_s16(*q15s16);
  42. *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
  43. *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
  44. *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
  45. *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
  46. *q12s16 = vcombine_s16(d17s16, d25s16);
  47. *q13s16 = vcombine_s16(d19s16, d27s16);
  48. *q14s16 = vcombine_s16(d21s16, d29s16);
  49. *q15s16 = vcombine_s16(d23s16, d31s16);
  50. q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
  51. vreinterpretq_s32_s16(*q10s16));
  52. q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
  53. vreinterpretq_s32_s16(*q11s16));
  54. q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
  55. vreinterpretq_s32_s16(*q14s16));
  56. q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
  57. vreinterpretq_s32_s16(*q15s16));
  58. q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
  59. vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
  60. q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
  61. vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
  62. q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
  63. vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
  64. q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
  65. vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
  66. *q8s16 = q0x2s16.val[0];
  67. *q9s16 = q0x2s16.val[1];
  68. *q10s16 = q1x2s16.val[0];
  69. *q11s16 = q1x2s16.val[1];
  70. *q12s16 = q2x2s16.val[0];
  71. *q13s16 = q2x2s16.val[1];
  72. *q14s16 = q3x2s16.val[0];
  73. *q15s16 = q3x2s16.val[1];
  74. return;
  75. }
  76. void vpx_idct16x16_256_add_neon_pass1(
  77. int16_t *in,
  78. int16_t *out,
  79. int output_stride) {
  80. int16x4_t d0s16, d1s16, d2s16, d3s16;
  81. int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
  82. int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  83. int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  84. uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
  85. uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
  86. int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  87. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  88. int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
  89. int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
  90. int16x8x2_t q0x2s16;
  91. q0x2s16 = vld2q_s16(in);
  92. q8s16 = q0x2s16.val[0];
  93. in += 16;
  94. q0x2s16 = vld2q_s16(in);
  95. q9s16 = q0x2s16.val[0];
  96. in += 16;
  97. q0x2s16 = vld2q_s16(in);
  98. q10s16 = q0x2s16.val[0];
  99. in += 16;
  100. q0x2s16 = vld2q_s16(in);
  101. q11s16 = q0x2s16.val[0];
  102. in += 16;
  103. q0x2s16 = vld2q_s16(in);
  104. q12s16 = q0x2s16.val[0];
  105. in += 16;
  106. q0x2s16 = vld2q_s16(in);
  107. q13s16 = q0x2s16.val[0];
  108. in += 16;
  109. q0x2s16 = vld2q_s16(in);
  110. q14s16 = q0x2s16.val[0];
  111. in += 16;
  112. q0x2s16 = vld2q_s16(in);
  113. q15s16 = q0x2s16.val[0];
  114. TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
  115. &q12s16, &q13s16, &q14s16, &q15s16);
  116. d16s16 = vget_low_s16(q8s16);
  117. d17s16 = vget_high_s16(q8s16);
  118. d18s16 = vget_low_s16(q9s16);
  119. d19s16 = vget_high_s16(q9s16);
  120. d20s16 = vget_low_s16(q10s16);
  121. d21s16 = vget_high_s16(q10s16);
  122. d22s16 = vget_low_s16(q11s16);
  123. d23s16 = vget_high_s16(q11s16);
  124. d24s16 = vget_low_s16(q12s16);
  125. d25s16 = vget_high_s16(q12s16);
  126. d26s16 = vget_low_s16(q13s16);
  127. d27s16 = vget_high_s16(q13s16);
  128. d28s16 = vget_low_s16(q14s16);
  129. d29s16 = vget_high_s16(q14s16);
  130. d30s16 = vget_low_s16(q15s16);
  131. d31s16 = vget_high_s16(q15s16);
  132. // stage 3
  133. d0s16 = vdup_n_s16(cospi_28_64);
  134. d1s16 = vdup_n_s16(cospi_4_64);
  135. q2s32 = vmull_s16(d18s16, d0s16);
  136. q3s32 = vmull_s16(d19s16, d0s16);
  137. q5s32 = vmull_s16(d18s16, d1s16);
  138. q6s32 = vmull_s16(d19s16, d1s16);
  139. q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
  140. q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
  141. q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
  142. q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
  143. d2s16 = vdup_n_s16(cospi_12_64);
  144. d3s16 = vdup_n_s16(cospi_20_64);
  145. d8s16 = vqrshrn_n_s32(q2s32, 14);
  146. d9s16 = vqrshrn_n_s32(q3s32, 14);
  147. d14s16 = vqrshrn_n_s32(q5s32, 14);
  148. d15s16 = vqrshrn_n_s32(q6s32, 14);
  149. q4s16 = vcombine_s16(d8s16, d9s16);
  150. q7s16 = vcombine_s16(d14s16, d15s16);
  151. q2s32 = vmull_s16(d26s16, d2s16);
  152. q3s32 = vmull_s16(d27s16, d2s16);
  153. q9s32 = vmull_s16(d26s16, d3s16);
  154. q15s32 = vmull_s16(d27s16, d3s16);
  155. q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
  156. q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
  157. q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
  158. q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
  159. d10s16 = vqrshrn_n_s32(q2s32, 14);
  160. d11s16 = vqrshrn_n_s32(q3s32, 14);
  161. d12s16 = vqrshrn_n_s32(q9s32, 14);
  162. d13s16 = vqrshrn_n_s32(q15s32, 14);
  163. q5s16 = vcombine_s16(d10s16, d11s16);
  164. q6s16 = vcombine_s16(d12s16, d13s16);
  165. // stage 4
  166. d30s16 = vdup_n_s16(cospi_16_64);
  167. q2s32 = vmull_s16(d16s16, d30s16);
  168. q11s32 = vmull_s16(d17s16, d30s16);
  169. q0s32 = vmull_s16(d24s16, d30s16);
  170. q1s32 = vmull_s16(d25s16, d30s16);
  171. d30s16 = vdup_n_s16(cospi_24_64);
  172. d31s16 = vdup_n_s16(cospi_8_64);
  173. q3s32 = vaddq_s32(q2s32, q0s32);
  174. q12s32 = vaddq_s32(q11s32, q1s32);
  175. q13s32 = vsubq_s32(q2s32, q0s32);
  176. q1s32 = vsubq_s32(q11s32, q1s32);
  177. d16s16 = vqrshrn_n_s32(q3s32, 14);
  178. d17s16 = vqrshrn_n_s32(q12s32, 14);
  179. d18s16 = vqrshrn_n_s32(q13s32, 14);
  180. d19s16 = vqrshrn_n_s32(q1s32, 14);
  181. q8s16 = vcombine_s16(d16s16, d17s16);
  182. q9s16 = vcombine_s16(d18s16, d19s16);
  183. q0s32 = vmull_s16(d20s16, d31s16);
  184. q1s32 = vmull_s16(d21s16, d31s16);
  185. q12s32 = vmull_s16(d20s16, d30s16);
  186. q13s32 = vmull_s16(d21s16, d30s16);
  187. q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
  188. q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
  189. q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
  190. q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
  191. d22s16 = vqrshrn_n_s32(q0s32, 14);
  192. d23s16 = vqrshrn_n_s32(q1s32, 14);
  193. d20s16 = vqrshrn_n_s32(q12s32, 14);
  194. d21s16 = vqrshrn_n_s32(q13s32, 14);
  195. q10s16 = vcombine_s16(d20s16, d21s16);
  196. q11s16 = vcombine_s16(d22s16, d23s16);
  197. q13s16 = vsubq_s16(q4s16, q5s16);
  198. q4s16 = vaddq_s16(q4s16, q5s16);
  199. q14s16 = vsubq_s16(q7s16, q6s16);
  200. q15s16 = vaddq_s16(q6s16, q7s16);
  201. d26s16 = vget_low_s16(q13s16);
  202. d27s16 = vget_high_s16(q13s16);
  203. d28s16 = vget_low_s16(q14s16);
  204. d29s16 = vget_high_s16(q14s16);
  205. // stage 5
  206. q0s16 = vaddq_s16(q8s16, q11s16);
  207. q1s16 = vaddq_s16(q9s16, q10s16);
  208. q2s16 = vsubq_s16(q9s16, q10s16);
  209. q3s16 = vsubq_s16(q8s16, q11s16);
  210. d16s16 = vdup_n_s16(cospi_16_64);
  211. q11s32 = vmull_s16(d26s16, d16s16);
  212. q12s32 = vmull_s16(d27s16, d16s16);
  213. q9s32 = vmull_s16(d28s16, d16s16);
  214. q10s32 = vmull_s16(d29s16, d16s16);
  215. q6s32 = vsubq_s32(q9s32, q11s32);
  216. q13s32 = vsubq_s32(q10s32, q12s32);
  217. q9s32 = vaddq_s32(q9s32, q11s32);
  218. q10s32 = vaddq_s32(q10s32, q12s32);
  219. d10s16 = vqrshrn_n_s32(q6s32, 14);
  220. d11s16 = vqrshrn_n_s32(q13s32, 14);
  221. d12s16 = vqrshrn_n_s32(q9s32, 14);
  222. d13s16 = vqrshrn_n_s32(q10s32, 14);
  223. q5s16 = vcombine_s16(d10s16, d11s16);
  224. q6s16 = vcombine_s16(d12s16, d13s16);
  225. // stage 6
  226. q8s16 = vaddq_s16(q0s16, q15s16);
  227. q9s16 = vaddq_s16(q1s16, q6s16);
  228. q10s16 = vaddq_s16(q2s16, q5s16);
  229. q11s16 = vaddq_s16(q3s16, q4s16);
  230. q12s16 = vsubq_s16(q3s16, q4s16);
  231. q13s16 = vsubq_s16(q2s16, q5s16);
  232. q14s16 = vsubq_s16(q1s16, q6s16);
  233. q15s16 = vsubq_s16(q0s16, q15s16);
  234. d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
  235. d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
  236. d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
  237. d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
  238. d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
  239. d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
  240. d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
  241. d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
  242. d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
  243. d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
  244. d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
  245. d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
  246. d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
  247. d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
  248. d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
  249. d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
  250. // store the data
  251. output_stride >>= 1; // output_stride / 2, out is int16_t
  252. vst1_u64((uint64_t *)out, d16u64);
  253. out += output_stride;
  254. vst1_u64((uint64_t *)out, d17u64);
  255. out += output_stride;
  256. vst1_u64((uint64_t *)out, d18u64);
  257. out += output_stride;
  258. vst1_u64((uint64_t *)out, d19u64);
  259. out += output_stride;
  260. vst1_u64((uint64_t *)out, d20u64);
  261. out += output_stride;
  262. vst1_u64((uint64_t *)out, d21u64);
  263. out += output_stride;
  264. vst1_u64((uint64_t *)out, d22u64);
  265. out += output_stride;
  266. vst1_u64((uint64_t *)out, d23u64);
  267. out += output_stride;
  268. vst1_u64((uint64_t *)out, d24u64);
  269. out += output_stride;
  270. vst1_u64((uint64_t *)out, d25u64);
  271. out += output_stride;
  272. vst1_u64((uint64_t *)out, d26u64);
  273. out += output_stride;
  274. vst1_u64((uint64_t *)out, d27u64);
  275. out += output_stride;
  276. vst1_u64((uint64_t *)out, d28u64);
  277. out += output_stride;
  278. vst1_u64((uint64_t *)out, d29u64);
  279. out += output_stride;
  280. vst1_u64((uint64_t *)out, d30u64);
  281. out += output_stride;
  282. vst1_u64((uint64_t *)out, d31u64);
  283. return;
  284. }
  285. void vpx_idct16x16_256_add_neon_pass2(
  286. int16_t *src,
  287. int16_t *out,
  288. int16_t *pass1Output,
  289. int16_t skip_adding,
  290. uint8_t *dest,
  291. int dest_stride) {
  292. uint8_t *d;
  293. uint8x8_t d12u8, d13u8;
  294. int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
  295. int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
  296. int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  297. int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  298. uint64x1_t d24u64, d25u64, d26u64, d27u64;
  299. int64x1_t d12s64, d13s64;
  300. uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
  301. uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
  302. int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  303. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  304. int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
  305. int32x4_t q10s32, q11s32, q12s32, q13s32;
  306. int16x8x2_t q0x2s16;
  307. q0x2s16 = vld2q_s16(src);
  308. q8s16 = q0x2s16.val[0];
  309. src += 16;
  310. q0x2s16 = vld2q_s16(src);
  311. q9s16 = q0x2s16.val[0];
  312. src += 16;
  313. q0x2s16 = vld2q_s16(src);
  314. q10s16 = q0x2s16.val[0];
  315. src += 16;
  316. q0x2s16 = vld2q_s16(src);
  317. q11s16 = q0x2s16.val[0];
  318. src += 16;
  319. q0x2s16 = vld2q_s16(src);
  320. q12s16 = q0x2s16.val[0];
  321. src += 16;
  322. q0x2s16 = vld2q_s16(src);
  323. q13s16 = q0x2s16.val[0];
  324. src += 16;
  325. q0x2s16 = vld2q_s16(src);
  326. q14s16 = q0x2s16.val[0];
  327. src += 16;
  328. q0x2s16 = vld2q_s16(src);
  329. q15s16 = q0x2s16.val[0];
  330. TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
  331. &q12s16, &q13s16, &q14s16, &q15s16);
  332. d16s16 = vget_low_s16(q8s16);
  333. d17s16 = vget_high_s16(q8s16);
  334. d18s16 = vget_low_s16(q9s16);
  335. d19s16 = vget_high_s16(q9s16);
  336. d20s16 = vget_low_s16(q10s16);
  337. d21s16 = vget_high_s16(q10s16);
  338. d22s16 = vget_low_s16(q11s16);
  339. d23s16 = vget_high_s16(q11s16);
  340. d24s16 = vget_low_s16(q12s16);
  341. d25s16 = vget_high_s16(q12s16);
  342. d26s16 = vget_low_s16(q13s16);
  343. d27s16 = vget_high_s16(q13s16);
  344. d28s16 = vget_low_s16(q14s16);
  345. d29s16 = vget_high_s16(q14s16);
  346. d30s16 = vget_low_s16(q15s16);
  347. d31s16 = vget_high_s16(q15s16);
  348. // stage 3
  349. d12s16 = vdup_n_s16(cospi_30_64);
  350. d13s16 = vdup_n_s16(cospi_2_64);
  351. q2s32 = vmull_s16(d16s16, d12s16);
  352. q3s32 = vmull_s16(d17s16, d12s16);
  353. q1s32 = vmull_s16(d16s16, d13s16);
  354. q4s32 = vmull_s16(d17s16, d13s16);
  355. q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
  356. q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
  357. q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
  358. q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
  359. d0s16 = vqrshrn_n_s32(q2s32, 14);
  360. d1s16 = vqrshrn_n_s32(q3s32, 14);
  361. d14s16 = vqrshrn_n_s32(q1s32, 14);
  362. d15s16 = vqrshrn_n_s32(q4s32, 14);
  363. q0s16 = vcombine_s16(d0s16, d1s16);
  364. q7s16 = vcombine_s16(d14s16, d15s16);
  365. d30s16 = vdup_n_s16(cospi_14_64);
  366. d31s16 = vdup_n_s16(cospi_18_64);
  367. q2s32 = vmull_s16(d24s16, d30s16);
  368. q3s32 = vmull_s16(d25s16, d30s16);
  369. q4s32 = vmull_s16(d24s16, d31s16);
  370. q5s32 = vmull_s16(d25s16, d31s16);
  371. q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
  372. q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
  373. q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
  374. q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
  375. d2s16 = vqrshrn_n_s32(q2s32, 14);
  376. d3s16 = vqrshrn_n_s32(q3s32, 14);
  377. d12s16 = vqrshrn_n_s32(q4s32, 14);
  378. d13s16 = vqrshrn_n_s32(q5s32, 14);
  379. q1s16 = vcombine_s16(d2s16, d3s16);
  380. q6s16 = vcombine_s16(d12s16, d13s16);
  381. d30s16 = vdup_n_s16(cospi_22_64);
  382. d31s16 = vdup_n_s16(cospi_10_64);
  383. q11s32 = vmull_s16(d20s16, d30s16);
  384. q12s32 = vmull_s16(d21s16, d30s16);
  385. q4s32 = vmull_s16(d20s16, d31s16);
  386. q5s32 = vmull_s16(d21s16, d31s16);
  387. q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
  388. q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
  389. q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
  390. q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
  391. d4s16 = vqrshrn_n_s32(q11s32, 14);
  392. d5s16 = vqrshrn_n_s32(q12s32, 14);
  393. d11s16 = vqrshrn_n_s32(q5s32, 14);
  394. d10s16 = vqrshrn_n_s32(q4s32, 14);
  395. q2s16 = vcombine_s16(d4s16, d5s16);
  396. q5s16 = vcombine_s16(d10s16, d11s16);
  397. d30s16 = vdup_n_s16(cospi_6_64);
  398. d31s16 = vdup_n_s16(cospi_26_64);
  399. q10s32 = vmull_s16(d28s16, d30s16);
  400. q11s32 = vmull_s16(d29s16, d30s16);
  401. q12s32 = vmull_s16(d28s16, d31s16);
  402. q13s32 = vmull_s16(d29s16, d31s16);
  403. q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
  404. q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
  405. q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
  406. q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
  407. d6s16 = vqrshrn_n_s32(q10s32, 14);
  408. d7s16 = vqrshrn_n_s32(q11s32, 14);
  409. d8s16 = vqrshrn_n_s32(q12s32, 14);
  410. d9s16 = vqrshrn_n_s32(q13s32, 14);
  411. q3s16 = vcombine_s16(d6s16, d7s16);
  412. q4s16 = vcombine_s16(d8s16, d9s16);
  413. // stage 3
  414. q9s16 = vsubq_s16(q0s16, q1s16);
  415. q0s16 = vaddq_s16(q0s16, q1s16);
  416. q10s16 = vsubq_s16(q3s16, q2s16);
  417. q11s16 = vaddq_s16(q2s16, q3s16);
  418. q12s16 = vaddq_s16(q4s16, q5s16);
  419. q13s16 = vsubq_s16(q4s16, q5s16);
  420. q14s16 = vsubq_s16(q7s16, q6s16);
  421. q7s16 = vaddq_s16(q6s16, q7s16);
  422. // stage 4
  423. d18s16 = vget_low_s16(q9s16);
  424. d19s16 = vget_high_s16(q9s16);
  425. d20s16 = vget_low_s16(q10s16);
  426. d21s16 = vget_high_s16(q10s16);
  427. d26s16 = vget_low_s16(q13s16);
  428. d27s16 = vget_high_s16(q13s16);
  429. d28s16 = vget_low_s16(q14s16);
  430. d29s16 = vget_high_s16(q14s16);
  431. d30s16 = vdup_n_s16(cospi_8_64);
  432. d31s16 = vdup_n_s16(cospi_24_64);
  433. q2s32 = vmull_s16(d18s16, d31s16);
  434. q3s32 = vmull_s16(d19s16, d31s16);
  435. q4s32 = vmull_s16(d28s16, d31s16);
  436. q5s32 = vmull_s16(d29s16, d31s16);
  437. q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
  438. q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
  439. q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
  440. q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
  441. d12s16 = vqrshrn_n_s32(q2s32, 14);
  442. d13s16 = vqrshrn_n_s32(q3s32, 14);
  443. d2s16 = vqrshrn_n_s32(q4s32, 14);
  444. d3s16 = vqrshrn_n_s32(q5s32, 14);
  445. q1s16 = vcombine_s16(d2s16, d3s16);
  446. q6s16 = vcombine_s16(d12s16, d13s16);
  447. q3s16 = q11s16;
  448. q4s16 = q12s16;
  449. d30s16 = vdup_n_s16(-cospi_8_64);
  450. q11s32 = vmull_s16(d26s16, d30s16);
  451. q12s32 = vmull_s16(d27s16, d30s16);
  452. q8s32 = vmull_s16(d20s16, d30s16);
  453. q9s32 = vmull_s16(d21s16, d30s16);
  454. q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
  455. q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
  456. q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
  457. q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
  458. d4s16 = vqrshrn_n_s32(q11s32, 14);
  459. d5s16 = vqrshrn_n_s32(q12s32, 14);
  460. d10s16 = vqrshrn_n_s32(q8s32, 14);
  461. d11s16 = vqrshrn_n_s32(q9s32, 14);
  462. q2s16 = vcombine_s16(d4s16, d5s16);
  463. q5s16 = vcombine_s16(d10s16, d11s16);
  464. // stage 5
  465. q8s16 = vaddq_s16(q0s16, q3s16);
  466. q9s16 = vaddq_s16(q1s16, q2s16);
  467. q10s16 = vsubq_s16(q1s16, q2s16);
  468. q11s16 = vsubq_s16(q0s16, q3s16);
  469. q12s16 = vsubq_s16(q7s16, q4s16);
  470. q13s16 = vsubq_s16(q6s16, q5s16);
  471. q14s16 = vaddq_s16(q6s16, q5s16);
  472. q15s16 = vaddq_s16(q7s16, q4s16);
  473. // stage 6
  474. d20s16 = vget_low_s16(q10s16);
  475. d21s16 = vget_high_s16(q10s16);
  476. d22s16 = vget_low_s16(q11s16);
  477. d23s16 = vget_high_s16(q11s16);
  478. d24s16 = vget_low_s16(q12s16);
  479. d25s16 = vget_high_s16(q12s16);
  480. d26s16 = vget_low_s16(q13s16);
  481. d27s16 = vget_high_s16(q13s16);
  482. d14s16 = vdup_n_s16(cospi_16_64);
  483. q3s32 = vmull_s16(d26s16, d14s16);
  484. q4s32 = vmull_s16(d27s16, d14s16);
  485. q0s32 = vmull_s16(d20s16, d14s16);
  486. q1s32 = vmull_s16(d21s16, d14s16);
  487. q5s32 = vsubq_s32(q3s32, q0s32);
  488. q6s32 = vsubq_s32(q4s32, q1s32);
  489. q10s32 = vaddq_s32(q3s32, q0s32);
  490. q4s32 = vaddq_s32(q4s32, q1s32);
  491. d4s16 = vqrshrn_n_s32(q5s32, 14);
  492. d5s16 = vqrshrn_n_s32(q6s32, 14);
  493. d10s16 = vqrshrn_n_s32(q10s32, 14);
  494. d11s16 = vqrshrn_n_s32(q4s32, 14);
  495. q2s16 = vcombine_s16(d4s16, d5s16);
  496. q5s16 = vcombine_s16(d10s16, d11s16);
  497. q0s32 = vmull_s16(d22s16, d14s16);
  498. q1s32 = vmull_s16(d23s16, d14s16);
  499. q13s32 = vmull_s16(d24s16, d14s16);
  500. q6s32 = vmull_s16(d25s16, d14s16);
  501. q10s32 = vsubq_s32(q13s32, q0s32);
  502. q4s32 = vsubq_s32(q6s32, q1s32);
  503. q13s32 = vaddq_s32(q13s32, q0s32);
  504. q6s32 = vaddq_s32(q6s32, q1s32);
  505. d6s16 = vqrshrn_n_s32(q10s32, 14);
  506. d7s16 = vqrshrn_n_s32(q4s32, 14);
  507. d8s16 = vqrshrn_n_s32(q13s32, 14);
  508. d9s16 = vqrshrn_n_s32(q6s32, 14);
  509. q3s16 = vcombine_s16(d6s16, d7s16);
  510. q4s16 = vcombine_s16(d8s16, d9s16);
  511. // stage 7
  512. if (skip_adding != 0) {
  513. d = dest;
  514. // load the data in pass1
  515. q0s16 = vld1q_s16(pass1Output);
  516. pass1Output += 8;
  517. q1s16 = vld1q_s16(pass1Output);
  518. pass1Output += 8;
  519. d12s64 = vld1_s64((int64_t *)dest);
  520. dest += dest_stride;
  521. d13s64 = vld1_s64((int64_t *)dest);
  522. dest += dest_stride;
  523. q12s16 = vaddq_s16(q0s16, q15s16);
  524. q13s16 = vaddq_s16(q1s16, q14s16);
  525. q12s16 = vrshrq_n_s16(q12s16, 6);
  526. q13s16 = vrshrq_n_s16(q13s16, 6);
  527. q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
  528. vreinterpret_u8_s64(d12s64));
  529. q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
  530. vreinterpret_u8_s64(d13s64));
  531. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
  532. d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
  533. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  534. d += dest_stride;
  535. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
  536. d += dest_stride;
  537. q14s16 = vsubq_s16(q1s16, q14s16);
  538. q15s16 = vsubq_s16(q0s16, q15s16);
  539. q10s16 = vld1q_s16(pass1Output);
  540. pass1Output += 8;
  541. q11s16 = vld1q_s16(pass1Output);
  542. pass1Output += 8;
  543. d12s64 = vld1_s64((int64_t *)dest);
  544. dest += dest_stride;
  545. d13s64 = vld1_s64((int64_t *)dest);
  546. dest += dest_stride;
  547. q12s16 = vaddq_s16(q10s16, q5s16);
  548. q13s16 = vaddq_s16(q11s16, q4s16);
  549. q12s16 = vrshrq_n_s16(q12s16, 6);
  550. q13s16 = vrshrq_n_s16(q13s16, 6);
  551. q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
  552. vreinterpret_u8_s64(d12s64));
  553. q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
  554. vreinterpret_u8_s64(d13s64));
  555. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
  556. d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
  557. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  558. d += dest_stride;
  559. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
  560. d += dest_stride;
  561. q4s16 = vsubq_s16(q11s16, q4s16);
  562. q5s16 = vsubq_s16(q10s16, q5s16);
  563. q0s16 = vld1q_s16(pass1Output);
  564. pass1Output += 8;
  565. q1s16 = vld1q_s16(pass1Output);
  566. pass1Output += 8;
  567. d12s64 = vld1_s64((int64_t *)dest);
  568. dest += dest_stride;
  569. d13s64 = vld1_s64((int64_t *)dest);
  570. dest += dest_stride;
  571. q12s16 = vaddq_s16(q0s16, q3s16);
  572. q13s16 = vaddq_s16(q1s16, q2s16);
  573. q12s16 = vrshrq_n_s16(q12s16, 6);
  574. q13s16 = vrshrq_n_s16(q13s16, 6);
  575. q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
  576. vreinterpret_u8_s64(d12s64));
  577. q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
  578. vreinterpret_u8_s64(d13s64));
  579. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
  580. d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
  581. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  582. d += dest_stride;
  583. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
  584. d += dest_stride;
  585. q2s16 = vsubq_s16(q1s16, q2s16);
  586. q3s16 = vsubq_s16(q0s16, q3s16);
  587. q10s16 = vld1q_s16(pass1Output);
  588. pass1Output += 8;
  589. q11s16 = vld1q_s16(pass1Output);
  590. d12s64 = vld1_s64((int64_t *)dest);
  591. dest += dest_stride;
  592. d13s64 = vld1_s64((int64_t *)dest);
  593. dest += dest_stride;
  594. q12s16 = vaddq_s16(q10s16, q9s16);
  595. q13s16 = vaddq_s16(q11s16, q8s16);
  596. q12s16 = vrshrq_n_s16(q12s16, 6);
  597. q13s16 = vrshrq_n_s16(q13s16, 6);
  598. q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
  599. vreinterpret_u8_s64(d12s64));
  600. q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
  601. vreinterpret_u8_s64(d13s64));
  602. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
  603. d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
  604. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  605. d += dest_stride;
  606. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
  607. d += dest_stride;
  608. q8s16 = vsubq_s16(q11s16, q8s16);
  609. q9s16 = vsubq_s16(q10s16, q9s16);
  610. // store the data out 8,9,10,11,12,13,14,15
  611. d12s64 = vld1_s64((int64_t *)dest);
  612. dest += dest_stride;
  613. q8s16 = vrshrq_n_s16(q8s16, 6);
  614. q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
  615. vreinterpret_u8_s64(d12s64));
  616. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  617. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  618. d += dest_stride;
  619. d12s64 = vld1_s64((int64_t *)dest);
  620. dest += dest_stride;
  621. q9s16 = vrshrq_n_s16(q9s16, 6);
  622. q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
  623. vreinterpret_u8_s64(d12s64));
  624. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  625. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  626. d += dest_stride;
  627. d12s64 = vld1_s64((int64_t *)dest);
  628. dest += dest_stride;
  629. q2s16 = vrshrq_n_s16(q2s16, 6);
  630. q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
  631. vreinterpret_u8_s64(d12s64));
  632. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
  633. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  634. d += dest_stride;
  635. d12s64 = vld1_s64((int64_t *)dest);
  636. dest += dest_stride;
  637. q3s16 = vrshrq_n_s16(q3s16, 6);
  638. q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
  639. vreinterpret_u8_s64(d12s64));
  640. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
  641. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  642. d += dest_stride;
  643. d12s64 = vld1_s64((int64_t *)dest);
  644. dest += dest_stride;
  645. q4s16 = vrshrq_n_s16(q4s16, 6);
  646. q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
  647. vreinterpret_u8_s64(d12s64));
  648. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
  649. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  650. d += dest_stride;
  651. d12s64 = vld1_s64((int64_t *)dest);
  652. dest += dest_stride;
  653. q5s16 = vrshrq_n_s16(q5s16, 6);
  654. q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
  655. vreinterpret_u8_s64(d12s64));
  656. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
  657. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  658. d += dest_stride;
  659. d12s64 = vld1_s64((int64_t *)dest);
  660. dest += dest_stride;
  661. q14s16 = vrshrq_n_s16(q14s16, 6);
  662. q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
  663. vreinterpret_u8_s64(d12s64));
  664. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
  665. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  666. d += dest_stride;
  667. d12s64 = vld1_s64((int64_t *)dest);
  668. q15s16 = vrshrq_n_s16(q15s16, 6);
  669. q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
  670. vreinterpret_u8_s64(d12s64));
  671. d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
  672. vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
  673. } else { // skip_adding_dest
  674. q0s16 = vld1q_s16(pass1Output);
  675. pass1Output += 8;
  676. q1s16 = vld1q_s16(pass1Output);
  677. pass1Output += 8;
  678. q12s16 = vaddq_s16(q0s16, q15s16);
  679. q13s16 = vaddq_s16(q1s16, q14s16);
  680. d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
  681. d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
  682. d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
  683. d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
  684. vst1_u64((uint64_t *)out, d24u64);
  685. out += 4;
  686. vst1_u64((uint64_t *)out, d25u64);
  687. out += 12;
  688. vst1_u64((uint64_t *)out, d26u64);
  689. out += 4;
  690. vst1_u64((uint64_t *)out, d27u64);
  691. out += 12;
  692. q14s16 = vsubq_s16(q1s16, q14s16);
  693. q15s16 = vsubq_s16(q0s16, q15s16);
  694. q10s16 = vld1q_s16(pass1Output);
  695. pass1Output += 8;
  696. q11s16 = vld1q_s16(pass1Output);
  697. pass1Output += 8;
  698. q12s16 = vaddq_s16(q10s16, q5s16);
  699. q13s16 = vaddq_s16(q11s16, q4s16);
  700. d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
  701. d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
  702. d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
  703. d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
  704. vst1_u64((uint64_t *)out, d24u64);
  705. out += 4;
  706. vst1_u64((uint64_t *)out, d25u64);
  707. out += 12;
  708. vst1_u64((uint64_t *)out, d26u64);
  709. out += 4;
  710. vst1_u64((uint64_t *)out, d27u64);
  711. out += 12;
  712. q4s16 = vsubq_s16(q11s16, q4s16);
  713. q5s16 = vsubq_s16(q10s16, q5s16);
  714. q0s16 = vld1q_s16(pass1Output);
  715. pass1Output += 8;
  716. q1s16 = vld1q_s16(pass1Output);
  717. pass1Output += 8;
  718. q12s16 = vaddq_s16(q0s16, q3s16);
  719. q13s16 = vaddq_s16(q1s16, q2s16);
  720. d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
  721. d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
  722. d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
  723. d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
  724. vst1_u64((uint64_t *)out, d24u64);
  725. out += 4;
  726. vst1_u64((uint64_t *)out, d25u64);
  727. out += 12;
  728. vst1_u64((uint64_t *)out, d26u64);
  729. out += 4;
  730. vst1_u64((uint64_t *)out, d27u64);
  731. out += 12;
  732. q2s16 = vsubq_s16(q1s16, q2s16);
  733. q3s16 = vsubq_s16(q0s16, q3s16);
  734. q10s16 = vld1q_s16(pass1Output);
  735. pass1Output += 8;
  736. q11s16 = vld1q_s16(pass1Output);
  737. pass1Output += 8;
  738. q12s16 = vaddq_s16(q10s16, q9s16);
  739. q13s16 = vaddq_s16(q11s16, q8s16);
  740. d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
  741. d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
  742. d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
  743. d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
  744. vst1_u64((uint64_t *)out, d24u64);
  745. out += 4;
  746. vst1_u64((uint64_t *)out, d25u64);
  747. out += 12;
  748. vst1_u64((uint64_t *)out, d26u64);
  749. out += 4;
  750. vst1_u64((uint64_t *)out, d27u64);
  751. out += 12;
  752. q8s16 = vsubq_s16(q11s16, q8s16);
  753. q9s16 = vsubq_s16(q10s16, q9s16);
  754. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
  755. out += 4;
  756. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
  757. out += 12;
  758. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
  759. out += 4;
  760. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
  761. out += 12;
  762. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
  763. out += 4;
  764. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
  765. out += 12;
  766. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
  767. out += 4;
  768. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
  769. out += 12;
  770. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
  771. out += 4;
  772. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
  773. out += 12;
  774. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
  775. out += 4;
  776. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
  777. out += 12;
  778. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
  779. out += 4;
  780. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
  781. out += 12;
  782. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
  783. out += 4;
  784. vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
  785. }
  786. return;
  787. }
  788. void vpx_idct16x16_10_add_neon_pass1(
  789. int16_t *in,
  790. int16_t *out,
  791. int output_stride) {
  792. int16x4_t d4s16;
  793. int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
  794. uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
  795. uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
  796. int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
  797. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  798. int32x4_t q6s32, q9s32;
  799. int32x4_t q10s32, q11s32, q12s32, q15s32;
  800. int16x8x2_t q0x2s16;
  801. q0x2s16 = vld2q_s16(in);
  802. q8s16 = q0x2s16.val[0];
  803. in += 16;
  804. q0x2s16 = vld2q_s16(in);
  805. q9s16 = q0x2s16.val[0];
  806. in += 16;
  807. q0x2s16 = vld2q_s16(in);
  808. q10s16 = q0x2s16.val[0];
  809. in += 16;
  810. q0x2s16 = vld2q_s16(in);
  811. q11s16 = q0x2s16.val[0];
  812. in += 16;
  813. q0x2s16 = vld2q_s16(in);
  814. q12s16 = q0x2s16.val[0];
  815. in += 16;
  816. q0x2s16 = vld2q_s16(in);
  817. q13s16 = q0x2s16.val[0];
  818. in += 16;
  819. q0x2s16 = vld2q_s16(in);
  820. q14s16 = q0x2s16.val[0];
  821. in += 16;
  822. q0x2s16 = vld2q_s16(in);
  823. q15s16 = q0x2s16.val[0];
  824. TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
  825. &q12s16, &q13s16, &q14s16, &q15s16);
  826. // stage 3
  827. q0s16 = vdupq_n_s16(cospi_28_64 * 2);
  828. q1s16 = vdupq_n_s16(cospi_4_64 * 2);
  829. q4s16 = vqrdmulhq_s16(q9s16, q0s16);
  830. q7s16 = vqrdmulhq_s16(q9s16, q1s16);
  831. // stage 4
  832. q1s16 = vdupq_n_s16(cospi_16_64 * 2);
  833. d4s16 = vdup_n_s16(cospi_16_64);
  834. q8s16 = vqrdmulhq_s16(q8s16, q1s16);
  835. d8s16 = vget_low_s16(q4s16);
  836. d9s16 = vget_high_s16(q4s16);
  837. d14s16 = vget_low_s16(q7s16);
  838. d15s16 = vget_high_s16(q7s16);
  839. q9s32 = vmull_s16(d14s16, d4s16);
  840. q10s32 = vmull_s16(d15s16, d4s16);
  841. q12s32 = vmull_s16(d9s16, d4s16);
  842. q11s32 = vmull_s16(d8s16, d4s16);
  843. q15s32 = vsubq_s32(q10s32, q12s32);
  844. q6s32 = vsubq_s32(q9s32, q11s32);
  845. q9s32 = vaddq_s32(q9s32, q11s32);
  846. q10s32 = vaddq_s32(q10s32, q12s32);
  847. d11s16 = vqrshrn_n_s32(q15s32, 14);
  848. d10s16 = vqrshrn_n_s32(q6s32, 14);
  849. d12s16 = vqrshrn_n_s32(q9s32, 14);
  850. d13s16 = vqrshrn_n_s32(q10s32, 14);
  851. q5s16 = vcombine_s16(d10s16, d11s16);
  852. q6s16 = vcombine_s16(d12s16, d13s16);
  853. // stage 6
  854. q2s16 = vaddq_s16(q8s16, q7s16);
  855. q9s16 = vaddq_s16(q8s16, q6s16);
  856. q10s16 = vaddq_s16(q8s16, q5s16);
  857. q11s16 = vaddq_s16(q8s16, q4s16);
  858. q12s16 = vsubq_s16(q8s16, q4s16);
  859. q13s16 = vsubq_s16(q8s16, q5s16);
  860. q14s16 = vsubq_s16(q8s16, q6s16);
  861. q15s16 = vsubq_s16(q8s16, q7s16);
  862. d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
  863. d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
  864. d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
  865. d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
  866. d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
  867. d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
  868. d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
  869. d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
  870. d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
  871. d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
  872. d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
  873. d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
  874. d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
  875. d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
  876. d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
  877. d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
  878. // store the data
  879. output_stride >>= 1; // output_stride / 2, out is int16_t
  880. vst1_u64((uint64_t *)out, d4u64);
  881. out += output_stride;
  882. vst1_u64((uint64_t *)out, d5u64);
  883. out += output_stride;
  884. vst1_u64((uint64_t *)out, d18u64);
  885. out += output_stride;
  886. vst1_u64((uint64_t *)out, d19u64);
  887. out += output_stride;
  888. vst1_u64((uint64_t *)out, d20u64);
  889. out += output_stride;
  890. vst1_u64((uint64_t *)out, d21u64);
  891. out += output_stride;
  892. vst1_u64((uint64_t *)out, d22u64);
  893. out += output_stride;
  894. vst1_u64((uint64_t *)out, d23u64);
  895. out += output_stride;
  896. vst1_u64((uint64_t *)out, d24u64);
  897. out += output_stride;
  898. vst1_u64((uint64_t *)out, d25u64);
  899. out += output_stride;
  900. vst1_u64((uint64_t *)out, d26u64);
  901. out += output_stride;
  902. vst1_u64((uint64_t *)out, d27u64);
  903. out += output_stride;
  904. vst1_u64((uint64_t *)out, d28u64);
  905. out += output_stride;
  906. vst1_u64((uint64_t *)out, d29u64);
  907. out += output_stride;
  908. vst1_u64((uint64_t *)out, d30u64);
  909. out += output_stride;
  910. vst1_u64((uint64_t *)out, d31u64);
  911. return;
  912. }
  913. void vpx_idct16x16_10_add_neon_pass2(
  914. int16_t *src,
  915. int16_t *out,
  916. int16_t *pass1Output,
  917. int16_t skip_adding,
  918. uint8_t *dest,
  919. int dest_stride) {
  920. int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
  921. int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
  922. int16x4_t d20s16, d21s16, d22s16, d23s16;
  923. int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
  924. uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
  925. uint64x1_t d16u64, d17u64, d18u64, d19u64;
  926. uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
  927. int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  928. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  929. int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
  930. int32x4_t q10s32, q11s32, q12s32, q13s32;
  931. int16x8x2_t q0x2s16;
  932. (void)skip_adding;
  933. (void)dest;
  934. (void)dest_stride;
  935. q0x2s16 = vld2q_s16(src);
  936. q8s16 = q0x2s16.val[0];
  937. src += 16;
  938. q0x2s16 = vld2q_s16(src);
  939. q9s16 = q0x2s16.val[0];
  940. src += 16;
  941. q0x2s16 = vld2q_s16(src);
  942. q10s16 = q0x2s16.val[0];
  943. src += 16;
  944. q0x2s16 = vld2q_s16(src);
  945. q11s16 = q0x2s16.val[0];
  946. src += 16;
  947. q0x2s16 = vld2q_s16(src);
  948. q12s16 = q0x2s16.val[0];
  949. src += 16;
  950. q0x2s16 = vld2q_s16(src);
  951. q13s16 = q0x2s16.val[0];
  952. src += 16;
  953. q0x2s16 = vld2q_s16(src);
  954. q14s16 = q0x2s16.val[0];
  955. src += 16;
  956. q0x2s16 = vld2q_s16(src);
  957. q15s16 = q0x2s16.val[0];
  958. TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
  959. &q12s16, &q13s16, &q14s16, &q15s16);
  960. // stage 3
  961. q6s16 = vdupq_n_s16(cospi_30_64 * 2);
  962. q0s16 = vqrdmulhq_s16(q8s16, q6s16);
  963. q6s16 = vdupq_n_s16(cospi_2_64 * 2);
  964. q7s16 = vqrdmulhq_s16(q8s16, q6s16);
  965. q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
  966. q14s16 = vdupq_n_s16(cospi_6_64 * 2);
  967. q3s16 = vqrdmulhq_s16(q9s16, q15s16);
  968. q4s16 = vqrdmulhq_s16(q9s16, q14s16);
  969. // stage 4
  970. d0s16 = vget_low_s16(q0s16);
  971. d1s16 = vget_high_s16(q0s16);
  972. d6s16 = vget_low_s16(q3s16);
  973. d7s16 = vget_high_s16(q3s16);
  974. d8s16 = vget_low_s16(q4s16);
  975. d9s16 = vget_high_s16(q4s16);
  976. d14s16 = vget_low_s16(q7s16);
  977. d15s16 = vget_high_s16(q7s16);
  978. d30s16 = vdup_n_s16(cospi_8_64);
  979. d31s16 = vdup_n_s16(cospi_24_64);
  980. q12s32 = vmull_s16(d14s16, d31s16);
  981. q5s32 = vmull_s16(d15s16, d31s16);
  982. q2s32 = vmull_s16(d0s16, d31s16);
  983. q11s32 = vmull_s16(d1s16, d31s16);
  984. q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
  985. q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
  986. q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
  987. q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
  988. d2s16 = vqrshrn_n_s32(q12s32, 14);
  989. d3s16 = vqrshrn_n_s32(q5s32, 14);
  990. d12s16 = vqrshrn_n_s32(q2s32, 14);
  991. d13s16 = vqrshrn_n_s32(q11s32, 14);
  992. q1s16 = vcombine_s16(d2s16, d3s16);
  993. q6s16 = vcombine_s16(d12s16, d13s16);
  994. d30s16 = vdup_n_s16(-cospi_8_64);
  995. q10s32 = vmull_s16(d8s16, d30s16);
  996. q13s32 = vmull_s16(d9s16, d30s16);
  997. q8s32 = vmull_s16(d6s16, d30s16);
  998. q9s32 = vmull_s16(d7s16, d30s16);
  999. q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
  1000. q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
  1001. q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
  1002. q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
  1003. d4s16 = vqrshrn_n_s32(q10s32, 14);
  1004. d5s16 = vqrshrn_n_s32(q13s32, 14);
  1005. d10s16 = vqrshrn_n_s32(q8s32, 14);
  1006. d11s16 = vqrshrn_n_s32(q9s32, 14);
  1007. q2s16 = vcombine_s16(d4s16, d5s16);
  1008. q5s16 = vcombine_s16(d10s16, d11s16);
  1009. // stage 5
  1010. q8s16 = vaddq_s16(q0s16, q3s16);
  1011. q9s16 = vaddq_s16(q1s16, q2s16);
  1012. q10s16 = vsubq_s16(q1s16, q2s16);
  1013. q11s16 = vsubq_s16(q0s16, q3s16);
  1014. q12s16 = vsubq_s16(q7s16, q4s16);
  1015. q13s16 = vsubq_s16(q6s16, q5s16);
  1016. q14s16 = vaddq_s16(q6s16, q5s16);
  1017. q15s16 = vaddq_s16(q7s16, q4s16);
  1018. // stage 6
  1019. d20s16 = vget_low_s16(q10s16);
  1020. d21s16 = vget_high_s16(q10s16);
  1021. d22s16 = vget_low_s16(q11s16);
  1022. d23s16 = vget_high_s16(q11s16);
  1023. d24s16 = vget_low_s16(q12s16);
  1024. d25s16 = vget_high_s16(q12s16);
  1025. d26s16 = vget_low_s16(q13s16);
  1026. d27s16 = vget_high_s16(q13s16);
  1027. d14s16 = vdup_n_s16(cospi_16_64);
  1028. q3s32 = vmull_s16(d26s16, d14s16);
  1029. q4s32 = vmull_s16(d27s16, d14s16);
  1030. q0s32 = vmull_s16(d20s16, d14s16);
  1031. q1s32 = vmull_s16(d21s16, d14s16);
  1032. q5s32 = vsubq_s32(q3s32, q0s32);
  1033. q6s32 = vsubq_s32(q4s32, q1s32);
  1034. q0s32 = vaddq_s32(q3s32, q0s32);
  1035. q4s32 = vaddq_s32(q4s32, q1s32);
  1036. d4s16 = vqrshrn_n_s32(q5s32, 14);
  1037. d5s16 = vqrshrn_n_s32(q6s32, 14);
  1038. d10s16 = vqrshrn_n_s32(q0s32, 14);
  1039. d11s16 = vqrshrn_n_s32(q4s32, 14);
  1040. q2s16 = vcombine_s16(d4s16, d5s16);
  1041. q5s16 = vcombine_s16(d10s16, d11s16);
  1042. q0s32 = vmull_s16(d22s16, d14s16);
  1043. q1s32 = vmull_s16(d23s16, d14s16);
  1044. q13s32 = vmull_s16(d24s16, d14s16);
  1045. q6s32 = vmull_s16(d25s16, d14s16);
  1046. q10s32 = vsubq_s32(q13s32, q0s32);
  1047. q4s32 = vsubq_s32(q6s32, q1s32);
  1048. q13s32 = vaddq_s32(q13s32, q0s32);
  1049. q6s32 = vaddq_s32(q6s32, q1s32);
  1050. d6s16 = vqrshrn_n_s32(q10s32, 14);
  1051. d7s16 = vqrshrn_n_s32(q4s32, 14);
  1052. d8s16 = vqrshrn_n_s32(q13s32, 14);
  1053. d9s16 = vqrshrn_n_s32(q6s32, 14);
  1054. q3s16 = vcombine_s16(d6s16, d7s16);
  1055. q4s16 = vcombine_s16(d8s16, d9s16);
  1056. // stage 7
  1057. q0s16 = vld1q_s16(pass1Output);
  1058. pass1Output += 8;
  1059. q1s16 = vld1q_s16(pass1Output);
  1060. pass1Output += 8;
  1061. q12s16 = vaddq_s16(q0s16, q15s16);
  1062. q13s16 = vaddq_s16(q1s16, q14s16);
  1063. d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
  1064. d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
  1065. d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
  1066. d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
  1067. vst1_u64((uint64_t *)out, d24u64);
  1068. out += 4;
  1069. vst1_u64((uint64_t *)out, d25u64);
  1070. out += 12;
  1071. vst1_u64((uint64_t *)out, d26u64);
  1072. out += 4;
  1073. vst1_u64((uint64_t *)out, d27u64);
  1074. out += 12;
  1075. q14s16 = vsubq_s16(q1s16, q14s16);
  1076. q15s16 = vsubq_s16(q0s16, q15s16);
  1077. q10s16 = vld1q_s16(pass1Output);
  1078. pass1Output += 8;
  1079. q11s16 = vld1q_s16(pass1Output);
  1080. pass1Output += 8;
  1081. q12s16 = vaddq_s16(q10s16, q5s16);
  1082. q13s16 = vaddq_s16(q11s16, q4s16);
  1083. d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
  1084. d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
  1085. d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
  1086. d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
  1087. vst1_u64((uint64_t *)out, d24u64);
  1088. out += 4;
  1089. vst1_u64((uint64_t *)out, d25u64);
  1090. out += 12;
  1091. vst1_u64((uint64_t *)out, d26u64);
  1092. out += 4;
  1093. vst1_u64((uint64_t *)out, d27u64);
  1094. out += 12;
  1095. q4s16 = vsubq_s16(q11s16, q4s16);
  1096. q5s16 = vsubq_s16(q10s16, q5s16);
  1097. q0s16 = vld1q_s16(pass1Output);
  1098. pass1Output += 8;
  1099. q1s16 = vld1q_s16(pass1Output);
  1100. pass1Output += 8;
  1101. q12s16 = vaddq_s16(q0s16, q3s16);
  1102. q13s16 = vaddq_s16(q1s16, q2s16);
  1103. d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
  1104. d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
  1105. d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
  1106. d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
  1107. vst1_u64((uint64_t *)out, d24u64);
  1108. out += 4;
  1109. vst1_u64((uint64_t *)out, d25u64);
  1110. out += 12;
  1111. vst1_u64((uint64_t *)out, d26u64);
  1112. out += 4;
  1113. vst1_u64((uint64_t *)out, d27u64);
  1114. out += 12;
  1115. q2s16 = vsubq_s16(q1s16, q2s16);
  1116. q3s16 = vsubq_s16(q0s16, q3s16);
  1117. q10s16 = vld1q_s16(pass1Output);
  1118. pass1Output += 8;
  1119. q11s16 = vld1q_s16(pass1Output);
  1120. q12s16 = vaddq_s16(q10s16, q9s16);
  1121. q13s16 = vaddq_s16(q11s16, q8s16);
  1122. d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
  1123. d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
  1124. d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
  1125. d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
  1126. vst1_u64((uint64_t *)out, d24u64);
  1127. out += 4;
  1128. vst1_u64((uint64_t *)out, d25u64);
  1129. out += 12;
  1130. vst1_u64((uint64_t *)out, d26u64);
  1131. out += 4;
  1132. vst1_u64((uint64_t *)out, d27u64);
  1133. out += 12;
  1134. q8s16 = vsubq_s16(q11s16, q8s16);
  1135. q9s16 = vsubq_s16(q10s16, q9s16);
  1136. d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
  1137. d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
  1138. d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
  1139. d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
  1140. d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
  1141. d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
  1142. d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
  1143. d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
  1144. d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
  1145. d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
  1146. d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
  1147. d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
  1148. d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
  1149. d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
  1150. d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
  1151. d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
  1152. vst1_u64((uint64_t *)out, d16u64);
  1153. out += 4;
  1154. vst1_u64((uint64_t *)out, d17u64);
  1155. out += 12;
  1156. vst1_u64((uint64_t *)out, d18u64);
  1157. out += 4;
  1158. vst1_u64((uint64_t *)out, d19u64);
  1159. out += 12;
  1160. vst1_u64((uint64_t *)out, d4u64);
  1161. out += 4;
  1162. vst1_u64((uint64_t *)out, d5u64);
  1163. out += 12;
  1164. vst1_u64((uint64_t *)out, d6u64);
  1165. out += 4;
  1166. vst1_u64((uint64_t *)out, d7u64);
  1167. out += 12;
  1168. vst1_u64((uint64_t *)out, d8u64);
  1169. out += 4;
  1170. vst1_u64((uint64_t *)out, d9u64);
  1171. out += 12;
  1172. vst1_u64((uint64_t *)out, d10u64);
  1173. out += 4;
  1174. vst1_u64((uint64_t *)out, d11u64);
  1175. out += 12;
  1176. vst1_u64((uint64_t *)out, d28u64);
  1177. out += 4;
  1178. vst1_u64((uint64_t *)out, d29u64);
  1179. out += 12;
  1180. vst1_u64((uint64_t *)out, d30u64);
  1181. out += 4;
  1182. vst1_u64((uint64_t *)out, d31u64);
  1183. return;
  1184. }