sixtappredict_neon.c 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "vpx_ports/mem.h"
  12. static const int8_t vp8_sub_pel_filters[8][8] = {
  13. {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */
  14. {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */
  15. {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
  16. {0, -9, 93, 50, -6, 0, 0, 0},
  17. {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
  18. {0, -6, 50, 93, -9, 0, 0, 0},
  19. {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
  20. {0, -1, 12, 123, -6, 0, 0, 0},
  21. };
  22. void vp8_sixtap_predict8x4_neon(
  23. unsigned char *src_ptr,
  24. int src_pixels_per_line,
  25. int xoffset,
  26. int yoffset,
  27. unsigned char *dst_ptr,
  28. int dst_pitch) {
  29. unsigned char *src;
  30. uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
  31. uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
  32. uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
  33. int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
  34. uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
  35. uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
  36. int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
  37. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
  38. uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
  39. if (xoffset == 0) { // secondpass_filter8x4_only
  40. // load second_pass filter
  41. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  42. d0s8 = vdup_lane_s8(dtmps8, 0);
  43. d1s8 = vdup_lane_s8(dtmps8, 1);
  44. d2s8 = vdup_lane_s8(dtmps8, 2);
  45. d3s8 = vdup_lane_s8(dtmps8, 3);
  46. d4s8 = vdup_lane_s8(dtmps8, 4);
  47. d5s8 = vdup_lane_s8(dtmps8, 5);
  48. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  49. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  50. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  51. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  52. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  53. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  54. // load src data
  55. src = src_ptr - src_pixels_per_line * 2;
  56. d22u8 = vld1_u8(src);
  57. src += src_pixels_per_line;
  58. d23u8 = vld1_u8(src);
  59. src += src_pixels_per_line;
  60. d24u8 = vld1_u8(src);
  61. src += src_pixels_per_line;
  62. d25u8 = vld1_u8(src);
  63. src += src_pixels_per_line;
  64. d26u8 = vld1_u8(src);
  65. src += src_pixels_per_line;
  66. d27u8 = vld1_u8(src);
  67. src += src_pixels_per_line;
  68. d28u8 = vld1_u8(src);
  69. src += src_pixels_per_line;
  70. d29u8 = vld1_u8(src);
  71. src += src_pixels_per_line;
  72. d30u8 = vld1_u8(src);
  73. q3u16 = vmull_u8(d22u8, d0u8);
  74. q4u16 = vmull_u8(d23u8, d0u8);
  75. q5u16 = vmull_u8(d24u8, d0u8);
  76. q6u16 = vmull_u8(d25u8, d0u8);
  77. q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
  78. q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
  79. q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
  80. q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
  81. q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
  82. q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
  83. q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
  84. q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
  85. q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
  86. q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
  87. q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
  88. q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
  89. q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
  90. q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
  91. q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
  92. q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
  93. q7u16 = vmull_u8(d25u8, d3u8);
  94. q8u16 = vmull_u8(d26u8, d3u8);
  95. q9u16 = vmull_u8(d27u8, d3u8);
  96. q10u16 = vmull_u8(d28u8, d3u8);
  97. q3s16 = vreinterpretq_s16_u16(q3u16);
  98. q4s16 = vreinterpretq_s16_u16(q4u16);
  99. q5s16 = vreinterpretq_s16_u16(q5u16);
  100. q6s16 = vreinterpretq_s16_u16(q6u16);
  101. q7s16 = vreinterpretq_s16_u16(q7u16);
  102. q8s16 = vreinterpretq_s16_u16(q8u16);
  103. q9s16 = vreinterpretq_s16_u16(q9u16);
  104. q10s16 = vreinterpretq_s16_u16(q10u16);
  105. q7s16 = vqaddq_s16(q7s16, q3s16);
  106. q8s16 = vqaddq_s16(q8s16, q4s16);
  107. q9s16 = vqaddq_s16(q9s16, q5s16);
  108. q10s16 = vqaddq_s16(q10s16, q6s16);
  109. d6u8 = vqrshrun_n_s16(q7s16, 7);
  110. d7u8 = vqrshrun_n_s16(q8s16, 7);
  111. d8u8 = vqrshrun_n_s16(q9s16, 7);
  112. d9u8 = vqrshrun_n_s16(q10s16, 7);
  113. vst1_u8(dst_ptr, d6u8);
  114. dst_ptr += dst_pitch;
  115. vst1_u8(dst_ptr, d7u8);
  116. dst_ptr += dst_pitch;
  117. vst1_u8(dst_ptr, d8u8);
  118. dst_ptr += dst_pitch;
  119. vst1_u8(dst_ptr, d9u8);
  120. return;
  121. }
  122. // load first_pass filter
  123. dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
  124. d0s8 = vdup_lane_s8(dtmps8, 0);
  125. d1s8 = vdup_lane_s8(dtmps8, 1);
  126. d2s8 = vdup_lane_s8(dtmps8, 2);
  127. d3s8 = vdup_lane_s8(dtmps8, 3);
  128. d4s8 = vdup_lane_s8(dtmps8, 4);
  129. d5s8 = vdup_lane_s8(dtmps8, 5);
  130. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  131. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  132. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  133. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  134. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  135. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  136. // First pass: output_height lines x output_width columns (9x4)
  137. if (yoffset == 0) // firstpass_filter4x4_only
  138. src = src_ptr - 2;
  139. else
  140. src = src_ptr - 2 - (src_pixels_per_line * 2);
  141. q3u8 = vld1q_u8(src);
  142. src += src_pixels_per_line;
  143. q4u8 = vld1q_u8(src);
  144. src += src_pixels_per_line;
  145. q5u8 = vld1q_u8(src);
  146. src += src_pixels_per_line;
  147. q6u8 = vld1q_u8(src);
  148. q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  149. q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  150. q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  151. q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  152. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  153. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  154. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  155. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  156. q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
  157. q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
  158. q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
  159. q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
  160. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  161. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  162. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  163. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  164. q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
  165. q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
  166. q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
  167. q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
  168. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  169. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  170. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  171. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  172. q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
  173. q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
  174. q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
  175. q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
  176. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  177. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  178. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  179. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  180. q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
  181. q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
  182. q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
  183. q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
  184. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  185. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  186. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  187. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  188. q3u16 = vmull_u8(d28u8, d3u8);
  189. q4u16 = vmull_u8(d29u8, d3u8);
  190. q5u16 = vmull_u8(d30u8, d3u8);
  191. q6u16 = vmull_u8(d31u8, d3u8);
  192. q3s16 = vreinterpretq_s16_u16(q3u16);
  193. q4s16 = vreinterpretq_s16_u16(q4u16);
  194. q5s16 = vreinterpretq_s16_u16(q5u16);
  195. q6s16 = vreinterpretq_s16_u16(q6u16);
  196. q7s16 = vreinterpretq_s16_u16(q7u16);
  197. q8s16 = vreinterpretq_s16_u16(q8u16);
  198. q9s16 = vreinterpretq_s16_u16(q9u16);
  199. q10s16 = vreinterpretq_s16_u16(q10u16);
  200. q7s16 = vqaddq_s16(q7s16, q3s16);
  201. q8s16 = vqaddq_s16(q8s16, q4s16);
  202. q9s16 = vqaddq_s16(q9s16, q5s16);
  203. q10s16 = vqaddq_s16(q10s16, q6s16);
  204. d22u8 = vqrshrun_n_s16(q7s16, 7);
  205. d23u8 = vqrshrun_n_s16(q8s16, 7);
  206. d24u8 = vqrshrun_n_s16(q9s16, 7);
  207. d25u8 = vqrshrun_n_s16(q10s16, 7);
  208. if (yoffset == 0) { // firstpass_filter8x4_only
  209. vst1_u8(dst_ptr, d22u8);
  210. dst_ptr += dst_pitch;
  211. vst1_u8(dst_ptr, d23u8);
  212. dst_ptr += dst_pitch;
  213. vst1_u8(dst_ptr, d24u8);
  214. dst_ptr += dst_pitch;
  215. vst1_u8(dst_ptr, d25u8);
  216. return;
  217. }
  218. // First Pass on rest 5-line data
  219. src += src_pixels_per_line;
  220. q3u8 = vld1q_u8(src);
  221. src += src_pixels_per_line;
  222. q4u8 = vld1q_u8(src);
  223. src += src_pixels_per_line;
  224. q5u8 = vld1q_u8(src);
  225. src += src_pixels_per_line;
  226. q6u8 = vld1q_u8(src);
  227. src += src_pixels_per_line;
  228. q7u8 = vld1q_u8(src);
  229. q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  230. q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  231. q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  232. q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  233. q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
  234. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  235. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  236. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  237. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  238. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
  239. q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
  240. q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
  241. q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
  242. q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
  243. q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
  244. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  245. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  246. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  247. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  248. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
  249. q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
  250. q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
  251. q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
  252. q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
  253. q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
  254. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  255. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  256. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  257. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  258. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
  259. q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
  260. q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
  261. q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
  262. q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
  263. q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
  264. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  265. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  266. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  267. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  268. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
  269. q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
  270. q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
  271. q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
  272. q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
  273. q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
  274. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  275. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  276. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  277. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  278. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
  279. q3u16 = vmull_u8(d27u8, d3u8);
  280. q4u16 = vmull_u8(d28u8, d3u8);
  281. q5u16 = vmull_u8(d29u8, d3u8);
  282. q6u16 = vmull_u8(d30u8, d3u8);
  283. q7u16 = vmull_u8(d31u8, d3u8);
  284. q3s16 = vreinterpretq_s16_u16(q3u16);
  285. q4s16 = vreinterpretq_s16_u16(q4u16);
  286. q5s16 = vreinterpretq_s16_u16(q5u16);
  287. q6s16 = vreinterpretq_s16_u16(q6u16);
  288. q7s16 = vreinterpretq_s16_u16(q7u16);
  289. q8s16 = vreinterpretq_s16_u16(q8u16);
  290. q9s16 = vreinterpretq_s16_u16(q9u16);
  291. q10s16 = vreinterpretq_s16_u16(q10u16);
  292. q11s16 = vreinterpretq_s16_u16(q11u16);
  293. q12s16 = vreinterpretq_s16_u16(q12u16);
  294. q8s16 = vqaddq_s16(q8s16, q3s16);
  295. q9s16 = vqaddq_s16(q9s16, q4s16);
  296. q10s16 = vqaddq_s16(q10s16, q5s16);
  297. q11s16 = vqaddq_s16(q11s16, q6s16);
  298. q12s16 = vqaddq_s16(q12s16, q7s16);
  299. d26u8 = vqrshrun_n_s16(q8s16, 7);
  300. d27u8 = vqrshrun_n_s16(q9s16, 7);
  301. d28u8 = vqrshrun_n_s16(q10s16, 7);
  302. d29u8 = vqrshrun_n_s16(q11s16, 7);
  303. d30u8 = vqrshrun_n_s16(q12s16, 7);
  304. // Second pass: 8x4
  305. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  306. d0s8 = vdup_lane_s8(dtmps8, 0);
  307. d1s8 = vdup_lane_s8(dtmps8, 1);
  308. d2s8 = vdup_lane_s8(dtmps8, 2);
  309. d3s8 = vdup_lane_s8(dtmps8, 3);
  310. d4s8 = vdup_lane_s8(dtmps8, 4);
  311. d5s8 = vdup_lane_s8(dtmps8, 5);
  312. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  313. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  314. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  315. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  316. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  317. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  318. q3u16 = vmull_u8(d22u8, d0u8);
  319. q4u16 = vmull_u8(d23u8, d0u8);
  320. q5u16 = vmull_u8(d24u8, d0u8);
  321. q6u16 = vmull_u8(d25u8, d0u8);
  322. q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
  323. q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
  324. q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
  325. q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
  326. q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
  327. q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
  328. q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
  329. q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
  330. q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
  331. q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
  332. q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
  333. q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
  334. q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
  335. q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
  336. q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
  337. q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
  338. q7u16 = vmull_u8(d25u8, d3u8);
  339. q8u16 = vmull_u8(d26u8, d3u8);
  340. q9u16 = vmull_u8(d27u8, d3u8);
  341. q10u16 = vmull_u8(d28u8, d3u8);
  342. q3s16 = vreinterpretq_s16_u16(q3u16);
  343. q4s16 = vreinterpretq_s16_u16(q4u16);
  344. q5s16 = vreinterpretq_s16_u16(q5u16);
  345. q6s16 = vreinterpretq_s16_u16(q6u16);
  346. q7s16 = vreinterpretq_s16_u16(q7u16);
  347. q8s16 = vreinterpretq_s16_u16(q8u16);
  348. q9s16 = vreinterpretq_s16_u16(q9u16);
  349. q10s16 = vreinterpretq_s16_u16(q10u16);
  350. q7s16 = vqaddq_s16(q7s16, q3s16);
  351. q8s16 = vqaddq_s16(q8s16, q4s16);
  352. q9s16 = vqaddq_s16(q9s16, q5s16);
  353. q10s16 = vqaddq_s16(q10s16, q6s16);
  354. d6u8 = vqrshrun_n_s16(q7s16, 7);
  355. d7u8 = vqrshrun_n_s16(q8s16, 7);
  356. d8u8 = vqrshrun_n_s16(q9s16, 7);
  357. d9u8 = vqrshrun_n_s16(q10s16, 7);
  358. vst1_u8(dst_ptr, d6u8);
  359. dst_ptr += dst_pitch;
  360. vst1_u8(dst_ptr, d7u8);
  361. dst_ptr += dst_pitch;
  362. vst1_u8(dst_ptr, d8u8);
  363. dst_ptr += dst_pitch;
  364. vst1_u8(dst_ptr, d9u8);
  365. return;
  366. }
  367. void vp8_sixtap_predict8x8_neon(
  368. unsigned char *src_ptr,
  369. int src_pixels_per_line,
  370. int xoffset,
  371. int yoffset,
  372. unsigned char *dst_ptr,
  373. int dst_pitch) {
  374. unsigned char *src, *tmpp;
  375. unsigned char tmp[64];
  376. int i;
  377. uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
  378. uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
  379. uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
  380. int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
  381. uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
  382. uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
  383. int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
  384. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
  385. uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
  386. if (xoffset == 0) { // secondpass_filter8x8_only
  387. // load second_pass filter
  388. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  389. d0s8 = vdup_lane_s8(dtmps8, 0);
  390. d1s8 = vdup_lane_s8(dtmps8, 1);
  391. d2s8 = vdup_lane_s8(dtmps8, 2);
  392. d3s8 = vdup_lane_s8(dtmps8, 3);
  393. d4s8 = vdup_lane_s8(dtmps8, 4);
  394. d5s8 = vdup_lane_s8(dtmps8, 5);
  395. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  396. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  397. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  398. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  399. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  400. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  401. // load src data
  402. src = src_ptr - src_pixels_per_line * 2;
  403. d18u8 = vld1_u8(src);
  404. src += src_pixels_per_line;
  405. d19u8 = vld1_u8(src);
  406. src += src_pixels_per_line;
  407. d20u8 = vld1_u8(src);
  408. src += src_pixels_per_line;
  409. d21u8 = vld1_u8(src);
  410. src += src_pixels_per_line;
  411. d22u8 = vld1_u8(src);
  412. src += src_pixels_per_line;
  413. d23u8 = vld1_u8(src);
  414. src += src_pixels_per_line;
  415. d24u8 = vld1_u8(src);
  416. src += src_pixels_per_line;
  417. d25u8 = vld1_u8(src);
  418. src += src_pixels_per_line;
  419. d26u8 = vld1_u8(src);
  420. src += src_pixels_per_line;
  421. d27u8 = vld1_u8(src);
  422. src += src_pixels_per_line;
  423. d28u8 = vld1_u8(src);
  424. src += src_pixels_per_line;
  425. d29u8 = vld1_u8(src);
  426. src += src_pixels_per_line;
  427. d30u8 = vld1_u8(src);
  428. for (i = 2; i > 0; i--) {
  429. q3u16 = vmull_u8(d18u8, d0u8);
  430. q4u16 = vmull_u8(d19u8, d0u8);
  431. q5u16 = vmull_u8(d20u8, d0u8);
  432. q6u16 = vmull_u8(d21u8, d0u8);
  433. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  434. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  435. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  436. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  437. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  438. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  439. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  440. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  441. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  442. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  443. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  444. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  445. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  446. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  447. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  448. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  449. q7u16 = vmull_u8(d21u8, d3u8);
  450. q8u16 = vmull_u8(d22u8, d3u8);
  451. q9u16 = vmull_u8(d23u8, d3u8);
  452. q10u16 = vmull_u8(d24u8, d3u8);
  453. q3s16 = vreinterpretq_s16_u16(q3u16);
  454. q4s16 = vreinterpretq_s16_u16(q4u16);
  455. q5s16 = vreinterpretq_s16_u16(q5u16);
  456. q6s16 = vreinterpretq_s16_u16(q6u16);
  457. q7s16 = vreinterpretq_s16_u16(q7u16);
  458. q8s16 = vreinterpretq_s16_u16(q8u16);
  459. q9s16 = vreinterpretq_s16_u16(q9u16);
  460. q10s16 = vreinterpretq_s16_u16(q10u16);
  461. q7s16 = vqaddq_s16(q7s16, q3s16);
  462. q8s16 = vqaddq_s16(q8s16, q4s16);
  463. q9s16 = vqaddq_s16(q9s16, q5s16);
  464. q10s16 = vqaddq_s16(q10s16, q6s16);
  465. d6u8 = vqrshrun_n_s16(q7s16, 7);
  466. d7u8 = vqrshrun_n_s16(q8s16, 7);
  467. d8u8 = vqrshrun_n_s16(q9s16, 7);
  468. d9u8 = vqrshrun_n_s16(q10s16, 7);
  469. d18u8 = d22u8;
  470. d19u8 = d23u8;
  471. d20u8 = d24u8;
  472. d21u8 = d25u8;
  473. d22u8 = d26u8;
  474. d23u8 = d27u8;
  475. d24u8 = d28u8;
  476. d25u8 = d29u8;
  477. d26u8 = d30u8;
  478. vst1_u8(dst_ptr, d6u8);
  479. dst_ptr += dst_pitch;
  480. vst1_u8(dst_ptr, d7u8);
  481. dst_ptr += dst_pitch;
  482. vst1_u8(dst_ptr, d8u8);
  483. dst_ptr += dst_pitch;
  484. vst1_u8(dst_ptr, d9u8);
  485. dst_ptr += dst_pitch;
  486. }
  487. return;
  488. }
  489. // load first_pass filter
  490. dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
  491. d0s8 = vdup_lane_s8(dtmps8, 0);
  492. d1s8 = vdup_lane_s8(dtmps8, 1);
  493. d2s8 = vdup_lane_s8(dtmps8, 2);
  494. d3s8 = vdup_lane_s8(dtmps8, 3);
  495. d4s8 = vdup_lane_s8(dtmps8, 4);
  496. d5s8 = vdup_lane_s8(dtmps8, 5);
  497. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  498. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  499. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  500. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  501. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  502. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  503. // First pass: output_height lines x output_width columns (9x4)
  504. if (yoffset == 0) // firstpass_filter4x4_only
  505. src = src_ptr - 2;
  506. else
  507. src = src_ptr - 2 - (src_pixels_per_line * 2);
  508. tmpp = tmp;
  509. for (i = 2; i > 0; i--) {
  510. q3u8 = vld1q_u8(src);
  511. src += src_pixels_per_line;
  512. q4u8 = vld1q_u8(src);
  513. src += src_pixels_per_line;
  514. q5u8 = vld1q_u8(src);
  515. src += src_pixels_per_line;
  516. q6u8 = vld1q_u8(src);
  517. src += src_pixels_per_line;
  518. __builtin_prefetch(src);
  519. __builtin_prefetch(src + src_pixels_per_line);
  520. __builtin_prefetch(src + src_pixels_per_line * 2);
  521. q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  522. q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  523. q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  524. q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  525. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  526. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  527. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  528. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  529. q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
  530. q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
  531. q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
  532. q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
  533. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  534. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  535. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  536. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  537. q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
  538. q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
  539. q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
  540. q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
  541. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  542. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  543. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  544. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  545. q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
  546. q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
  547. q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
  548. q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
  549. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  550. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  551. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  552. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  553. q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
  554. q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
  555. q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
  556. q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
  557. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  558. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  559. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  560. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  561. q3u16 = vmull_u8(d28u8, d3u8);
  562. q4u16 = vmull_u8(d29u8, d3u8);
  563. q5u16 = vmull_u8(d30u8, d3u8);
  564. q6u16 = vmull_u8(d31u8, d3u8);
  565. q3s16 = vreinterpretq_s16_u16(q3u16);
  566. q4s16 = vreinterpretq_s16_u16(q4u16);
  567. q5s16 = vreinterpretq_s16_u16(q5u16);
  568. q6s16 = vreinterpretq_s16_u16(q6u16);
  569. q7s16 = vreinterpretq_s16_u16(q7u16);
  570. q8s16 = vreinterpretq_s16_u16(q8u16);
  571. q9s16 = vreinterpretq_s16_u16(q9u16);
  572. q10s16 = vreinterpretq_s16_u16(q10u16);
  573. q7s16 = vqaddq_s16(q7s16, q3s16);
  574. q8s16 = vqaddq_s16(q8s16, q4s16);
  575. q9s16 = vqaddq_s16(q9s16, q5s16);
  576. q10s16 = vqaddq_s16(q10s16, q6s16);
  577. d22u8 = vqrshrun_n_s16(q7s16, 7);
  578. d23u8 = vqrshrun_n_s16(q8s16, 7);
  579. d24u8 = vqrshrun_n_s16(q9s16, 7);
  580. d25u8 = vqrshrun_n_s16(q10s16, 7);
  581. if (yoffset == 0) { // firstpass_filter8x4_only
  582. vst1_u8(dst_ptr, d22u8);
  583. dst_ptr += dst_pitch;
  584. vst1_u8(dst_ptr, d23u8);
  585. dst_ptr += dst_pitch;
  586. vst1_u8(dst_ptr, d24u8);
  587. dst_ptr += dst_pitch;
  588. vst1_u8(dst_ptr, d25u8);
  589. dst_ptr += dst_pitch;
  590. } else {
  591. vst1_u8(tmpp, d22u8);
  592. tmpp += 8;
  593. vst1_u8(tmpp, d23u8);
  594. tmpp += 8;
  595. vst1_u8(tmpp, d24u8);
  596. tmpp += 8;
  597. vst1_u8(tmpp, d25u8);
  598. tmpp += 8;
  599. }
  600. }
  601. if (yoffset == 0)
  602. return;
  603. // First Pass on rest 5-line data
  604. q3u8 = vld1q_u8(src);
  605. src += src_pixels_per_line;
  606. q4u8 = vld1q_u8(src);
  607. src += src_pixels_per_line;
  608. q5u8 = vld1q_u8(src);
  609. src += src_pixels_per_line;
  610. q6u8 = vld1q_u8(src);
  611. src += src_pixels_per_line;
  612. q7u8 = vld1q_u8(src);
  613. q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  614. q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  615. q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  616. q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  617. q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
  618. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  619. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  620. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  621. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  622. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
  623. q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
  624. q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
  625. q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
  626. q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
  627. q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
  628. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  629. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  630. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  631. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  632. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
  633. q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
  634. q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
  635. q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
  636. q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
  637. q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
  638. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  639. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  640. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  641. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  642. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
  643. q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
  644. q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
  645. q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
  646. q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
  647. q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
  648. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  649. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  650. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  651. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  652. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
  653. q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
  654. q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
  655. q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
  656. q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
  657. q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
  658. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  659. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  660. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  661. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  662. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
  663. q3u16 = vmull_u8(d27u8, d3u8);
  664. q4u16 = vmull_u8(d28u8, d3u8);
  665. q5u16 = vmull_u8(d29u8, d3u8);
  666. q6u16 = vmull_u8(d30u8, d3u8);
  667. q7u16 = vmull_u8(d31u8, d3u8);
  668. q3s16 = vreinterpretq_s16_u16(q3u16);
  669. q4s16 = vreinterpretq_s16_u16(q4u16);
  670. q5s16 = vreinterpretq_s16_u16(q5u16);
  671. q6s16 = vreinterpretq_s16_u16(q6u16);
  672. q7s16 = vreinterpretq_s16_u16(q7u16);
  673. q8s16 = vreinterpretq_s16_u16(q8u16);
  674. q9s16 = vreinterpretq_s16_u16(q9u16);
  675. q10s16 = vreinterpretq_s16_u16(q10u16);
  676. q11s16 = vreinterpretq_s16_u16(q11u16);
  677. q12s16 = vreinterpretq_s16_u16(q12u16);
  678. q8s16 = vqaddq_s16(q8s16, q3s16);
  679. q9s16 = vqaddq_s16(q9s16, q4s16);
  680. q10s16 = vqaddq_s16(q10s16, q5s16);
  681. q11s16 = vqaddq_s16(q11s16, q6s16);
  682. q12s16 = vqaddq_s16(q12s16, q7s16);
  683. d26u8 = vqrshrun_n_s16(q8s16, 7);
  684. d27u8 = vqrshrun_n_s16(q9s16, 7);
  685. d28u8 = vqrshrun_n_s16(q10s16, 7);
  686. d29u8 = vqrshrun_n_s16(q11s16, 7);
  687. d30u8 = vqrshrun_n_s16(q12s16, 7);
  688. // Second pass: 8x8
  689. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  690. d0s8 = vdup_lane_s8(dtmps8, 0);
  691. d1s8 = vdup_lane_s8(dtmps8, 1);
  692. d2s8 = vdup_lane_s8(dtmps8, 2);
  693. d3s8 = vdup_lane_s8(dtmps8, 3);
  694. d4s8 = vdup_lane_s8(dtmps8, 4);
  695. d5s8 = vdup_lane_s8(dtmps8, 5);
  696. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  697. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  698. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  699. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  700. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  701. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  702. tmpp = tmp;
  703. q9u8 = vld1q_u8(tmpp);
  704. tmpp += 16;
  705. q10u8 = vld1q_u8(tmpp);
  706. tmpp += 16;
  707. q11u8 = vld1q_u8(tmpp);
  708. tmpp += 16;
  709. q12u8 = vld1q_u8(tmpp);
  710. d18u8 = vget_low_u8(q9u8);
  711. d19u8 = vget_high_u8(q9u8);
  712. d20u8 = vget_low_u8(q10u8);
  713. d21u8 = vget_high_u8(q10u8);
  714. d22u8 = vget_low_u8(q11u8);
  715. d23u8 = vget_high_u8(q11u8);
  716. d24u8 = vget_low_u8(q12u8);
  717. d25u8 = vget_high_u8(q12u8);
  718. for (i = 2; i > 0; i--) {
  719. q3u16 = vmull_u8(d18u8, d0u8);
  720. q4u16 = vmull_u8(d19u8, d0u8);
  721. q5u16 = vmull_u8(d20u8, d0u8);
  722. q6u16 = vmull_u8(d21u8, d0u8);
  723. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  724. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  725. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  726. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  727. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  728. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  729. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  730. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  731. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  732. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  733. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  734. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  735. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  736. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  737. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  738. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  739. q7u16 = vmull_u8(d21u8, d3u8);
  740. q8u16 = vmull_u8(d22u8, d3u8);
  741. q9u16 = vmull_u8(d23u8, d3u8);
  742. q10u16 = vmull_u8(d24u8, d3u8);
  743. q3s16 = vreinterpretq_s16_u16(q3u16);
  744. q4s16 = vreinterpretq_s16_u16(q4u16);
  745. q5s16 = vreinterpretq_s16_u16(q5u16);
  746. q6s16 = vreinterpretq_s16_u16(q6u16);
  747. q7s16 = vreinterpretq_s16_u16(q7u16);
  748. q8s16 = vreinterpretq_s16_u16(q8u16);
  749. q9s16 = vreinterpretq_s16_u16(q9u16);
  750. q10s16 = vreinterpretq_s16_u16(q10u16);
  751. q7s16 = vqaddq_s16(q7s16, q3s16);
  752. q8s16 = vqaddq_s16(q8s16, q4s16);
  753. q9s16 = vqaddq_s16(q9s16, q5s16);
  754. q10s16 = vqaddq_s16(q10s16, q6s16);
  755. d6u8 = vqrshrun_n_s16(q7s16, 7);
  756. d7u8 = vqrshrun_n_s16(q8s16, 7);
  757. d8u8 = vqrshrun_n_s16(q9s16, 7);
  758. d9u8 = vqrshrun_n_s16(q10s16, 7);
  759. d18u8 = d22u8;
  760. d19u8 = d23u8;
  761. d20u8 = d24u8;
  762. d21u8 = d25u8;
  763. d22u8 = d26u8;
  764. d23u8 = d27u8;
  765. d24u8 = d28u8;
  766. d25u8 = d29u8;
  767. d26u8 = d30u8;
  768. vst1_u8(dst_ptr, d6u8);
  769. dst_ptr += dst_pitch;
  770. vst1_u8(dst_ptr, d7u8);
  771. dst_ptr += dst_pitch;
  772. vst1_u8(dst_ptr, d8u8);
  773. dst_ptr += dst_pitch;
  774. vst1_u8(dst_ptr, d9u8);
  775. dst_ptr += dst_pitch;
  776. }
  777. return;
  778. }
  779. void vp8_sixtap_predict16x16_neon(
  780. unsigned char *src_ptr,
  781. int src_pixels_per_line,
  782. int xoffset,
  783. int yoffset,
  784. unsigned char *dst_ptr,
  785. int dst_pitch) {
  786. unsigned char *src, *src_tmp, *dst, *tmpp;
  787. unsigned char tmp[336];
  788. int i, j;
  789. uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
  790. uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
  791. uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
  792. uint8x8_t d28u8, d29u8, d30u8, d31u8;
  793. int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
  794. uint8x16_t q3u8, q4u8;
  795. uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
  796. uint16x8_t q11u16, q12u16, q13u16, q15u16;
  797. int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
  798. int16x8_t q11s16, q12s16, q13s16, q15s16;
  799. if (xoffset == 0) { // secondpass_filter8x8_only
  800. // load second_pass filter
  801. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  802. d0s8 = vdup_lane_s8(dtmps8, 0);
  803. d1s8 = vdup_lane_s8(dtmps8, 1);
  804. d2s8 = vdup_lane_s8(dtmps8, 2);
  805. d3s8 = vdup_lane_s8(dtmps8, 3);
  806. d4s8 = vdup_lane_s8(dtmps8, 4);
  807. d5s8 = vdup_lane_s8(dtmps8, 5);
  808. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  809. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  810. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  811. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  812. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  813. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  814. // load src data
  815. src_tmp = src_ptr - src_pixels_per_line * 2;
  816. for (i = 0; i < 2; i++) {
  817. src = src_tmp + i * 8;
  818. dst = dst_ptr + i * 8;
  819. d18u8 = vld1_u8(src);
  820. src += src_pixels_per_line;
  821. d19u8 = vld1_u8(src);
  822. src += src_pixels_per_line;
  823. d20u8 = vld1_u8(src);
  824. src += src_pixels_per_line;
  825. d21u8 = vld1_u8(src);
  826. src += src_pixels_per_line;
  827. d22u8 = vld1_u8(src);
  828. src += src_pixels_per_line;
  829. for (j = 0; j < 4; j++) {
  830. d23u8 = vld1_u8(src);
  831. src += src_pixels_per_line;
  832. d24u8 = vld1_u8(src);
  833. src += src_pixels_per_line;
  834. d25u8 = vld1_u8(src);
  835. src += src_pixels_per_line;
  836. d26u8 = vld1_u8(src);
  837. src += src_pixels_per_line;
  838. q3u16 = vmull_u8(d18u8, d0u8);
  839. q4u16 = vmull_u8(d19u8, d0u8);
  840. q5u16 = vmull_u8(d20u8, d0u8);
  841. q6u16 = vmull_u8(d21u8, d0u8);
  842. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  843. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  844. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  845. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  846. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  847. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  848. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  849. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  850. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  851. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  852. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  853. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  854. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  855. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  856. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  857. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  858. q7u16 = vmull_u8(d21u8, d3u8);
  859. q8u16 = vmull_u8(d22u8, d3u8);
  860. q9u16 = vmull_u8(d23u8, d3u8);
  861. q10u16 = vmull_u8(d24u8, d3u8);
  862. q3s16 = vreinterpretq_s16_u16(q3u16);
  863. q4s16 = vreinterpretq_s16_u16(q4u16);
  864. q5s16 = vreinterpretq_s16_u16(q5u16);
  865. q6s16 = vreinterpretq_s16_u16(q6u16);
  866. q7s16 = vreinterpretq_s16_u16(q7u16);
  867. q8s16 = vreinterpretq_s16_u16(q8u16);
  868. q9s16 = vreinterpretq_s16_u16(q9u16);
  869. q10s16 = vreinterpretq_s16_u16(q10u16);
  870. q7s16 = vqaddq_s16(q7s16, q3s16);
  871. q8s16 = vqaddq_s16(q8s16, q4s16);
  872. q9s16 = vqaddq_s16(q9s16, q5s16);
  873. q10s16 = vqaddq_s16(q10s16, q6s16);
  874. d6u8 = vqrshrun_n_s16(q7s16, 7);
  875. d7u8 = vqrshrun_n_s16(q8s16, 7);
  876. d8u8 = vqrshrun_n_s16(q9s16, 7);
  877. d9u8 = vqrshrun_n_s16(q10s16, 7);
  878. d18u8 = d22u8;
  879. d19u8 = d23u8;
  880. d20u8 = d24u8;
  881. d21u8 = d25u8;
  882. d22u8 = d26u8;
  883. vst1_u8(dst, d6u8);
  884. dst += dst_pitch;
  885. vst1_u8(dst, d7u8);
  886. dst += dst_pitch;
  887. vst1_u8(dst, d8u8);
  888. dst += dst_pitch;
  889. vst1_u8(dst, d9u8);
  890. dst += dst_pitch;
  891. }
  892. }
  893. return;
  894. }
  895. // load first_pass filter
  896. dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
  897. d0s8 = vdup_lane_s8(dtmps8, 0);
  898. d1s8 = vdup_lane_s8(dtmps8, 1);
  899. d2s8 = vdup_lane_s8(dtmps8, 2);
  900. d3s8 = vdup_lane_s8(dtmps8, 3);
  901. d4s8 = vdup_lane_s8(dtmps8, 4);
  902. d5s8 = vdup_lane_s8(dtmps8, 5);
  903. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  904. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  905. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  906. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  907. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  908. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  909. // First pass: output_height lines x output_width columns (9x4)
  910. if (yoffset == 0) { // firstpass_filter4x4_only
  911. src = src_ptr - 2;
  912. dst = dst_ptr;
  913. for (i = 0; i < 8; i++) {
  914. d6u8 = vld1_u8(src);
  915. d7u8 = vld1_u8(src + 8);
  916. d8u8 = vld1_u8(src + 16);
  917. src += src_pixels_per_line;
  918. d9u8 = vld1_u8(src);
  919. d10u8 = vld1_u8(src + 8);
  920. d11u8 = vld1_u8(src + 16);
  921. src += src_pixels_per_line;
  922. __builtin_prefetch(src);
  923. __builtin_prefetch(src + src_pixels_per_line);
  924. q6u16 = vmull_u8(d6u8, d0u8);
  925. q7u16 = vmull_u8(d7u8, d0u8);
  926. q8u16 = vmull_u8(d9u8, d0u8);
  927. q9u16 = vmull_u8(d10u8, d0u8);
  928. d20u8 = vext_u8(d6u8, d7u8, 1);
  929. d21u8 = vext_u8(d9u8, d10u8, 1);
  930. d22u8 = vext_u8(d7u8, d8u8, 1);
  931. d23u8 = vext_u8(d10u8, d11u8, 1);
  932. d24u8 = vext_u8(d6u8, d7u8, 4);
  933. d25u8 = vext_u8(d9u8, d10u8, 4);
  934. d26u8 = vext_u8(d7u8, d8u8, 4);
  935. d27u8 = vext_u8(d10u8, d11u8, 4);
  936. d28u8 = vext_u8(d6u8, d7u8, 5);
  937. d29u8 = vext_u8(d9u8, d10u8, 5);
  938. q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
  939. q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
  940. q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
  941. q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
  942. q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
  943. q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
  944. q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
  945. q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
  946. q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
  947. q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
  948. d20u8 = vext_u8(d7u8, d8u8, 5);
  949. d21u8 = vext_u8(d10u8, d11u8, 5);
  950. d22u8 = vext_u8(d6u8, d7u8, 2);
  951. d23u8 = vext_u8(d9u8, d10u8, 2);
  952. d24u8 = vext_u8(d7u8, d8u8, 2);
  953. d25u8 = vext_u8(d10u8, d11u8, 2);
  954. d26u8 = vext_u8(d6u8, d7u8, 3);
  955. d27u8 = vext_u8(d9u8, d10u8, 3);
  956. d28u8 = vext_u8(d7u8, d8u8, 3);
  957. d29u8 = vext_u8(d10u8, d11u8, 3);
  958. q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
  959. q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
  960. q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
  961. q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
  962. q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
  963. q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
  964. q10u16 = vmull_u8(d26u8, d3u8);
  965. q11u16 = vmull_u8(d27u8, d3u8);
  966. q12u16 = vmull_u8(d28u8, d3u8);
  967. q15u16 = vmull_u8(d29u8, d3u8);
  968. q6s16 = vreinterpretq_s16_u16(q6u16);
  969. q7s16 = vreinterpretq_s16_u16(q7u16);
  970. q8s16 = vreinterpretq_s16_u16(q8u16);
  971. q9s16 = vreinterpretq_s16_u16(q9u16);
  972. q10s16 = vreinterpretq_s16_u16(q10u16);
  973. q11s16 = vreinterpretq_s16_u16(q11u16);
  974. q12s16 = vreinterpretq_s16_u16(q12u16);
  975. q15s16 = vreinterpretq_s16_u16(q15u16);
  976. q6s16 = vqaddq_s16(q6s16, q10s16);
  977. q8s16 = vqaddq_s16(q8s16, q11s16);
  978. q7s16 = vqaddq_s16(q7s16, q12s16);
  979. q9s16 = vqaddq_s16(q9s16, q15s16);
  980. d6u8 = vqrshrun_n_s16(q6s16, 7);
  981. d7u8 = vqrshrun_n_s16(q7s16, 7);
  982. d8u8 = vqrshrun_n_s16(q8s16, 7);
  983. d9u8 = vqrshrun_n_s16(q9s16, 7);
  984. q3u8 = vcombine_u8(d6u8, d7u8);
  985. q4u8 = vcombine_u8(d8u8, d9u8);
  986. vst1q_u8(dst, q3u8);
  987. dst += dst_pitch;
  988. vst1q_u8(dst, q4u8);
  989. dst += dst_pitch;
  990. }
  991. return;
  992. }
  993. src = src_ptr - 2 - src_pixels_per_line * 2;
  994. tmpp = tmp;
  995. for (i = 0; i < 7; i++) {
  996. d6u8 = vld1_u8(src);
  997. d7u8 = vld1_u8(src + 8);
  998. d8u8 = vld1_u8(src + 16);
  999. src += src_pixels_per_line;
  1000. d9u8 = vld1_u8(src);
  1001. d10u8 = vld1_u8(src + 8);
  1002. d11u8 = vld1_u8(src + 16);
  1003. src += src_pixels_per_line;
  1004. d12u8 = vld1_u8(src);
  1005. d13u8 = vld1_u8(src + 8);
  1006. d14u8 = vld1_u8(src + 16);
  1007. src += src_pixels_per_line;
  1008. __builtin_prefetch(src);
  1009. __builtin_prefetch(src + src_pixels_per_line);
  1010. __builtin_prefetch(src + src_pixels_per_line * 2);
  1011. q8u16 = vmull_u8(d6u8, d0u8);
  1012. q9u16 = vmull_u8(d7u8, d0u8);
  1013. q10u16 = vmull_u8(d9u8, d0u8);
  1014. q11u16 = vmull_u8(d10u8, d0u8);
  1015. q12u16 = vmull_u8(d12u8, d0u8);
  1016. q13u16 = vmull_u8(d13u8, d0u8);
  1017. d28u8 = vext_u8(d6u8, d7u8, 1);
  1018. d29u8 = vext_u8(d9u8, d10u8, 1);
  1019. d30u8 = vext_u8(d12u8, d13u8, 1);
  1020. q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
  1021. q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
  1022. q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
  1023. d28u8 = vext_u8(d7u8, d8u8, 1);
  1024. d29u8 = vext_u8(d10u8, d11u8, 1);
  1025. d30u8 = vext_u8(d13u8, d14u8, 1);
  1026. q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
  1027. q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
  1028. q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
  1029. d28u8 = vext_u8(d6u8, d7u8, 4);
  1030. d29u8 = vext_u8(d9u8, d10u8, 4);
  1031. d30u8 = vext_u8(d12u8, d13u8, 4);
  1032. q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
  1033. q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
  1034. q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
  1035. d28u8 = vext_u8(d7u8, d8u8, 4);
  1036. d29u8 = vext_u8(d10u8, d11u8, 4);
  1037. d30u8 = vext_u8(d13u8, d14u8, 4);
  1038. q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
  1039. q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
  1040. q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
  1041. d28u8 = vext_u8(d6u8, d7u8, 5);
  1042. d29u8 = vext_u8(d9u8, d10u8, 5);
  1043. d30u8 = vext_u8(d12u8, d13u8, 5);
  1044. q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
  1045. q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
  1046. q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
  1047. d28u8 = vext_u8(d7u8, d8u8, 5);
  1048. d29u8 = vext_u8(d10u8, d11u8, 5);
  1049. d30u8 = vext_u8(d13u8, d14u8, 5);
  1050. q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
  1051. q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
  1052. q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
  1053. d28u8 = vext_u8(d6u8, d7u8, 2);
  1054. d29u8 = vext_u8(d9u8, d10u8, 2);
  1055. d30u8 = vext_u8(d12u8, d13u8, 2);
  1056. q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
  1057. q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
  1058. q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
  1059. d28u8 = vext_u8(d7u8, d8u8, 2);
  1060. d29u8 = vext_u8(d10u8, d11u8, 2);
  1061. d30u8 = vext_u8(d13u8, d14u8, 2);
  1062. q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
  1063. q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
  1064. q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
  1065. d28u8 = vext_u8(d6u8, d7u8, 3);
  1066. d29u8 = vext_u8(d9u8, d10u8, 3);
  1067. d30u8 = vext_u8(d12u8, d13u8, 3);
  1068. d15u8 = vext_u8(d7u8, d8u8, 3);
  1069. d31u8 = vext_u8(d10u8, d11u8, 3);
  1070. d6u8 = vext_u8(d13u8, d14u8, 3);
  1071. q4u16 = vmull_u8(d28u8, d3u8);
  1072. q5u16 = vmull_u8(d29u8, d3u8);
  1073. q6u16 = vmull_u8(d30u8, d3u8);
  1074. q4s16 = vreinterpretq_s16_u16(q4u16);
  1075. q5s16 = vreinterpretq_s16_u16(q5u16);
  1076. q6s16 = vreinterpretq_s16_u16(q6u16);
  1077. q8s16 = vreinterpretq_s16_u16(q8u16);
  1078. q10s16 = vreinterpretq_s16_u16(q10u16);
  1079. q12s16 = vreinterpretq_s16_u16(q12u16);
  1080. q8s16 = vqaddq_s16(q8s16, q4s16);
  1081. q10s16 = vqaddq_s16(q10s16, q5s16);
  1082. q12s16 = vqaddq_s16(q12s16, q6s16);
  1083. q6u16 = vmull_u8(d15u8, d3u8);
  1084. q7u16 = vmull_u8(d31u8, d3u8);
  1085. q3u16 = vmull_u8(d6u8, d3u8);
  1086. q3s16 = vreinterpretq_s16_u16(q3u16);
  1087. q6s16 = vreinterpretq_s16_u16(q6u16);
  1088. q7s16 = vreinterpretq_s16_u16(q7u16);
  1089. q9s16 = vreinterpretq_s16_u16(q9u16);
  1090. q11s16 = vreinterpretq_s16_u16(q11u16);
  1091. q13s16 = vreinterpretq_s16_u16(q13u16);
  1092. q9s16 = vqaddq_s16(q9s16, q6s16);
  1093. q11s16 = vqaddq_s16(q11s16, q7s16);
  1094. q13s16 = vqaddq_s16(q13s16, q3s16);
  1095. d6u8 = vqrshrun_n_s16(q8s16, 7);
  1096. d7u8 = vqrshrun_n_s16(q9s16, 7);
  1097. d8u8 = vqrshrun_n_s16(q10s16, 7);
  1098. d9u8 = vqrshrun_n_s16(q11s16, 7);
  1099. d10u8 = vqrshrun_n_s16(q12s16, 7);
  1100. d11u8 = vqrshrun_n_s16(q13s16, 7);
  1101. vst1_u8(tmpp, d6u8);
  1102. tmpp += 8;
  1103. vst1_u8(tmpp, d7u8);
  1104. tmpp += 8;
  1105. vst1_u8(tmpp, d8u8);
  1106. tmpp += 8;
  1107. vst1_u8(tmpp, d9u8);
  1108. tmpp += 8;
  1109. vst1_u8(tmpp, d10u8);
  1110. tmpp += 8;
  1111. vst1_u8(tmpp, d11u8);
  1112. tmpp += 8;
  1113. }
  1114. // Second pass: 16x16
  1115. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  1116. d0s8 = vdup_lane_s8(dtmps8, 0);
  1117. d1s8 = vdup_lane_s8(dtmps8, 1);
  1118. d2s8 = vdup_lane_s8(dtmps8, 2);
  1119. d3s8 = vdup_lane_s8(dtmps8, 3);
  1120. d4s8 = vdup_lane_s8(dtmps8, 4);
  1121. d5s8 = vdup_lane_s8(dtmps8, 5);
  1122. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  1123. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  1124. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  1125. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  1126. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  1127. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  1128. for (i = 0; i < 2; i++) {
  1129. dst = dst_ptr + 8 * i;
  1130. tmpp = tmp + 8 * i;
  1131. d18u8 = vld1_u8(tmpp);
  1132. tmpp += 16;
  1133. d19u8 = vld1_u8(tmpp);
  1134. tmpp += 16;
  1135. d20u8 = vld1_u8(tmpp);
  1136. tmpp += 16;
  1137. d21u8 = vld1_u8(tmpp);
  1138. tmpp += 16;
  1139. d22u8 = vld1_u8(tmpp);
  1140. tmpp += 16;
  1141. for (j = 0; j < 4; j++) {
  1142. d23u8 = vld1_u8(tmpp);
  1143. tmpp += 16;
  1144. d24u8 = vld1_u8(tmpp);
  1145. tmpp += 16;
  1146. d25u8 = vld1_u8(tmpp);
  1147. tmpp += 16;
  1148. d26u8 = vld1_u8(tmpp);
  1149. tmpp += 16;
  1150. q3u16 = vmull_u8(d18u8, d0u8);
  1151. q4u16 = vmull_u8(d19u8, d0u8);
  1152. q5u16 = vmull_u8(d20u8, d0u8);
  1153. q6u16 = vmull_u8(d21u8, d0u8);
  1154. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  1155. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  1156. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  1157. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  1158. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  1159. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  1160. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  1161. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  1162. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  1163. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  1164. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  1165. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  1166. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  1167. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  1168. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  1169. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  1170. q7u16 = vmull_u8(d21u8, d3u8);
  1171. q8u16 = vmull_u8(d22u8, d3u8);
  1172. q9u16 = vmull_u8(d23u8, d3u8);
  1173. q10u16 = vmull_u8(d24u8, d3u8);
  1174. q3s16 = vreinterpretq_s16_u16(q3u16);
  1175. q4s16 = vreinterpretq_s16_u16(q4u16);
  1176. q5s16 = vreinterpretq_s16_u16(q5u16);
  1177. q6s16 = vreinterpretq_s16_u16(q6u16);
  1178. q7s16 = vreinterpretq_s16_u16(q7u16);
  1179. q8s16 = vreinterpretq_s16_u16(q8u16);
  1180. q9s16 = vreinterpretq_s16_u16(q9u16);
  1181. q10s16 = vreinterpretq_s16_u16(q10u16);
  1182. q7s16 = vqaddq_s16(q7s16, q3s16);
  1183. q8s16 = vqaddq_s16(q8s16, q4s16);
  1184. q9s16 = vqaddq_s16(q9s16, q5s16);
  1185. q10s16 = vqaddq_s16(q10s16, q6s16);
  1186. d6u8 = vqrshrun_n_s16(q7s16, 7);
  1187. d7u8 = vqrshrun_n_s16(q8s16, 7);
  1188. d8u8 = vqrshrun_n_s16(q9s16, 7);
  1189. d9u8 = vqrshrun_n_s16(q10s16, 7);
  1190. d18u8 = d22u8;
  1191. d19u8 = d23u8;
  1192. d20u8 = d24u8;
  1193. d21u8 = d25u8;
  1194. d22u8 = d26u8;
  1195. vst1_u8(dst, d6u8);
  1196. dst += dst_pitch;
  1197. vst1_u8(dst, d7u8);
  1198. dst += dst_pitch;
  1199. vst1_u8(dst, d8u8);
  1200. dst += dst_pitch;
  1201. vst1_u8(dst, d9u8);
  1202. dst += dst_pitch;
  1203. }
  1204. }
  1205. return;
  1206. }