sixtappredict_neon.c 59 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include <string.h>
  12. #include "./vpx_config.h"
  13. #include "vpx_dsp/arm/mem_neon.h"
  14. #include "vpx_ports/mem.h"
  15. static const int8_t vp8_sub_pel_filters[8][8] = {
  16. { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positionyys are */
  17. { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */
  18. { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
  19. { 0, -9, 93, 50, -6, 0, 0, 0 },
  20. { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
  21. { 0, -6, 50, 93, -9, 0, 0, 0 },
  22. { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
  23. { 0, -1, 12, 123, -6, 0, 0, 0 },
  24. };
  25. // This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
  26. // Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
  27. // Elements 1 and 4 are either 0 or negative. The code accounts for this with
  28. // multiply/accumulates which either add or subtract as needed. The other
  29. // functions will be updated to use this table later.
  30. // It is also expanded to 8 elements to allow loading into 64 bit neon
  31. // registers.
  32. static const uint8_t abs_filters[8][8] = {
  33. { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 },
  34. { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
  35. { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
  36. { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
  37. };
  38. static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
  39. return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
  40. }
  41. static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
  42. const uint8x8_t filter, uint16x8_t *c,
  43. uint16x8_t *d) {
  44. const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
  45. vreinterpret_u32_u8(vget_high_u8(a)));
  46. const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
  47. vreinterpret_u32_u8(vget_high_u8(b)));
  48. *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
  49. *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
  50. }
  51. static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
  52. const uint8x8_t filter, uint16x8_t *c,
  53. uint16x8_t *d) {
  54. const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
  55. vreinterpret_u32_u8(vget_high_u8(a)));
  56. const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
  57. vreinterpret_u32_u8(vget_high_u8(b)));
  58. *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
  59. *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
  60. }
  61. static INLINE void yonly4x4(const unsigned char *src, int src_stride,
  62. int filter_offset, unsigned char *dst,
  63. int dst_stride) {
  64. uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
  65. uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
  66. uint16x8_t c0, c1, c2, c3;
  67. int16x8_t d0, d1;
  68. uint8x8_t e0, e1;
  69. const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
  70. const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
  71. const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
  72. const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
  73. const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
  74. const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
  75. const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
  76. src -= src_stride * 2;
  77. // Shift the even rows to allow using 'vext' to combine the vectors. armv8
  78. // has vcopy_lane which would be interesting. This started as just a
  79. // horrible workaround for clang adding alignment hints to 32bit loads:
  80. // https://llvm.org/bugs/show_bug.cgi?id=24421
  81. // But it turns out it almost identical to casting the loads.
  82. a0 = load_and_shift(src);
  83. src += src_stride;
  84. a1 = vld1_u8(src);
  85. src += src_stride;
  86. a2 = load_and_shift(src);
  87. src += src_stride;
  88. a3 = vld1_u8(src);
  89. src += src_stride;
  90. a4 = load_and_shift(src);
  91. src += src_stride;
  92. a5 = vld1_u8(src);
  93. src += src_stride;
  94. a6 = load_and_shift(src);
  95. src += src_stride;
  96. a7 = vld1_u8(src);
  97. src += src_stride;
  98. a8 = vld1_u8(src);
  99. // Combine the rows so we can operate on 8 at a time.
  100. b0 = vext_u8(a0, a1, 4);
  101. b2 = vext_u8(a2, a3, 4);
  102. b4 = vext_u8(a4, a5, 4);
  103. b6 = vext_u8(a6, a7, 4);
  104. b8 = a8;
  105. // To keep with the 8-at-a-time theme, combine *alternate* rows. This
  106. // allows combining the odd rows with the even.
  107. b1 = vext_u8(b0, b2, 4);
  108. b3 = vext_u8(b2, b4, 4);
  109. b5 = vext_u8(b4, b6, 4);
  110. b7 = vext_u8(b6, b8, 4);
  111. // Multiply and expand to 16 bits.
  112. c0 = vmull_u8(b0, filter0);
  113. c1 = vmull_u8(b2, filter0);
  114. c2 = vmull_u8(b5, filter5);
  115. c3 = vmull_u8(b7, filter5);
  116. // Multiply, subtract and accumulate for filters 1 and 4 (the negative
  117. // ones).
  118. c0 = vmlsl_u8(c0, b4, filter4);
  119. c1 = vmlsl_u8(c1, b6, filter4);
  120. c2 = vmlsl_u8(c2, b1, filter1);
  121. c3 = vmlsl_u8(c3, b3, filter1);
  122. // Add more positive ones. vmlal should really return a signed type.
  123. // It's doing signed math internally, as evidenced by the fact we can do
  124. // subtractions followed by more additions. Ideally we could use
  125. // vqmlal/sl but that instruction doesn't exist. Might be able to
  126. // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
  127. c0 = vmlal_u8(c0, b2, filter2);
  128. c1 = vmlal_u8(c1, b4, filter2);
  129. c2 = vmlal_u8(c2, b3, filter3);
  130. c3 = vmlal_u8(c3, b5, filter3);
  131. // Use signed saturation math because vmlsl may have left some negative
  132. // numbers in there.
  133. d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
  134. d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
  135. // Use signed again because numbers like -200 need to be saturated to 0.
  136. e0 = vqrshrun_n_s16(d0, 7);
  137. e1 = vqrshrun_n_s16(d1, 7);
  138. store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
  139. }
  140. void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
  141. int xoffset, int yoffset,
  142. unsigned char *dst_ptr, int dst_pitch) {
  143. uint8x16_t s0, s1, s2, s3, s4;
  144. uint64x2_t s01, s23;
  145. // Variables to hold src[] elements for the given filter[]
  146. uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
  147. uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
  148. uint8x16_t s01_f0, s23_f0;
  149. uint64x2_t s01_f3, s23_f3;
  150. uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
  151. // Accumulator variables.
  152. uint16x8_t d0123, d4567, d89;
  153. uint16x8_t d0123_a, d4567_a, d89_a;
  154. int16x8_t e0123, e4567, e89;
  155. // Second pass intermediates.
  156. uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
  157. uint16x8_t c0, c1, c2, c3;
  158. int16x8_t d0, d1;
  159. uint8x8_t e0, e1;
  160. uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
  161. if (xoffset == 0) { // Second pass only.
  162. yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
  163. return;
  164. }
  165. if (yoffset == 0) { // First pass only.
  166. src_ptr -= 2;
  167. } else { // Add context for the second pass. 2 extra lines on top.
  168. src_ptr -= 2 + (src_pixels_per_line * 2);
  169. }
  170. filter = vld1_u8(abs_filters[xoffset]);
  171. filter0 = vdup_lane_u8(filter, 0);
  172. filter1 = vdup_lane_u8(filter, 1);
  173. filter2 = vdup_lane_u8(filter, 2);
  174. filter3 = vdup_lane_u8(filter, 3);
  175. filter4 = vdup_lane_u8(filter, 4);
  176. filter5 = vdup_lane_u8(filter, 5);
  177. // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
  178. // garbage. So much effort for that last single bit.
  179. // The low values of each pair are for filter0.
  180. s0 = vld1q_u8(src_ptr);
  181. src_ptr += src_pixels_per_line;
  182. s1 = vld1q_u8(src_ptr);
  183. src_ptr += src_pixels_per_line;
  184. s2 = vld1q_u8(src_ptr);
  185. src_ptr += src_pixels_per_line;
  186. s3 = vld1q_u8(src_ptr);
  187. src_ptr += src_pixels_per_line;
  188. // Shift to extract values for filter[5]
  189. // If src[] is 0, this puts:
  190. // 3 4 5 6 7 8 9 10 in s0_f5
  191. // Can't use vshr.u64 because it crosses the double word boundary.
  192. s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
  193. s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
  194. s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
  195. s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
  196. s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
  197. s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
  198. s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
  199. s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
  200. d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
  201. d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
  202. // Keep original src data as 64 bits to simplify shifting and extracting.
  203. s01 = vreinterpretq_u64_u8(s01_f0);
  204. s23 = vreinterpretq_u64_u8(s23_f0);
  205. // 3 4 5 6 * filter0
  206. filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
  207. // Shift over one to use -1, 0, 1, 2 for filter1
  208. // -1 0 1 2 * filter1
  209. filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
  210. vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
  211. &d0123, &d4567);
  212. // 2 3 4 5 * filter4
  213. filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
  214. vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
  215. &d0123, &d4567);
  216. // 0 1 2 3 * filter2
  217. filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
  218. vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
  219. &d0123, &d4567);
  220. // 1 2 3 4 * filter3
  221. s01_f3 = vshrq_n_u64(s01, 24);
  222. s23_f3 = vshrq_n_u64(s23, 24);
  223. s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
  224. vreinterpret_u32_u64(vget_high_u64(s01_f3)));
  225. s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
  226. vreinterpret_u32_u64(vget_high_u64(s23_f3)));
  227. // Accumulate into different registers so it can use saturated addition.
  228. d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
  229. d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
  230. e0123 =
  231. vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
  232. e4567 =
  233. vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
  234. // Shift and narrow.
  235. b0 = vqrshrun_n_s16(e0123, 7);
  236. b2 = vqrshrun_n_s16(e4567, 7);
  237. if (yoffset == 0) { // firstpass_filter4x4_only
  238. store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
  239. return;
  240. }
  241. // Load additional context when doing both filters.
  242. s0 = vld1q_u8(src_ptr);
  243. src_ptr += src_pixels_per_line;
  244. s1 = vld1q_u8(src_ptr);
  245. src_ptr += src_pixels_per_line;
  246. s2 = vld1q_u8(src_ptr);
  247. src_ptr += src_pixels_per_line;
  248. s3 = vld1q_u8(src_ptr);
  249. src_ptr += src_pixels_per_line;
  250. s4 = vld1q_u8(src_ptr);
  251. s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
  252. s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
  253. s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
  254. s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
  255. s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
  256. // 3 4 5 6 * filter0
  257. s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
  258. s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
  259. s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
  260. s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
  261. // But this time instead of 16 pixels to filter, there are 20. So an extra
  262. // run with a doubleword register.
  263. d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
  264. d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
  265. d89 = vmull_u8(s4_f5, filter5);
  266. // Save a copy as u64 for shifting.
  267. s01 = vreinterpretq_u64_u8(s01_f0);
  268. s23 = vreinterpretq_u64_u8(s23_f0);
  269. filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
  270. d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
  271. filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
  272. vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
  273. &d0123, &d4567);
  274. s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
  275. d89 = vmlsl_u8(d89, s4_f1, filter1);
  276. filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
  277. vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
  278. &d0123, &d4567);
  279. s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
  280. d89 = vmlsl_u8(d89, s4_f4, filter4);
  281. filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
  282. vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
  283. &d0123, &d4567);
  284. s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
  285. d89 = vmlal_u8(d89, s4_f2, filter2);
  286. s01_f3 = vshrq_n_u64(s01, 24);
  287. s23_f3 = vshrq_n_u64(s23, 24);
  288. s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
  289. vreinterpret_u32_u64(vget_high_u64(s01_f3)));
  290. s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
  291. vreinterpret_u32_u64(vget_high_u64(s23_f3)));
  292. s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
  293. d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
  294. d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
  295. d89_a = vmull_u8(s4_f3, filter3);
  296. e0123 =
  297. vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
  298. e4567 =
  299. vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
  300. e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
  301. b4 = vqrshrun_n_s16(e0123, 7);
  302. b6 = vqrshrun_n_s16(e4567, 7);
  303. b8 = vqrshrun_n_s16(e89, 7);
  304. // Second pass: 4x4
  305. filter = vld1_u8(abs_filters[yoffset]);
  306. filter0 = vdup_lane_u8(filter, 0);
  307. filter1 = vdup_lane_u8(filter, 1);
  308. filter2 = vdup_lane_u8(filter, 2);
  309. filter3 = vdup_lane_u8(filter, 3);
  310. filter4 = vdup_lane_u8(filter, 4);
  311. filter5 = vdup_lane_u8(filter, 5);
  312. b1 = vext_u8(b0, b2, 4);
  313. b3 = vext_u8(b2, b4, 4);
  314. b5 = vext_u8(b4, b6, 4);
  315. b7 = vext_u8(b6, b8, 4);
  316. c0 = vmull_u8(b0, filter0);
  317. c1 = vmull_u8(b2, filter0);
  318. c2 = vmull_u8(b5, filter5);
  319. c3 = vmull_u8(b7, filter5);
  320. c0 = vmlsl_u8(c0, b4, filter4);
  321. c1 = vmlsl_u8(c1, b6, filter4);
  322. c2 = vmlsl_u8(c2, b1, filter1);
  323. c3 = vmlsl_u8(c3, b3, filter1);
  324. c0 = vmlal_u8(c0, b2, filter2);
  325. c1 = vmlal_u8(c1, b4, filter2);
  326. c2 = vmlal_u8(c2, b3, filter3);
  327. c3 = vmlal_u8(c3, b5, filter3);
  328. d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
  329. d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
  330. e0 = vqrshrun_n_s16(d0, 7);
  331. e1 = vqrshrun_n_s16(d1, 7);
  332. store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
  333. }
  334. void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
  335. int xoffset, int yoffset,
  336. unsigned char *dst_ptr, int dst_pitch) {
  337. unsigned char *src;
  338. uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
  339. uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
  340. uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
  341. int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
  342. uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
  343. uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
  344. int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
  345. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
  346. uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
  347. if (xoffset == 0) { // secondpass_filter8x4_only
  348. // load second_pass filter
  349. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  350. d0s8 = vdup_lane_s8(dtmps8, 0);
  351. d1s8 = vdup_lane_s8(dtmps8, 1);
  352. d2s8 = vdup_lane_s8(dtmps8, 2);
  353. d3s8 = vdup_lane_s8(dtmps8, 3);
  354. d4s8 = vdup_lane_s8(dtmps8, 4);
  355. d5s8 = vdup_lane_s8(dtmps8, 5);
  356. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  357. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  358. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  359. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  360. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  361. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  362. // load src data
  363. src = src_ptr - src_pixels_per_line * 2;
  364. d22u8 = vld1_u8(src);
  365. src += src_pixels_per_line;
  366. d23u8 = vld1_u8(src);
  367. src += src_pixels_per_line;
  368. d24u8 = vld1_u8(src);
  369. src += src_pixels_per_line;
  370. d25u8 = vld1_u8(src);
  371. src += src_pixels_per_line;
  372. d26u8 = vld1_u8(src);
  373. src += src_pixels_per_line;
  374. d27u8 = vld1_u8(src);
  375. src += src_pixels_per_line;
  376. d28u8 = vld1_u8(src);
  377. src += src_pixels_per_line;
  378. d29u8 = vld1_u8(src);
  379. src += src_pixels_per_line;
  380. d30u8 = vld1_u8(src);
  381. q3u16 = vmull_u8(d22u8, d0u8);
  382. q4u16 = vmull_u8(d23u8, d0u8);
  383. q5u16 = vmull_u8(d24u8, d0u8);
  384. q6u16 = vmull_u8(d25u8, d0u8);
  385. q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
  386. q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
  387. q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
  388. q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
  389. q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
  390. q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
  391. q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
  392. q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
  393. q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
  394. q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
  395. q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
  396. q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
  397. q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
  398. q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
  399. q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
  400. q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
  401. q7u16 = vmull_u8(d25u8, d3u8);
  402. q8u16 = vmull_u8(d26u8, d3u8);
  403. q9u16 = vmull_u8(d27u8, d3u8);
  404. q10u16 = vmull_u8(d28u8, d3u8);
  405. q3s16 = vreinterpretq_s16_u16(q3u16);
  406. q4s16 = vreinterpretq_s16_u16(q4u16);
  407. q5s16 = vreinterpretq_s16_u16(q5u16);
  408. q6s16 = vreinterpretq_s16_u16(q6u16);
  409. q7s16 = vreinterpretq_s16_u16(q7u16);
  410. q8s16 = vreinterpretq_s16_u16(q8u16);
  411. q9s16 = vreinterpretq_s16_u16(q9u16);
  412. q10s16 = vreinterpretq_s16_u16(q10u16);
  413. q7s16 = vqaddq_s16(q7s16, q3s16);
  414. q8s16 = vqaddq_s16(q8s16, q4s16);
  415. q9s16 = vqaddq_s16(q9s16, q5s16);
  416. q10s16 = vqaddq_s16(q10s16, q6s16);
  417. d6u8 = vqrshrun_n_s16(q7s16, 7);
  418. d7u8 = vqrshrun_n_s16(q8s16, 7);
  419. d8u8 = vqrshrun_n_s16(q9s16, 7);
  420. d9u8 = vqrshrun_n_s16(q10s16, 7);
  421. vst1_u8(dst_ptr, d6u8);
  422. dst_ptr += dst_pitch;
  423. vst1_u8(dst_ptr, d7u8);
  424. dst_ptr += dst_pitch;
  425. vst1_u8(dst_ptr, d8u8);
  426. dst_ptr += dst_pitch;
  427. vst1_u8(dst_ptr, d9u8);
  428. return;
  429. }
  430. // load first_pass filter
  431. dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
  432. d0s8 = vdup_lane_s8(dtmps8, 0);
  433. d1s8 = vdup_lane_s8(dtmps8, 1);
  434. d2s8 = vdup_lane_s8(dtmps8, 2);
  435. d3s8 = vdup_lane_s8(dtmps8, 3);
  436. d4s8 = vdup_lane_s8(dtmps8, 4);
  437. d5s8 = vdup_lane_s8(dtmps8, 5);
  438. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  439. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  440. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  441. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  442. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  443. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  444. // First pass: output_height lines x output_width columns (9x4)
  445. if (yoffset == 0) // firstpass_filter4x4_only
  446. src = src_ptr - 2;
  447. else
  448. src = src_ptr - 2 - (src_pixels_per_line * 2);
  449. q3u8 = vld1q_u8(src);
  450. src += src_pixels_per_line;
  451. q4u8 = vld1q_u8(src);
  452. src += src_pixels_per_line;
  453. q5u8 = vld1q_u8(src);
  454. src += src_pixels_per_line;
  455. q6u8 = vld1q_u8(src);
  456. q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  457. q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  458. q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  459. q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  460. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  461. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  462. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  463. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  464. q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
  465. q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
  466. q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
  467. q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
  468. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  469. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  470. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  471. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  472. q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
  473. q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
  474. q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
  475. q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
  476. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  477. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  478. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  479. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  480. q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
  481. q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
  482. q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
  483. q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
  484. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  485. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  486. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  487. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  488. q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
  489. q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
  490. q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
  491. q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
  492. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  493. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  494. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  495. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  496. q3u16 = vmull_u8(d28u8, d3u8);
  497. q4u16 = vmull_u8(d29u8, d3u8);
  498. q5u16 = vmull_u8(d30u8, d3u8);
  499. q6u16 = vmull_u8(d31u8, d3u8);
  500. q3s16 = vreinterpretq_s16_u16(q3u16);
  501. q4s16 = vreinterpretq_s16_u16(q4u16);
  502. q5s16 = vreinterpretq_s16_u16(q5u16);
  503. q6s16 = vreinterpretq_s16_u16(q6u16);
  504. q7s16 = vreinterpretq_s16_u16(q7u16);
  505. q8s16 = vreinterpretq_s16_u16(q8u16);
  506. q9s16 = vreinterpretq_s16_u16(q9u16);
  507. q10s16 = vreinterpretq_s16_u16(q10u16);
  508. q7s16 = vqaddq_s16(q7s16, q3s16);
  509. q8s16 = vqaddq_s16(q8s16, q4s16);
  510. q9s16 = vqaddq_s16(q9s16, q5s16);
  511. q10s16 = vqaddq_s16(q10s16, q6s16);
  512. d22u8 = vqrshrun_n_s16(q7s16, 7);
  513. d23u8 = vqrshrun_n_s16(q8s16, 7);
  514. d24u8 = vqrshrun_n_s16(q9s16, 7);
  515. d25u8 = vqrshrun_n_s16(q10s16, 7);
  516. if (yoffset == 0) { // firstpass_filter8x4_only
  517. vst1_u8(dst_ptr, d22u8);
  518. dst_ptr += dst_pitch;
  519. vst1_u8(dst_ptr, d23u8);
  520. dst_ptr += dst_pitch;
  521. vst1_u8(dst_ptr, d24u8);
  522. dst_ptr += dst_pitch;
  523. vst1_u8(dst_ptr, d25u8);
  524. return;
  525. }
  526. // First Pass on rest 5-line data
  527. src += src_pixels_per_line;
  528. q3u8 = vld1q_u8(src);
  529. src += src_pixels_per_line;
  530. q4u8 = vld1q_u8(src);
  531. src += src_pixels_per_line;
  532. q5u8 = vld1q_u8(src);
  533. src += src_pixels_per_line;
  534. q6u8 = vld1q_u8(src);
  535. src += src_pixels_per_line;
  536. q7u8 = vld1q_u8(src);
  537. q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  538. q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  539. q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  540. q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  541. q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
  542. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  543. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  544. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  545. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  546. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
  547. q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
  548. q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
  549. q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
  550. q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
  551. q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
  552. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  553. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  554. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  555. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  556. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
  557. q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
  558. q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
  559. q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
  560. q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
  561. q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
  562. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  563. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  564. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  565. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  566. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
  567. q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
  568. q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
  569. q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
  570. q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
  571. q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
  572. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  573. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  574. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  575. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  576. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
  577. q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
  578. q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
  579. q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
  580. q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
  581. q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
  582. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  583. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  584. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  585. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  586. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
  587. q3u16 = vmull_u8(d27u8, d3u8);
  588. q4u16 = vmull_u8(d28u8, d3u8);
  589. q5u16 = vmull_u8(d29u8, d3u8);
  590. q6u16 = vmull_u8(d30u8, d3u8);
  591. q7u16 = vmull_u8(d31u8, d3u8);
  592. q3s16 = vreinterpretq_s16_u16(q3u16);
  593. q4s16 = vreinterpretq_s16_u16(q4u16);
  594. q5s16 = vreinterpretq_s16_u16(q5u16);
  595. q6s16 = vreinterpretq_s16_u16(q6u16);
  596. q7s16 = vreinterpretq_s16_u16(q7u16);
  597. q8s16 = vreinterpretq_s16_u16(q8u16);
  598. q9s16 = vreinterpretq_s16_u16(q9u16);
  599. q10s16 = vreinterpretq_s16_u16(q10u16);
  600. q11s16 = vreinterpretq_s16_u16(q11u16);
  601. q12s16 = vreinterpretq_s16_u16(q12u16);
  602. q8s16 = vqaddq_s16(q8s16, q3s16);
  603. q9s16 = vqaddq_s16(q9s16, q4s16);
  604. q10s16 = vqaddq_s16(q10s16, q5s16);
  605. q11s16 = vqaddq_s16(q11s16, q6s16);
  606. q12s16 = vqaddq_s16(q12s16, q7s16);
  607. d26u8 = vqrshrun_n_s16(q8s16, 7);
  608. d27u8 = vqrshrun_n_s16(q9s16, 7);
  609. d28u8 = vqrshrun_n_s16(q10s16, 7);
  610. d29u8 = vqrshrun_n_s16(q11s16, 7);
  611. d30u8 = vqrshrun_n_s16(q12s16, 7);
  612. // Second pass: 8x4
  613. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  614. d0s8 = vdup_lane_s8(dtmps8, 0);
  615. d1s8 = vdup_lane_s8(dtmps8, 1);
  616. d2s8 = vdup_lane_s8(dtmps8, 2);
  617. d3s8 = vdup_lane_s8(dtmps8, 3);
  618. d4s8 = vdup_lane_s8(dtmps8, 4);
  619. d5s8 = vdup_lane_s8(dtmps8, 5);
  620. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  621. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  622. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  623. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  624. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  625. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  626. q3u16 = vmull_u8(d22u8, d0u8);
  627. q4u16 = vmull_u8(d23u8, d0u8);
  628. q5u16 = vmull_u8(d24u8, d0u8);
  629. q6u16 = vmull_u8(d25u8, d0u8);
  630. q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
  631. q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
  632. q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
  633. q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
  634. q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
  635. q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
  636. q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
  637. q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
  638. q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
  639. q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
  640. q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
  641. q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
  642. q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
  643. q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
  644. q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
  645. q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
  646. q7u16 = vmull_u8(d25u8, d3u8);
  647. q8u16 = vmull_u8(d26u8, d3u8);
  648. q9u16 = vmull_u8(d27u8, d3u8);
  649. q10u16 = vmull_u8(d28u8, d3u8);
  650. q3s16 = vreinterpretq_s16_u16(q3u16);
  651. q4s16 = vreinterpretq_s16_u16(q4u16);
  652. q5s16 = vreinterpretq_s16_u16(q5u16);
  653. q6s16 = vreinterpretq_s16_u16(q6u16);
  654. q7s16 = vreinterpretq_s16_u16(q7u16);
  655. q8s16 = vreinterpretq_s16_u16(q8u16);
  656. q9s16 = vreinterpretq_s16_u16(q9u16);
  657. q10s16 = vreinterpretq_s16_u16(q10u16);
  658. q7s16 = vqaddq_s16(q7s16, q3s16);
  659. q8s16 = vqaddq_s16(q8s16, q4s16);
  660. q9s16 = vqaddq_s16(q9s16, q5s16);
  661. q10s16 = vqaddq_s16(q10s16, q6s16);
  662. d6u8 = vqrshrun_n_s16(q7s16, 7);
  663. d7u8 = vqrshrun_n_s16(q8s16, 7);
  664. d8u8 = vqrshrun_n_s16(q9s16, 7);
  665. d9u8 = vqrshrun_n_s16(q10s16, 7);
  666. vst1_u8(dst_ptr, d6u8);
  667. dst_ptr += dst_pitch;
  668. vst1_u8(dst_ptr, d7u8);
  669. dst_ptr += dst_pitch;
  670. vst1_u8(dst_ptr, d8u8);
  671. dst_ptr += dst_pitch;
  672. vst1_u8(dst_ptr, d9u8);
  673. return;
  674. }
  675. void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
  676. int xoffset, int yoffset,
  677. unsigned char *dst_ptr, int dst_pitch) {
  678. unsigned char *src, *tmpp;
  679. unsigned char tmp[64];
  680. int i;
  681. uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
  682. uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
  683. uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
  684. int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
  685. uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
  686. uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
  687. int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
  688. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
  689. uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
  690. if (xoffset == 0) { // secondpass_filter8x8_only
  691. // load second_pass filter
  692. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  693. d0s8 = vdup_lane_s8(dtmps8, 0);
  694. d1s8 = vdup_lane_s8(dtmps8, 1);
  695. d2s8 = vdup_lane_s8(dtmps8, 2);
  696. d3s8 = vdup_lane_s8(dtmps8, 3);
  697. d4s8 = vdup_lane_s8(dtmps8, 4);
  698. d5s8 = vdup_lane_s8(dtmps8, 5);
  699. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  700. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  701. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  702. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  703. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  704. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  705. // load src data
  706. src = src_ptr - src_pixels_per_line * 2;
  707. d18u8 = vld1_u8(src);
  708. src += src_pixels_per_line;
  709. d19u8 = vld1_u8(src);
  710. src += src_pixels_per_line;
  711. d20u8 = vld1_u8(src);
  712. src += src_pixels_per_line;
  713. d21u8 = vld1_u8(src);
  714. src += src_pixels_per_line;
  715. d22u8 = vld1_u8(src);
  716. src += src_pixels_per_line;
  717. d23u8 = vld1_u8(src);
  718. src += src_pixels_per_line;
  719. d24u8 = vld1_u8(src);
  720. src += src_pixels_per_line;
  721. d25u8 = vld1_u8(src);
  722. src += src_pixels_per_line;
  723. d26u8 = vld1_u8(src);
  724. src += src_pixels_per_line;
  725. d27u8 = vld1_u8(src);
  726. src += src_pixels_per_line;
  727. d28u8 = vld1_u8(src);
  728. src += src_pixels_per_line;
  729. d29u8 = vld1_u8(src);
  730. src += src_pixels_per_line;
  731. d30u8 = vld1_u8(src);
  732. for (i = 2; i > 0; i--) {
  733. q3u16 = vmull_u8(d18u8, d0u8);
  734. q4u16 = vmull_u8(d19u8, d0u8);
  735. q5u16 = vmull_u8(d20u8, d0u8);
  736. q6u16 = vmull_u8(d21u8, d0u8);
  737. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  738. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  739. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  740. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  741. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  742. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  743. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  744. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  745. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  746. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  747. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  748. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  749. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  750. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  751. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  752. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  753. q7u16 = vmull_u8(d21u8, d3u8);
  754. q8u16 = vmull_u8(d22u8, d3u8);
  755. q9u16 = vmull_u8(d23u8, d3u8);
  756. q10u16 = vmull_u8(d24u8, d3u8);
  757. q3s16 = vreinterpretq_s16_u16(q3u16);
  758. q4s16 = vreinterpretq_s16_u16(q4u16);
  759. q5s16 = vreinterpretq_s16_u16(q5u16);
  760. q6s16 = vreinterpretq_s16_u16(q6u16);
  761. q7s16 = vreinterpretq_s16_u16(q7u16);
  762. q8s16 = vreinterpretq_s16_u16(q8u16);
  763. q9s16 = vreinterpretq_s16_u16(q9u16);
  764. q10s16 = vreinterpretq_s16_u16(q10u16);
  765. q7s16 = vqaddq_s16(q7s16, q3s16);
  766. q8s16 = vqaddq_s16(q8s16, q4s16);
  767. q9s16 = vqaddq_s16(q9s16, q5s16);
  768. q10s16 = vqaddq_s16(q10s16, q6s16);
  769. d6u8 = vqrshrun_n_s16(q7s16, 7);
  770. d7u8 = vqrshrun_n_s16(q8s16, 7);
  771. d8u8 = vqrshrun_n_s16(q9s16, 7);
  772. d9u8 = vqrshrun_n_s16(q10s16, 7);
  773. d18u8 = d22u8;
  774. d19u8 = d23u8;
  775. d20u8 = d24u8;
  776. d21u8 = d25u8;
  777. d22u8 = d26u8;
  778. d23u8 = d27u8;
  779. d24u8 = d28u8;
  780. d25u8 = d29u8;
  781. d26u8 = d30u8;
  782. vst1_u8(dst_ptr, d6u8);
  783. dst_ptr += dst_pitch;
  784. vst1_u8(dst_ptr, d7u8);
  785. dst_ptr += dst_pitch;
  786. vst1_u8(dst_ptr, d8u8);
  787. dst_ptr += dst_pitch;
  788. vst1_u8(dst_ptr, d9u8);
  789. dst_ptr += dst_pitch;
  790. }
  791. return;
  792. }
  793. // load first_pass filter
  794. dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
  795. d0s8 = vdup_lane_s8(dtmps8, 0);
  796. d1s8 = vdup_lane_s8(dtmps8, 1);
  797. d2s8 = vdup_lane_s8(dtmps8, 2);
  798. d3s8 = vdup_lane_s8(dtmps8, 3);
  799. d4s8 = vdup_lane_s8(dtmps8, 4);
  800. d5s8 = vdup_lane_s8(dtmps8, 5);
  801. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  802. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  803. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  804. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  805. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  806. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  807. // First pass: output_height lines x output_width columns (9x4)
  808. if (yoffset == 0) // firstpass_filter4x4_only
  809. src = src_ptr - 2;
  810. else
  811. src = src_ptr - 2 - (src_pixels_per_line * 2);
  812. tmpp = tmp;
  813. for (i = 2; i > 0; i--) {
  814. q3u8 = vld1q_u8(src);
  815. src += src_pixels_per_line;
  816. q4u8 = vld1q_u8(src);
  817. src += src_pixels_per_line;
  818. q5u8 = vld1q_u8(src);
  819. src += src_pixels_per_line;
  820. q6u8 = vld1q_u8(src);
  821. src += src_pixels_per_line;
  822. __builtin_prefetch(src);
  823. __builtin_prefetch(src + src_pixels_per_line);
  824. __builtin_prefetch(src + src_pixels_per_line * 2);
  825. q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  826. q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  827. q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  828. q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  829. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  830. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  831. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  832. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  833. q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
  834. q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
  835. q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
  836. q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
  837. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  838. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  839. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  840. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  841. q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
  842. q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
  843. q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
  844. q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
  845. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  846. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  847. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  848. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  849. q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
  850. q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
  851. q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
  852. q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
  853. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  854. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  855. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  856. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  857. q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
  858. q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
  859. q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
  860. q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
  861. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  862. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  863. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  864. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  865. q3u16 = vmull_u8(d28u8, d3u8);
  866. q4u16 = vmull_u8(d29u8, d3u8);
  867. q5u16 = vmull_u8(d30u8, d3u8);
  868. q6u16 = vmull_u8(d31u8, d3u8);
  869. q3s16 = vreinterpretq_s16_u16(q3u16);
  870. q4s16 = vreinterpretq_s16_u16(q4u16);
  871. q5s16 = vreinterpretq_s16_u16(q5u16);
  872. q6s16 = vreinterpretq_s16_u16(q6u16);
  873. q7s16 = vreinterpretq_s16_u16(q7u16);
  874. q8s16 = vreinterpretq_s16_u16(q8u16);
  875. q9s16 = vreinterpretq_s16_u16(q9u16);
  876. q10s16 = vreinterpretq_s16_u16(q10u16);
  877. q7s16 = vqaddq_s16(q7s16, q3s16);
  878. q8s16 = vqaddq_s16(q8s16, q4s16);
  879. q9s16 = vqaddq_s16(q9s16, q5s16);
  880. q10s16 = vqaddq_s16(q10s16, q6s16);
  881. d22u8 = vqrshrun_n_s16(q7s16, 7);
  882. d23u8 = vqrshrun_n_s16(q8s16, 7);
  883. d24u8 = vqrshrun_n_s16(q9s16, 7);
  884. d25u8 = vqrshrun_n_s16(q10s16, 7);
  885. if (yoffset == 0) { // firstpass_filter8x4_only
  886. vst1_u8(dst_ptr, d22u8);
  887. dst_ptr += dst_pitch;
  888. vst1_u8(dst_ptr, d23u8);
  889. dst_ptr += dst_pitch;
  890. vst1_u8(dst_ptr, d24u8);
  891. dst_ptr += dst_pitch;
  892. vst1_u8(dst_ptr, d25u8);
  893. dst_ptr += dst_pitch;
  894. } else {
  895. vst1_u8(tmpp, d22u8);
  896. tmpp += 8;
  897. vst1_u8(tmpp, d23u8);
  898. tmpp += 8;
  899. vst1_u8(tmpp, d24u8);
  900. tmpp += 8;
  901. vst1_u8(tmpp, d25u8);
  902. tmpp += 8;
  903. }
  904. }
  905. if (yoffset == 0) return;
  906. // First Pass on rest 5-line data
  907. q3u8 = vld1q_u8(src);
  908. src += src_pixels_per_line;
  909. q4u8 = vld1q_u8(src);
  910. src += src_pixels_per_line;
  911. q5u8 = vld1q_u8(src);
  912. src += src_pixels_per_line;
  913. q6u8 = vld1q_u8(src);
  914. src += src_pixels_per_line;
  915. q7u8 = vld1q_u8(src);
  916. q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  917. q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  918. q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  919. q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  920. q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
  921. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  922. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  923. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  924. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  925. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
  926. q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
  927. q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
  928. q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
  929. q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
  930. q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
  931. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  932. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  933. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  934. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  935. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
  936. q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
  937. q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
  938. q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
  939. q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
  940. q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
  941. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  942. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  943. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  944. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  945. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
  946. q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
  947. q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
  948. q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
  949. q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
  950. q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
  951. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  952. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  953. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  954. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  955. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
  956. q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
  957. q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
  958. q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
  959. q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
  960. q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
  961. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  962. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  963. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  964. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  965. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
  966. q3u16 = vmull_u8(d27u8, d3u8);
  967. q4u16 = vmull_u8(d28u8, d3u8);
  968. q5u16 = vmull_u8(d29u8, d3u8);
  969. q6u16 = vmull_u8(d30u8, d3u8);
  970. q7u16 = vmull_u8(d31u8, d3u8);
  971. q3s16 = vreinterpretq_s16_u16(q3u16);
  972. q4s16 = vreinterpretq_s16_u16(q4u16);
  973. q5s16 = vreinterpretq_s16_u16(q5u16);
  974. q6s16 = vreinterpretq_s16_u16(q6u16);
  975. q7s16 = vreinterpretq_s16_u16(q7u16);
  976. q8s16 = vreinterpretq_s16_u16(q8u16);
  977. q9s16 = vreinterpretq_s16_u16(q9u16);
  978. q10s16 = vreinterpretq_s16_u16(q10u16);
  979. q11s16 = vreinterpretq_s16_u16(q11u16);
  980. q12s16 = vreinterpretq_s16_u16(q12u16);
  981. q8s16 = vqaddq_s16(q8s16, q3s16);
  982. q9s16 = vqaddq_s16(q9s16, q4s16);
  983. q10s16 = vqaddq_s16(q10s16, q5s16);
  984. q11s16 = vqaddq_s16(q11s16, q6s16);
  985. q12s16 = vqaddq_s16(q12s16, q7s16);
  986. d26u8 = vqrshrun_n_s16(q8s16, 7);
  987. d27u8 = vqrshrun_n_s16(q9s16, 7);
  988. d28u8 = vqrshrun_n_s16(q10s16, 7);
  989. d29u8 = vqrshrun_n_s16(q11s16, 7);
  990. d30u8 = vqrshrun_n_s16(q12s16, 7);
  991. // Second pass: 8x8
  992. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  993. d0s8 = vdup_lane_s8(dtmps8, 0);
  994. d1s8 = vdup_lane_s8(dtmps8, 1);
  995. d2s8 = vdup_lane_s8(dtmps8, 2);
  996. d3s8 = vdup_lane_s8(dtmps8, 3);
  997. d4s8 = vdup_lane_s8(dtmps8, 4);
  998. d5s8 = vdup_lane_s8(dtmps8, 5);
  999. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  1000. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  1001. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  1002. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  1003. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  1004. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  1005. tmpp = tmp;
  1006. q9u8 = vld1q_u8(tmpp);
  1007. tmpp += 16;
  1008. q10u8 = vld1q_u8(tmpp);
  1009. tmpp += 16;
  1010. q11u8 = vld1q_u8(tmpp);
  1011. tmpp += 16;
  1012. q12u8 = vld1q_u8(tmpp);
  1013. d18u8 = vget_low_u8(q9u8);
  1014. d19u8 = vget_high_u8(q9u8);
  1015. d20u8 = vget_low_u8(q10u8);
  1016. d21u8 = vget_high_u8(q10u8);
  1017. d22u8 = vget_low_u8(q11u8);
  1018. d23u8 = vget_high_u8(q11u8);
  1019. d24u8 = vget_low_u8(q12u8);
  1020. d25u8 = vget_high_u8(q12u8);
  1021. for (i = 2; i > 0; i--) {
  1022. q3u16 = vmull_u8(d18u8, d0u8);
  1023. q4u16 = vmull_u8(d19u8, d0u8);
  1024. q5u16 = vmull_u8(d20u8, d0u8);
  1025. q6u16 = vmull_u8(d21u8, d0u8);
  1026. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  1027. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  1028. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  1029. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  1030. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  1031. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  1032. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  1033. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  1034. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  1035. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  1036. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  1037. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  1038. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  1039. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  1040. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  1041. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  1042. q7u16 = vmull_u8(d21u8, d3u8);
  1043. q8u16 = vmull_u8(d22u8, d3u8);
  1044. q9u16 = vmull_u8(d23u8, d3u8);
  1045. q10u16 = vmull_u8(d24u8, d3u8);
  1046. q3s16 = vreinterpretq_s16_u16(q3u16);
  1047. q4s16 = vreinterpretq_s16_u16(q4u16);
  1048. q5s16 = vreinterpretq_s16_u16(q5u16);
  1049. q6s16 = vreinterpretq_s16_u16(q6u16);
  1050. q7s16 = vreinterpretq_s16_u16(q7u16);
  1051. q8s16 = vreinterpretq_s16_u16(q8u16);
  1052. q9s16 = vreinterpretq_s16_u16(q9u16);
  1053. q10s16 = vreinterpretq_s16_u16(q10u16);
  1054. q7s16 = vqaddq_s16(q7s16, q3s16);
  1055. q8s16 = vqaddq_s16(q8s16, q4s16);
  1056. q9s16 = vqaddq_s16(q9s16, q5s16);
  1057. q10s16 = vqaddq_s16(q10s16, q6s16);
  1058. d6u8 = vqrshrun_n_s16(q7s16, 7);
  1059. d7u8 = vqrshrun_n_s16(q8s16, 7);
  1060. d8u8 = vqrshrun_n_s16(q9s16, 7);
  1061. d9u8 = vqrshrun_n_s16(q10s16, 7);
  1062. d18u8 = d22u8;
  1063. d19u8 = d23u8;
  1064. d20u8 = d24u8;
  1065. d21u8 = d25u8;
  1066. d22u8 = d26u8;
  1067. d23u8 = d27u8;
  1068. d24u8 = d28u8;
  1069. d25u8 = d29u8;
  1070. d26u8 = d30u8;
  1071. vst1_u8(dst_ptr, d6u8);
  1072. dst_ptr += dst_pitch;
  1073. vst1_u8(dst_ptr, d7u8);
  1074. dst_ptr += dst_pitch;
  1075. vst1_u8(dst_ptr, d8u8);
  1076. dst_ptr += dst_pitch;
  1077. vst1_u8(dst_ptr, d9u8);
  1078. dst_ptr += dst_pitch;
  1079. }
  1080. return;
  1081. }
  1082. void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
  1083. int src_pixels_per_line, int xoffset,
  1084. int yoffset, unsigned char *dst_ptr,
  1085. int dst_pitch) {
  1086. unsigned char *src, *src_tmp, *dst, *tmpp;
  1087. unsigned char tmp[336];
  1088. int i, j;
  1089. uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
  1090. uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
  1091. uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
  1092. uint8x8_t d28u8, d29u8, d30u8, d31u8;
  1093. int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
  1094. uint8x16_t q3u8, q4u8;
  1095. uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
  1096. uint16x8_t q11u16, q12u16, q13u16, q15u16;
  1097. int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
  1098. int16x8_t q11s16, q12s16, q13s16, q15s16;
  1099. if (xoffset == 0) { // secondpass_filter8x8_only
  1100. // load second_pass filter
  1101. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  1102. d0s8 = vdup_lane_s8(dtmps8, 0);
  1103. d1s8 = vdup_lane_s8(dtmps8, 1);
  1104. d2s8 = vdup_lane_s8(dtmps8, 2);
  1105. d3s8 = vdup_lane_s8(dtmps8, 3);
  1106. d4s8 = vdup_lane_s8(dtmps8, 4);
  1107. d5s8 = vdup_lane_s8(dtmps8, 5);
  1108. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  1109. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  1110. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  1111. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  1112. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  1113. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  1114. // load src data
  1115. src_tmp = src_ptr - src_pixels_per_line * 2;
  1116. for (i = 0; i < 2; ++i) {
  1117. src = src_tmp + i * 8;
  1118. dst = dst_ptr + i * 8;
  1119. d18u8 = vld1_u8(src);
  1120. src += src_pixels_per_line;
  1121. d19u8 = vld1_u8(src);
  1122. src += src_pixels_per_line;
  1123. d20u8 = vld1_u8(src);
  1124. src += src_pixels_per_line;
  1125. d21u8 = vld1_u8(src);
  1126. src += src_pixels_per_line;
  1127. d22u8 = vld1_u8(src);
  1128. src += src_pixels_per_line;
  1129. for (j = 0; j < 4; ++j) {
  1130. d23u8 = vld1_u8(src);
  1131. src += src_pixels_per_line;
  1132. d24u8 = vld1_u8(src);
  1133. src += src_pixels_per_line;
  1134. d25u8 = vld1_u8(src);
  1135. src += src_pixels_per_line;
  1136. d26u8 = vld1_u8(src);
  1137. src += src_pixels_per_line;
  1138. q3u16 = vmull_u8(d18u8, d0u8);
  1139. q4u16 = vmull_u8(d19u8, d0u8);
  1140. q5u16 = vmull_u8(d20u8, d0u8);
  1141. q6u16 = vmull_u8(d21u8, d0u8);
  1142. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  1143. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  1144. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  1145. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  1146. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  1147. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  1148. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  1149. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  1150. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  1151. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  1152. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  1153. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  1154. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  1155. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  1156. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  1157. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  1158. q7u16 = vmull_u8(d21u8, d3u8);
  1159. q8u16 = vmull_u8(d22u8, d3u8);
  1160. q9u16 = vmull_u8(d23u8, d3u8);
  1161. q10u16 = vmull_u8(d24u8, d3u8);
  1162. q3s16 = vreinterpretq_s16_u16(q3u16);
  1163. q4s16 = vreinterpretq_s16_u16(q4u16);
  1164. q5s16 = vreinterpretq_s16_u16(q5u16);
  1165. q6s16 = vreinterpretq_s16_u16(q6u16);
  1166. q7s16 = vreinterpretq_s16_u16(q7u16);
  1167. q8s16 = vreinterpretq_s16_u16(q8u16);
  1168. q9s16 = vreinterpretq_s16_u16(q9u16);
  1169. q10s16 = vreinterpretq_s16_u16(q10u16);
  1170. q7s16 = vqaddq_s16(q7s16, q3s16);
  1171. q8s16 = vqaddq_s16(q8s16, q4s16);
  1172. q9s16 = vqaddq_s16(q9s16, q5s16);
  1173. q10s16 = vqaddq_s16(q10s16, q6s16);
  1174. d6u8 = vqrshrun_n_s16(q7s16, 7);
  1175. d7u8 = vqrshrun_n_s16(q8s16, 7);
  1176. d8u8 = vqrshrun_n_s16(q9s16, 7);
  1177. d9u8 = vqrshrun_n_s16(q10s16, 7);
  1178. d18u8 = d22u8;
  1179. d19u8 = d23u8;
  1180. d20u8 = d24u8;
  1181. d21u8 = d25u8;
  1182. d22u8 = d26u8;
  1183. vst1_u8(dst, d6u8);
  1184. dst += dst_pitch;
  1185. vst1_u8(dst, d7u8);
  1186. dst += dst_pitch;
  1187. vst1_u8(dst, d8u8);
  1188. dst += dst_pitch;
  1189. vst1_u8(dst, d9u8);
  1190. dst += dst_pitch;
  1191. }
  1192. }
  1193. return;
  1194. }
  1195. // load first_pass filter
  1196. dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
  1197. d0s8 = vdup_lane_s8(dtmps8, 0);
  1198. d1s8 = vdup_lane_s8(dtmps8, 1);
  1199. d2s8 = vdup_lane_s8(dtmps8, 2);
  1200. d3s8 = vdup_lane_s8(dtmps8, 3);
  1201. d4s8 = vdup_lane_s8(dtmps8, 4);
  1202. d5s8 = vdup_lane_s8(dtmps8, 5);
  1203. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  1204. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  1205. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  1206. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  1207. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  1208. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  1209. // First pass: output_height lines x output_width columns (9x4)
  1210. if (yoffset == 0) { // firstpass_filter4x4_only
  1211. src = src_ptr - 2;
  1212. dst = dst_ptr;
  1213. for (i = 0; i < 8; ++i) {
  1214. d6u8 = vld1_u8(src);
  1215. d7u8 = vld1_u8(src + 8);
  1216. d8u8 = vld1_u8(src + 16);
  1217. src += src_pixels_per_line;
  1218. d9u8 = vld1_u8(src);
  1219. d10u8 = vld1_u8(src + 8);
  1220. d11u8 = vld1_u8(src + 16);
  1221. src += src_pixels_per_line;
  1222. __builtin_prefetch(src);
  1223. __builtin_prefetch(src + src_pixels_per_line);
  1224. q6u16 = vmull_u8(d6u8, d0u8);
  1225. q7u16 = vmull_u8(d7u8, d0u8);
  1226. q8u16 = vmull_u8(d9u8, d0u8);
  1227. q9u16 = vmull_u8(d10u8, d0u8);
  1228. d20u8 = vext_u8(d6u8, d7u8, 1);
  1229. d21u8 = vext_u8(d9u8, d10u8, 1);
  1230. d22u8 = vext_u8(d7u8, d8u8, 1);
  1231. d23u8 = vext_u8(d10u8, d11u8, 1);
  1232. d24u8 = vext_u8(d6u8, d7u8, 4);
  1233. d25u8 = vext_u8(d9u8, d10u8, 4);
  1234. d26u8 = vext_u8(d7u8, d8u8, 4);
  1235. d27u8 = vext_u8(d10u8, d11u8, 4);
  1236. d28u8 = vext_u8(d6u8, d7u8, 5);
  1237. d29u8 = vext_u8(d9u8, d10u8, 5);
  1238. q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
  1239. q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
  1240. q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
  1241. q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
  1242. q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
  1243. q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
  1244. q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
  1245. q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
  1246. q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
  1247. q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
  1248. d20u8 = vext_u8(d7u8, d8u8, 5);
  1249. d21u8 = vext_u8(d10u8, d11u8, 5);
  1250. d22u8 = vext_u8(d6u8, d7u8, 2);
  1251. d23u8 = vext_u8(d9u8, d10u8, 2);
  1252. d24u8 = vext_u8(d7u8, d8u8, 2);
  1253. d25u8 = vext_u8(d10u8, d11u8, 2);
  1254. d26u8 = vext_u8(d6u8, d7u8, 3);
  1255. d27u8 = vext_u8(d9u8, d10u8, 3);
  1256. d28u8 = vext_u8(d7u8, d8u8, 3);
  1257. d29u8 = vext_u8(d10u8, d11u8, 3);
  1258. q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
  1259. q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
  1260. q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
  1261. q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
  1262. q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
  1263. q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
  1264. q10u16 = vmull_u8(d26u8, d3u8);
  1265. q11u16 = vmull_u8(d27u8, d3u8);
  1266. q12u16 = vmull_u8(d28u8, d3u8);
  1267. q15u16 = vmull_u8(d29u8, d3u8);
  1268. q6s16 = vreinterpretq_s16_u16(q6u16);
  1269. q7s16 = vreinterpretq_s16_u16(q7u16);
  1270. q8s16 = vreinterpretq_s16_u16(q8u16);
  1271. q9s16 = vreinterpretq_s16_u16(q9u16);
  1272. q10s16 = vreinterpretq_s16_u16(q10u16);
  1273. q11s16 = vreinterpretq_s16_u16(q11u16);
  1274. q12s16 = vreinterpretq_s16_u16(q12u16);
  1275. q15s16 = vreinterpretq_s16_u16(q15u16);
  1276. q6s16 = vqaddq_s16(q6s16, q10s16);
  1277. q8s16 = vqaddq_s16(q8s16, q11s16);
  1278. q7s16 = vqaddq_s16(q7s16, q12s16);
  1279. q9s16 = vqaddq_s16(q9s16, q15s16);
  1280. d6u8 = vqrshrun_n_s16(q6s16, 7);
  1281. d7u8 = vqrshrun_n_s16(q7s16, 7);
  1282. d8u8 = vqrshrun_n_s16(q8s16, 7);
  1283. d9u8 = vqrshrun_n_s16(q9s16, 7);
  1284. q3u8 = vcombine_u8(d6u8, d7u8);
  1285. q4u8 = vcombine_u8(d8u8, d9u8);
  1286. vst1q_u8(dst, q3u8);
  1287. dst += dst_pitch;
  1288. vst1q_u8(dst, q4u8);
  1289. dst += dst_pitch;
  1290. }
  1291. return;
  1292. }
  1293. src = src_ptr - 2 - src_pixels_per_line * 2;
  1294. tmpp = tmp;
  1295. for (i = 0; i < 7; ++i) {
  1296. d6u8 = vld1_u8(src);
  1297. d7u8 = vld1_u8(src + 8);
  1298. d8u8 = vld1_u8(src + 16);
  1299. src += src_pixels_per_line;
  1300. d9u8 = vld1_u8(src);
  1301. d10u8 = vld1_u8(src + 8);
  1302. d11u8 = vld1_u8(src + 16);
  1303. src += src_pixels_per_line;
  1304. d12u8 = vld1_u8(src);
  1305. d13u8 = vld1_u8(src + 8);
  1306. d14u8 = vld1_u8(src + 16);
  1307. src += src_pixels_per_line;
  1308. __builtin_prefetch(src);
  1309. __builtin_prefetch(src + src_pixels_per_line);
  1310. __builtin_prefetch(src + src_pixels_per_line * 2);
  1311. q8u16 = vmull_u8(d6u8, d0u8);
  1312. q9u16 = vmull_u8(d7u8, d0u8);
  1313. q10u16 = vmull_u8(d9u8, d0u8);
  1314. q11u16 = vmull_u8(d10u8, d0u8);
  1315. q12u16 = vmull_u8(d12u8, d0u8);
  1316. q13u16 = vmull_u8(d13u8, d0u8);
  1317. d28u8 = vext_u8(d6u8, d7u8, 1);
  1318. d29u8 = vext_u8(d9u8, d10u8, 1);
  1319. d30u8 = vext_u8(d12u8, d13u8, 1);
  1320. q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
  1321. q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
  1322. q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
  1323. d28u8 = vext_u8(d7u8, d8u8, 1);
  1324. d29u8 = vext_u8(d10u8, d11u8, 1);
  1325. d30u8 = vext_u8(d13u8, d14u8, 1);
  1326. q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
  1327. q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
  1328. q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
  1329. d28u8 = vext_u8(d6u8, d7u8, 4);
  1330. d29u8 = vext_u8(d9u8, d10u8, 4);
  1331. d30u8 = vext_u8(d12u8, d13u8, 4);
  1332. q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
  1333. q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
  1334. q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
  1335. d28u8 = vext_u8(d7u8, d8u8, 4);
  1336. d29u8 = vext_u8(d10u8, d11u8, 4);
  1337. d30u8 = vext_u8(d13u8, d14u8, 4);
  1338. q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
  1339. q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
  1340. q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
  1341. d28u8 = vext_u8(d6u8, d7u8, 5);
  1342. d29u8 = vext_u8(d9u8, d10u8, 5);
  1343. d30u8 = vext_u8(d12u8, d13u8, 5);
  1344. q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
  1345. q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
  1346. q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
  1347. d28u8 = vext_u8(d7u8, d8u8, 5);
  1348. d29u8 = vext_u8(d10u8, d11u8, 5);
  1349. d30u8 = vext_u8(d13u8, d14u8, 5);
  1350. q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
  1351. q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
  1352. q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
  1353. d28u8 = vext_u8(d6u8, d7u8, 2);
  1354. d29u8 = vext_u8(d9u8, d10u8, 2);
  1355. d30u8 = vext_u8(d12u8, d13u8, 2);
  1356. q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
  1357. q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
  1358. q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
  1359. d28u8 = vext_u8(d7u8, d8u8, 2);
  1360. d29u8 = vext_u8(d10u8, d11u8, 2);
  1361. d30u8 = vext_u8(d13u8, d14u8, 2);
  1362. q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
  1363. q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
  1364. q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
  1365. d28u8 = vext_u8(d6u8, d7u8, 3);
  1366. d29u8 = vext_u8(d9u8, d10u8, 3);
  1367. d30u8 = vext_u8(d12u8, d13u8, 3);
  1368. d15u8 = vext_u8(d7u8, d8u8, 3);
  1369. d31u8 = vext_u8(d10u8, d11u8, 3);
  1370. d6u8 = vext_u8(d13u8, d14u8, 3);
  1371. q4u16 = vmull_u8(d28u8, d3u8);
  1372. q5u16 = vmull_u8(d29u8, d3u8);
  1373. q6u16 = vmull_u8(d30u8, d3u8);
  1374. q4s16 = vreinterpretq_s16_u16(q4u16);
  1375. q5s16 = vreinterpretq_s16_u16(q5u16);
  1376. q6s16 = vreinterpretq_s16_u16(q6u16);
  1377. q8s16 = vreinterpretq_s16_u16(q8u16);
  1378. q10s16 = vreinterpretq_s16_u16(q10u16);
  1379. q12s16 = vreinterpretq_s16_u16(q12u16);
  1380. q8s16 = vqaddq_s16(q8s16, q4s16);
  1381. q10s16 = vqaddq_s16(q10s16, q5s16);
  1382. q12s16 = vqaddq_s16(q12s16, q6s16);
  1383. q6u16 = vmull_u8(d15u8, d3u8);
  1384. q7u16 = vmull_u8(d31u8, d3u8);
  1385. q3u16 = vmull_u8(d6u8, d3u8);
  1386. q3s16 = vreinterpretq_s16_u16(q3u16);
  1387. q6s16 = vreinterpretq_s16_u16(q6u16);
  1388. q7s16 = vreinterpretq_s16_u16(q7u16);
  1389. q9s16 = vreinterpretq_s16_u16(q9u16);
  1390. q11s16 = vreinterpretq_s16_u16(q11u16);
  1391. q13s16 = vreinterpretq_s16_u16(q13u16);
  1392. q9s16 = vqaddq_s16(q9s16, q6s16);
  1393. q11s16 = vqaddq_s16(q11s16, q7s16);
  1394. q13s16 = vqaddq_s16(q13s16, q3s16);
  1395. d6u8 = vqrshrun_n_s16(q8s16, 7);
  1396. d7u8 = vqrshrun_n_s16(q9s16, 7);
  1397. d8u8 = vqrshrun_n_s16(q10s16, 7);
  1398. d9u8 = vqrshrun_n_s16(q11s16, 7);
  1399. d10u8 = vqrshrun_n_s16(q12s16, 7);
  1400. d11u8 = vqrshrun_n_s16(q13s16, 7);
  1401. vst1_u8(tmpp, d6u8);
  1402. tmpp += 8;
  1403. vst1_u8(tmpp, d7u8);
  1404. tmpp += 8;
  1405. vst1_u8(tmpp, d8u8);
  1406. tmpp += 8;
  1407. vst1_u8(tmpp, d9u8);
  1408. tmpp += 8;
  1409. vst1_u8(tmpp, d10u8);
  1410. tmpp += 8;
  1411. vst1_u8(tmpp, d11u8);
  1412. tmpp += 8;
  1413. }
  1414. // Second pass: 16x16
  1415. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  1416. d0s8 = vdup_lane_s8(dtmps8, 0);
  1417. d1s8 = vdup_lane_s8(dtmps8, 1);
  1418. d2s8 = vdup_lane_s8(dtmps8, 2);
  1419. d3s8 = vdup_lane_s8(dtmps8, 3);
  1420. d4s8 = vdup_lane_s8(dtmps8, 4);
  1421. d5s8 = vdup_lane_s8(dtmps8, 5);
  1422. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  1423. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  1424. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  1425. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  1426. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  1427. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  1428. for (i = 0; i < 2; ++i) {
  1429. dst = dst_ptr + 8 * i;
  1430. tmpp = tmp + 8 * i;
  1431. d18u8 = vld1_u8(tmpp);
  1432. tmpp += 16;
  1433. d19u8 = vld1_u8(tmpp);
  1434. tmpp += 16;
  1435. d20u8 = vld1_u8(tmpp);
  1436. tmpp += 16;
  1437. d21u8 = vld1_u8(tmpp);
  1438. tmpp += 16;
  1439. d22u8 = vld1_u8(tmpp);
  1440. tmpp += 16;
  1441. for (j = 0; j < 4; ++j) {
  1442. d23u8 = vld1_u8(tmpp);
  1443. tmpp += 16;
  1444. d24u8 = vld1_u8(tmpp);
  1445. tmpp += 16;
  1446. d25u8 = vld1_u8(tmpp);
  1447. tmpp += 16;
  1448. d26u8 = vld1_u8(tmpp);
  1449. tmpp += 16;
  1450. q3u16 = vmull_u8(d18u8, d0u8);
  1451. q4u16 = vmull_u8(d19u8, d0u8);
  1452. q5u16 = vmull_u8(d20u8, d0u8);
  1453. q6u16 = vmull_u8(d21u8, d0u8);
  1454. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  1455. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  1456. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  1457. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  1458. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  1459. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  1460. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  1461. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  1462. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  1463. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  1464. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  1465. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  1466. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  1467. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  1468. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  1469. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  1470. q7u16 = vmull_u8(d21u8, d3u8);
  1471. q8u16 = vmull_u8(d22u8, d3u8);
  1472. q9u16 = vmull_u8(d23u8, d3u8);
  1473. q10u16 = vmull_u8(d24u8, d3u8);
  1474. q3s16 = vreinterpretq_s16_u16(q3u16);
  1475. q4s16 = vreinterpretq_s16_u16(q4u16);
  1476. q5s16 = vreinterpretq_s16_u16(q5u16);
  1477. q6s16 = vreinterpretq_s16_u16(q6u16);
  1478. q7s16 = vreinterpretq_s16_u16(q7u16);
  1479. q8s16 = vreinterpretq_s16_u16(q8u16);
  1480. q9s16 = vreinterpretq_s16_u16(q9u16);
  1481. q10s16 = vreinterpretq_s16_u16(q10u16);
  1482. q7s16 = vqaddq_s16(q7s16, q3s16);
  1483. q8s16 = vqaddq_s16(q8s16, q4s16);
  1484. q9s16 = vqaddq_s16(q9s16, q5s16);
  1485. q10s16 = vqaddq_s16(q10s16, q6s16);
  1486. d6u8 = vqrshrun_n_s16(q7s16, 7);
  1487. d7u8 = vqrshrun_n_s16(q8s16, 7);
  1488. d8u8 = vqrshrun_n_s16(q9s16, 7);
  1489. d9u8 = vqrshrun_n_s16(q10s16, 7);
  1490. d18u8 = d22u8;
  1491. d19u8 = d23u8;
  1492. d20u8 = d24u8;
  1493. d21u8 = d25u8;
  1494. d22u8 = d26u8;
  1495. vst1_u8(dst, d6u8);
  1496. dst += dst_pitch;
  1497. vst1_u8(dst, d7u8);
  1498. dst += dst_pitch;
  1499. vst1_u8(dst, d8u8);
  1500. dst += dst_pitch;
  1501. vst1_u8(dst, d9u8);
  1502. dst += dst_pitch;
  1503. }
  1504. }
  1505. return;
  1506. }