vpx_convolve8_msa.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/mips/vpx_convolve_msa.h"
  13. const uint8_t mc_filt_mask_arr[16 * 3] = {
  14. /* 8 width cases */
  15. 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
  16. /* 4 width cases */
  17. 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
  18. /* 4 width cases */
  19. 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
  20. };
  21. static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
  22. uint8_t *dst, int32_t dst_stride,
  23. int8_t *filter_horiz, int8_t *filter_vert,
  24. int32_t height) {
  25. uint32_t loop_cnt;
  26. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  27. v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
  28. v16u8 mask0, mask1, mask2, mask3, out;
  29. v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  30. v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
  31. v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
  32. mask0 = LD_UB(&mc_filt_mask_arr[16]);
  33. src -= (3 + 3 * src_stride);
  34. /* rearranging filter */
  35. filt = LD_SH(filter_horiz);
  36. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
  37. mask1 = mask0 + 2;
  38. mask2 = mask0 + 4;
  39. mask3 = mask0 + 6;
  40. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  41. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  42. src += (7 * src_stride);
  43. hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
  44. filt_hz1, filt_hz2, filt_hz3);
  45. hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
  46. filt_hz1, filt_hz2, filt_hz3);
  47. hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
  48. filt_hz1, filt_hz2, filt_hz3);
  49. hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
  50. filt_hz1, filt_hz2, filt_hz3);
  51. SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
  52. filt = LD_SH(filter_vert);
  53. SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
  54. ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
  55. out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
  56. for (loop_cnt = (height >> 2); loop_cnt--;) {
  57. LD_SB4(src, src_stride, src7, src8, src9, src10);
  58. XORI_B4_128_SB(src7, src8, src9, src10);
  59. src += (4 * src_stride);
  60. hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
  61. filt_hz1, filt_hz2, filt_hz3);
  62. hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
  63. out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
  64. tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
  65. filt_vt2, filt_vt3);
  66. hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
  67. filt_hz1, filt_hz2, filt_hz3);
  68. hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
  69. out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
  70. tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
  71. filt_vt2, filt_vt3);
  72. SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
  73. SAT_SH2_SH(tmp0, tmp1, 7);
  74. out = PCKEV_XORI128_UB(tmp0, tmp1);
  75. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  76. dst += (4 * dst_stride);
  77. hz_out5 = hz_out9;
  78. out0 = out2;
  79. out1 = out3;
  80. out2 = out4;
  81. }
  82. }
  83. static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
  84. uint8_t *dst, int32_t dst_stride,
  85. int8_t *filter_horiz, int8_t *filter_vert,
  86. int32_t height) {
  87. uint32_t loop_cnt;
  88. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  89. v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
  90. v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
  91. v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
  92. v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  93. v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
  94. v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
  95. mask0 = LD_UB(&mc_filt_mask_arr[0]);
  96. src -= (3 + 3 * src_stride);
  97. /* rearranging filter */
  98. filt = LD_SH(filter_horiz);
  99. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
  100. mask1 = mask0 + 2;
  101. mask2 = mask0 + 4;
  102. mask3 = mask0 + 6;
  103. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  104. src += (7 * src_stride);
  105. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  106. hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
  107. filt_hz1, filt_hz2, filt_hz3);
  108. hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
  109. filt_hz1, filt_hz2, filt_hz3);
  110. hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
  111. filt_hz1, filt_hz2, filt_hz3);
  112. hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
  113. filt_hz1, filt_hz2, filt_hz3);
  114. hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
  115. filt_hz1, filt_hz2, filt_hz3);
  116. hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
  117. filt_hz1, filt_hz2, filt_hz3);
  118. hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
  119. filt_hz1, filt_hz2, filt_hz3);
  120. filt = LD_SH(filter_vert);
  121. SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
  122. ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
  123. ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
  124. ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
  125. for (loop_cnt = (height >> 2); loop_cnt--;) {
  126. LD_SB4(src, src_stride, src7, src8, src9, src10);
  127. src += (4 * src_stride);
  128. XORI_B4_128_SB(src7, src8, src9, src10);
  129. hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
  130. filt_hz1, filt_hz2, filt_hz3);
  131. out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
  132. tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
  133. filt_vt2, filt_vt3);
  134. hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
  135. filt_hz1, filt_hz2, filt_hz3);
  136. out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
  137. tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
  138. filt_vt2, filt_vt3);
  139. hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
  140. filt_hz1, filt_hz2, filt_hz3);
  141. out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
  142. tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
  143. filt_vt2, filt_vt3);
  144. hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
  145. filt_hz0, filt_hz1, filt_hz2, filt_hz3);
  146. out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
  147. tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
  148. filt_vt2, filt_vt3);
  149. SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  150. SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
  151. vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
  152. vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
  153. ST8x4_UB(vec0, vec1, dst, dst_stride);
  154. dst += (4 * dst_stride);
  155. hz_out6 = hz_out10;
  156. out0 = out2;
  157. out1 = out3;
  158. out2 = out8;
  159. out4 = out6;
  160. out5 = out7;
  161. out6 = out9;
  162. }
  163. }
  164. static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
  165. uint8_t *dst, int32_t dst_stride,
  166. int8_t *filter_horiz, int8_t *filter_vert,
  167. int32_t height) {
  168. int32_t multiple8_cnt;
  169. for (multiple8_cnt = 2; multiple8_cnt--;) {
  170. common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
  171. filter_vert, height);
  172. src += 8;
  173. dst += 8;
  174. }
  175. }
  176. static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
  177. uint8_t *dst, int32_t dst_stride,
  178. int8_t *filter_horiz, int8_t *filter_vert,
  179. int32_t height) {
  180. int32_t multiple8_cnt;
  181. for (multiple8_cnt = 4; multiple8_cnt--;) {
  182. common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
  183. filter_vert, height);
  184. src += 8;
  185. dst += 8;
  186. }
  187. }
  188. static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
  189. uint8_t *dst, int32_t dst_stride,
  190. int8_t *filter_horiz, int8_t *filter_vert,
  191. int32_t height) {
  192. int32_t multiple8_cnt;
  193. for (multiple8_cnt = 8; multiple8_cnt--;) {
  194. common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
  195. filter_vert, height);
  196. src += 8;
  197. dst += 8;
  198. }
  199. }
  200. static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
  201. uint8_t *dst, int32_t dst_stride,
  202. int8_t *filter_horiz,
  203. int8_t *filter_vert) {
  204. v16i8 src0, src1, src2, src3, src4, mask;
  205. v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
  206. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
  207. mask = LD_SB(&mc_filt_mask_arr[16]);
  208. /* rearranging filter */
  209. filt = LD_UH(filter_horiz);
  210. filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
  211. filt = LD_UH(filter_vert);
  212. filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
  213. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  214. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  215. hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  216. hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  217. hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
  218. hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
  219. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  220. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  221. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  222. PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
  223. ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
  224. }
  225. static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
  226. uint8_t *dst, int32_t dst_stride,
  227. int8_t *filter_horiz,
  228. int8_t *filter_vert) {
  229. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
  230. v16i8 res0, res1, res2, res3;
  231. v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
  232. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  233. v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
  234. mask = LD_SB(&mc_filt_mask_arr[16]);
  235. /* rearranging filter */
  236. filt = LD_UH(filter_horiz);
  237. filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
  238. filt = LD_UH(filter_vert);
  239. filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
  240. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  241. src += (8 * src_stride);
  242. src8 = LD_SB(src);
  243. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  244. hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  245. hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
  246. hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
  247. hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
  248. SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
  249. hz_out3, hz_out5, 8);
  250. hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
  251. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  252. ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
  253. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
  254. vec5, vec6, vec7);
  255. SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
  256. PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
  257. res3);
  258. ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
  259. dst += (4 * dst_stride);
  260. ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
  261. }
  262. static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
  263. uint8_t *dst, int32_t dst_stride,
  264. int8_t *filter_horiz, int8_t *filter_vert,
  265. int32_t height) {
  266. if (4 == height) {
  267. common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
  268. filter_vert);
  269. } else if (8 == height) {
  270. common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
  271. filter_vert);
  272. }
  273. }
  274. static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
  275. uint8_t *dst, int32_t dst_stride,
  276. int8_t *filter_horiz,
  277. int8_t *filter_vert) {
  278. v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
  279. v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
  280. v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
  281. v8i16 filt;
  282. mask = LD_SB(&mc_filt_mask_arr[0]);
  283. /* rearranging filter */
  284. filt = LD_SH(filter_horiz);
  285. filt_hz = (v16u8)__msa_splati_h(filt, 0);
  286. filt = LD_SH(filter_vert);
  287. filt_vt = (v16u8)__msa_splati_h(filt, 0);
  288. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  289. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  290. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  291. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  292. tmp0 = __msa_dotp_u_h(vec0, filt_vt);
  293. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  294. vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  295. tmp1 = __msa_dotp_u_h(vec1, filt_vt);
  296. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  297. vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  298. tmp2 = __msa_dotp_u_h(vec2, filt_vt);
  299. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  300. vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  301. tmp3 = __msa_dotp_u_h(vec3, filt_vt);
  302. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  303. PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  304. ST8x4_UB(out0, out1, dst, dst_stride);
  305. }
  306. static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
  307. int32_t src_stride, uint8_t *dst,
  308. int32_t dst_stride,
  309. int8_t *filter_horiz,
  310. int8_t *filter_vert, int32_t height) {
  311. uint32_t loop_cnt;
  312. v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
  313. v16u8 filt_hz, filt_vt, vec0;
  314. v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
  315. v8i16 filt;
  316. mask = LD_SB(&mc_filt_mask_arr[0]);
  317. /* rearranging filter */
  318. filt = LD_SH(filter_horiz);
  319. filt_hz = (v16u8)__msa_splati_h(filt, 0);
  320. filt = LD_SH(filter_vert);
  321. filt_vt = (v16u8)__msa_splati_h(filt, 0);
  322. src0 = LD_SB(src);
  323. src += src_stride;
  324. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  325. for (loop_cnt = (height >> 3); loop_cnt--;) {
  326. LD_SB4(src, src_stride, src1, src2, src3, src4);
  327. src += (4 * src_stride);
  328. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  329. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  330. tmp1 = __msa_dotp_u_h(vec0, filt_vt);
  331. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  332. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  333. tmp2 = __msa_dotp_u_h(vec0, filt_vt);
  334. SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
  335. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  336. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  337. tmp3 = __msa_dotp_u_h(vec0, filt_vt);
  338. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  339. LD_SB4(src, src_stride, src1, src2, src3, src4);
  340. src += (4 * src_stride);
  341. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  342. tmp4 = __msa_dotp_u_h(vec0, filt_vt);
  343. SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
  344. PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
  345. ST8x4_UB(out0, out1, dst, dst_stride);
  346. dst += (4 * dst_stride);
  347. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  348. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  349. tmp5 = __msa_dotp_u_h(vec0, filt_vt);
  350. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  351. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  352. tmp6 = __msa_dotp_u_h(vec0, filt_vt);
  353. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  354. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  355. tmp7 = __msa_dotp_u_h(vec0, filt_vt);
  356. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  357. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  358. tmp8 = __msa_dotp_u_h(vec0, filt_vt);
  359. SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
  360. PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
  361. ST8x4_UB(out0, out1, dst, dst_stride);
  362. dst += (4 * dst_stride);
  363. }
  364. }
  365. static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
  366. uint8_t *dst, int32_t dst_stride,
  367. int8_t *filter_horiz, int8_t *filter_vert,
  368. int32_t height) {
  369. if (4 == height) {
  370. common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
  371. filter_vert);
  372. } else {
  373. common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
  374. filter_horiz, filter_vert, height);
  375. }
  376. }
  377. static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
  378. uint8_t *dst, int32_t dst_stride,
  379. int8_t *filter_horiz, int8_t *filter_vert,
  380. int32_t height) {
  381. uint32_t loop_cnt;
  382. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
  383. v16u8 filt_hz, filt_vt, vec0, vec1;
  384. v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
  385. v8i16 filt;
  386. mask = LD_SB(&mc_filt_mask_arr[0]);
  387. /* rearranging filter */
  388. filt = LD_SH(filter_horiz);
  389. filt_hz = (v16u8)__msa_splati_h(filt, 0);
  390. filt = LD_SH(filter_vert);
  391. filt_vt = (v16u8)__msa_splati_h(filt, 0);
  392. LD_SB2(src, 8, src0, src1);
  393. src += src_stride;
  394. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  395. hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  396. for (loop_cnt = (height >> 2); loop_cnt--;) {
  397. LD_SB4(src, src_stride, src0, src2, src4, src6);
  398. LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
  399. src += (4 * src_stride);
  400. hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  401. hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  402. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  403. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
  404. SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
  405. PCKEV_ST_SB(tmp1, tmp2, dst);
  406. dst += dst_stride;
  407. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  408. hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  409. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  410. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
  411. SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
  412. PCKEV_ST_SB(tmp1, tmp2, dst);
  413. dst += dst_stride;
  414. hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  415. hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
  416. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  417. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
  418. SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
  419. PCKEV_ST_SB(tmp1, tmp2, dst);
  420. dst += dst_stride;
  421. hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
  422. hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
  423. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  424. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
  425. SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
  426. PCKEV_ST_SB(tmp1, tmp2, dst);
  427. dst += dst_stride;
  428. }
  429. }
  430. static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
  431. uint8_t *dst, int32_t dst_stride,
  432. int8_t *filter_horiz, int8_t *filter_vert,
  433. int32_t height) {
  434. int32_t multiple8_cnt;
  435. for (multiple8_cnt = 2; multiple8_cnt--;) {
  436. common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
  437. filter_vert, height);
  438. src += 16;
  439. dst += 16;
  440. }
  441. }
  442. static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
  443. uint8_t *dst, int32_t dst_stride,
  444. int8_t *filter_horiz, int8_t *filter_vert,
  445. int32_t height) {
  446. int32_t multiple8_cnt;
  447. for (multiple8_cnt = 4; multiple8_cnt--;) {
  448. common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
  449. filter_vert, height);
  450. src += 16;
  451. dst += 16;
  452. }
  453. }
  454. void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
  455. ptrdiff_t dst_stride, const InterpKernel *filter,
  456. int x0_q4, int32_t x_step_q4, int y0_q4,
  457. int32_t y_step_q4, int32_t w, int32_t h) {
  458. const int16_t *const filter_x = filter[x0_q4];
  459. const int16_t *const filter_y = filter[y0_q4];
  460. int8_t cnt, filt_hor[8], filt_ver[8];
  461. assert(x_step_q4 == 16);
  462. assert(y_step_q4 == 16);
  463. assert(((const int32_t *)filter_x)[1] != 0x800000);
  464. assert(((const int32_t *)filter_y)[1] != 0x800000);
  465. for (cnt = 0; cnt < 8; ++cnt) {
  466. filt_hor[cnt] = filter_x[cnt];
  467. filt_ver[cnt] = filter_y[cnt];
  468. }
  469. if (((const int32_t *)filter_x)[0] == 0 &&
  470. ((const int32_t *)filter_y)[0] == 0) {
  471. switch (w) {
  472. case 4:
  473. common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
  474. (int32_t)dst_stride, &filt_hor[3],
  475. &filt_ver[3], (int32_t)h);
  476. break;
  477. case 8:
  478. common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
  479. (int32_t)dst_stride, &filt_hor[3],
  480. &filt_ver[3], (int32_t)h);
  481. break;
  482. case 16:
  483. common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
  484. (int32_t)dst_stride, &filt_hor[3],
  485. &filt_ver[3], (int32_t)h);
  486. break;
  487. case 32:
  488. common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
  489. (int32_t)dst_stride, &filt_hor[3],
  490. &filt_ver[3], (int32_t)h);
  491. break;
  492. case 64:
  493. common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
  494. (int32_t)dst_stride, &filt_hor[3],
  495. &filt_ver[3], (int32_t)h);
  496. break;
  497. default:
  498. vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  499. x_step_q4, y0_q4, y_step_q4, w, h);
  500. break;
  501. }
  502. } else if (((const int32_t *)filter_x)[0] == 0 ||
  503. ((const int32_t *)filter_y)[0] == 0) {
  504. vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
  505. y0_q4, y_step_q4, w, h);
  506. } else {
  507. switch (w) {
  508. case 4:
  509. common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
  510. (int32_t)dst_stride, filt_hor, filt_ver,
  511. (int32_t)h);
  512. break;
  513. case 8:
  514. common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
  515. (int32_t)dst_stride, filt_hor, filt_ver,
  516. (int32_t)h);
  517. break;
  518. case 16:
  519. common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
  520. (int32_t)dst_stride, filt_hor, filt_ver,
  521. (int32_t)h);
  522. break;
  523. case 32:
  524. common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
  525. (int32_t)dst_stride, filt_hor, filt_ver,
  526. (int32_t)h);
  527. break;
  528. case 64:
  529. common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
  530. (int32_t)dst_stride, filt_hor, filt_ver,
  531. (int32_t)h);
  532. break;
  533. default:
  534. vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  535. x_step_q4, y0_q4, y_step_q4, w, h);
  536. break;
  537. }
  538. }
  539. }