avg_msa.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <stdlib.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/mips/macros_msa.h"
  13. uint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
  14. uint32_t sum_out;
  15. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  16. v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
  17. v4u32 sum = { 0 };
  18. LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  19. HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
  20. HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
  21. ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
  22. ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
  23. sum0 += sum4;
  24. sum = __msa_hadd_u_w(sum0, sum0);
  25. sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
  26. sum = __msa_hadd_u_w(sum0, sum0);
  27. sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
  28. sum_out = __msa_copy_u_w((v4i32)sum, 0);
  29. return sum_out;
  30. }
  31. uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
  32. uint32_t sum_out;
  33. uint32_t src0, src1, src2, src3;
  34. v16u8 vec = { 0 };
  35. v8u16 sum0;
  36. v4u32 sum1;
  37. v2u64 sum2;
  38. LW4(src, src_stride, src0, src1, src2, src3);
  39. INSERT_W4_UB(src0, src1, src2, src3, vec);
  40. sum0 = __msa_hadd_u_h(vec, vec);
  41. sum1 = __msa_hadd_u_w(sum0, sum0);
  42. sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
  43. sum1 = __msa_hadd_u_w(sum0, sum0);
  44. sum2 = __msa_hadd_u_d(sum1, sum1);
  45. sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
  46. sum_out = __msa_copy_u_w((v4i32)sum1, 0);
  47. return sum_out;
  48. }
  49. void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) {
  50. v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
  51. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  52. LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  53. BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
  54. tmp6, tmp7, tmp5, tmp3, tmp1);
  55. BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
  56. src5, src7, src6, src3, src2);
  57. BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
  58. tmp4, tmp5, tmp1, tmp6, tmp2);
  59. TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
  60. src2, src3, src4, src5, src6, src7);
  61. BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
  62. tmp6, tmp7, tmp5, tmp3, tmp1);
  63. BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
  64. src5, src7, src6, src3, src2);
  65. BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
  66. tmp4, tmp5, tmp1, tmp6, tmp2);
  67. TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
  68. src2, src3, src4, src5, src6, src7);
  69. ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
  70. }
  71. void vpx_hadamard_16x16_msa(const int16_t *src, int src_stride, int16_t *dst) {
  72. v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  73. v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  74. v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  75. v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
  76. LD_SH2(src, 8, src0, src8);
  77. src += src_stride;
  78. LD_SH2(src, 8, src1, src9);
  79. src += src_stride;
  80. LD_SH2(src, 8, src2, src10);
  81. src += src_stride;
  82. LD_SH2(src, 8, src3, src11);
  83. src += src_stride;
  84. LD_SH2(src, 8, src4, src12);
  85. src += src_stride;
  86. LD_SH2(src, 8, src5, src13);
  87. src += src_stride;
  88. LD_SH2(src, 8, src6, src14);
  89. src += src_stride;
  90. LD_SH2(src, 8, src7, src15);
  91. src += src_stride;
  92. BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
  93. tmp6, tmp7, tmp5, tmp3, tmp1);
  94. BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
  95. tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
  96. BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
  97. src5, src7, src6, src3, src2);
  98. BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
  99. tmp4, tmp5, tmp1, tmp6, tmp2);
  100. TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
  101. src2, src3, src4, src5, src6, src7);
  102. BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
  103. tmp6, tmp7, tmp5, tmp3, tmp1);
  104. BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
  105. src5, src7, src6, src3, src2);
  106. BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
  107. tmp4, tmp5, tmp1, tmp6, tmp2);
  108. TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
  109. src2, src11, src4, src5, src6, src7);
  110. ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8);
  111. BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
  112. src12, src13, src15, src14, src11, src10);
  113. BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
  114. tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
  115. TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
  116. src9, src10, src11, src12, src13, src14, src15);
  117. BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
  118. tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
  119. BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
  120. src12, src13, src15, src14, src11, src10);
  121. BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
  122. tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
  123. TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
  124. res1, res2, res3, res4, res5, res6, res7);
  125. LD_SH2(src, 8, src0, src8);
  126. src += src_stride;
  127. LD_SH2(src, 8, src1, src9);
  128. src += src_stride;
  129. LD_SH2(src, 8, src2, src10);
  130. src += src_stride;
  131. LD_SH2(src, 8, src3, src11);
  132. src += src_stride;
  133. ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8);
  134. LD_SH2(src, 8, src4, src12);
  135. src += src_stride;
  136. LD_SH2(src, 8, src5, src13);
  137. src += src_stride;
  138. LD_SH2(src, 8, src6, src14);
  139. src += src_stride;
  140. LD_SH2(src, 8, src7, src15);
  141. src += src_stride;
  142. BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
  143. tmp6, tmp7, tmp5, tmp3, tmp1);
  144. BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
  145. tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
  146. BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
  147. src5, src7, src6, src3, src2);
  148. BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
  149. tmp4, tmp5, tmp1, tmp6, tmp2);
  150. TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
  151. src2, src3, src4, src5, src6, src7);
  152. BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
  153. tmp6, tmp7, tmp5, tmp3, tmp1);
  154. BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
  155. src5, src7, src6, src3, src2);
  156. BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
  157. tmp4, tmp5, tmp1, tmp6, tmp2);
  158. TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
  159. src2, src3, src4, src5, src6, src7);
  160. ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8);
  161. BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
  162. src12, src13, src15, src14, src11, src10);
  163. BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
  164. tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
  165. TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
  166. src9, src10, src11, src12, src13, src14, src15);
  167. BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
  168. tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
  169. BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
  170. src12, src13, src15, src14, src11, src10);
  171. BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
  172. tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
  173. TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
  174. res1, res2, res3, res4, res5, res6, res7);
  175. ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8);
  176. LD_SH4(dst, 64, src0, src1, src2, src3);
  177. LD_SH4(dst + 8, 64, src4, src5, src6, src7);
  178. BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
  179. tmp6, tmp7, tmp5, tmp3, tmp1);
  180. SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
  181. SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
  182. BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
  183. src5, src7, src6, src3, src2);
  184. ST_SH4(src0, src1, src2, src3, dst, 64);
  185. ST_SH4(src4, src5, src6, src7, dst + 8, 64);
  186. dst += 16;
  187. LD_SH4(dst, 64, src0, src1, src2, src3);
  188. LD_SH4(dst + 8, 64, src4, src5, src6, src7);
  189. BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
  190. tmp6, tmp7, tmp5, tmp3, tmp1);
  191. SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
  192. SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
  193. BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
  194. src5, src7, src6, src3, src2);
  195. ST_SH4(src0, src1, src2, src3, dst, 64);
  196. ST_SH4(src4, src5, src6, src7, dst + 8, 64);
  197. dst += 16;
  198. LD_SH4(dst, 64, src0, src1, src2, src3);
  199. LD_SH4(dst + 8, 64, src4, src5, src6, src7);
  200. BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
  201. tmp6, tmp7, tmp5, tmp3, tmp1);
  202. SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
  203. SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
  204. BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
  205. src5, src7, src6, src3, src2);
  206. ST_SH4(src0, src1, src2, src3, dst, 64);
  207. ST_SH4(src4, src5, src6, src7, dst + 8, 64);
  208. dst += 16;
  209. LD_SH4(dst, 64, src0, src1, src2, src3);
  210. LD_SH4(dst + 8, 64, src4, src5, src6, src7);
  211. BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
  212. tmp6, tmp7, tmp5, tmp3, tmp1);
  213. SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
  214. SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
  215. BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
  216. src5, src7, src6, src3, src2);
  217. ST_SH4(src0, src1, src2, src3, dst, 64);
  218. ST_SH4(src4, src5, src6, src7, dst + 8, 64);
  219. }
  220. int vpx_satd_msa(const int16_t *data, int length) {
  221. int i, satd;
  222. v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
  223. v8i16 src8, src9, src10, src11, src12, src13, src14, src15;
  224. v8i16 zero = { 0 };
  225. v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h;
  226. v4u32 tmp0_w = { 0 };
  227. if (16 == length) {
  228. LD_SH2(data, 8, src0, src1);
  229. tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
  230. tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
  231. tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
  232. tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
  233. satd = HADD_UW_U32(tmp0_w);
  234. } else if (64 == length) {
  235. LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
  236. tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
  237. tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
  238. tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
  239. tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
  240. tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
  241. tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
  242. tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
  243. tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
  244. tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
  245. tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
  246. tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
  247. tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
  248. tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
  249. tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
  250. tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
  251. tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
  252. satd = HADD_UW_U32(tmp0_w);
  253. } else if (256 == length) {
  254. for (i = 0; i < 2; ++i) {
  255. LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
  256. data += 8 * 8;
  257. LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
  258. data += 8 * 8;
  259. tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
  260. tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
  261. tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
  262. tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
  263. tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
  264. tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
  265. tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
  266. tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
  267. tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
  268. tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
  269. tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
  270. tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
  271. tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
  272. tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
  273. tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
  274. tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
  275. tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
  276. tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
  277. tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
  278. tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
  279. tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
  280. tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
  281. tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
  282. tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
  283. tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
  284. tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
  285. tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
  286. tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
  287. tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
  288. tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
  289. tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
  290. tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
  291. }
  292. satd = HADD_UW_U32(tmp0_w);
  293. } else if (1024 == length) {
  294. for (i = 0; i < 8; ++i) {
  295. LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
  296. data += 8 * 8;
  297. LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
  298. data += 8 * 8;
  299. tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
  300. tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
  301. tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
  302. tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
  303. tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
  304. tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
  305. tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
  306. tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
  307. tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
  308. tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
  309. tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
  310. tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
  311. tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
  312. tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
  313. tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
  314. tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
  315. tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
  316. tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
  317. tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
  318. tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
  319. tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
  320. tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
  321. tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
  322. tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
  323. tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
  324. tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
  325. tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
  326. tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
  327. tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
  328. tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
  329. tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
  330. tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
  331. }
  332. satd = HADD_UW_U32(tmp0_w);
  333. } else {
  334. satd = 0;
  335. for (i = 0; i < length; ++i) {
  336. satd += abs(data[i]);
  337. }
  338. }
  339. return satd;
  340. }
  341. void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
  342. const int ref_stride, const int height) {
  343. int i;
  344. v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
  345. v8i16 hbuf_r = { 0 };
  346. v8i16 hbuf_l = { 0 };
  347. v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
  348. v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
  349. if (16 == height) {
  350. for (i = 2; i--;) {
  351. LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  352. ref += 8 * ref_stride;
  353. UNPCK_UB_SH(ref0, ref0_r, ref0_l);
  354. UNPCK_UB_SH(ref1, ref1_r, ref1_l);
  355. UNPCK_UB_SH(ref2, ref2_r, ref2_l);
  356. UNPCK_UB_SH(ref3, ref3_r, ref3_l);
  357. UNPCK_UB_SH(ref4, ref4_r, ref4_l);
  358. UNPCK_UB_SH(ref5, ref5_r, ref5_l);
  359. UNPCK_UB_SH(ref6, ref6_r, ref6_l);
  360. UNPCK_UB_SH(ref7, ref7_r, ref7_l);
  361. ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
  362. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  363. ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
  364. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  365. ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
  366. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  367. ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
  368. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  369. }
  370. SRA_2V(hbuf_r, hbuf_l, 3);
  371. ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
  372. } else if (32 == height) {
  373. for (i = 2; i--;) {
  374. LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  375. ref += 8 * ref_stride;
  376. UNPCK_UB_SH(ref0, ref0_r, ref0_l);
  377. UNPCK_UB_SH(ref1, ref1_r, ref1_l);
  378. UNPCK_UB_SH(ref2, ref2_r, ref2_l);
  379. UNPCK_UB_SH(ref3, ref3_r, ref3_l);
  380. UNPCK_UB_SH(ref4, ref4_r, ref4_l);
  381. UNPCK_UB_SH(ref5, ref5_r, ref5_l);
  382. UNPCK_UB_SH(ref6, ref6_r, ref6_l);
  383. UNPCK_UB_SH(ref7, ref7_r, ref7_l);
  384. ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
  385. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  386. ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
  387. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  388. ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
  389. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  390. ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
  391. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  392. LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  393. ref += 8 * ref_stride;
  394. UNPCK_UB_SH(ref0, ref0_r, ref0_l);
  395. UNPCK_UB_SH(ref1, ref1_r, ref1_l);
  396. UNPCK_UB_SH(ref2, ref2_r, ref2_l);
  397. UNPCK_UB_SH(ref3, ref3_r, ref3_l);
  398. UNPCK_UB_SH(ref4, ref4_r, ref4_l);
  399. UNPCK_UB_SH(ref5, ref5_r, ref5_l);
  400. UNPCK_UB_SH(ref6, ref6_r, ref6_l);
  401. UNPCK_UB_SH(ref7, ref7_r, ref7_l);
  402. ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
  403. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  404. ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
  405. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  406. ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
  407. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  408. ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
  409. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  410. }
  411. SRA_2V(hbuf_r, hbuf_l, 4);
  412. ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
  413. } else if (64 == height) {
  414. for (i = 4; i--;) {
  415. LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  416. ref += 8 * ref_stride;
  417. UNPCK_UB_SH(ref0, ref0_r, ref0_l);
  418. UNPCK_UB_SH(ref1, ref1_r, ref1_l);
  419. UNPCK_UB_SH(ref2, ref2_r, ref2_l);
  420. UNPCK_UB_SH(ref3, ref3_r, ref3_l);
  421. UNPCK_UB_SH(ref4, ref4_r, ref4_l);
  422. UNPCK_UB_SH(ref5, ref5_r, ref5_l);
  423. UNPCK_UB_SH(ref6, ref6_r, ref6_l);
  424. UNPCK_UB_SH(ref7, ref7_r, ref7_l);
  425. ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
  426. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  427. ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
  428. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  429. ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
  430. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  431. ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
  432. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  433. LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  434. ref += 8 * ref_stride;
  435. UNPCK_UB_SH(ref0, ref0_r, ref0_l);
  436. UNPCK_UB_SH(ref1, ref1_r, ref1_l);
  437. UNPCK_UB_SH(ref2, ref2_r, ref2_l);
  438. UNPCK_UB_SH(ref3, ref3_r, ref3_l);
  439. UNPCK_UB_SH(ref4, ref4_r, ref4_l);
  440. UNPCK_UB_SH(ref5, ref5_r, ref5_l);
  441. UNPCK_UB_SH(ref6, ref6_r, ref6_l);
  442. UNPCK_UB_SH(ref7, ref7_r, ref7_l);
  443. ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
  444. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  445. ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
  446. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  447. ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
  448. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  449. ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
  450. hbuf_r, hbuf_l, hbuf_r, hbuf_l);
  451. }
  452. SRA_2V(hbuf_r, hbuf_l, 5);
  453. ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
  454. } else {
  455. const int norm_factor = height >> 1;
  456. int cnt;
  457. for (cnt = 0; cnt < 16; cnt++) {
  458. hbuf[cnt] = 0;
  459. }
  460. for (i = 0; i < height; ++i) {
  461. for (cnt = 0; cnt < 16; cnt++) {
  462. hbuf[cnt] += ref[cnt];
  463. }
  464. ref += ref_stride;
  465. }
  466. for (cnt = 0; cnt < 16; cnt++) {
  467. hbuf[cnt] /= norm_factor;
  468. }
  469. }
  470. }
  471. int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
  472. int16_t sum;
  473. v16u8 ref0, ref1, ref2, ref3;
  474. v8u16 ref0_h;
  475. if (16 == width) {
  476. ref0 = LD_UB(ref);
  477. ref0_h = __msa_hadd_u_h(ref0, ref0);
  478. sum = HADD_UH_U32(ref0_h);
  479. } else if (32 == width) {
  480. LD_UB2(ref, 16, ref0, ref1);
  481. ref0_h = __msa_hadd_u_h(ref0, ref0);
  482. ref0_h += __msa_hadd_u_h(ref1, ref1);
  483. sum = HADD_UH_U32(ref0_h);
  484. } else if (64 == width) {
  485. LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
  486. ref0_h = __msa_hadd_u_h(ref0, ref0);
  487. ref0_h += __msa_hadd_u_h(ref1, ref1);
  488. ref0_h += __msa_hadd_u_h(ref2, ref2);
  489. ref0_h += __msa_hadd_u_h(ref3, ref3);
  490. sum = HADD_UH_U32(ref0_h);
  491. } else {
  492. int idx;
  493. sum = 0;
  494. for (idx = 0; idx < width; ++idx) {
  495. sum += ref[idx];
  496. }
  497. }
  498. return sum;
  499. }
  500. int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) {
  501. int sse, mean, var;
  502. v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2;
  503. v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m;
  504. v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m;
  505. v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m;
  506. v4i32 res_l7_m, mean_v;
  507. v2i64 sse_v;
  508. if (2 == bwl) {
  509. LD_SH2(src, 8, src0, src1);
  510. LD_SH2(ref, 8, ref0, ref1);
  511. ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
  512. ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
  513. HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
  514. HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
  515. sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
  516. sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
  517. DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
  518. mean_v = res_l0_m + res_l1_m;
  519. mean_v += res_l2_m + res_l3_m;
  520. sse_v += __msa_splati_d(sse_v, 1);
  521. sse = __msa_copy_s_w((v4i32)sse_v, 0);
  522. mean = HADD_SW_S32(mean_v);
  523. } else if (3 == bwl) {
  524. LD_SH4(src, 8, src0, src1, src2, src3);
  525. LD_SH4(ref, 8, ref0, ref1, ref2, ref3);
  526. ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
  527. ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
  528. ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
  529. ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
  530. HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
  531. HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
  532. HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
  533. HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
  534. sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
  535. sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
  536. DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
  537. DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
  538. DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
  539. mean_v = res_l0_m + res_l1_m;
  540. mean_v += res_l2_m + res_l3_m;
  541. mean_v += res_l4_m + res_l5_m;
  542. mean_v += res_l6_m + res_l7_m;
  543. sse_v += __msa_splati_d(sse_v, 1);
  544. sse = __msa_copy_s_w((v4i32)sse_v, 0);
  545. mean = HADD_SW_S32(mean_v);
  546. } else if (4 == bwl) {
  547. LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
  548. LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  549. ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
  550. ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
  551. ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
  552. ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
  553. HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
  554. HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
  555. HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
  556. HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
  557. sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
  558. sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
  559. DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
  560. DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
  561. DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
  562. mean_v = res_l0_m + res_l1_m;
  563. mean_v += res_l2_m + res_l3_m;
  564. mean_v += res_l4_m + res_l5_m;
  565. mean_v += res_l6_m + res_l7_m;
  566. ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m);
  567. ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m);
  568. ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m);
  569. ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m);
  570. HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
  571. HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
  572. HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
  573. HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
  574. DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v);
  575. DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
  576. DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
  577. DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
  578. mean_v += res_l0_m + res_l1_m;
  579. mean_v += res_l2_m + res_l3_m;
  580. mean_v += res_l4_m + res_l5_m;
  581. mean_v += res_l6_m + res_l7_m;
  582. sse_v += __msa_splati_d(sse_v, 1);
  583. sse = __msa_copy_s_w((v4i32)sse_v, 0);
  584. mean = HADD_SW_S32(mean_v);
  585. } else {
  586. int i;
  587. const int width = 4 << bwl;
  588. sse = 0;
  589. mean = 0;
  590. for (i = 0; i < width; ++i) {
  591. const int diff = ref[i] - src[i];
  592. mean += diff;
  593. sse += diff * diff;
  594. }
  595. }
  596. var = sse - ((mean * mean) >> (bwl + 2));
  597. return var;
  598. }
  599. void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp,
  600. int *min, int *max) {
  601. v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7;
  602. v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1;
  603. LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
  604. LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7);
  605. PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3);
  606. PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3);
  607. diff0 = __msa_asub_u_b(s0, d0);
  608. diff1 = __msa_asub_u_b(s1, d1);
  609. diff2 = __msa_asub_u_b(s2, d2);
  610. diff3 = __msa_asub_u_b(s3, d3);
  611. min0 = __msa_min_u_b(diff0, diff1);
  612. min1 = __msa_min_u_b(diff2, diff3);
  613. min0 = __msa_min_u_b(min0, min1);
  614. max0 = __msa_max_u_b(diff0, diff1);
  615. max1 = __msa_max_u_b(diff2, diff3);
  616. max0 = __msa_max_u_b(max0, max1);
  617. min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8);
  618. min0 = __msa_min_u_b(min0, min1);
  619. max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8);
  620. max0 = __msa_max_u_b(max0, max1);
  621. min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4);
  622. min0 = __msa_min_u_b(min0, min1);
  623. max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4);
  624. max0 = __msa_max_u_b(max0, max1);
  625. min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2);
  626. min0 = __msa_min_u_b(min0, min1);
  627. max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2);
  628. max0 = __msa_max_u_b(max0, max1);
  629. min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1);
  630. min0 = __msa_min_u_b(min0, min1);
  631. max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1);
  632. max0 = __msa_max_u_b(max0, max1);
  633. *min = min0[0];
  634. *max = max0[0];
  635. }