sub_pixel_variance_msa.c 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_dsp_rtcd.h"
  11. #include "vpx_ports/mem.h"
  12. #include "vpx_dsp/mips/macros_msa.h"
  13. #include "vpx_dsp/variance.h"
  14. static const uint8_t bilinear_filters_msa[8][2] = {
  15. { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
  16. { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
  17. };
  18. #define CALC_MSE_AVG_B(src, ref, var, sub) \
  19. { \
  20. v16u8 src_l0_m, src_l1_m; \
  21. v8i16 res_l0_m, res_l1_m; \
  22. \
  23. ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
  24. HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
  25. DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
  26. \
  27. sub += res_l0_m + res_l1_m; \
  28. }
  29. #define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
  30. #define VARIANCE_LARGE_WxH(sse, diff, shift) \
  31. sse - (((int64_t)diff * diff) >> shift)
  32. static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
  33. int32_t src_stride,
  34. const uint8_t *ref_ptr,
  35. int32_t ref_stride,
  36. const uint8_t *sec_pred, int32_t height,
  37. int32_t *diff) {
  38. int32_t ht_cnt;
  39. uint32_t src0, src1, src2, src3;
  40. uint32_t ref0, ref1, ref2, ref3;
  41. v16u8 pred, src = { 0 };
  42. v16u8 ref = { 0 };
  43. v8i16 avg = { 0 };
  44. v4i32 vec, var = { 0 };
  45. for (ht_cnt = (height >> 2); ht_cnt--;) {
  46. pred = LD_UB(sec_pred);
  47. sec_pred += 16;
  48. LW4(src_ptr, src_stride, src0, src1, src2, src3);
  49. src_ptr += (4 * src_stride);
  50. LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
  51. ref_ptr += (4 * ref_stride);
  52. INSERT_W4_UB(src0, src1, src2, src3, src);
  53. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  54. src = __msa_aver_u_b(src, pred);
  55. CALC_MSE_AVG_B(src, ref, var, avg);
  56. }
  57. vec = __msa_hadd_s_w(avg, avg);
  58. *diff = HADD_SW_S32(vec);
  59. return HADD_SW_S32(var);
  60. }
  61. static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
  62. int32_t src_stride,
  63. const uint8_t *ref_ptr,
  64. int32_t ref_stride,
  65. const uint8_t *sec_pred, int32_t height,
  66. int32_t *diff) {
  67. int32_t ht_cnt;
  68. v16u8 src0, src1, src2, src3;
  69. v16u8 ref0, ref1, ref2, ref3;
  70. v16u8 pred0, pred1;
  71. v8i16 avg = { 0 };
  72. v4i32 vec, var = { 0 };
  73. for (ht_cnt = (height >> 2); ht_cnt--;) {
  74. LD_UB2(sec_pred, 16, pred0, pred1);
  75. sec_pred += 32;
  76. LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
  77. src_ptr += (4 * src_stride);
  78. LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
  79. ref_ptr += (4 * ref_stride);
  80. PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
  81. ref0, ref1);
  82. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  83. CALC_MSE_AVG_B(src0, ref0, var, avg);
  84. CALC_MSE_AVG_B(src1, ref1, var, avg);
  85. }
  86. vec = __msa_hadd_s_w(avg, avg);
  87. *diff = HADD_SW_S32(vec);
  88. return HADD_SW_S32(var);
  89. }
  90. static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
  91. int32_t src_stride,
  92. const uint8_t *ref_ptr,
  93. int32_t ref_stride,
  94. const uint8_t *sec_pred,
  95. int32_t height, int32_t *diff) {
  96. int32_t ht_cnt;
  97. v16u8 src, ref, pred;
  98. v8i16 avg = { 0 };
  99. v4i32 vec, var = { 0 };
  100. for (ht_cnt = (height >> 2); ht_cnt--;) {
  101. pred = LD_UB(sec_pred);
  102. sec_pred += 16;
  103. src = LD_UB(src_ptr);
  104. src_ptr += src_stride;
  105. ref = LD_UB(ref_ptr);
  106. ref_ptr += ref_stride;
  107. src = __msa_aver_u_b(src, pred);
  108. CALC_MSE_AVG_B(src, ref, var, avg);
  109. pred = LD_UB(sec_pred);
  110. sec_pred += 16;
  111. src = LD_UB(src_ptr);
  112. src_ptr += src_stride;
  113. ref = LD_UB(ref_ptr);
  114. ref_ptr += ref_stride;
  115. src = __msa_aver_u_b(src, pred);
  116. CALC_MSE_AVG_B(src, ref, var, avg);
  117. pred = LD_UB(sec_pred);
  118. sec_pred += 16;
  119. src = LD_UB(src_ptr);
  120. src_ptr += src_stride;
  121. ref = LD_UB(ref_ptr);
  122. ref_ptr += ref_stride;
  123. src = __msa_aver_u_b(src, pred);
  124. CALC_MSE_AVG_B(src, ref, var, avg);
  125. pred = LD_UB(sec_pred);
  126. sec_pred += 16;
  127. src = LD_UB(src_ptr);
  128. src_ptr += src_stride;
  129. ref = LD_UB(ref_ptr);
  130. ref_ptr += ref_stride;
  131. src = __msa_aver_u_b(src, pred);
  132. CALC_MSE_AVG_B(src, ref, var, avg);
  133. }
  134. vec = __msa_hadd_s_w(avg, avg);
  135. *diff = HADD_SW_S32(vec);
  136. return HADD_SW_S32(var);
  137. }
  138. static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
  139. int32_t src_stride,
  140. const uint8_t *ref_ptr,
  141. int32_t ref_stride,
  142. const uint8_t *sec_pred,
  143. int32_t height, int32_t *diff) {
  144. int32_t ht_cnt;
  145. v16u8 src0, src1, ref0, ref1, pred0, pred1;
  146. v8i16 avg = { 0 };
  147. v4i32 vec, var = { 0 };
  148. for (ht_cnt = (height >> 2); ht_cnt--;) {
  149. LD_UB2(sec_pred, 16, pred0, pred1);
  150. sec_pred += 32;
  151. LD_UB2(src_ptr, 16, src0, src1);
  152. src_ptr += src_stride;
  153. LD_UB2(ref_ptr, 16, ref0, ref1);
  154. ref_ptr += ref_stride;
  155. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  156. CALC_MSE_AVG_B(src0, ref0, var, avg);
  157. CALC_MSE_AVG_B(src1, ref1, var, avg);
  158. LD_UB2(sec_pred, 16, pred0, pred1);
  159. sec_pred += 32;
  160. LD_UB2(src_ptr, 16, src0, src1);
  161. src_ptr += src_stride;
  162. LD_UB2(ref_ptr, 16, ref0, ref1);
  163. ref_ptr += ref_stride;
  164. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  165. CALC_MSE_AVG_B(src0, ref0, var, avg);
  166. CALC_MSE_AVG_B(src1, ref1, var, avg);
  167. LD_UB2(sec_pred, 16, pred0, pred1);
  168. sec_pred += 32;
  169. LD_UB2(src_ptr, 16, src0, src1);
  170. src_ptr += src_stride;
  171. LD_UB2(ref_ptr, 16, ref0, ref1);
  172. ref_ptr += ref_stride;
  173. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  174. CALC_MSE_AVG_B(src0, ref0, var, avg);
  175. CALC_MSE_AVG_B(src1, ref1, var, avg);
  176. LD_UB2(sec_pred, 16, pred0, pred1);
  177. sec_pred += 32;
  178. LD_UB2(src_ptr, 16, src0, src1);
  179. src_ptr += src_stride;
  180. LD_UB2(ref_ptr, 16, ref0, ref1);
  181. ref_ptr += ref_stride;
  182. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  183. CALC_MSE_AVG_B(src0, ref0, var, avg);
  184. CALC_MSE_AVG_B(src1, ref1, var, avg);
  185. }
  186. vec = __msa_hadd_s_w(avg, avg);
  187. *diff = HADD_SW_S32(vec);
  188. return HADD_SW_S32(var);
  189. }
  190. static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
  191. int32_t src_stride,
  192. const uint8_t *ref_ptr,
  193. int32_t ref_stride,
  194. const uint8_t *sec_pred, int32_t *diff) {
  195. int32_t ht_cnt;
  196. v16u8 src0, src1, ref0, ref1, pred0, pred1;
  197. v8i16 avg0 = { 0 };
  198. v8i16 avg1 = { 0 };
  199. v4i32 vec, var = { 0 };
  200. for (ht_cnt = 16; ht_cnt--;) {
  201. LD_UB2(sec_pred, 16, pred0, pred1);
  202. sec_pred += 32;
  203. LD_UB2(src_ptr, 16, src0, src1);
  204. src_ptr += src_stride;
  205. LD_UB2(ref_ptr, 16, ref0, ref1);
  206. ref_ptr += ref_stride;
  207. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  208. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  209. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  210. LD_UB2(sec_pred, 16, pred0, pred1);
  211. sec_pred += 32;
  212. LD_UB2(src_ptr, 16, src0, src1);
  213. src_ptr += src_stride;
  214. LD_UB2(ref_ptr, 16, ref0, ref1);
  215. ref_ptr += ref_stride;
  216. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  217. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  218. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  219. LD_UB2(sec_pred, 16, pred0, pred1);
  220. sec_pred += 32;
  221. LD_UB2(src_ptr, 16, src0, src1);
  222. src_ptr += src_stride;
  223. LD_UB2(ref_ptr, 16, ref0, ref1);
  224. ref_ptr += ref_stride;
  225. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  226. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  227. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  228. LD_UB2(sec_pred, 16, pred0, pred1);
  229. sec_pred += 32;
  230. LD_UB2(src_ptr, 16, src0, src1);
  231. src_ptr += src_stride;
  232. LD_UB2(ref_ptr, 16, ref0, ref1);
  233. ref_ptr += ref_stride;
  234. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  235. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  236. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  237. }
  238. vec = __msa_hadd_s_w(avg0, avg0);
  239. vec += __msa_hadd_s_w(avg1, avg1);
  240. *diff = HADD_SW_S32(vec);
  241. return HADD_SW_S32(var);
  242. }
  243. static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
  244. int32_t src_stride,
  245. const uint8_t *ref_ptr,
  246. int32_t ref_stride,
  247. const uint8_t *sec_pred, int32_t *diff) {
  248. int32_t ht_cnt;
  249. v16u8 src0, src1, src2, src3;
  250. v16u8 ref0, ref1, ref2, ref3;
  251. v16u8 pred0, pred1, pred2, pred3;
  252. v8i16 avg0 = { 0 };
  253. v8i16 avg1 = { 0 };
  254. v4i32 vec, var = { 0 };
  255. for (ht_cnt = 16; ht_cnt--;) {
  256. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  257. sec_pred += 64;
  258. LD_UB4(src_ptr, 16, src0, src1, src2, src3);
  259. src_ptr += src_stride;
  260. LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
  261. ref_ptr += ref_stride;
  262. AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
  263. src2, src3);
  264. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  265. CALC_MSE_AVG_B(src2, ref2, var, avg0);
  266. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  267. CALC_MSE_AVG_B(src3, ref3, var, avg1);
  268. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  269. sec_pred += 64;
  270. LD_UB4(src_ptr, 16, src0, src1, src2, src3);
  271. src_ptr += src_stride;
  272. LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
  273. ref_ptr += ref_stride;
  274. AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
  275. src2, src3);
  276. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  277. CALC_MSE_AVG_B(src2, ref2, var, avg0);
  278. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  279. CALC_MSE_AVG_B(src3, ref3, var, avg1);
  280. }
  281. vec = __msa_hadd_s_w(avg0, avg0);
  282. vec += __msa_hadd_s_w(avg1, avg1);
  283. *diff = HADD_SW_S32(vec);
  284. return HADD_SW_S32(var);
  285. }
  286. static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
  287. int32_t src_stride,
  288. const uint8_t *ref_ptr,
  289. int32_t ref_stride,
  290. const uint8_t *sec_pred, int32_t *diff) {
  291. int32_t ht_cnt;
  292. v16u8 src0, src1, src2, src3;
  293. v16u8 ref0, ref1, ref2, ref3;
  294. v16u8 pred0, pred1, pred2, pred3;
  295. v8i16 avg0 = { 0 };
  296. v8i16 avg1 = { 0 };
  297. v8i16 avg2 = { 0 };
  298. v8i16 avg3 = { 0 };
  299. v4i32 vec, var = { 0 };
  300. for (ht_cnt = 32; ht_cnt--;) {
  301. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  302. sec_pred += 64;
  303. LD_UB4(src_ptr, 16, src0, src1, src2, src3);
  304. src_ptr += src_stride;
  305. LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
  306. ref_ptr += ref_stride;
  307. AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
  308. src2, src3);
  309. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  310. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  311. CALC_MSE_AVG_B(src2, ref2, var, avg2);
  312. CALC_MSE_AVG_B(src3, ref3, var, avg3);
  313. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  314. sec_pred += 64;
  315. LD_UB4(src_ptr, 16, src0, src1, src2, src3);
  316. src_ptr += src_stride;
  317. LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
  318. ref_ptr += ref_stride;
  319. AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
  320. src2, src3);
  321. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  322. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  323. CALC_MSE_AVG_B(src2, ref2, var, avg2);
  324. CALC_MSE_AVG_B(src3, ref3, var, avg3);
  325. }
  326. vec = __msa_hadd_s_w(avg0, avg0);
  327. vec += __msa_hadd_s_w(avg1, avg1);
  328. vec += __msa_hadd_s_w(avg2, avg2);
  329. vec += __msa_hadd_s_w(avg3, avg3);
  330. *diff = HADD_SW_S32(vec);
  331. return HADD_SW_S32(var);
  332. }
  333. static uint32_t sub_pixel_sse_diff_4width_h_msa(
  334. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  335. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  336. int16_t filtval;
  337. uint32_t loop_cnt;
  338. uint32_t ref0, ref1, ref2, ref3;
  339. v16u8 filt0, ref = { 0 };
  340. v16i8 src0, src1, src2, src3;
  341. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  342. v8u16 vec0, vec1, vec2, vec3;
  343. v8i16 avg = { 0 };
  344. v4i32 vec, var = { 0 };
  345. filtval = LH(filter);
  346. filt0 = (v16u8)__msa_fill_h(filtval);
  347. for (loop_cnt = (height >> 2); loop_cnt--;) {
  348. LD_SB4(src, src_stride, src0, src1, src2, src3);
  349. src += (4 * src_stride);
  350. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  351. dst += (4 * dst_stride);
  352. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  353. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  354. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  355. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  356. vec2, vec3);
  357. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  358. PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
  359. src2, src3);
  360. ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
  361. src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
  362. CALC_MSE_AVG_B(src0, ref, var, avg);
  363. }
  364. vec = __msa_hadd_s_w(avg, avg);
  365. *diff = HADD_SW_S32(vec);
  366. return HADD_SW_S32(var);
  367. }
  368. static uint32_t sub_pixel_sse_diff_8width_h_msa(
  369. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  370. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  371. int16_t filtval;
  372. uint32_t loop_cnt;
  373. v16u8 filt0, out, ref0, ref1, ref2, ref3;
  374. v16i8 src0, src1, src2, src3;
  375. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  376. v8u16 vec0, vec1, vec2, vec3;
  377. v8i16 avg = { 0 };
  378. v4i32 vec, var = { 0 };
  379. filtval = LH(filter);
  380. filt0 = (v16u8)__msa_fill_h(filtval);
  381. for (loop_cnt = (height >> 2); loop_cnt--;) {
  382. LD_SB4(src, src_stride, src0, src1, src2, src3);
  383. src += (4 * src_stride);
  384. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  385. dst += (4 * dst_stride);
  386. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  387. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  388. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  389. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  390. vec2, vec3);
  391. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  392. PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
  393. src2, src3);
  394. out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
  395. CALC_MSE_AVG_B(out, ref0, var, avg);
  396. out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
  397. CALC_MSE_AVG_B(out, ref1, var, avg);
  398. }
  399. vec = __msa_hadd_s_w(avg, avg);
  400. *diff = HADD_SW_S32(vec);
  401. return HADD_SW_S32(var);
  402. }
  403. static uint32_t sub_pixel_sse_diff_16width_h_msa(
  404. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  405. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  406. int16_t filtval;
  407. uint32_t loop_cnt;
  408. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  409. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  410. v16u8 dst0, dst1, dst2, dst3, filt0;
  411. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  412. v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
  413. v8i16 avg = { 0 };
  414. v4i32 vec, var = { 0 };
  415. filtval = LH(filter);
  416. filt0 = (v16u8)__msa_fill_h(filtval);
  417. for (loop_cnt = (height >> 2); loop_cnt--;) {
  418. LD_SB4(src, src_stride, src0, src2, src4, src6);
  419. LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
  420. src += (4 * src_stride);
  421. LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
  422. dst += (4 * dst_stride);
  423. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  424. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  425. VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
  426. VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
  427. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
  428. out2, out3);
  429. DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
  430. out6, out7);
  431. SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
  432. SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
  433. PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
  434. src2, src3);
  435. CALC_MSE_AVG_B(src0, dst0, var, avg);
  436. CALC_MSE_AVG_B(src1, dst1, var, avg);
  437. CALC_MSE_AVG_B(src2, dst2, var, avg);
  438. CALC_MSE_AVG_B(src3, dst3, var, avg);
  439. }
  440. vec = __msa_hadd_s_w(avg, avg);
  441. *diff = HADD_SW_S32(vec);
  442. return HADD_SW_S32(var);
  443. }
  444. static uint32_t sub_pixel_sse_diff_32width_h_msa(
  445. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  446. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  447. uint32_t loop_cnt, sse = 0;
  448. int32_t diff0[2];
  449. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  450. sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
  451. filter, height, &diff0[loop_cnt]);
  452. src += 16;
  453. dst += 16;
  454. }
  455. *diff = diff0[0] + diff0[1];
  456. return sse;
  457. }
  458. static uint32_t sub_pixel_sse_diff_64width_h_msa(
  459. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  460. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  461. uint32_t loop_cnt, sse = 0;
  462. int32_t diff0[4];
  463. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  464. sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
  465. filter, height, &diff0[loop_cnt]);
  466. src += 16;
  467. dst += 16;
  468. }
  469. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  470. return sse;
  471. }
  472. static uint32_t sub_pixel_sse_diff_4width_v_msa(
  473. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  474. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  475. int16_t filtval;
  476. uint32_t loop_cnt;
  477. uint32_t ref0, ref1, ref2, ref3;
  478. v16u8 src0, src1, src2, src3, src4, out;
  479. v16u8 src10_r, src32_r, src21_r, src43_r;
  480. v16u8 ref = { 0 };
  481. v16u8 src2110, src4332;
  482. v16u8 filt0;
  483. v8i16 avg = { 0 };
  484. v4i32 vec, var = { 0 };
  485. v8u16 tmp0, tmp1;
  486. filtval = LH(filter);
  487. filt0 = (v16u8)__msa_fill_h(filtval);
  488. src0 = LD_UB(src);
  489. src += src_stride;
  490. for (loop_cnt = (height >> 2); loop_cnt--;) {
  491. LD_UB4(src, src_stride, src1, src2, src3, src4);
  492. src += (4 * src_stride);
  493. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  494. dst += (4 * dst_stride);
  495. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  496. ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
  497. src32_r, src43_r);
  498. ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
  499. DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
  500. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  501. out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  502. CALC_MSE_AVG_B(out, ref, var, avg);
  503. src0 = src4;
  504. }
  505. vec = __msa_hadd_s_w(avg, avg);
  506. *diff = HADD_SW_S32(vec);
  507. return HADD_SW_S32(var);
  508. }
  509. static uint32_t sub_pixel_sse_diff_8width_v_msa(
  510. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  511. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  512. int16_t filtval;
  513. uint32_t loop_cnt;
  514. v16u8 src0, src1, src2, src3, src4;
  515. v16u8 ref0, ref1, ref2, ref3;
  516. v8u16 vec0, vec1, vec2, vec3;
  517. v8u16 tmp0, tmp1, tmp2, tmp3;
  518. v16u8 filt0;
  519. v8i16 avg = { 0 };
  520. v4i32 vec, var = { 0 };
  521. filtval = LH(filter);
  522. filt0 = (v16u8)__msa_fill_h(filtval);
  523. src0 = LD_UB(src);
  524. src += src_stride;
  525. for (loop_cnt = (height >> 2); loop_cnt--;) {
  526. LD_UB4(src, src_stride, src1, src2, src3, src4);
  527. src += (4 * src_stride);
  528. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  529. dst += (4 * dst_stride);
  530. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  531. ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
  532. vec3);
  533. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
  534. tmp2, tmp3);
  535. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  536. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
  537. CALC_MSE_AVG_B(src0, ref0, var, avg);
  538. CALC_MSE_AVG_B(src1, ref1, var, avg);
  539. src0 = src4;
  540. }
  541. vec = __msa_hadd_s_w(avg, avg);
  542. *diff = HADD_SW_S32(vec);
  543. return HADD_SW_S32(var);
  544. }
  545. static uint32_t sub_pixel_sse_diff_16width_v_msa(
  546. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  547. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  548. int16_t filtval;
  549. uint32_t loop_cnt;
  550. v16u8 ref0, ref1, ref2, ref3;
  551. v16u8 src0, src1, src2, src3, src4;
  552. v16u8 out0, out1, out2, out3;
  553. v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  554. v8u16 tmp0, tmp1, tmp2, tmp3;
  555. v16u8 filt0;
  556. v8i16 avg = { 0 };
  557. v4i32 vec, var = { 0 };
  558. filtval = LH(filter);
  559. filt0 = (v16u8)__msa_fill_h(filtval);
  560. src0 = LD_UB(src);
  561. src += src_stride;
  562. for (loop_cnt = (height >> 2); loop_cnt--;) {
  563. LD_UB4(src, src_stride, src1, src2, src3, src4);
  564. src += (4 * src_stride);
  565. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  566. dst += (4 * dst_stride);
  567. ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
  568. ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
  569. DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
  570. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  571. out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  572. ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
  573. ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
  574. DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
  575. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  576. out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
  577. DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
  578. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  579. out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  580. DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
  581. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  582. out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
  583. src0 = src4;
  584. CALC_MSE_AVG_B(out0, ref0, var, avg);
  585. CALC_MSE_AVG_B(out1, ref1, var, avg);
  586. CALC_MSE_AVG_B(out2, ref2, var, avg);
  587. CALC_MSE_AVG_B(out3, ref3, var, avg);
  588. }
  589. vec = __msa_hadd_s_w(avg, avg);
  590. *diff = HADD_SW_S32(vec);
  591. return HADD_SW_S32(var);
  592. }
  593. static uint32_t sub_pixel_sse_diff_32width_v_msa(
  594. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  595. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  596. uint32_t loop_cnt, sse = 0;
  597. int32_t diff0[2];
  598. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  599. sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
  600. filter, height, &diff0[loop_cnt]);
  601. src += 16;
  602. dst += 16;
  603. }
  604. *diff = diff0[0] + diff0[1];
  605. return sse;
  606. }
  607. static uint32_t sub_pixel_sse_diff_64width_v_msa(
  608. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  609. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  610. uint32_t loop_cnt, sse = 0;
  611. int32_t diff0[4];
  612. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  613. sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
  614. filter, height, &diff0[loop_cnt]);
  615. src += 16;
  616. dst += 16;
  617. }
  618. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  619. return sse;
  620. }
  621. static uint32_t sub_pixel_sse_diff_4width_hv_msa(
  622. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  623. int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
  624. int32_t height, int32_t *diff) {
  625. int16_t filtval;
  626. uint32_t loop_cnt;
  627. uint32_t ref0, ref1, ref2, ref3;
  628. v16u8 src0, src1, src2, src3, src4;
  629. v16u8 out, ref = { 0 };
  630. v16u8 filt_vt, filt_hz, vec0, vec1;
  631. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
  632. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
  633. v8u16 tmp0, tmp1;
  634. v8i16 avg = { 0 };
  635. v4i32 vec, var = { 0 };
  636. filtval = LH(filter_horiz);
  637. filt_hz = (v16u8)__msa_fill_h(filtval);
  638. filtval = LH(filter_vert);
  639. filt_vt = (v16u8)__msa_fill_h(filtval);
  640. src0 = LD_UB(src);
  641. src += src_stride;
  642. for (loop_cnt = (height >> 2); loop_cnt--;) {
  643. LD_UB4(src, src_stride, src1, src2, src3, src4);
  644. src += (4 * src_stride);
  645. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  646. dst += (4 * dst_stride);
  647. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  648. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  649. hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  650. hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  651. hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
  652. hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
  653. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  654. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  655. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  656. out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  657. CALC_MSE_AVG_B(out, ref, var, avg);
  658. src0 = src4;
  659. }
  660. vec = __msa_hadd_s_w(avg, avg);
  661. *diff = HADD_SW_S32(vec);
  662. return HADD_SW_S32(var);
  663. }
  664. static uint32_t sub_pixel_sse_diff_8width_hv_msa(
  665. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  666. int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
  667. int32_t height, int32_t *diff) {
  668. int16_t filtval;
  669. uint32_t loop_cnt;
  670. v16u8 ref0, ref1, ref2, ref3;
  671. v16u8 src0, src1, src2, src3, src4;
  672. v16u8 out0, out1;
  673. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  674. v8u16 hz_out0, hz_out1;
  675. v8u16 tmp0, tmp1, tmp2, tmp3;
  676. v16u8 filt_vt, filt_hz, vec0;
  677. v8i16 avg = { 0 };
  678. v4i32 vec, var = { 0 };
  679. filtval = LH(filter_horiz);
  680. filt_hz = (v16u8)__msa_fill_h(filtval);
  681. filtval = LH(filter_vert);
  682. filt_vt = (v16u8)__msa_fill_h(filtval);
  683. src0 = LD_UB(src);
  684. src += src_stride;
  685. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  686. for (loop_cnt = (height >> 2); loop_cnt--;) {
  687. LD_UB4(src, src_stride, src1, src2, src3, src4);
  688. src += (4 * src_stride);
  689. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  690. dst += (4 * dst_stride);
  691. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  692. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  693. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  694. tmp0 = __msa_dotp_u_h(vec0, filt_vt);
  695. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  696. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  697. tmp1 = __msa_dotp_u_h(vec0, filt_vt);
  698. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  699. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  700. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  701. tmp2 = __msa_dotp_u_h(vec0, filt_vt);
  702. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  703. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  704. tmp3 = __msa_dotp_u_h(vec0, filt_vt);
  705. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  706. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  707. CALC_MSE_AVG_B(out0, ref0, var, avg);
  708. CALC_MSE_AVG_B(out1, ref1, var, avg);
  709. }
  710. vec = __msa_hadd_s_w(avg, avg);
  711. *diff = HADD_SW_S32(vec);
  712. return HADD_SW_S32(var);
  713. }
  714. static uint32_t sub_pixel_sse_diff_16width_hv_msa(
  715. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  716. int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
  717. int32_t height, int32_t *diff) {
  718. int16_t filtval;
  719. uint32_t loop_cnt;
  720. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  721. v16u8 ref0, ref1, ref2, ref3;
  722. v16u8 filt_hz, filt_vt, vec0, vec1;
  723. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  724. v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
  725. v8u16 tmp0, tmp1;
  726. v8i16 avg = { 0 };
  727. v4i32 vec, var = { 0 };
  728. filtval = LH(filter_horiz);
  729. filt_hz = (v16u8)__msa_fill_h(filtval);
  730. filtval = LH(filter_vert);
  731. filt_vt = (v16u8)__msa_fill_h(filtval);
  732. LD_UB2(src, 8, src0, src1);
  733. src += src_stride;
  734. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  735. hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  736. for (loop_cnt = (height >> 2); loop_cnt--;) {
  737. LD_UB4(src, src_stride, src0, src2, src4, src6);
  738. LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
  739. src += (4 * src_stride);
  740. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  741. dst += (4 * dst_stride);
  742. hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  743. hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  744. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  745. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  746. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  747. src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  748. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  749. hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  750. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  751. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  752. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  753. src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  754. hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  755. hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
  756. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  757. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  758. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  759. src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  760. hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
  761. hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
  762. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  763. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  764. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  765. src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  766. CALC_MSE_AVG_B(src0, ref0, var, avg);
  767. CALC_MSE_AVG_B(src1, ref1, var, avg);
  768. CALC_MSE_AVG_B(src2, ref2, var, avg);
  769. CALC_MSE_AVG_B(src3, ref3, var, avg);
  770. }
  771. vec = __msa_hadd_s_w(avg, avg);
  772. *diff = HADD_SW_S32(vec);
  773. return HADD_SW_S32(var);
  774. }
  775. static uint32_t sub_pixel_sse_diff_32width_hv_msa(
  776. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  777. int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
  778. int32_t height, int32_t *diff) {
  779. uint32_t loop_cnt, sse = 0;
  780. int32_t diff0[2];
  781. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  782. sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
  783. filter_horiz, filter_vert, height,
  784. &diff0[loop_cnt]);
  785. src += 16;
  786. dst += 16;
  787. }
  788. *diff = diff0[0] + diff0[1];
  789. return sse;
  790. }
  791. static uint32_t sub_pixel_sse_diff_64width_hv_msa(
  792. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  793. int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
  794. int32_t height, int32_t *diff) {
  795. uint32_t loop_cnt, sse = 0;
  796. int32_t diff0[4];
  797. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  798. sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
  799. filter_horiz, filter_vert, height,
  800. &diff0[loop_cnt]);
  801. src += 16;
  802. dst += 16;
  803. }
  804. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  805. return sse;
  806. }
  807. static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
  808. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  809. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  810. int32_t height, int32_t *diff) {
  811. int16_t filtval;
  812. uint32_t loop_cnt;
  813. uint32_t ref0, ref1, ref2, ref3;
  814. v16u8 out, pred, filt0, ref = { 0 };
  815. v16i8 src0, src1, src2, src3;
  816. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  817. v8u16 vec0, vec1, vec2, vec3;
  818. v8i16 avg = { 0 };
  819. v4i32 vec, var = { 0 };
  820. filtval = LH(filter);
  821. filt0 = (v16u8)__msa_fill_h(filtval);
  822. for (loop_cnt = (height >> 2); loop_cnt--;) {
  823. LD_SB4(src, src_stride, src0, src1, src2, src3);
  824. src += (4 * src_stride);
  825. pred = LD_UB(sec_pred);
  826. sec_pred += 16;
  827. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  828. dst += (4 * dst_stride);
  829. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  830. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  831. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  832. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  833. vec2, vec3);
  834. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  835. PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
  836. src2, src3);
  837. ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
  838. out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
  839. out = __msa_aver_u_b(out, pred);
  840. CALC_MSE_AVG_B(out, ref, var, avg);
  841. }
  842. vec = __msa_hadd_s_w(avg, avg);
  843. *diff = HADD_SW_S32(vec);
  844. return HADD_SW_S32(var);
  845. }
  846. static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
  847. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  848. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  849. int32_t height, int32_t *diff) {
  850. int16_t filtval;
  851. uint32_t loop_cnt;
  852. v16u8 out, pred, filt0;
  853. v16u8 ref0, ref1, ref2, ref3;
  854. v16i8 src0, src1, src2, src3;
  855. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  856. v8u16 vec0, vec1, vec2, vec3;
  857. v8i16 avg = { 0 };
  858. v4i32 vec, var = { 0 };
  859. filtval = LH(filter);
  860. filt0 = (v16u8)__msa_fill_h(filtval);
  861. for (loop_cnt = (height >> 2); loop_cnt--;) {
  862. LD_SB4(src, src_stride, src0, src1, src2, src3);
  863. src += (4 * src_stride);
  864. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  865. dst += (4 * dst_stride);
  866. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  867. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  868. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  869. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  870. vec2, vec3);
  871. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  872. PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
  873. src2, src3);
  874. out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
  875. pred = LD_UB(sec_pred);
  876. sec_pred += 16;
  877. out = __msa_aver_u_b(out, pred);
  878. CALC_MSE_AVG_B(out, ref0, var, avg);
  879. out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
  880. pred = LD_UB(sec_pred);
  881. sec_pred += 16;
  882. out = __msa_aver_u_b(out, pred);
  883. CALC_MSE_AVG_B(out, ref1, var, avg);
  884. }
  885. vec = __msa_hadd_s_w(avg, avg);
  886. *diff = HADD_SW_S32(vec);
  887. return HADD_SW_S32(var);
  888. }
  889. static uint32_t subpel_avg_ssediff_16w_h_msa(
  890. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  891. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  892. int32_t height, int32_t *diff, int32_t width) {
  893. int16_t filtval;
  894. uint32_t loop_cnt;
  895. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  896. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  897. v16u8 dst0, dst1, dst2, dst3;
  898. v16u8 tmp0, tmp1, tmp2, tmp3;
  899. v16u8 pred0, pred1, pred2, pred3, filt0;
  900. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  901. v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
  902. v8i16 avg = { 0 };
  903. v4i32 vec, var = { 0 };
  904. filtval = LH(filter);
  905. filt0 = (v16u8)__msa_fill_h(filtval);
  906. for (loop_cnt = (height >> 2); loop_cnt--;) {
  907. LD_SB4(src, src_stride, src0, src2, src4, src6);
  908. LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
  909. src += (4 * src_stride);
  910. LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
  911. dst += (4 * dst_stride);
  912. LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
  913. sec_pred += (4 * width);
  914. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  915. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  916. VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
  917. VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
  918. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
  919. out2, out3);
  920. DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
  921. out6, out7);
  922. SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
  923. SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
  924. PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
  925. tmp2, tmp3);
  926. AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
  927. tmp2, tmp3);
  928. CALC_MSE_AVG_B(tmp0, dst0, var, avg);
  929. CALC_MSE_AVG_B(tmp1, dst1, var, avg);
  930. CALC_MSE_AVG_B(tmp2, dst2, var, avg);
  931. CALC_MSE_AVG_B(tmp3, dst3, var, avg);
  932. }
  933. vec = __msa_hadd_s_w(avg, avg);
  934. *diff = HADD_SW_S32(vec);
  935. return HADD_SW_S32(var);
  936. }
  937. static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
  938. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  939. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  940. int32_t height, int32_t *diff) {
  941. return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
  942. sec_pred, filter, height, diff, 16);
  943. }
  944. static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
  945. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  946. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  947. int32_t height, int32_t *diff) {
  948. uint32_t loop_cnt, sse = 0;
  949. int32_t diff0[2];
  950. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  951. sse +=
  952. subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
  953. filter, height, &diff0[loop_cnt], 32);
  954. src += 16;
  955. dst += 16;
  956. sec_pred += 16;
  957. }
  958. *diff = diff0[0] + diff0[1];
  959. return sse;
  960. }
  961. static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
  962. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  963. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  964. int32_t height, int32_t *diff) {
  965. uint32_t loop_cnt, sse = 0;
  966. int32_t diff0[4];
  967. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  968. sse +=
  969. subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
  970. filter, height, &diff0[loop_cnt], 64);
  971. src += 16;
  972. dst += 16;
  973. sec_pred += 16;
  974. }
  975. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  976. return sse;
  977. }
  978. static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
  979. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  980. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  981. int32_t height, int32_t *diff) {
  982. int16_t filtval;
  983. uint32_t loop_cnt;
  984. uint32_t ref0, ref1, ref2, ref3;
  985. v16u8 src0, src1, src2, src3, src4;
  986. v16u8 src10_r, src32_r, src21_r, src43_r;
  987. v16u8 out, pred, ref = { 0 };
  988. v16u8 src2110, src4332, filt0;
  989. v8i16 avg = { 0 };
  990. v4i32 vec, var = { 0 };
  991. v8u16 tmp0, tmp1;
  992. filtval = LH(filter);
  993. filt0 = (v16u8)__msa_fill_h(filtval);
  994. src0 = LD_UB(src);
  995. src += src_stride;
  996. for (loop_cnt = (height >> 2); loop_cnt--;) {
  997. LD_UB4(src, src_stride, src1, src2, src3, src4);
  998. src += (4 * src_stride);
  999. pred = LD_UB(sec_pred);
  1000. sec_pred += 16;
  1001. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1002. dst += (4 * dst_stride);
  1003. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  1004. ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
  1005. src32_r, src43_r);
  1006. ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
  1007. DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
  1008. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1009. out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1010. out = __msa_aver_u_b(out, pred);
  1011. CALC_MSE_AVG_B(out, ref, var, avg);
  1012. src0 = src4;
  1013. }
  1014. vec = __msa_hadd_s_w(avg, avg);
  1015. *diff = HADD_SW_S32(vec);
  1016. return HADD_SW_S32(var);
  1017. }
  1018. static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
  1019. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1020. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  1021. int32_t height, int32_t *diff) {
  1022. int16_t filtval;
  1023. uint32_t loop_cnt;
  1024. v16u8 src0, src1, src2, src3, src4;
  1025. v16u8 ref0, ref1, ref2, ref3;
  1026. v16u8 pred0, pred1, filt0;
  1027. v8u16 vec0, vec1, vec2, vec3;
  1028. v8u16 tmp0, tmp1, tmp2, tmp3;
  1029. v8i16 avg = { 0 };
  1030. v4i32 vec, var = { 0 };
  1031. filtval = LH(filter);
  1032. filt0 = (v16u8)__msa_fill_h(filtval);
  1033. src0 = LD_UB(src);
  1034. src += src_stride;
  1035. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1036. LD_UB4(src, src_stride, src1, src2, src3, src4);
  1037. src += (4 * src_stride);
  1038. LD_UB2(sec_pred, 16, pred0, pred1);
  1039. sec_pred += 32;
  1040. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1041. dst += (4 * dst_stride);
  1042. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  1043. ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
  1044. vec3);
  1045. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
  1046. tmp2, tmp3);
  1047. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  1048. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
  1049. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  1050. CALC_MSE_AVG_B(src0, ref0, var, avg);
  1051. CALC_MSE_AVG_B(src1, ref1, var, avg);
  1052. src0 = src4;
  1053. }
  1054. vec = __msa_hadd_s_w(avg, avg);
  1055. *diff = HADD_SW_S32(vec);
  1056. return HADD_SW_S32(var);
  1057. }
  1058. static uint32_t subpel_avg_ssediff_16w_v_msa(
  1059. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1060. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  1061. int32_t height, int32_t *diff, int32_t width) {
  1062. int16_t filtval;
  1063. uint32_t loop_cnt;
  1064. v16u8 ref0, ref1, ref2, ref3;
  1065. v16u8 pred0, pred1, pred2, pred3;
  1066. v16u8 src0, src1, src2, src3, src4;
  1067. v16u8 out0, out1, out2, out3, filt0;
  1068. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1069. v8u16 tmp0, tmp1, tmp2, tmp3;
  1070. v8i16 avg = { 0 };
  1071. v4i32 vec, var = { 0 };
  1072. filtval = LH(filter);
  1073. filt0 = (v16u8)__msa_fill_h(filtval);
  1074. src0 = LD_UB(src);
  1075. src += src_stride;
  1076. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1077. LD_UB4(src, src_stride, src1, src2, src3, src4);
  1078. src += (4 * src_stride);
  1079. LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
  1080. sec_pred += (4 * width);
  1081. ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
  1082. ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
  1083. DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
  1084. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1085. out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1086. ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
  1087. ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
  1088. DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
  1089. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  1090. out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
  1091. DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
  1092. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1093. out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1094. DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
  1095. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  1096. out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
  1097. src0 = src4;
  1098. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1099. dst += (4 * dst_stride);
  1100. AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
  1101. out2, out3);
  1102. CALC_MSE_AVG_B(out0, ref0, var, avg);
  1103. CALC_MSE_AVG_B(out1, ref1, var, avg);
  1104. CALC_MSE_AVG_B(out2, ref2, var, avg);
  1105. CALC_MSE_AVG_B(out3, ref3, var, avg);
  1106. }
  1107. vec = __msa_hadd_s_w(avg, avg);
  1108. *diff = HADD_SW_S32(vec);
  1109. return HADD_SW_S32(var);
  1110. }
  1111. static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
  1112. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1113. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  1114. int32_t height, int32_t *diff) {
  1115. return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
  1116. sec_pred, filter, height, diff, 16);
  1117. }
  1118. static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
  1119. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1120. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  1121. int32_t height, int32_t *diff) {
  1122. uint32_t loop_cnt, sse = 0;
  1123. int32_t diff0[2];
  1124. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  1125. sse +=
  1126. subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
  1127. filter, height, &diff0[loop_cnt], 32);
  1128. src += 16;
  1129. dst += 16;
  1130. sec_pred += 16;
  1131. }
  1132. *diff = diff0[0] + diff0[1];
  1133. return sse;
  1134. }
  1135. static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
  1136. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1137. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  1138. int32_t height, int32_t *diff) {
  1139. uint32_t loop_cnt, sse = 0;
  1140. int32_t diff0[4];
  1141. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  1142. sse +=
  1143. subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
  1144. filter, height, &diff0[loop_cnt], 64);
  1145. src += 16;
  1146. dst += 16;
  1147. sec_pred += 16;
  1148. }
  1149. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  1150. return sse;
  1151. }
  1152. static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
  1153. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1154. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1155. const uint8_t *filter_vert, int32_t height, int32_t *diff) {
  1156. int16_t filtval;
  1157. uint32_t loop_cnt;
  1158. uint32_t ref0, ref1, ref2, ref3;
  1159. v16u8 src0, src1, src2, src3, src4;
  1160. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
  1161. v16u8 filt_hz, filt_vt, vec0, vec1;
  1162. v16u8 out, pred, ref = { 0 };
  1163. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
  1164. v8i16 avg = { 0 };
  1165. v4i32 vec, var = { 0 };
  1166. filtval = LH(filter_horiz);
  1167. filt_hz = (v16u8)__msa_fill_h(filtval);
  1168. filtval = LH(filter_vert);
  1169. filt_vt = (v16u8)__msa_fill_h(filtval);
  1170. src0 = LD_UB(src);
  1171. src += src_stride;
  1172. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1173. LD_UB4(src, src_stride, src1, src2, src3, src4);
  1174. src += (4 * src_stride);
  1175. pred = LD_UB(sec_pred);
  1176. sec_pred += 16;
  1177. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1178. dst += (4 * dst_stride);
  1179. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  1180. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  1181. hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  1182. hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  1183. hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
  1184. hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
  1185. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  1186. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  1187. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1188. out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1189. out = __msa_aver_u_b(out, pred);
  1190. CALC_MSE_AVG_B(out, ref, var, avg);
  1191. src0 = src4;
  1192. }
  1193. vec = __msa_hadd_s_w(avg, avg);
  1194. *diff = HADD_SW_S32(vec);
  1195. return HADD_SW_S32(var);
  1196. }
  1197. static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
  1198. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1199. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1200. const uint8_t *filter_vert, int32_t height, int32_t *diff) {
  1201. int16_t filtval;
  1202. uint32_t loop_cnt;
  1203. v16u8 ref0, ref1, ref2, ref3;
  1204. v16u8 src0, src1, src2, src3, src4;
  1205. v16u8 pred0, pred1, out0, out1;
  1206. v16u8 filt_hz, filt_vt, vec0;
  1207. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  1208. v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
  1209. v8i16 avg = { 0 };
  1210. v4i32 vec, var = { 0 };
  1211. filtval = LH(filter_horiz);
  1212. filt_hz = (v16u8)__msa_fill_h(filtval);
  1213. filtval = LH(filter_vert);
  1214. filt_vt = (v16u8)__msa_fill_h(filtval);
  1215. src0 = LD_UB(src);
  1216. src += src_stride;
  1217. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  1218. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1219. LD_UB4(src, src_stride, src1, src2, src3, src4);
  1220. src += (4 * src_stride);
  1221. LD_UB2(sec_pred, 16, pred0, pred1);
  1222. sec_pred += 32;
  1223. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1224. dst += (4 * dst_stride);
  1225. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  1226. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  1227. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  1228. tmp0 = __msa_dotp_u_h(vec0, filt_vt);
  1229. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  1230. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  1231. tmp1 = __msa_dotp_u_h(vec0, filt_vt);
  1232. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1233. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  1234. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  1235. tmp2 = __msa_dotp_u_h(vec0, filt_vt);
  1236. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  1237. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  1238. tmp3 = __msa_dotp_u_h(vec0, filt_vt);
  1239. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  1240. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  1241. AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
  1242. CALC_MSE_AVG_B(out0, ref0, var, avg);
  1243. CALC_MSE_AVG_B(out1, ref1, var, avg);
  1244. }
  1245. vec = __msa_hadd_s_w(avg, avg);
  1246. *diff = HADD_SW_S32(vec);
  1247. return HADD_SW_S32(var);
  1248. }
  1249. static uint32_t subpel_avg_ssediff_16w_hv_msa(
  1250. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1251. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1252. const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
  1253. int16_t filtval;
  1254. uint32_t loop_cnt;
  1255. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1256. v16u8 ref0, ref1, ref2, ref3;
  1257. v16u8 pred0, pred1, pred2, pred3;
  1258. v16u8 out0, out1, out2, out3;
  1259. v16u8 filt_hz, filt_vt, vec0, vec1;
  1260. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  1261. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
  1262. v8i16 avg = { 0 };
  1263. v4i32 vec, var = { 0 };
  1264. filtval = LH(filter_horiz);
  1265. filt_hz = (v16u8)__msa_fill_h(filtval);
  1266. filtval = LH(filter_vert);
  1267. filt_vt = (v16u8)__msa_fill_h(filtval);
  1268. LD_UB2(src, 8, src0, src1);
  1269. src += src_stride;
  1270. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  1271. hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  1272. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1273. LD_UB4(src, src_stride, src0, src2, src4, src6);
  1274. LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
  1275. src += (4 * src_stride);
  1276. LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
  1277. sec_pred += (4 * width);
  1278. hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  1279. hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  1280. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  1281. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  1282. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1283. out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1284. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  1285. hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  1286. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  1287. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  1288. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1289. out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1290. hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  1291. hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
  1292. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  1293. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  1294. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1295. out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1296. hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
  1297. hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
  1298. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  1299. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  1300. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1301. out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1302. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1303. dst += (4 * dst_stride);
  1304. AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
  1305. out2, out3);
  1306. CALC_MSE_AVG_B(out0, ref0, var, avg);
  1307. CALC_MSE_AVG_B(out1, ref1, var, avg);
  1308. CALC_MSE_AVG_B(out2, ref2, var, avg);
  1309. CALC_MSE_AVG_B(out3, ref3, var, avg);
  1310. }
  1311. vec = __msa_hadd_s_w(avg, avg);
  1312. *diff = HADD_SW_S32(vec);
  1313. return HADD_SW_S32(var);
  1314. }
  1315. static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
  1316. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1317. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1318. const uint8_t *filter_vert, int32_t height, int32_t *diff) {
  1319. return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
  1320. sec_pred, filter_horiz, filter_vert,
  1321. height, diff, 16);
  1322. }
  1323. static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
  1324. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1325. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1326. const uint8_t *filter_vert, int32_t height, int32_t *diff) {
  1327. uint32_t loop_cnt, sse = 0;
  1328. int32_t diff0[2];
  1329. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  1330. sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
  1331. sec_pred, filter_horiz, filter_vert,
  1332. height, &diff0[loop_cnt], 32);
  1333. src += 16;
  1334. dst += 16;
  1335. sec_pred += 16;
  1336. }
  1337. *diff = diff0[0] + diff0[1];
  1338. return sse;
  1339. }
  1340. static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
  1341. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1342. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1343. const uint8_t *filter_vert, int32_t height, int32_t *diff) {
  1344. uint32_t loop_cnt, sse = 0;
  1345. int32_t diff0[4];
  1346. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  1347. sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
  1348. sec_pred, filter_horiz, filter_vert,
  1349. height, &diff0[loop_cnt], 64);
  1350. src += 16;
  1351. dst += 16;
  1352. sec_pred += 16;
  1353. }
  1354. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  1355. return sse;
  1356. }
  1357. #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
  1358. #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
  1359. #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
  1360. #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
  1361. #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
  1362. #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
  1363. #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
  1364. #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
  1365. #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
  1366. #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
  1367. #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
  1368. #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
  1369. #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
  1370. #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
  1371. uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \
  1372. const uint8_t *src, int32_t src_stride, int32_t xoffset, \
  1373. int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \
  1374. uint32_t *sse) { \
  1375. int32_t diff; \
  1376. uint32_t var; \
  1377. const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
  1378. const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
  1379. \
  1380. if (yoffset) { \
  1381. if (xoffset) { \
  1382. *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \
  1383. src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
  1384. } else { \
  1385. *sse = sub_pixel_sse_diff_##wd##width_v_msa( \
  1386. src, src_stride, ref, ref_stride, v_filter, ht, &diff); \
  1387. } \
  1388. \
  1389. var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
  1390. } else { \
  1391. if (xoffset) { \
  1392. *sse = sub_pixel_sse_diff_##wd##width_h_msa( \
  1393. src, src_stride, ref, ref_stride, h_filter, ht, &diff); \
  1394. \
  1395. var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
  1396. } else { \
  1397. var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
  1398. sse); \
  1399. } \
  1400. } \
  1401. \
  1402. return var; \
  1403. }
  1404. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
  1405. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
  1406. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
  1407. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
  1408. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
  1409. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
  1410. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
  1411. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
  1412. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
  1413. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
  1414. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
  1415. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
  1416. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
  1417. #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
  1418. uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \
  1419. const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \
  1420. int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
  1421. uint32_t *sse, const uint8_t *sec_pred) { \
  1422. int32_t diff; \
  1423. const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
  1424. const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
  1425. \
  1426. if (yoffset) { \
  1427. if (xoffset) { \
  1428. *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \
  1429. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
  1430. v_filter, ht, &diff); \
  1431. } else { \
  1432. *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \
  1433. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
  1434. &diff); \
  1435. } \
  1436. } else { \
  1437. if (xoffset) { \
  1438. *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \
  1439. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
  1440. &diff); \
  1441. } else { \
  1442. *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \
  1443. ref_stride, sec_pred, ht, &diff); \
  1444. } \
  1445. } \
  1446. \
  1447. return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
  1448. }
  1449. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
  1450. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
  1451. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
  1452. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
  1453. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
  1454. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
  1455. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
  1456. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
  1457. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
  1458. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
  1459. uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
  1460. int32_t src_stride,
  1461. int32_t xoffset, int32_t yoffset,
  1462. const uint8_t *ref_ptr,
  1463. int32_t ref_stride, uint32_t *sse,
  1464. const uint8_t *sec_pred) {
  1465. int32_t diff;
  1466. const uint8_t *h_filter = bilinear_filters_msa[xoffset];
  1467. const uint8_t *v_filter = bilinear_filters_msa[yoffset];
  1468. if (yoffset) {
  1469. if (xoffset) {
  1470. *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
  1471. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
  1472. v_filter, 64, &diff);
  1473. } else {
  1474. *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
  1475. ref_stride, sec_pred,
  1476. v_filter, 64, &diff);
  1477. }
  1478. } else {
  1479. if (xoffset) {
  1480. *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
  1481. ref_stride, sec_pred,
  1482. h_filter, 64, &diff);
  1483. } else {
  1484. *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
  1485. sec_pred, &diff);
  1486. }
  1487. }
  1488. return VARIANCE_32Wx64H(*sse, diff);
  1489. }
  1490. #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
  1491. uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \
  1492. const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \
  1493. int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
  1494. uint32_t *sse, const uint8_t *sec_pred) { \
  1495. int32_t diff; \
  1496. const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
  1497. const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
  1498. \
  1499. if (yoffset) { \
  1500. if (xoffset) { \
  1501. *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \
  1502. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
  1503. v_filter, ht, &diff); \
  1504. } else { \
  1505. *sse = sub_pixel_avg_sse_diff_64width_v_msa( \
  1506. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
  1507. &diff); \
  1508. } \
  1509. } else { \
  1510. if (xoffset) { \
  1511. *sse = sub_pixel_avg_sse_diff_64width_h_msa( \
  1512. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
  1513. &diff); \
  1514. } else { \
  1515. *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \
  1516. ref_stride, sec_pred, &diff); \
  1517. } \
  1518. } \
  1519. \
  1520. return VARIANCE_64Wx##ht##H(*sse, diff); \
  1521. }
  1522. VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
  1523. VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);