variance_mmi.c 66 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_dsp_rtcd.h"
  11. #include "vpx_dsp/variance.h"
  12. #include "vpx_ports/mem.h"
  13. #include "vpx/vpx_integer.h"
  14. #include "vpx_ports/asmdefs_mmi.h"
  15. static const uint8_t bilinear_filters[8][2] = {
  16. { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
  17. { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
  18. };
  19. /* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
  20. vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
  21. #define VARIANCE_SSE_SUM_8_FOR_W64 \
  22. "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
  23. "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
  24. "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
  25. "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
  26. \
  27. /* sum */ \
  28. "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
  29. "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \
  30. "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \
  31. "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
  32. "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \
  33. "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \
  34. "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \
  35. "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \
  36. "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
  37. "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \
  38. "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \
  39. "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \
  40. "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \
  41. "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \
  42. "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \
  43. "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" \
  44. \
  45. /* *sse */ \
  46. "pmuluw %[ftmp1], %[ftmp3], %[ftmp3] \n\t" \
  47. "pmuluw %[ftmp2], %[ftmp5], %[ftmp5] \n\t" \
  48. "pmuluw %[ftmp7], %[ftmp4], %[ftmp4] \n\t" \
  49. "pmuluw %[ftmp8], %[ftmp6], %[ftmp6] \n\t" \
  50. "paddw %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
  51. "paddw %[ftmp10], %[ftmp10], %[ftmp2] \n\t" \
  52. "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \
  53. "paddw %[ftmp10], %[ftmp10], %[ftmp8] \n\t" \
  54. "dsrl %[ftmp3], %[ftmp3], %[ftmp11] \n\t" \
  55. "dsrl %[ftmp5], %[ftmp5], %[ftmp11] \n\t" \
  56. "dsrl %[ftmp4], %[ftmp4], %[ftmp11] \n\t" \
  57. "dsrl %[ftmp6], %[ftmp6], %[ftmp11] \n\t" \
  58. "pmuluw %[ftmp1], %[ftmp3], %[ftmp3] \n\t" \
  59. "pmuluw %[ftmp2], %[ftmp5], %[ftmp5] \n\t" \
  60. "pmuluw %[ftmp7], %[ftmp4], %[ftmp4] \n\t" \
  61. "pmuluw %[ftmp8], %[ftmp6], %[ftmp6] \n\t" \
  62. "paddw %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
  63. "paddw %[ftmp10], %[ftmp10], %[ftmp2] \n\t" \
  64. "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \
  65. "paddw %[ftmp10], %[ftmp10], %[ftmp8] \n\t"
  66. #define VARIANCE_SSE_SUM_4 \
  67. /* sse */ \
  68. "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
  69. "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
  70. "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \
  71. "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \
  72. \
  73. /* sum */ \
  74. "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
  75. "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
  76. "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \
  77. "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
  78. #define VARIANCE_SSE_SUM_8 \
  79. /* sse */ \
  80. "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
  81. "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
  82. "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
  83. "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
  84. "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
  85. "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
  86. "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \
  87. \
  88. /* sum */ \
  89. "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
  90. "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
  91. "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
  92. "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
  93. "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \
  94. "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \
  95. "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \
  96. "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t"
  97. #define VARIANCE_SSE_8 \
  98. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
  99. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
  100. "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" \
  101. "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" \
  102. "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
  103. "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
  104. "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
  105. "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
  106. "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
  107. "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
  108. "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  109. #define VARIANCE_SSE_16 \
  110. VARIANCE_SSE_8 \
  111. "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \
  112. "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \
  113. "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" \
  114. "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" \
  115. "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
  116. "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
  117. "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
  118. "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
  119. "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
  120. "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
  121. "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  122. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \
  123. /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \
  124. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
  125. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
  126. "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
  127. "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
  128. "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
  129. "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
  130. "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
  131. "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
  132. "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \
  133. "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
  134. "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  135. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \
  136. /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \
  137. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
  138. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
  139. "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
  140. "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
  141. "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
  142. "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
  143. "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
  144. "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
  145. "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
  146. "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
  147. "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  148. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \
  149. /* calculate: temp2[0] ~ temp2[3] */ \
  150. "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
  151. "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
  152. "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
  153. "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
  154. "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \
  155. \
  156. /* store: temp2[0] ~ temp2[3] */ \
  157. "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
  158. "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
  159. "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
  160. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \
  161. /* calculate: temp2[0] ~ temp2[3] */ \
  162. "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
  163. "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
  164. "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
  165. "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
  166. "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
  167. \
  168. /* store: temp2[0] ~ temp2[3] */ \
  169. "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
  170. "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
  171. "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t"
  172. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
  173. /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
  174. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
  175. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
  176. "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
  177. "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
  178. "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
  179. "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
  180. "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
  181. "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
  182. "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
  183. "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \
  184. "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
  185. "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
  186. "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \
  187. "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
  188. "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
  189. "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
  190. "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
  191. "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t"
  192. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
  193. /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
  194. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
  195. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
  196. "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \
  197. "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \
  198. "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
  199. "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
  200. "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
  201. "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
  202. "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \
  203. "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \
  204. "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
  205. "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
  206. "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \
  207. "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \
  208. "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \
  209. "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \
  210. "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
  211. "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t"
  212. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
  213. /* calculate: temp2[0] ~ temp2[3] */ \
  214. "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
  215. "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
  216. "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \
  217. "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
  218. "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
  219. \
  220. /* calculate: temp2[4] ~ temp2[7] */ \
  221. "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \
  222. "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
  223. "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \
  224. "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \
  225. "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \
  226. \
  227. /* store: temp2[0] ~ temp2[7] */ \
  228. "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
  229. "and %[ftmp3], %[ftmp3], %[mask] \n\t" \
  230. "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
  231. "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \
  232. "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
  233. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
  234. /* calculate: temp2[0] ~ temp2[3] */ \
  235. "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \
  236. "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
  237. "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
  238. "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \
  239. "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
  240. \
  241. /* calculate: temp2[4] ~ temp2[7] */ \
  242. "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \
  243. "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
  244. "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \
  245. "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \
  246. "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \
  247. \
  248. /* store: temp2[0] ~ temp2[7] */ \
  249. "and %[ftmp8], %[ftmp8], %[mask] \n\t" \
  250. "and %[ftmp9], %[ftmp9], %[mask] \n\t" \
  251. "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
  252. "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \
  253. "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t"
  254. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \
  255. /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
  256. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
  257. \
  258. /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \
  259. "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \
  260. "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \
  261. "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
  262. "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
  263. "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \
  264. "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \
  265. "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
  266. "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \
  267. "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
  268. "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \
  269. "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
  270. "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
  271. "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \
  272. "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \
  273. "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
  274. "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
  275. "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
  276. "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t"
  277. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \
  278. /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
  279. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
  280. \
  281. /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \
  282. "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \
  283. "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \
  284. "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
  285. "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
  286. "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \
  287. "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \
  288. "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \
  289. "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \
  290. "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \
  291. "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \
  292. "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
  293. "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
  294. "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \
  295. "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \
  296. "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \
  297. "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \
  298. "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
  299. "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t"
  300. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \
  301. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
  302. \
  303. /* calculate: temp2[8] ~ temp2[11] */ \
  304. "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
  305. "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
  306. "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \
  307. "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
  308. "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
  309. \
  310. /* calculate: temp2[12] ~ temp2[15] */ \
  311. "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \
  312. "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
  313. "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \
  314. "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
  315. "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \
  316. \
  317. /* store: temp2[8] ~ temp2[15] */ \
  318. "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
  319. "and %[ftmp5], %[ftmp5], %[mask] \n\t" \
  320. "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
  321. "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \
  322. "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t"
  323. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \
  324. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
  325. \
  326. /* calculate: temp2[8] ~ temp2[11] */ \
  327. "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \
  328. "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
  329. "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
  330. "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
  331. "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
  332. \
  333. /* calculate: temp2[12] ~ temp2[15] */ \
  334. "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \
  335. "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
  336. "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \
  337. "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \
  338. "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \
  339. \
  340. /* store: temp2[8] ~ temp2[15] */ \
  341. "and %[ftmp10], %[ftmp10], %[mask] \n\t" \
  342. "and %[ftmp11], %[ftmp11], %[mask] \n\t" \
  343. "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
  344. "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \
  345. "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t"
  346. // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
  347. // or vertical direction to produce the filtered output block. Used to implement
  348. // the first-pass of 2-D separable filter.
  349. //
  350. // Produces int16_t output to retain precision for the next pass. Two filter
  351. // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
  352. // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
  353. // It defines the offset required to move from one input to the next.
  354. static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
  355. unsigned int src_pixels_per_line,
  356. int pixel_step,
  357. unsigned int output_height,
  358. unsigned int output_width,
  359. const uint8_t *filter) {
  360. unsigned int i, j;
  361. for (i = 0; i < output_height; ++i) {
  362. for (j = 0; j < output_width; ++j) {
  363. b[j] = ROUND_POWER_OF_TWO(
  364. (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
  365. ++a;
  366. }
  367. a += src_pixels_per_line - output_width;
  368. b += output_width;
  369. }
  370. }
  371. // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
  372. // or vertical direction to produce the filtered output block. Used to implement
  373. // the second-pass of 2-D separable filter.
  374. //
  375. // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
  376. // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
  377. // filter is applied horizontally (pixel_step = 1) or vertically
  378. // (pixel_step = stride). It defines the offset required to move from one input
  379. // to the next. Output is 8-bit.
  380. static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
  381. unsigned int src_pixels_per_line,
  382. unsigned int pixel_step,
  383. unsigned int output_height,
  384. unsigned int output_width,
  385. const uint8_t *filter) {
  386. unsigned int i, j;
  387. for (i = 0; i < output_height; ++i) {
  388. for (j = 0; j < output_width; ++j) {
  389. b[j] = ROUND_POWER_OF_TWO(
  390. (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
  391. ++a;
  392. }
  393. a += src_pixels_per_line - output_width;
  394. b += output_width;
  395. }
  396. }
  397. static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,
  398. const uint8_t *b, int b_stride,
  399. uint32_t *sse, int high) {
  400. int sum;
  401. double ftmp[12];
  402. uint32_t tmp[3];
  403. *sse = 0;
  404. __asm__ volatile (
  405. "li %[tmp0], 0x20 \n\t"
  406. "mtc1 %[tmp0], %[ftmp11] \n\t"
  407. MMI_L(%[tmp0], %[high], 0x00)
  408. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  409. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  410. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  411. "1: \n\t"
  412. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
  413. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
  414. "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
  415. "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
  416. VARIANCE_SSE_SUM_8_FOR_W64
  417. "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
  418. "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
  419. "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
  420. "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
  421. VARIANCE_SSE_SUM_8_FOR_W64
  422. "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t"
  423. "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t"
  424. "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t"
  425. "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t"
  426. VARIANCE_SSE_SUM_8_FOR_W64
  427. "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t"
  428. "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t"
  429. "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t"
  430. "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t"
  431. VARIANCE_SSE_SUM_8_FOR_W64
  432. "gsldlc1 %[ftmp1], 0x27(%[a]) \n\t"
  433. "gsldrc1 %[ftmp1], 0x20(%[a]) \n\t"
  434. "gsldlc1 %[ftmp2], 0x27(%[b]) \n\t"
  435. "gsldrc1 %[ftmp2], 0x20(%[b]) \n\t"
  436. VARIANCE_SSE_SUM_8_FOR_W64
  437. "gsldlc1 %[ftmp1], 0x2f(%[a]) \n\t"
  438. "gsldrc1 %[ftmp1], 0x28(%[a]) \n\t"
  439. "gsldlc1 %[ftmp2], 0x2f(%[b]) \n\t"
  440. "gsldrc1 %[ftmp2], 0x28(%[b]) \n\t"
  441. VARIANCE_SSE_SUM_8_FOR_W64
  442. "gsldlc1 %[ftmp1], 0x37(%[a]) \n\t"
  443. "gsldrc1 %[ftmp1], 0x30(%[a]) \n\t"
  444. "gsldlc1 %[ftmp2], 0x37(%[b]) \n\t"
  445. "gsldrc1 %[ftmp2], 0x30(%[b]) \n\t"
  446. VARIANCE_SSE_SUM_8_FOR_W64
  447. "gsldlc1 %[ftmp1], 0x3f(%[a]) \n\t"
  448. "gsldrc1 %[ftmp1], 0x38(%[a]) \n\t"
  449. "gsldlc1 %[ftmp2], 0x3f(%[b]) \n\t"
  450. "gsldrc1 %[ftmp2], 0x38(%[b]) \n\t"
  451. VARIANCE_SSE_SUM_8_FOR_W64
  452. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  453. MMI_ADDU(%[a], %[a], %[a_stride])
  454. MMI_ADDU(%[b], %[b], %[b_stride])
  455. "bnez %[tmp0], 1b \n\t"
  456. "mfc1 %[tmp1], %[ftmp9] \n\t"
  457. "mfhc1 %[tmp2], %[ftmp9] \n\t"
  458. "addu %[sum], %[tmp1], %[tmp2] \n\t"
  459. "swc1 %[ftmp10], 0x00(%[sse]) \n\t"
  460. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  461. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  462. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  463. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  464. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  465. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  466. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  467. [tmp2]"=&r"(tmp[2]),
  468. [a]"+&r"(a), [b]"+&r"(b),
  469. [sum]"=&r"(sum)
  470. : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
  471. [high]"r"(&high), [sse]"r"(sse)
  472. : "memory"
  473. );
  474. return *sse - (((int64_t)sum * sum) / (64 * high));
  475. }
  476. #define VPX_VARIANCE64XN(n) \
  477. uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \
  478. const uint8_t *b, int b_stride, \
  479. uint32_t *sse) { \
  480. return vpx_variance64x(a, a_stride, b, b_stride, sse, n); \
  481. }
  482. VPX_VARIANCE64XN(64)
  483. VPX_VARIANCE64XN(32)
  484. uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,
  485. int b_stride, uint32_t *sse) {
  486. int sum;
  487. double ftmp[12];
  488. uint32_t tmp[3];
  489. *sse = 0;
  490. __asm__ volatile (
  491. "li %[tmp0], 0x20 \n\t"
  492. "mtc1 %[tmp0], %[ftmp11] \n\t"
  493. "li %[tmp0], 0x40 \n\t"
  494. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  495. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  496. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  497. "1: \n\t"
  498. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
  499. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
  500. "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
  501. "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
  502. VARIANCE_SSE_SUM_8_FOR_W64
  503. "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
  504. "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
  505. "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
  506. "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
  507. VARIANCE_SSE_SUM_8_FOR_W64
  508. "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t"
  509. "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t"
  510. "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t"
  511. "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t"
  512. VARIANCE_SSE_SUM_8_FOR_W64
  513. "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t"
  514. "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t"
  515. "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t"
  516. "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t"
  517. VARIANCE_SSE_SUM_8_FOR_W64
  518. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  519. MMI_ADDU(%[a], %[a], %[a_stride])
  520. MMI_ADDU(%[b], %[b], %[b_stride])
  521. "bnez %[tmp0], 1b \n\t"
  522. "mfc1 %[tmp1], %[ftmp9] \n\t"
  523. "mfhc1 %[tmp2], %[ftmp9] \n\t"
  524. "addu %[sum], %[tmp1], %[tmp2] \n\t"
  525. "swc1 %[ftmp10], 0x00(%[sse]) \n\t"
  526. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  527. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  528. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  529. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  530. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  531. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  532. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  533. [tmp2]"=&r"(tmp[2]),
  534. [a]"+&r"(a), [b]"+&r"(b),
  535. [sum]"=&r"(sum)
  536. : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
  537. [sse]"r"(sse)
  538. : "memory"
  539. );
  540. return *sse - (((int64_t)sum * sum) / 2048);
  541. }
  542. static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,
  543. const uint8_t *b, int b_stride,
  544. uint32_t *sse, int high) {
  545. int sum;
  546. double ftmp[13];
  547. uint32_t tmp[3];
  548. *sse = 0;
  549. __asm__ volatile (
  550. "li %[tmp0], 0x20 \n\t"
  551. "mtc1 %[tmp0], %[ftmp11] \n\t"
  552. MMI_L(%[tmp0], %[high], 0x00)
  553. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  554. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  555. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  556. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  557. "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
  558. "1: \n\t"
  559. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
  560. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
  561. "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
  562. "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
  563. VARIANCE_SSE_SUM_8
  564. "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
  565. "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
  566. "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
  567. "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
  568. VARIANCE_SSE_SUM_8
  569. "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t"
  570. "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t"
  571. "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t"
  572. "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t"
  573. VARIANCE_SSE_SUM_8
  574. "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t"
  575. "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t"
  576. "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t"
  577. "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t"
  578. VARIANCE_SSE_SUM_8
  579. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  580. MMI_ADDU(%[a], %[a], %[a_stride])
  581. MMI_ADDU(%[b], %[b], %[b_stride])
  582. "bnez %[tmp0], 1b \n\t"
  583. "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
  584. "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
  585. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  586. "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
  587. "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
  588. "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
  589. "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
  590. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  591. "paddw %[ftmp10], %[ftmp10], %[ftmp3] \n\t"
  592. "paddw %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
  593. "psubw %[ftmp10], %[ftmp10], %[ftmp5] \n\t"
  594. "psubw %[ftmp10], %[ftmp10], %[ftmp6] \n\t"
  595. "dsrl %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
  596. "paddw %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
  597. "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
  598. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  599. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  600. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  601. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  602. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  603. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  604. [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
  605. [a]"+&r"(a), [b]"+&r"(b)
  606. : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
  607. [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
  608. : "memory"
  609. );
  610. return *sse - (((int64_t)sum * sum) / (32 * high));
  611. }
  612. #define VPX_VARIANCE32XN(n) \
  613. uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \
  614. const uint8_t *b, int b_stride, \
  615. uint32_t *sse) { \
  616. return vpx_variance32x(a, a_stride, b, b_stride, sse, n); \
  617. }
  618. VPX_VARIANCE32XN(32)
  619. VPX_VARIANCE32XN(16)
  620. static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,
  621. const uint8_t *b, int b_stride,
  622. uint32_t *sse, int high) {
  623. int sum;
  624. double ftmp[13];
  625. uint32_t tmp[3];
  626. *sse = 0;
  627. __asm__ volatile (
  628. "li %[tmp0], 0x20 \n\t"
  629. "mtc1 %[tmp0], %[ftmp11] \n\t"
  630. MMI_L(%[tmp0], %[high], 0x00)
  631. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  632. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  633. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  634. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  635. "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
  636. "1: \n\t"
  637. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
  638. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
  639. "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
  640. "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
  641. VARIANCE_SSE_SUM_8
  642. "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
  643. "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
  644. "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
  645. "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
  646. VARIANCE_SSE_SUM_8
  647. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  648. MMI_ADDU(%[a], %[a], %[a_stride])
  649. MMI_ADDU(%[b], %[b], %[b_stride])
  650. "bnez %[tmp0], 1b \n\t"
  651. "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
  652. "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
  653. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  654. "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
  655. "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
  656. "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
  657. "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
  658. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  659. "paddw %[ftmp10], %[ftmp10], %[ftmp3] \n\t"
  660. "paddw %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
  661. "psubw %[ftmp10], %[ftmp10], %[ftmp5] \n\t"
  662. "psubw %[ftmp10], %[ftmp10], %[ftmp6] \n\t"
  663. "dsrl %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
  664. "paddw %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
  665. "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
  666. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  667. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  668. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  669. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  670. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  671. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  672. [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
  673. [a]"+&r"(a), [b]"+&r"(b)
  674. : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
  675. [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
  676. : "memory"
  677. );
  678. return *sse - (((int64_t)sum * sum) / (16 * high));
  679. }
  680. #define VPX_VARIANCE16XN(n) \
  681. uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \
  682. const uint8_t *b, int b_stride, \
  683. uint32_t *sse) { \
  684. return vpx_variance16x(a, a_stride, b, b_stride, sse, n); \
  685. }
  686. VPX_VARIANCE16XN(32)
  687. VPX_VARIANCE16XN(16)
  688. VPX_VARIANCE16XN(8)
  689. static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,
  690. const uint8_t *b, int b_stride,
  691. uint32_t *sse, int high) {
  692. int sum;
  693. double ftmp[13];
  694. uint32_t tmp[3];
  695. *sse = 0;
  696. __asm__ volatile (
  697. "li %[tmp0], 0x20 \n\t"
  698. "mtc1 %[tmp0], %[ftmp11] \n\t"
  699. MMI_L(%[tmp0], %[high], 0x00)
  700. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  701. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  702. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  703. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  704. "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
  705. "1: \n\t"
  706. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
  707. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
  708. "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
  709. "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
  710. VARIANCE_SSE_SUM_8
  711. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  712. MMI_ADDU(%[a], %[a], %[a_stride])
  713. MMI_ADDU(%[b], %[b], %[b_stride])
  714. "bnez %[tmp0], 1b \n\t"
  715. "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
  716. "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
  717. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  718. "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
  719. "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
  720. "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
  721. "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
  722. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  723. "paddw %[ftmp10], %[ftmp10], %[ftmp3] \n\t"
  724. "paddw %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
  725. "psubw %[ftmp10], %[ftmp10], %[ftmp5] \n\t"
  726. "psubw %[ftmp10], %[ftmp10], %[ftmp6] \n\t"
  727. "dsrl %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
  728. "paddw %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
  729. "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
  730. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  731. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  732. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  733. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  734. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  735. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  736. [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
  737. [a]"+&r"(a), [b]"+&r"(b)
  738. : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
  739. [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
  740. : "memory"
  741. );
  742. return *sse - (((int64_t)sum * sum) / (8 * high));
  743. }
  744. #define VPX_VARIANCE8XN(n) \
  745. uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \
  746. const uint8_t *b, int b_stride, \
  747. uint32_t *sse) { \
  748. return vpx_variance8x(a, a_stride, b, b_stride, sse, n); \
  749. }
  750. VPX_VARIANCE8XN(16)
  751. VPX_VARIANCE8XN(8)
  752. VPX_VARIANCE8XN(4)
  753. static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,
  754. const uint8_t *b, int b_stride,
  755. uint32_t *sse, int high) {
  756. int sum;
  757. double ftmp[12];
  758. uint32_t tmp[3];
  759. *sse = 0;
  760. __asm__ volatile (
  761. "li %[tmp0], 0x20 \n\t"
  762. "mtc1 %[tmp0], %[ftmp10] \n\t"
  763. MMI_L(%[tmp0], %[high], 0x00)
  764. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  765. "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  766. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  767. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  768. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  769. "1: \n\t"
  770. "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
  771. "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
  772. "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
  773. "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
  774. VARIANCE_SSE_SUM_4
  775. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  776. MMI_ADDU(%[a], %[a], %[a_stride])
  777. MMI_ADDU(%[b], %[b], %[b_stride])
  778. "bnez %[tmp0], 1b \n\t"
  779. "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t"
  780. "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
  781. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  782. "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t"
  783. "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t"
  784. "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t"
  785. "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t"
  786. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  787. "paddw %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
  788. "paddw %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
  789. "psubw %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
  790. "psubw %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
  791. "dsrl %[ftmp0], %[ftmp7], %[ftmp10] \n\t"
  792. "paddw %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
  793. "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
  794. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  795. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  796. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  797. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  798. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  799. [ftmp10]"=&f"(ftmp[10]),
  800. [tmp0]"=&r"(tmp[0]),
  801. [a]"+&r"(a), [b]"+&r"(b)
  802. : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
  803. [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
  804. : "memory"
  805. );
  806. return *sse - (((int64_t)sum * sum) / (4 * high));
  807. }
  808. #define VPX_VARIANCE4XN(n) \
  809. uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \
  810. const uint8_t *b, int b_stride, \
  811. uint32_t *sse) { \
  812. return vpx_variance4x(a, a_stride, b, b_stride, sse, n); \
  813. }
  814. VPX_VARIANCE4XN(8)
  815. VPX_VARIANCE4XN(4)
  816. static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
  817. const uint8_t *b, int b_stride, uint32_t *sse,
  818. uint64_t high) {
  819. double ftmp[12];
  820. uint32_t tmp[1];
  821. *sse = 0;
  822. __asm__ volatile (
  823. "li %[tmp0], 0x20 \n\t"
  824. "mtc1 %[tmp0], %[ftmp11] \n\t"
  825. MMI_L(%[tmp0], %[high], 0x00)
  826. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  827. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  828. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  829. "1: \n\t"
  830. VARIANCE_SSE_16
  831. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  832. MMI_ADDU(%[a], %[a], %[a_stride])
  833. MMI_ADDU(%[b], %[b], %[b_stride])
  834. "bnez %[tmp0], 1b \n\t"
  835. "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
  836. "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
  837. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  838. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  839. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  840. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  841. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  842. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  843. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  844. [tmp0]"=&r"(tmp[0]),
  845. [a]"+&r"(a), [b]"+&r"(b)
  846. : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
  847. [high]"r"(&high), [sse]"r"(sse)
  848. : "memory"
  849. );
  850. return *sse;
  851. }
  852. #define vpx_mse16xN(n) \
  853. uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \
  854. const uint8_t *b, int b_stride, \
  855. uint32_t *sse) { \
  856. return vpx_mse16x(a, a_stride, b, b_stride, sse, n); \
  857. }
  858. vpx_mse16xN(16);
  859. vpx_mse16xN(8);
  860. static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
  861. const uint8_t *b, int b_stride, uint32_t *sse,
  862. uint64_t high) {
  863. double ftmp[12];
  864. uint32_t tmp[1];
  865. *sse = 0;
  866. __asm__ volatile (
  867. "li %[tmp0], 0x20 \n\t"
  868. "mtc1 %[tmp0], %[ftmp11] \n\t"
  869. MMI_L(%[tmp0], %[high], 0x00)
  870. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  871. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  872. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  873. "1: \n\t"
  874. VARIANCE_SSE_8
  875. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  876. MMI_ADDU(%[a], %[a], %[a_stride])
  877. MMI_ADDU(%[b], %[b], %[b_stride])
  878. "bnez %[tmp0], 1b \n\t"
  879. "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
  880. "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
  881. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  882. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  883. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  884. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  885. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  886. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  887. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  888. [tmp0]"=&r"(tmp[0]),
  889. [a]"+&r"(a), [b]"+&r"(b)
  890. : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
  891. [high]"r"(&high), [sse]"r"(sse)
  892. : "memory"
  893. );
  894. return *sse;
  895. }
  896. #define vpx_mse8xN(n) \
  897. uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride, \
  898. const uint8_t *b, int b_stride, uint32_t *sse) { \
  899. return vpx_mse8x(a, a_stride, b, b_stride, sse, n); \
  900. }
  901. vpx_mse8xN(16);
  902. vpx_mse8xN(8);
  903. #define SUBPIX_VAR(W, H) \
  904. uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \
  905. const uint8_t *a, int a_stride, int xoffset, int yoffset, \
  906. const uint8_t *b, int b_stride, uint32_t *sse) { \
  907. uint16_t fdata3[(H + 1) * W]; \
  908. uint8_t temp2[H * W]; \
  909. \
  910. var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
  911. bilinear_filters[xoffset]); \
  912. var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
  913. bilinear_filters[yoffset]); \
  914. \
  915. return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse); \
  916. }
  917. SUBPIX_VAR(64, 64)
  918. SUBPIX_VAR(64, 32)
  919. SUBPIX_VAR(32, 64)
  920. SUBPIX_VAR(32, 32)
  921. SUBPIX_VAR(32, 16)
  922. SUBPIX_VAR(16, 32)
  923. static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,
  924. int xoffset, int yoffset,
  925. uint8_t *temp2, int counter) {
  926. uint8_t *temp2_ptr = temp2;
  927. mips_reg l_counter = counter;
  928. double ftmp[15];
  929. mips_reg tmp[2];
  930. DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
  931. DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
  932. const uint8_t *filter_x = bilinear_filters[xoffset];
  933. const uint8_t *filter_y = bilinear_filters[yoffset];
  934. __asm__ volatile (
  935. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  936. MMI_LI(%[tmp0], 0x07)
  937. MMI_MTC1(%[tmp0], %[ftmp14])
  938. "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
  939. "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
  940. "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
  941. "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
  942. // fdata3: fdata3[0] ~ fdata3[15]
  943. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
  944. // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15]
  945. MMI_ADDU(%[a], %[a], %[a_stride])
  946. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
  947. // temp2: temp2[0] ~ temp2[15]
  948. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
  949. // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15]
  950. MMI_ADDU(%[a], %[a], %[a_stride])
  951. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
  952. // temp2+16*1: temp2[0] ~ temp2[15]
  953. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
  954. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
  955. "1: \n\t"
  956. MMI_ADDU(%[a], %[a], %[a_stride])
  957. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
  958. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
  959. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
  960. MMI_ADDU(%[a], %[a], %[a_stride])
  961. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
  962. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
  963. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
  964. "addiu %[counter], %[counter], -0x01 \n\t"
  965. "bnez %[counter], 1b \n\t"
  966. : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
  967. [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
  968. [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
  969. [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
  970. [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
  971. [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
  972. [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
  973. [counter]"+&r"(l_counter)
  974. : [filter_x0] "f"((uint64_t)filter_x[0]),
  975. [filter_x1] "f"((uint64_t)filter_x[1]),
  976. [filter_y0] "f"((uint64_t)filter_y[0]),
  977. [filter_y1] "f"((uint64_t)filter_y[1]),
  978. [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
  979. [mask] "f"(mask)
  980. : "memory"
  981. );
  982. }
  983. #define SUBPIX_VAR16XN(H) \
  984. uint32_t vpx_sub_pixel_variance16x##H##_mmi( \
  985. const uint8_t *a, int a_stride, int xoffset, int yoffset, \
  986. const uint8_t *b, int b_stride, uint32_t *sse) { \
  987. uint8_t temp2[16 * H]; \
  988. var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \
  989. (H - 2) / 2); \
  990. \
  991. return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse); \
  992. }
  993. SUBPIX_VAR16XN(16)
  994. SUBPIX_VAR16XN(8)
  995. static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride,
  996. int xoffset, int yoffset,
  997. uint8_t *temp2, int counter) {
  998. uint8_t *temp2_ptr = temp2;
  999. mips_reg l_counter = counter;
  1000. double ftmp[15];
  1001. mips_reg tmp[2];
  1002. DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
  1003. DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
  1004. const uint8_t *filter_x = bilinear_filters[xoffset];
  1005. const uint8_t *filter_y = bilinear_filters[yoffset];
  1006. __asm__ volatile (
  1007. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1008. MMI_LI(%[tmp0], 0x07)
  1009. MMI_MTC1(%[tmp0], %[ftmp14])
  1010. "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
  1011. "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
  1012. "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
  1013. "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
  1014. // fdata3: fdata3[0] ~ fdata3[7]
  1015. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
  1016. // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7]
  1017. MMI_ADDU(%[a], %[a], %[a_stride])
  1018. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
  1019. // temp2: temp2[0] ~ temp2[7]
  1020. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
  1021. // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7]
  1022. MMI_ADDU(%[a], %[a], %[a_stride])
  1023. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
  1024. // temp2+8*1: temp2[0] ~ temp2[7]
  1025. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
  1026. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
  1027. "1: \n\t"
  1028. MMI_ADDU(%[a], %[a], %[a_stride])
  1029. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
  1030. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
  1031. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
  1032. MMI_ADDU(%[a], %[a], %[a_stride])
  1033. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
  1034. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
  1035. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
  1036. "addiu %[counter], %[counter], -0x01 \n\t"
  1037. "bnez %[counter], 1b \n\t"
  1038. : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
  1039. [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
  1040. [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
  1041. [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
  1042. [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
  1043. [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
  1044. [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
  1045. [counter]"+&r"(l_counter)
  1046. : [filter_x0] "f"((uint64_t)filter_x[0]),
  1047. [filter_x1] "f"((uint64_t)filter_x[1]),
  1048. [filter_y0] "f"((uint64_t)filter_y[0]),
  1049. [filter_y1] "f"((uint64_t)filter_y[1]),
  1050. [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
  1051. [mask] "f"(mask)
  1052. : "memory"
  1053. );
  1054. }
  1055. #define SUBPIX_VAR8XN(H) \
  1056. uint32_t vpx_sub_pixel_variance8x##H##_mmi( \
  1057. const uint8_t *a, int a_stride, int xoffset, int yoffset, \
  1058. const uint8_t *b, int b_stride, uint32_t *sse) { \
  1059. uint8_t temp2[8 * H]; \
  1060. var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \
  1061. (H - 2) / 2); \
  1062. \
  1063. return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse); \
  1064. }
  1065. SUBPIX_VAR8XN(16)
  1066. SUBPIX_VAR8XN(8)
  1067. SUBPIX_VAR8XN(4)
  1068. static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride,
  1069. int xoffset, int yoffset,
  1070. uint8_t *temp2, int counter) {
  1071. uint8_t *temp2_ptr = temp2;
  1072. mips_reg l_counter = counter;
  1073. double ftmp[7];
  1074. mips_reg tmp[2];
  1075. DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
  1076. DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
  1077. const uint8_t *filter_x = bilinear_filters[xoffset];
  1078. const uint8_t *filter_y = bilinear_filters[yoffset];
  1079. __asm__ volatile (
  1080. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1081. MMI_LI(%[tmp0], 0x07)
  1082. MMI_MTC1(%[tmp0], %[ftmp6])
  1083. "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
  1084. "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
  1085. "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
  1086. "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
  1087. // fdata3: fdata3[0] ~ fdata3[3]
  1088. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
  1089. // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3]
  1090. MMI_ADDU(%[a], %[a], %[a_stride])
  1091. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
  1092. // temp2: temp2[0] ~ temp2[7]
  1093. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
  1094. // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3]
  1095. MMI_ADDU(%[a], %[a], %[a_stride])
  1096. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
  1097. // temp2+4*1: temp2[0] ~ temp2[7]
  1098. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
  1099. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
  1100. "1: \n\t"
  1101. MMI_ADDU(%[a], %[a], %[a_stride])
  1102. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
  1103. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
  1104. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
  1105. MMI_ADDU(%[a], %[a], %[a_stride])
  1106. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
  1107. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
  1108. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
  1109. "addiu %[counter], %[counter], -0x01 \n\t"
  1110. "bnez %[counter], 1b \n\t"
  1111. : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
  1112. [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
  1113. [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a),
  1114. [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
  1115. : [filter_x0] "f"((uint64_t)filter_x[0]),
  1116. [filter_x1] "f"((uint64_t)filter_x[1]),
  1117. [filter_y0] "f"((uint64_t)filter_y[0]),
  1118. [filter_y1] "f"((uint64_t)filter_y[1]),
  1119. [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
  1120. [mask] "f"(mask)
  1121. : "memory"
  1122. );
  1123. }
  1124. #define SUBPIX_VAR4XN(H) \
  1125. uint32_t vpx_sub_pixel_variance4x##H##_mmi( \
  1126. const uint8_t *a, int a_stride, int xoffset, int yoffset, \
  1127. const uint8_t *b, int b_stride, uint32_t *sse) { \
  1128. uint8_t temp2[4 * H]; \
  1129. var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \
  1130. (H - 2) / 2); \
  1131. \
  1132. return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse); \
  1133. }
  1134. SUBPIX_VAR4XN(8)
  1135. SUBPIX_VAR4XN(4)
  1136. #define SUBPIX_AVG_VAR(W, H) \
  1137. uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \
  1138. const uint8_t *a, int a_stride, int xoffset, int yoffset, \
  1139. const uint8_t *b, int b_stride, uint32_t *sse, \
  1140. const uint8_t *second_pred) { \
  1141. uint16_t fdata3[(H + 1) * W]; \
  1142. uint8_t temp2[H * W]; \
  1143. DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
  1144. \
  1145. var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
  1146. bilinear_filters[xoffset]); \
  1147. var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
  1148. bilinear_filters[yoffset]); \
  1149. \
  1150. vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \
  1151. \
  1152. return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse); \
  1153. }
  1154. SUBPIX_AVG_VAR(64, 64)
  1155. SUBPIX_AVG_VAR(64, 32)
  1156. SUBPIX_AVG_VAR(32, 64)
  1157. SUBPIX_AVG_VAR(32, 32)
  1158. SUBPIX_AVG_VAR(32, 16)
  1159. SUBPIX_AVG_VAR(16, 32)
  1160. SUBPIX_AVG_VAR(16, 16)
  1161. SUBPIX_AVG_VAR(16, 8)
  1162. SUBPIX_AVG_VAR(8, 16)
  1163. SUBPIX_AVG_VAR(8, 8)
  1164. SUBPIX_AVG_VAR(8, 4)
  1165. SUBPIX_AVG_VAR(4, 8)
  1166. SUBPIX_AVG_VAR(4, 4)