highbd_variance_sse2.c 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_config.h"
  11. #include "vpx_ports/mem.h"
  12. typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
  13. const uint16_t *ref, int ref_stride,
  14. uint32_t *sse, int *sum);
  15. uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
  16. const uint16_t *ref, int ref_stride,
  17. uint32_t *sse, int *sum);
  18. uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
  19. const uint16_t *ref, int ref_stride,
  20. uint32_t *sse, int *sum);
  21. static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
  22. const uint16_t *ref, int ref_stride, int w,
  23. int h, uint32_t *sse, int *sum,
  24. high_variance_fn_t var_fn, int block_size) {
  25. int i, j;
  26. *sse = 0;
  27. *sum = 0;
  28. for (i = 0; i < h; i += block_size) {
  29. for (j = 0; j < w; j += block_size) {
  30. unsigned int sse0;
  31. int sum0;
  32. var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
  33. ref_stride, &sse0, &sum0);
  34. *sse += sse0;
  35. *sum += sum0;
  36. }
  37. }
  38. }
  39. static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
  40. const uint16_t *ref, int ref_stride, int w,
  41. int h, uint32_t *sse, int *sum,
  42. high_variance_fn_t var_fn, int block_size) {
  43. int i, j;
  44. uint64_t sse_long = 0;
  45. int32_t sum_long = 0;
  46. for (i = 0; i < h; i += block_size) {
  47. for (j = 0; j < w; j += block_size) {
  48. unsigned int sse0;
  49. int sum0;
  50. var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
  51. ref_stride, &sse0, &sum0);
  52. sse_long += sse0;
  53. sum_long += sum0;
  54. }
  55. }
  56. *sum = ROUND_POWER_OF_TWO(sum_long, 2);
  57. *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
  58. }
  59. static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
  60. const uint16_t *ref, int ref_stride, int w,
  61. int h, uint32_t *sse, int *sum,
  62. high_variance_fn_t var_fn, int block_size) {
  63. int i, j;
  64. uint64_t sse_long = 0;
  65. int32_t sum_long = 0;
  66. for (i = 0; i < h; i += block_size) {
  67. for (j = 0; j < w; j += block_size) {
  68. unsigned int sse0;
  69. int sum0;
  70. var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
  71. ref_stride, &sse0, &sum0);
  72. sse_long += sse0;
  73. sum_long += sum0;
  74. }
  75. }
  76. *sum = ROUND_POWER_OF_TWO(sum_long, 4);
  77. *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
  78. }
  79. #define HIGH_GET_VAR(S) \
  80. void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
  81. const uint8_t *ref8, int ref_stride, \
  82. uint32_t *sse, int *sum) { \
  83. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  84. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  85. vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
  86. sum); \
  87. } \
  88. \
  89. void vpx_highbd_10_get##S##x##S##var_sse2( \
  90. const uint8_t *src8, int src_stride, const uint8_t *ref8, \
  91. int ref_stride, uint32_t *sse, int *sum) { \
  92. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  93. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  94. vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
  95. sum); \
  96. *sum = ROUND_POWER_OF_TWO(*sum, 2); \
  97. *sse = ROUND_POWER_OF_TWO(*sse, 4); \
  98. } \
  99. \
  100. void vpx_highbd_12_get##S##x##S##var_sse2( \
  101. const uint8_t *src8, int src_stride, const uint8_t *ref8, \
  102. int ref_stride, uint32_t *sse, int *sum) { \
  103. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  104. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  105. vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
  106. sum); \
  107. *sum = ROUND_POWER_OF_TWO(*sum, 4); \
  108. *sse = ROUND_POWER_OF_TWO(*sse, 8); \
  109. }
  110. HIGH_GET_VAR(16);
  111. HIGH_GET_VAR(8);
  112. #undef HIGH_GET_VAR
  113. #define VAR_FN(w, h, block_size, shift) \
  114. uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
  115. const uint8_t *src8, int src_stride, const uint8_t *ref8, \
  116. int ref_stride, uint32_t *sse) { \
  117. int sum; \
  118. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  119. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  120. highbd_8_variance_sse2( \
  121. src, src_stride, ref, ref_stride, w, h, sse, &sum, \
  122. vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
  123. return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
  124. } \
  125. \
  126. uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
  127. const uint8_t *src8, int src_stride, const uint8_t *ref8, \
  128. int ref_stride, uint32_t *sse) { \
  129. int sum; \
  130. int64_t var; \
  131. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  132. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  133. highbd_10_variance_sse2( \
  134. src, src_stride, ref, ref_stride, w, h, sse, &sum, \
  135. vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
  136. var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
  137. return (var >= 0) ? (uint32_t)var : 0; \
  138. } \
  139. \
  140. uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
  141. const uint8_t *src8, int src_stride, const uint8_t *ref8, \
  142. int ref_stride, uint32_t *sse) { \
  143. int sum; \
  144. int64_t var; \
  145. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  146. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  147. highbd_12_variance_sse2( \
  148. src, src_stride, ref, ref_stride, w, h, sse, &sum, \
  149. vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
  150. var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
  151. return (var >= 0) ? (uint32_t)var : 0; \
  152. }
  153. VAR_FN(64, 64, 16, 12);
  154. VAR_FN(64, 32, 16, 11);
  155. VAR_FN(32, 64, 16, 11);
  156. VAR_FN(32, 32, 16, 10);
  157. VAR_FN(32, 16, 16, 9);
  158. VAR_FN(16, 32, 16, 9);
  159. VAR_FN(16, 16, 16, 8);
  160. VAR_FN(16, 8, 8, 7);
  161. VAR_FN(8, 16, 8, 7);
  162. VAR_FN(8, 8, 8, 6);
  163. #undef VAR_FN
  164. unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
  165. const uint8_t *ref8, int ref_stride,
  166. unsigned int *sse) {
  167. int sum;
  168. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  169. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  170. highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
  171. vpx_highbd_calc16x16var_sse2, 16);
  172. return *sse;
  173. }
  174. unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
  175. const uint8_t *ref8, int ref_stride,
  176. unsigned int *sse) {
  177. int sum;
  178. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  179. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  180. highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
  181. vpx_highbd_calc16x16var_sse2, 16);
  182. return *sse;
  183. }
  184. unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
  185. const uint8_t *ref8, int ref_stride,
  186. unsigned int *sse) {
  187. int sum;
  188. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  189. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  190. highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
  191. vpx_highbd_calc16x16var_sse2, 16);
  192. return *sse;
  193. }
  194. unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
  195. const uint8_t *ref8, int ref_stride,
  196. unsigned int *sse) {
  197. int sum;
  198. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  199. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  200. highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
  201. vpx_highbd_calc8x8var_sse2, 8);
  202. return *sse;
  203. }
  204. unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
  205. const uint8_t *ref8, int ref_stride,
  206. unsigned int *sse) {
  207. int sum;
  208. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  209. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  210. highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
  211. vpx_highbd_calc8x8var_sse2, 8);
  212. return *sse;
  213. }
  214. unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
  215. const uint8_t *ref8, int ref_stride,
  216. unsigned int *sse) {
  217. int sum;
  218. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  219. uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  220. highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
  221. vpx_highbd_calc8x8var_sse2, 8);
  222. return *sse;
  223. }
  224. // The 2 unused parameters are place holders for PIC enabled build.
  225. // These definitions are for functions defined in
  226. // highbd_subpel_variance_impl_sse2.asm
  227. #define DECL(w, opt) \
  228. int vpx_highbd_sub_pixel_variance##w##xh_##opt( \
  229. const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
  230. const uint16_t *dst, ptrdiff_t dst_stride, int height, \
  231. unsigned int *sse, void *unused0, void *unused);
  232. #define DECLS(opt) \
  233. DECL(8, opt); \
  234. DECL(16, opt)
  235. DECLS(sse2);
  236. #undef DECLS
  237. #undef DECL
  238. #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
  239. uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
  240. const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
  241. const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
  242. uint32_t sse; \
  243. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  244. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
  245. int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  246. src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
  247. NULL); \
  248. if (w > wf) { \
  249. unsigned int sse2; \
  250. int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  251. src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
  252. &sse2, NULL, NULL); \
  253. se += se2; \
  254. sse += sse2; \
  255. if (w > wf * 2) { \
  256. se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  257. src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
  258. &sse2, NULL, NULL); \
  259. se += se2; \
  260. sse += sse2; \
  261. se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  262. src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
  263. &sse2, NULL, NULL); \
  264. se += se2; \
  265. sse += sse2; \
  266. } \
  267. } \
  268. *sse_ptr = sse; \
  269. return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
  270. } \
  271. \
  272. uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
  273. const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
  274. const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
  275. int64_t var; \
  276. uint32_t sse; \
  277. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  278. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
  279. int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  280. src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
  281. NULL); \
  282. if (w > wf) { \
  283. uint32_t sse2; \
  284. int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  285. src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
  286. &sse2, NULL, NULL); \
  287. se += se2; \
  288. sse += sse2; \
  289. if (w > wf * 2) { \
  290. se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  291. src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
  292. &sse2, NULL, NULL); \
  293. se += se2; \
  294. sse += sse2; \
  295. se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  296. src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
  297. &sse2, NULL, NULL); \
  298. se += se2; \
  299. sse += sse2; \
  300. } \
  301. } \
  302. se = ROUND_POWER_OF_TWO(se, 2); \
  303. sse = ROUND_POWER_OF_TWO(sse, 4); \
  304. *sse_ptr = sse; \
  305. var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
  306. return (var >= 0) ? (uint32_t)var : 0; \
  307. } \
  308. \
  309. uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
  310. const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
  311. const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
  312. int start_row; \
  313. uint32_t sse; \
  314. int se = 0; \
  315. int64_t var; \
  316. uint64_t long_sse = 0; \
  317. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  318. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
  319. for (start_row = 0; start_row < h; start_row += 16) { \
  320. uint32_t sse2; \
  321. int height = h - start_row < 16 ? h - start_row : 16; \
  322. int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  323. src + (start_row * src_stride), src_stride, x_offset, y_offset, \
  324. dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \
  325. NULL); \
  326. se += se2; \
  327. long_sse += sse2; \
  328. if (w > wf) { \
  329. se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  330. src + 16 + (start_row * src_stride), src_stride, x_offset, \
  331. y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
  332. &sse2, NULL, NULL); \
  333. se += se2; \
  334. long_sse += sse2; \
  335. if (w > wf * 2) { \
  336. se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  337. src + 32 + (start_row * src_stride), src_stride, x_offset, \
  338. y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
  339. height, &sse2, NULL, NULL); \
  340. se += se2; \
  341. long_sse += sse2; \
  342. se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
  343. src + 48 + (start_row * src_stride), src_stride, x_offset, \
  344. y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
  345. height, &sse2, NULL, NULL); \
  346. se += se2; \
  347. long_sse += sse2; \
  348. } \
  349. } \
  350. } \
  351. se = ROUND_POWER_OF_TWO(se, 4); \
  352. sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
  353. *sse_ptr = sse; \
  354. var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
  355. return (var >= 0) ? (uint32_t)var : 0; \
  356. }
  357. #define FNS(opt) \
  358. FN(64, 64, 16, 6, 6, opt, (int64_t)); \
  359. FN(64, 32, 16, 6, 5, opt, (int64_t)); \
  360. FN(32, 64, 16, 5, 6, opt, (int64_t)); \
  361. FN(32, 32, 16, 5, 5, opt, (int64_t)); \
  362. FN(32, 16, 16, 5, 4, opt, (int64_t)); \
  363. FN(16, 32, 16, 4, 5, opt, (int64_t)); \
  364. FN(16, 16, 16, 4, 4, opt, (int64_t)); \
  365. FN(16, 8, 16, 4, 3, opt, (int64_t)); \
  366. FN(8, 16, 8, 3, 4, opt, (int64_t)); \
  367. FN(8, 8, 8, 3, 3, opt, (int64_t)); \
  368. FN(8, 4, 8, 3, 2, opt, (int64_t));
  369. FNS(sse2);
  370. #undef FNS
  371. #undef FN
  372. // The 2 unused parameters are place holders for PIC enabled build.
  373. #define DECL(w, opt) \
  374. int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt( \
  375. const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
  376. const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \
  377. ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
  378. void *unused);
  379. #define DECLS(opt1) \
  380. DECL(16, opt1) \
  381. DECL(8, opt1)
  382. DECLS(sse2);
  383. #undef DECL
  384. #undef DECLS
  385. #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
  386. uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
  387. const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
  388. const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
  389. const uint8_t *sec8) { \
  390. uint32_t sse; \
  391. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  392. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
  393. uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
  394. int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  395. src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
  396. NULL, NULL); \
  397. if (w > wf) { \
  398. uint32_t sse2; \
  399. int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  400. src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
  401. sec + 16, w, h, &sse2, NULL, NULL); \
  402. se += se2; \
  403. sse += sse2; \
  404. if (w > wf * 2) { \
  405. se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  406. src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
  407. sec + 32, w, h, &sse2, NULL, NULL); \
  408. se += se2; \
  409. sse += sse2; \
  410. se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  411. src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
  412. sec + 48, w, h, &sse2, NULL, NULL); \
  413. se += se2; \
  414. sse += sse2; \
  415. } \
  416. } \
  417. *sse_ptr = sse; \
  418. return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
  419. } \
  420. \
  421. uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
  422. const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
  423. const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
  424. const uint8_t *sec8) { \
  425. int64_t var; \
  426. uint32_t sse; \
  427. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  428. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
  429. uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
  430. int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  431. src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
  432. NULL, NULL); \
  433. if (w > wf) { \
  434. uint32_t sse2; \
  435. int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  436. src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
  437. sec + 16, w, h, &sse2, NULL, NULL); \
  438. se += se2; \
  439. sse += sse2; \
  440. if (w > wf * 2) { \
  441. se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  442. src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
  443. sec + 32, w, h, &sse2, NULL, NULL); \
  444. se += se2; \
  445. sse += sse2; \
  446. se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  447. src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
  448. sec + 48, w, h, &sse2, NULL, NULL); \
  449. se += se2; \
  450. sse += sse2; \
  451. } \
  452. } \
  453. se = ROUND_POWER_OF_TWO(se, 2); \
  454. sse = ROUND_POWER_OF_TWO(sse, 4); \
  455. *sse_ptr = sse; \
  456. var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
  457. return (var >= 0) ? (uint32_t)var : 0; \
  458. } \
  459. \
  460. uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
  461. const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
  462. const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
  463. const uint8_t *sec8) { \
  464. int start_row; \
  465. int64_t var; \
  466. uint32_t sse; \
  467. int se = 0; \
  468. uint64_t long_sse = 0; \
  469. uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  470. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
  471. uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
  472. for (start_row = 0; start_row < h; start_row += 16) { \
  473. uint32_t sse2; \
  474. int height = h - start_row < 16 ? h - start_row : 16; \
  475. int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  476. src + (start_row * src_stride), src_stride, x_offset, y_offset, \
  477. dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \
  478. w, height, &sse2, NULL, NULL); \
  479. se += se2; \
  480. long_sse += sse2; \
  481. if (w > wf) { \
  482. se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  483. src + 16 + (start_row * src_stride), src_stride, x_offset, \
  484. y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \
  485. sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
  486. se += se2; \
  487. long_sse += sse2; \
  488. if (w > wf * 2) { \
  489. se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  490. src + 32 + (start_row * src_stride), src_stride, x_offset, \
  491. y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
  492. sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
  493. se += se2; \
  494. long_sse += sse2; \
  495. se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
  496. src + 48 + (start_row * src_stride), src_stride, x_offset, \
  497. y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
  498. sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
  499. se += se2; \
  500. long_sse += sse2; \
  501. } \
  502. } \
  503. } \
  504. se = ROUND_POWER_OF_TWO(se, 4); \
  505. sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
  506. *sse_ptr = sse; \
  507. var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
  508. return (var >= 0) ? (uint32_t)var : 0; \
  509. }
  510. #define FNS(opt1) \
  511. FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
  512. FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
  513. FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
  514. FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
  515. FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
  516. FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
  517. FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
  518. FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
  519. FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
  520. FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
  521. FN(8, 4, 8, 3, 2, opt1, (int64_t));
  522. FNS(sse2);
  523. #undef FNS
  524. #undef FN