variance_sse2.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h> // SSE2
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_ports/mem.h"
  14. typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
  15. const unsigned char *ref, int ref_stride,
  16. unsigned int *sse, int *sum);
  17. unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
  18. __m128i vsum = _mm_setzero_si128();
  19. int i;
  20. for (i = 0; i < 32; ++i) {
  21. const __m128i v = _mm_loadu_si128((const __m128i *)src);
  22. vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
  23. src += 8;
  24. }
  25. vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
  26. vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
  27. return _mm_cvtsi128_si32(vsum);
  28. }
  29. #define READ64(p, stride, i) \
  30. _mm_unpacklo_epi8( \
  31. _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
  32. _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
  33. static void get4x4var_sse2(const uint8_t *src, int src_stride,
  34. const uint8_t *ref, int ref_stride,
  35. unsigned int *sse, int *sum) {
  36. const __m128i zero = _mm_setzero_si128();
  37. const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
  38. const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
  39. const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
  40. const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
  41. const __m128i diff0 = _mm_sub_epi16(src0, ref0);
  42. const __m128i diff1 = _mm_sub_epi16(src1, ref1);
  43. // sum
  44. __m128i vsum = _mm_add_epi16(diff0, diff1);
  45. vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  46. vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  47. vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
  48. *sum = (int16_t)_mm_extract_epi16(vsum, 0);
  49. // sse
  50. vsum =
  51. _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
  52. vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
  53. vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
  54. *sse = _mm_cvtsi128_si32(vsum);
  55. }
  56. void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
  57. int ref_stride, unsigned int *sse, int *sum) {
  58. const __m128i zero = _mm_setzero_si128();
  59. __m128i vsum = _mm_setzero_si128();
  60. __m128i vsse = _mm_setzero_si128();
  61. int i;
  62. for (i = 0; i < 8; i += 2) {
  63. const __m128i src0 = _mm_unpacklo_epi8(
  64. _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
  65. const __m128i ref0 = _mm_unpacklo_epi8(
  66. _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
  67. const __m128i diff0 = _mm_sub_epi16(src0, ref0);
  68. const __m128i src1 = _mm_unpacklo_epi8(
  69. _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
  70. const __m128i ref1 = _mm_unpacklo_epi8(
  71. _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
  72. const __m128i diff1 = _mm_sub_epi16(src1, ref1);
  73. vsum = _mm_add_epi16(vsum, diff0);
  74. vsum = _mm_add_epi16(vsum, diff1);
  75. vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
  76. vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
  77. }
  78. // sum
  79. vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  80. vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  81. vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
  82. *sum = (int16_t)_mm_extract_epi16(vsum, 0);
  83. // sse
  84. vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
  85. vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
  86. *sse = _mm_cvtsi128_si32(vsse);
  87. }
  88. void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
  89. const uint8_t *ref, int ref_stride, unsigned int *sse,
  90. int *sum) {
  91. const __m128i zero = _mm_setzero_si128();
  92. __m128i vsum = _mm_setzero_si128();
  93. __m128i vsse = _mm_setzero_si128();
  94. int i;
  95. for (i = 0; i < 16; ++i) {
  96. const __m128i s = _mm_loadu_si128((const __m128i *)src);
  97. const __m128i r = _mm_loadu_si128((const __m128i *)ref);
  98. const __m128i src0 = _mm_unpacklo_epi8(s, zero);
  99. const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
  100. const __m128i diff0 = _mm_sub_epi16(src0, ref0);
  101. const __m128i src1 = _mm_unpackhi_epi8(s, zero);
  102. const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
  103. const __m128i diff1 = _mm_sub_epi16(src1, ref1);
  104. vsum = _mm_add_epi16(vsum, diff0);
  105. vsum = _mm_add_epi16(vsum, diff1);
  106. vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
  107. vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
  108. src += src_stride;
  109. ref += ref_stride;
  110. }
  111. // sum
  112. vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  113. vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  114. *sum =
  115. (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
  116. // sse
  117. vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
  118. vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
  119. *sse = _mm_cvtsi128_si32(vsse);
  120. }
  121. static void variance_sse2(const unsigned char *src, int src_stride,
  122. const unsigned char *ref, int ref_stride, int w,
  123. int h, unsigned int *sse, int *sum,
  124. getNxMvar_fn_t var_fn, int block_size) {
  125. int i, j;
  126. *sse = 0;
  127. *sum = 0;
  128. for (i = 0; i < h; i += block_size) {
  129. for (j = 0; j < w; j += block_size) {
  130. unsigned int sse0;
  131. int sum0;
  132. var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
  133. ref_stride, &sse0, &sum0);
  134. *sse += sse0;
  135. *sum += sum0;
  136. }
  137. }
  138. }
  139. unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
  140. const unsigned char *ref, int ref_stride,
  141. unsigned int *sse) {
  142. int sum;
  143. get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
  144. return *sse - ((sum * sum) >> 4);
  145. }
  146. unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
  147. const uint8_t *ref, int ref_stride,
  148. unsigned int *sse) {
  149. int sum;
  150. variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
  151. get4x4var_sse2, 4);
  152. return *sse - ((sum * sum) >> 5);
  153. }
  154. unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
  155. const uint8_t *ref, int ref_stride,
  156. unsigned int *sse) {
  157. int sum;
  158. variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
  159. get4x4var_sse2, 4);
  160. return *sse - ((sum * sum) >> 5);
  161. }
  162. unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
  163. const unsigned char *ref, int ref_stride,
  164. unsigned int *sse) {
  165. int sum;
  166. vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
  167. return *sse - ((sum * sum) >> 6);
  168. }
  169. unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
  170. const unsigned char *ref, int ref_stride,
  171. unsigned int *sse) {
  172. int sum;
  173. variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
  174. vpx_get8x8var_sse2, 8);
  175. return *sse - ((sum * sum) >> 7);
  176. }
  177. unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
  178. const unsigned char *ref, int ref_stride,
  179. unsigned int *sse) {
  180. int sum;
  181. variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
  182. vpx_get8x8var_sse2, 8);
  183. return *sse - ((sum * sum) >> 7);
  184. }
  185. unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
  186. const unsigned char *ref, int ref_stride,
  187. unsigned int *sse) {
  188. int sum;
  189. vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
  190. return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
  191. }
  192. unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
  193. const uint8_t *ref, int ref_stride,
  194. unsigned int *sse) {
  195. int sum;
  196. variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
  197. vpx_get16x16var_sse2, 16);
  198. return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
  199. }
  200. unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
  201. const uint8_t *ref, int ref_stride,
  202. unsigned int *sse) {
  203. int sum;
  204. variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
  205. vpx_get16x16var_sse2, 16);
  206. return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
  207. }
  208. unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
  209. const uint8_t *ref, int ref_stride,
  210. unsigned int *sse) {
  211. int sum;
  212. variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
  213. vpx_get16x16var_sse2, 16);
  214. return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
  215. }
  216. unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
  217. const uint8_t *ref, int ref_stride,
  218. unsigned int *sse) {
  219. int sum;
  220. variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
  221. vpx_get16x16var_sse2, 16);
  222. return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
  223. }
  224. unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
  225. const uint8_t *ref, int ref_stride,
  226. unsigned int *sse) {
  227. int sum;
  228. variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
  229. vpx_get16x16var_sse2, 16);
  230. return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
  231. }
  232. unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
  233. const uint8_t *ref, int ref_stride,
  234. unsigned int *sse) {
  235. int sum;
  236. variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
  237. vpx_get16x16var_sse2, 16);
  238. return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
  239. }
  240. unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
  241. const uint8_t *ref, int ref_stride,
  242. unsigned int *sse) {
  243. vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
  244. return *sse;
  245. }
  246. unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
  247. const uint8_t *ref, int ref_stride,
  248. unsigned int *sse) {
  249. vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
  250. return *sse;
  251. }
  252. unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
  253. const uint8_t *ref, int ref_stride,
  254. unsigned int *sse) {
  255. vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
  256. return *sse;
  257. }
  258. unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
  259. const uint8_t *ref, int ref_stride,
  260. unsigned int *sse) {
  261. vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
  262. return *sse;
  263. }
  264. // The 2 unused parameters are place holders for PIC enabled build.
  265. // These definitions are for functions defined in subpel_variance.asm
  266. #define DECL(w, opt) \
  267. int vpx_sub_pixel_variance##w##xh_##opt( \
  268. const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
  269. const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
  270. void *unused0, void *unused)
  271. #define DECLS(opt1, opt2) \
  272. DECL(4, opt1); \
  273. DECL(8, opt1); \
  274. DECL(16, opt1)
  275. DECLS(sse2, sse2);
  276. DECLS(ssse3, ssse3);
  277. #undef DECLS
  278. #undef DECL
  279. #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
  280. unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \
  281. const uint8_t *src, int src_stride, int x_offset, int y_offset, \
  282. const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
  283. unsigned int sse; \
  284. int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
  285. y_offset, dst, dst_stride, \
  286. h, &sse, NULL, NULL); \
  287. if (w > wf) { \
  288. unsigned int sse2; \
  289. int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
  290. src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
  291. &sse2, NULL, NULL); \
  292. se += se2; \
  293. sse += sse2; \
  294. if (w > wf * 2) { \
  295. se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
  296. src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
  297. &sse2, NULL, NULL); \
  298. se += se2; \
  299. sse += sse2; \
  300. se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
  301. src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
  302. &sse2, NULL, NULL); \
  303. se += se2; \
  304. sse += sse2; \
  305. } \
  306. } \
  307. *sse_ptr = sse; \
  308. return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
  309. }
  310. #define FNS(opt1, opt2) \
  311. FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
  312. FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
  313. FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
  314. FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
  315. FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
  316. FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
  317. FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
  318. FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
  319. FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \
  320. FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \
  321. FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \
  322. FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \
  323. FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
  324. FNS(sse2, sse2);
  325. FNS(ssse3, ssse3);
  326. #undef FNS
  327. #undef FN
  328. // The 2 unused parameters are place holders for PIC enabled build.
  329. #define DECL(w, opt) \
  330. int vpx_sub_pixel_avg_variance##w##xh_##opt( \
  331. const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
  332. const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \
  333. ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
  334. void *unused)
  335. #define DECLS(opt1, opt2) \
  336. DECL(4, opt1); \
  337. DECL(8, opt1); \
  338. DECL(16, opt1)
  339. DECLS(sse2, sse2);
  340. DECLS(ssse3, ssse3);
  341. #undef DECL
  342. #undef DECLS
  343. #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
  344. unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \
  345. const uint8_t *src, int src_stride, int x_offset, int y_offset, \
  346. const uint8_t *dst, int dst_stride, unsigned int *sseptr, \
  347. const uint8_t *sec) { \
  348. unsigned int sse; \
  349. int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
  350. src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
  351. NULL, NULL); \
  352. if (w > wf) { \
  353. unsigned int sse2; \
  354. int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
  355. src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
  356. sec + 16, w, h, &sse2, NULL, NULL); \
  357. se += se2; \
  358. sse += sse2; \
  359. if (w > wf * 2) { \
  360. se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
  361. src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
  362. sec + 32, w, h, &sse2, NULL, NULL); \
  363. se += se2; \
  364. sse += sse2; \
  365. se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
  366. src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
  367. sec + 48, w, h, &sse2, NULL, NULL); \
  368. se += se2; \
  369. sse += sse2; \
  370. } \
  371. } \
  372. *sseptr = sse; \
  373. return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
  374. }
  375. #define FNS(opt1, opt2) \
  376. FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
  377. FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
  378. FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
  379. FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
  380. FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
  381. FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
  382. FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
  383. FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
  384. FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \
  385. FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \
  386. FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \
  387. FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \
  388. FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
  389. FNS(sse2, sse);
  390. FNS(ssse3, ssse3);
  391. #undef FNS
  392. #undef FN