vp8_asm_stubs.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "vpx_config.h"
  11. #include "vp8_rtcd.h"
  12. #include "vpx_ports/mem.h"
  13. #include "filter_x86.h"
  14. extern const short vp8_six_tap_x86[8][6 * 8];
  15. extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
  16. unsigned short *output_ptr,
  17. unsigned int src_pixels_per_line,
  18. unsigned int pixel_step,
  19. unsigned int output_height,
  20. unsigned int output_width,
  21. const short *vp8_filter);
  22. extern void vp8_filter_block1dc_v6_mmx(
  23. unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
  24. unsigned int pixels_per_line, unsigned int pixel_step,
  25. unsigned int output_height, unsigned int output_width,
  26. const short *vp8_filter);
  27. extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
  28. unsigned short *output_ptr,
  29. unsigned int src_pixels_per_line,
  30. unsigned int pixel_step,
  31. unsigned int output_height,
  32. unsigned int output_width,
  33. const short *vp8_filter);
  34. extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
  35. unsigned short *output_ptr,
  36. unsigned int src_pixels_per_line,
  37. unsigned int pixel_step,
  38. unsigned int output_height,
  39. unsigned int output_width,
  40. const short *vp8_filter);
  41. extern void vp8_filter_block1d8_v6_sse2(
  42. unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
  43. unsigned int pixels_per_line, unsigned int pixel_step,
  44. unsigned int output_height, unsigned int output_width,
  45. const short *vp8_filter);
  46. extern void vp8_filter_block1d16_v6_sse2(
  47. unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
  48. unsigned int pixels_per_line, unsigned int pixel_step,
  49. unsigned int output_height, unsigned int output_width,
  50. const short *vp8_filter);
  51. extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
  52. unsigned short *output_ptr,
  53. unsigned int src_pixels_per_line,
  54. unsigned int output_height,
  55. unsigned int output_width);
  56. extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
  57. unsigned int src_pixels_per_line,
  58. unsigned char *output_ptr,
  59. int dst_ptich,
  60. unsigned int output_height,
  61. const short *vp8_filter);
  62. extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
  63. unsigned int src_pixels_per_line,
  64. unsigned char *output_ptr,
  65. int dst_ptich,
  66. unsigned int output_height,
  67. const short *vp8_filter);
  68. extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
  69. unsigned int src_pixels_per_line,
  70. unsigned char *output_ptr,
  71. int dst_ptich,
  72. unsigned int output_height,
  73. const short *vp8_filter);
  74. #if HAVE_MMX
  75. void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
  76. int xoffset, int yoffset, unsigned char *dst_ptr,
  77. int dst_pitch) {
  78. DECLARE_ALIGNED(16, unsigned short,
  79. FData2[16 * 16]); /* Temp data bufffer used in filtering */
  80. const short *HFilter, *VFilter;
  81. HFilter = vp8_six_tap_x86[xoffset];
  82. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
  83. src_pixels_per_line, 1, 9, 8, HFilter);
  84. VFilter = vp8_six_tap_x86[yoffset];
  85. vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
  86. VFilter);
  87. }
  88. #endif
  89. #if HAVE_SSE2
  90. void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
  91. int src_pixels_per_line, int xoffset,
  92. int yoffset, unsigned char *dst_ptr,
  93. int dst_pitch
  94. ) {
  95. DECLARE_ALIGNED(16, unsigned short,
  96. FData2[24 * 24]); /* Temp data bufffer used in filtering */
  97. const short *HFilter, *VFilter;
  98. if (xoffset) {
  99. if (yoffset) {
  100. HFilter = vp8_six_tap_x86[xoffset];
  101. vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  102. src_pixels_per_line, 1, 21, 32, HFilter);
  103. VFilter = vp8_six_tap_x86[yoffset];
  104. vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
  105. dst_pitch, VFilter);
  106. } else {
  107. /* First-pass only */
  108. HFilter = vp8_six_tap_x86[xoffset];
  109. vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
  110. dst_pitch, 16, HFilter);
  111. }
  112. } else {
  113. /* Second-pass only */
  114. VFilter = vp8_six_tap_x86[yoffset];
  115. vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  116. src_pixels_per_line, 21, 32);
  117. vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
  118. dst_pitch, VFilter);
  119. }
  120. }
  121. void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
  122. int xoffset, int yoffset,
  123. unsigned char *dst_ptr, int dst_pitch) {
  124. DECLARE_ALIGNED(16, unsigned short,
  125. FData2[256]); /* Temp data bufffer used in filtering */
  126. const short *HFilter, *VFilter;
  127. if (xoffset) {
  128. if (yoffset) {
  129. HFilter = vp8_six_tap_x86[xoffset];
  130. vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  131. src_pixels_per_line, 1, 13, 16, HFilter);
  132. VFilter = vp8_six_tap_x86[yoffset];
  133. vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
  134. dst_pitch, VFilter);
  135. } else {
  136. /* First-pass only */
  137. HFilter = vp8_six_tap_x86[xoffset];
  138. vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
  139. dst_pitch, 8, HFilter);
  140. }
  141. } else {
  142. /* Second-pass only */
  143. VFilter = vp8_six_tap_x86[yoffset];
  144. vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
  145. src_pixels_per_line, dst_ptr, dst_pitch, 8,
  146. VFilter);
  147. }
  148. }
  149. void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
  150. int xoffset, int yoffset,
  151. unsigned char *dst_ptr, int dst_pitch) {
  152. DECLARE_ALIGNED(16, unsigned short,
  153. FData2[256]); /* Temp data bufffer used in filtering */
  154. const short *HFilter, *VFilter;
  155. if (xoffset) {
  156. if (yoffset) {
  157. HFilter = vp8_six_tap_x86[xoffset];
  158. vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  159. src_pixels_per_line, 1, 9, 16, HFilter);
  160. VFilter = vp8_six_tap_x86[yoffset];
  161. vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
  162. dst_pitch, VFilter);
  163. } else {
  164. /* First-pass only */
  165. HFilter = vp8_six_tap_x86[xoffset];
  166. vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
  167. dst_pitch, 4, HFilter);
  168. }
  169. } else {
  170. /* Second-pass only */
  171. VFilter = vp8_six_tap_x86[yoffset];
  172. vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
  173. src_pixels_per_line, dst_ptr, dst_pitch, 4,
  174. VFilter);
  175. }
  176. }
  177. #endif
  178. #if HAVE_SSSE3
  179. extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
  180. unsigned int src_pixels_per_line,
  181. unsigned char *output_ptr,
  182. unsigned int output_pitch,
  183. unsigned int output_height,
  184. unsigned int vp8_filter_index);
  185. extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
  186. unsigned int src_pixels_per_line,
  187. unsigned char *output_ptr,
  188. unsigned int output_pitch,
  189. unsigned int output_height,
  190. unsigned int vp8_filter_index);
  191. extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
  192. unsigned int src_pitch,
  193. unsigned char *output_ptr,
  194. unsigned int out_pitch,
  195. unsigned int output_height,
  196. unsigned int vp8_filter_index);
  197. extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
  198. unsigned int src_pitch,
  199. unsigned char *output_ptr,
  200. unsigned int out_pitch,
  201. unsigned int output_height,
  202. unsigned int vp8_filter_index);
  203. extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
  204. unsigned int src_pixels_per_line,
  205. unsigned char *output_ptr,
  206. unsigned int output_pitch,
  207. unsigned int output_height,
  208. unsigned int vp8_filter_index);
  209. extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
  210. unsigned int src_pitch,
  211. unsigned char *output_ptr,
  212. unsigned int out_pitch,
  213. unsigned int output_height,
  214. unsigned int vp8_filter_index);
  215. void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
  216. int src_pixels_per_line, int xoffset,
  217. int yoffset, unsigned char *dst_ptr,
  218. int dst_pitch
  219. ) {
  220. DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
  221. if (xoffset) {
  222. if (yoffset) {
  223. vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  224. src_pixels_per_line, FData2, 16, 21,
  225. xoffset);
  226. vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
  227. yoffset);
  228. } else {
  229. /* First-pass only */
  230. vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  231. dst_pitch, 16, xoffset);
  232. }
  233. } else {
  234. if (yoffset) {
  235. /* Second-pass only */
  236. vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  237. src_pixels_per_line, dst_ptr, dst_pitch, 16,
  238. yoffset);
  239. } else {
  240. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  241. * yoffset==0) case correctly. Add copy function here to guarantee
  242. * six-tap function handles all possible offsets. */
  243. vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
  244. }
  245. }
  246. }
  247. void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
  248. int src_pixels_per_line, int xoffset,
  249. int yoffset, unsigned char *dst_ptr,
  250. int dst_pitch) {
  251. DECLARE_ALIGNED(16, unsigned char, FData2[256]);
  252. if (xoffset) {
  253. if (yoffset) {
  254. vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  255. src_pixels_per_line, FData2, 8, 13, xoffset);
  256. vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
  257. } else {
  258. vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  259. dst_pitch, 8, xoffset);
  260. }
  261. } else {
  262. if (yoffset) {
  263. /* Second-pass only */
  264. vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  265. src_pixels_per_line, dst_ptr, dst_pitch, 8,
  266. yoffset);
  267. } else {
  268. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  269. * yoffset==0) case correctly. Add copy function here to guarantee
  270. * six-tap function handles all possible offsets. */
  271. vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
  272. }
  273. }
  274. }
  275. void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
  276. int src_pixels_per_line, int xoffset,
  277. int yoffset, unsigned char *dst_ptr,
  278. int dst_pitch) {
  279. DECLARE_ALIGNED(16, unsigned char, FData2[256]);
  280. if (xoffset) {
  281. if (yoffset) {
  282. vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  283. src_pixels_per_line, FData2, 8, 9, xoffset);
  284. vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
  285. } else {
  286. /* First-pass only */
  287. vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  288. dst_pitch, 4, xoffset);
  289. }
  290. } else {
  291. if (yoffset) {
  292. /* Second-pass only */
  293. vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  294. src_pixels_per_line, dst_ptr, dst_pitch, 4,
  295. yoffset);
  296. } else {
  297. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  298. * yoffset==0) case correctly. Add copy function here to guarantee
  299. * six-tap function handles all possible offsets. */
  300. vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
  301. }
  302. }
  303. }
  304. void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
  305. int src_pixels_per_line, int xoffset,
  306. int yoffset, unsigned char *dst_ptr,
  307. int dst_pitch) {
  308. DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
  309. if (xoffset) {
  310. if (yoffset) {
  311. vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  312. src_pixels_per_line, FData2, 4, 9, xoffset);
  313. vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
  314. } else {
  315. vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  316. dst_pitch, 4, xoffset);
  317. }
  318. } else {
  319. if (yoffset) {
  320. vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  321. src_pixels_per_line, dst_ptr, dst_pitch, 4,
  322. yoffset);
  323. } else {
  324. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  325. * yoffset==0) case correctly. Add copy function here to guarantee
  326. * six-tap function handles all possible offsets. */
  327. int r;
  328. for (r = 0; r < 4; ++r) {
  329. dst_ptr[0] = src_ptr[0];
  330. dst_ptr[1] = src_ptr[1];
  331. dst_ptr[2] = src_ptr[2];
  332. dst_ptr[3] = src_ptr[3];
  333. dst_ptr += dst_pitch;
  334. src_ptr += src_pixels_per_line;
  335. }
  336. }
  337. }
  338. }
  339. #endif