sad_ssse3.asm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %macro PROCESS_16X2X3 1
  12. %if %1
  13. movdqa xmm0, XMMWORD PTR [rsi]
  14. lddqu xmm5, XMMWORD PTR [rdi]
  15. lddqu xmm6, XMMWORD PTR [rdi+1]
  16. lddqu xmm7, XMMWORD PTR [rdi+2]
  17. psadbw xmm5, xmm0
  18. psadbw xmm6, xmm0
  19. psadbw xmm7, xmm0
  20. %else
  21. movdqa xmm0, XMMWORD PTR [rsi]
  22. lddqu xmm1, XMMWORD PTR [rdi]
  23. lddqu xmm2, XMMWORD PTR [rdi+1]
  24. lddqu xmm3, XMMWORD PTR [rdi+2]
  25. psadbw xmm1, xmm0
  26. psadbw xmm2, xmm0
  27. psadbw xmm3, xmm0
  28. paddw xmm5, xmm1
  29. paddw xmm6, xmm2
  30. paddw xmm7, xmm3
  31. %endif
  32. movdqa xmm0, XMMWORD PTR [rsi+rax]
  33. lddqu xmm1, XMMWORD PTR [rdi+rdx]
  34. lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
  35. lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
  36. lea rsi, [rsi+rax*2]
  37. lea rdi, [rdi+rdx*2]
  38. psadbw xmm1, xmm0
  39. psadbw xmm2, xmm0
  40. psadbw xmm3, xmm0
  41. paddw xmm5, xmm1
  42. paddw xmm6, xmm2
  43. paddw xmm7, xmm3
  44. %endmacro
  45. %macro PROCESS_16X2X3_OFFSET 2
  46. %if %1
  47. movdqa xmm0, XMMWORD PTR [rsi]
  48. movdqa xmm4, XMMWORD PTR [rdi]
  49. movdqa xmm7, XMMWORD PTR [rdi+16]
  50. movdqa xmm5, xmm7
  51. palignr xmm5, xmm4, %2
  52. movdqa xmm6, xmm7
  53. palignr xmm6, xmm4, (%2+1)
  54. palignr xmm7, xmm4, (%2+2)
  55. psadbw xmm5, xmm0
  56. psadbw xmm6, xmm0
  57. psadbw xmm7, xmm0
  58. %else
  59. movdqa xmm0, XMMWORD PTR [rsi]
  60. movdqa xmm4, XMMWORD PTR [rdi]
  61. movdqa xmm3, XMMWORD PTR [rdi+16]
  62. movdqa xmm1, xmm3
  63. palignr xmm1, xmm4, %2
  64. movdqa xmm2, xmm3
  65. palignr xmm2, xmm4, (%2+1)
  66. palignr xmm3, xmm4, (%2+2)
  67. psadbw xmm1, xmm0
  68. psadbw xmm2, xmm0
  69. psadbw xmm3, xmm0
  70. paddw xmm5, xmm1
  71. paddw xmm6, xmm2
  72. paddw xmm7, xmm3
  73. %endif
  74. movdqa xmm0, XMMWORD PTR [rsi+rax]
  75. movdqa xmm4, XMMWORD PTR [rdi+rdx]
  76. movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
  77. movdqa xmm1, xmm3
  78. palignr xmm1, xmm4, %2
  79. movdqa xmm2, xmm3
  80. palignr xmm2, xmm4, (%2+1)
  81. palignr xmm3, xmm4, (%2+2)
  82. lea rsi, [rsi+rax*2]
  83. lea rdi, [rdi+rdx*2]
  84. psadbw xmm1, xmm0
  85. psadbw xmm2, xmm0
  86. psadbw xmm3, xmm0
  87. paddw xmm5, xmm1
  88. paddw xmm6, xmm2
  89. paddw xmm7, xmm3
  90. %endmacro
  91. %macro PROCESS_16X16X3_OFFSET 2
  92. %2_aligned_by_%1:
  93. sub rdi, %1
  94. PROCESS_16X2X3_OFFSET 1, %1
  95. PROCESS_16X2X3_OFFSET 0, %1
  96. PROCESS_16X2X3_OFFSET 0, %1
  97. PROCESS_16X2X3_OFFSET 0, %1
  98. PROCESS_16X2X3_OFFSET 0, %1
  99. PROCESS_16X2X3_OFFSET 0, %1
  100. PROCESS_16X2X3_OFFSET 0, %1
  101. PROCESS_16X2X3_OFFSET 0, %1
  102. jmp %2_store_off
  103. %endmacro
  104. %macro PROCESS_16X8X3_OFFSET 2
  105. %2_aligned_by_%1:
  106. sub rdi, %1
  107. PROCESS_16X2X3_OFFSET 1, %1
  108. PROCESS_16X2X3_OFFSET 0, %1
  109. PROCESS_16X2X3_OFFSET 0, %1
  110. PROCESS_16X2X3_OFFSET 0, %1
  111. jmp %2_store_off
  112. %endmacro
  113. ;void int vpx_sad16x16x3_ssse3(
  114. ; unsigned char *src_ptr,
  115. ; int src_stride,
  116. ; unsigned char *ref_ptr,
  117. ; int ref_stride,
  118. ; int *results)
  119. global sym(vpx_sad16x16x3_ssse3) PRIVATE
  120. sym(vpx_sad16x16x3_ssse3):
  121. push rbp
  122. mov rbp, rsp
  123. SHADOW_ARGS_TO_STACK 5
  124. SAVE_XMM 7
  125. push rsi
  126. push rdi
  127. push rcx
  128. ; end prolog
  129. mov rsi, arg(0) ;src_ptr
  130. mov rdi, arg(2) ;ref_ptr
  131. mov rdx, 0xf
  132. and rdx, rdi
  133. jmp .vpx_sad16x16x3_ssse3_skiptable
  134. .vpx_sad16x16x3_ssse3_jumptable:
  135. dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump
  136. dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump
  137. dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump
  138. dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump
  139. dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump
  140. dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump
  141. dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump
  142. dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump
  143. dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump
  144. dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump
  145. dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
  146. dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
  147. dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
  148. dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
  149. dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
  150. dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
  151. .vpx_sad16x16x3_ssse3_skiptable:
  152. call .vpx_sad16x16x3_ssse3_do_jump
  153. .vpx_sad16x16x3_ssse3_do_jump:
  154. pop rcx ; get the address of do_jump
  155. mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
  156. add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
  157. movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
  158. add rcx, rax
  159. movsxd rax, dword ptr arg(1) ;src_stride
  160. movsxd rdx, dword ptr arg(3) ;ref_stride
  161. jmp rcx
  162. PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3
  163. PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3
  164. PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3
  165. PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3
  166. PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3
  167. PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3
  168. PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3
  169. PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3
  170. PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3
  171. PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3
  172. PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
  173. PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
  174. PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
  175. PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
  176. PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
  177. .vpx_sad16x16x3_ssse3_aligned_by_15:
  178. PROCESS_16X2X3 1
  179. PROCESS_16X2X3 0
  180. PROCESS_16X2X3 0
  181. PROCESS_16X2X3 0
  182. PROCESS_16X2X3 0
  183. PROCESS_16X2X3 0
  184. PROCESS_16X2X3 0
  185. PROCESS_16X2X3 0
  186. .vpx_sad16x16x3_ssse3_store_off:
  187. mov rdi, arg(4) ;Results
  188. movq xmm0, xmm5
  189. psrldq xmm5, 8
  190. paddw xmm0, xmm5
  191. movd [rdi], xmm0
  192. ;-
  193. movq xmm0, xmm6
  194. psrldq xmm6, 8
  195. paddw xmm0, xmm6
  196. movd [rdi+4], xmm0
  197. ;-
  198. movq xmm0, xmm7
  199. psrldq xmm7, 8
  200. paddw xmm0, xmm7
  201. movd [rdi+8], xmm0
  202. ; begin epilog
  203. pop rcx
  204. pop rdi
  205. pop rsi
  206. RESTORE_XMM
  207. UNSHADOW_ARGS
  208. pop rbp
  209. ret
  210. ;void int vpx_sad16x8x3_ssse3(
  211. ; unsigned char *src_ptr,
  212. ; int src_stride,
  213. ; unsigned char *ref_ptr,
  214. ; int ref_stride,
  215. ; int *results)
  216. global sym(vpx_sad16x8x3_ssse3) PRIVATE
  217. sym(vpx_sad16x8x3_ssse3):
  218. push rbp
  219. mov rbp, rsp
  220. SHADOW_ARGS_TO_STACK 5
  221. SAVE_XMM 7
  222. push rsi
  223. push rdi
  224. push rcx
  225. ; end prolog
  226. mov rsi, arg(0) ;src_ptr
  227. mov rdi, arg(2) ;ref_ptr
  228. mov rdx, 0xf
  229. and rdx, rdi
  230. jmp .vpx_sad16x8x3_ssse3_skiptable
  231. .vpx_sad16x8x3_ssse3_jumptable:
  232. dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump
  233. dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump
  234. dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump
  235. dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump
  236. dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump
  237. dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump
  238. dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump
  239. dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump
  240. dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump
  241. dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump
  242. dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
  243. dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
  244. dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
  245. dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
  246. dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
  247. dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
  248. .vpx_sad16x8x3_ssse3_skiptable:
  249. call .vpx_sad16x8x3_ssse3_do_jump
  250. .vpx_sad16x8x3_ssse3_do_jump:
  251. pop rcx ; get the address of do_jump
  252. mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
  253. add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
  254. movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
  255. add rcx, rax
  256. movsxd rax, dword ptr arg(1) ;src_stride
  257. movsxd rdx, dword ptr arg(3) ;ref_stride
  258. jmp rcx
  259. PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3
  260. PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3
  261. PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3
  262. PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3
  263. PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3
  264. PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3
  265. PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3
  266. PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3
  267. PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3
  268. PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3
  269. PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
  270. PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
  271. PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
  272. PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
  273. PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
  274. .vpx_sad16x8x3_ssse3_aligned_by_15:
  275. PROCESS_16X2X3 1
  276. PROCESS_16X2X3 0
  277. PROCESS_16X2X3 0
  278. PROCESS_16X2X3 0
  279. .vpx_sad16x8x3_ssse3_store_off:
  280. mov rdi, arg(4) ;Results
  281. movq xmm0, xmm5
  282. psrldq xmm5, 8
  283. paddw xmm0, xmm5
  284. movd [rdi], xmm0
  285. ;-
  286. movq xmm0, xmm6
  287. psrldq xmm6, 8
  288. paddw xmm0, xmm6
  289. movd [rdi+4], xmm0
  290. ;-
  291. movq xmm0, xmm7
  292. psrldq xmm7, 8
  293. paddw xmm0, xmm7
  294. movd [rdi+8], xmm0
  295. ; begin epilog
  296. pop rcx
  297. pop rdi
  298. pop rsi
  299. RESTORE_XMM
  300. UNSHADOW_ARGS
  301. pop rbp
  302. ret