highbd_variance_impl_sse2.asm 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;unsigned int vpx_highbd_calc16x16var_sse2
  12. ;(
  13. ; unsigned char * src_ptr,
  14. ; int source_stride,
  15. ; unsigned char * ref_ptr,
  16. ; int recon_stride,
  17. ; unsigned int * SSE,
  18. ; int * Sum
  19. ;)
  20. global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
  21. sym(vpx_highbd_calc16x16var_sse2):
  22. push rbp
  23. mov rbp, rsp
  24. SHADOW_ARGS_TO_STACK 6
  25. SAVE_XMM 7
  26. push rbx
  27. push rsi
  28. push rdi
  29. ; end prolog
  30. mov rsi, arg(0) ;[src_ptr]
  31. mov rdi, arg(2) ;[ref_ptr]
  32. movsxd rax, DWORD PTR arg(1) ;[source_stride]
  33. movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
  34. add rax, rax ; source stride in bytes
  35. add rdx, rdx ; recon stride in bytes
  36. ; Prefetch data
  37. prefetcht0 [rsi]
  38. prefetcht0 [rsi+16]
  39. prefetcht0 [rsi+rax]
  40. prefetcht0 [rsi+rax+16]
  41. lea rbx, [rsi+rax*2]
  42. prefetcht0 [rbx]
  43. prefetcht0 [rbx+16]
  44. prefetcht0 [rbx+rax]
  45. prefetcht0 [rbx+rax+16]
  46. prefetcht0 [rdi]
  47. prefetcht0 [rdi+16]
  48. prefetcht0 [rdi+rdx]
  49. prefetcht0 [rdi+rdx+16]
  50. lea rbx, [rdi+rdx*2]
  51. prefetcht0 [rbx]
  52. prefetcht0 [rbx+16]
  53. prefetcht0 [rbx+rdx]
  54. prefetcht0 [rbx+rdx+16]
  55. pxor xmm0, xmm0 ; clear xmm0 for unpack
  56. pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
  57. pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
  58. mov rcx, 16
  59. .var16loop:
  60. movdqu xmm1, XMMWORD PTR [rsi]
  61. movdqu xmm2, XMMWORD PTR [rdi]
  62. lea rbx, [rsi+rax*2]
  63. prefetcht0 [rbx]
  64. prefetcht0 [rbx+16]
  65. prefetcht0 [rbx+rax]
  66. prefetcht0 [rbx+rax+16]
  67. lea rbx, [rdi+rdx*2]
  68. prefetcht0 [rbx]
  69. prefetcht0 [rbx+16]
  70. prefetcht0 [rbx+rdx]
  71. prefetcht0 [rbx+rdx+16]
  72. pxor xmm5, xmm5
  73. psubw xmm1, xmm2
  74. movdqu xmm3, XMMWORD PTR [rsi+16]
  75. paddw xmm5, xmm1
  76. pmaddwd xmm1, xmm1
  77. movdqu xmm2, XMMWORD PTR [rdi+16]
  78. paddd xmm6, xmm1
  79. psubw xmm3, xmm2
  80. movdqu xmm1, XMMWORD PTR [rsi+rax]
  81. paddw xmm5, xmm3
  82. pmaddwd xmm3, xmm3
  83. movdqu xmm2, XMMWORD PTR [rdi+rdx]
  84. paddd xmm6, xmm3
  85. psubw xmm1, xmm2
  86. movdqu xmm3, XMMWORD PTR [rsi+rax+16]
  87. paddw xmm5, xmm1
  88. pmaddwd xmm1, xmm1
  89. movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
  90. paddd xmm6, xmm1
  91. psubw xmm3, xmm2
  92. paddw xmm5, xmm3
  93. pmaddwd xmm3, xmm3
  94. paddd xmm6, xmm3
  95. movdqa xmm1, xmm5
  96. movdqa xmm2, xmm5
  97. pcmpgtw xmm1, xmm0
  98. pcmpeqw xmm2, xmm0
  99. por xmm1, xmm2
  100. pcmpeqw xmm1, xmm0
  101. movdqa xmm2, xmm5
  102. punpcklwd xmm5, xmm1
  103. punpckhwd xmm2, xmm1
  104. paddd xmm7, xmm5
  105. paddd xmm7, xmm2
  106. lea rsi, [rsi + 2*rax]
  107. lea rdi, [rdi + 2*rdx]
  108. sub rcx, 2
  109. jnz .var16loop
  110. movdqa xmm4, xmm6
  111. punpckldq xmm6, xmm0
  112. punpckhdq xmm4, xmm0
  113. movdqa xmm5, xmm7
  114. paddd xmm6, xmm4
  115. punpckldq xmm7, xmm0
  116. punpckhdq xmm5, xmm0
  117. paddd xmm7, xmm5
  118. movdqa xmm4, xmm6
  119. movdqa xmm5, xmm7
  120. psrldq xmm4, 8
  121. psrldq xmm5, 8
  122. paddd xmm6, xmm4
  123. paddd xmm7, xmm5
  124. mov rdi, arg(4) ; [SSE]
  125. mov rax, arg(5) ; [Sum]
  126. movd DWORD PTR [rdi], xmm6
  127. movd DWORD PTR [rax], xmm7
  128. ; begin epilog
  129. pop rdi
  130. pop rsi
  131. pop rbx
  132. RESTORE_XMM
  133. UNSHADOW_ARGS
  134. pop rbp
  135. ret
  136. ;unsigned int vpx_highbd_calc8x8var_sse2
  137. ;(
  138. ; unsigned char * src_ptr,
  139. ; int source_stride,
  140. ; unsigned char * ref_ptr,
  141. ; int recon_stride,
  142. ; unsigned int * SSE,
  143. ; int * Sum
  144. ;)
  145. global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
  146. sym(vpx_highbd_calc8x8var_sse2):
  147. push rbp
  148. mov rbp, rsp
  149. SHADOW_ARGS_TO_STACK 6
  150. SAVE_XMM 7
  151. push rbx
  152. push rsi
  153. push rdi
  154. ; end prolog
  155. mov rsi, arg(0) ;[src_ptr]
  156. mov rdi, arg(2) ;[ref_ptr]
  157. movsxd rax, DWORD PTR arg(1) ;[source_stride]
  158. movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
  159. add rax, rax ; source stride in bytes
  160. add rdx, rdx ; recon stride in bytes
  161. ; Prefetch data
  162. prefetcht0 [rsi]
  163. prefetcht0 [rsi+rax]
  164. lea rbx, [rsi+rax*2]
  165. prefetcht0 [rbx]
  166. prefetcht0 [rbx+rax]
  167. prefetcht0 [rdi]
  168. prefetcht0 [rdi+rdx]
  169. lea rbx, [rdi+rdx*2]
  170. prefetcht0 [rbx]
  171. prefetcht0 [rbx+rdx]
  172. pxor xmm0, xmm0 ; clear xmm0 for unpack
  173. pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
  174. pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
  175. mov rcx, 8
  176. .var8loop:
  177. movdqu xmm1, XMMWORD PTR [rsi]
  178. movdqu xmm2, XMMWORD PTR [rdi]
  179. lea rbx, [rsi+rax*4]
  180. prefetcht0 [rbx]
  181. prefetcht0 [rbx+rax]
  182. lea rbx, [rbx+rax*2]
  183. prefetcht0 [rbx]
  184. prefetcht0 [rbx+rax]
  185. lea rbx, [rdi+rdx*4]
  186. prefetcht0 [rbx]
  187. prefetcht0 [rbx+rdx]
  188. lea rbx, [rbx+rdx*2]
  189. prefetcht0 [rbx]
  190. prefetcht0 [rbx+rdx]
  191. pxor xmm5, xmm5
  192. psubw xmm1, xmm2
  193. movdqu xmm3, XMMWORD PTR [rsi+rax]
  194. paddw xmm5, xmm1
  195. pmaddwd xmm1, xmm1
  196. movdqu xmm2, XMMWORD PTR [rdi+rdx]
  197. paddd xmm6, xmm1
  198. lea rsi, [rsi + 2*rax]
  199. lea rdi, [rdi + 2*rdx]
  200. psubw xmm3, xmm2
  201. movdqu xmm1, XMMWORD PTR [rsi]
  202. paddw xmm5, xmm3
  203. pmaddwd xmm3, xmm3
  204. movdqu xmm2, XMMWORD PTR [rdi]
  205. paddd xmm6, xmm3
  206. psubw xmm1, xmm2
  207. movdqu xmm3, XMMWORD PTR [rsi+rax]
  208. paddw xmm5, xmm1
  209. pmaddwd xmm1, xmm1
  210. movdqu xmm2, XMMWORD PTR [rdi+rdx]
  211. paddd xmm6, xmm1
  212. psubw xmm3, xmm2
  213. paddw xmm5, xmm3
  214. pmaddwd xmm3, xmm3
  215. paddd xmm6, xmm3
  216. movdqa xmm1, xmm5
  217. movdqa xmm2, xmm5
  218. pcmpgtw xmm1, xmm0
  219. pcmpeqw xmm2, xmm0
  220. por xmm1, xmm2
  221. pcmpeqw xmm1, xmm0
  222. movdqa xmm2, xmm5
  223. punpcklwd xmm5, xmm1
  224. punpckhwd xmm2, xmm1
  225. paddd xmm7, xmm5
  226. paddd xmm7, xmm2
  227. lea rsi, [rsi + 2*rax]
  228. lea rdi, [rdi + 2*rdx]
  229. sub rcx, 4
  230. jnz .var8loop
  231. movdqa xmm4, xmm6
  232. punpckldq xmm6, xmm0
  233. punpckhdq xmm4, xmm0
  234. movdqa xmm5, xmm7
  235. paddd xmm6, xmm4
  236. punpckldq xmm7, xmm0
  237. punpckhdq xmm5, xmm0
  238. paddd xmm7, xmm5
  239. movdqa xmm4, xmm6
  240. movdqa xmm5, xmm7
  241. psrldq xmm4, 8
  242. psrldq xmm5, 8
  243. paddd xmm6, xmm4
  244. paddd xmm7, xmm5
  245. mov rdi, arg(4) ; [SSE]
  246. mov rax, arg(5) ; [Sum]
  247. movd DWORD PTR [rdi], xmm6
  248. movd DWORD PTR [rax], xmm7
  249. ; begin epilog
  250. pop rdi
  251. pop rsi
  252. pop rbx
  253. RESTORE_XMM
  254. UNSHADOW_ARGS
  255. pop rbp
  256. ret