deblock_sse2.asm 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;macro in deblock functions
  12. %macro FIRST_2_ROWS 0
  13. movdqa xmm4, xmm0
  14. movdqa xmm6, xmm0
  15. movdqa xmm5, xmm1
  16. pavgb xmm5, xmm3
  17. ;calculate absolute value
  18. psubusb xmm4, xmm1
  19. psubusb xmm1, xmm0
  20. psubusb xmm6, xmm3
  21. psubusb xmm3, xmm0
  22. paddusb xmm4, xmm1
  23. paddusb xmm6, xmm3
  24. ;get threshold
  25. movdqa xmm2, flimit
  26. pxor xmm1, xmm1
  27. movdqa xmm7, xmm2
  28. ;get mask
  29. psubusb xmm2, xmm4
  30. psubusb xmm7, xmm6
  31. pcmpeqb xmm2, xmm1
  32. pcmpeqb xmm7, xmm1
  33. por xmm7, xmm2
  34. %endmacro
  35. %macro SECOND_2_ROWS 0
  36. movdqa xmm6, xmm0
  37. movdqa xmm4, xmm0
  38. movdqa xmm2, xmm1
  39. pavgb xmm1, xmm3
  40. ;calculate absolute value
  41. psubusb xmm6, xmm2
  42. psubusb xmm2, xmm0
  43. psubusb xmm4, xmm3
  44. psubusb xmm3, xmm0
  45. paddusb xmm6, xmm2
  46. paddusb xmm4, xmm3
  47. pavgb xmm5, xmm1
  48. ;get threshold
  49. movdqa xmm2, flimit
  50. pxor xmm1, xmm1
  51. movdqa xmm3, xmm2
  52. ;get mask
  53. psubusb xmm2, xmm6
  54. psubusb xmm3, xmm4
  55. pcmpeqb xmm2, xmm1
  56. pcmpeqb xmm3, xmm1
  57. por xmm7, xmm2
  58. por xmm7, xmm3
  59. pavgb xmm5, xmm0
  60. ;decide if or not to use filtered value
  61. pand xmm0, xmm7
  62. pandn xmm7, xmm5
  63. paddusb xmm0, xmm7
  64. %endmacro
  65. %macro UPDATE_FLIMIT 0
  66. movdqu xmm2, XMMWORD PTR [rbx]
  67. movdqu [rsp], xmm2
  68. add rbx, 16
  69. %endmacro
  70. ;void vpx_post_proc_down_and_across_mb_row_sse2
  71. ;(
  72. ; unsigned char *src_ptr,
  73. ; unsigned char *dst_ptr,
  74. ; int src_pixels_per_line,
  75. ; int dst_pixels_per_line,
  76. ; int cols,
  77. ; int *flimits,
  78. ; int size
  79. ;)
  80. global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
  81. sym(vpx_post_proc_down_and_across_mb_row_sse2):
  82. push rbp
  83. mov rbp, rsp
  84. SHADOW_ARGS_TO_STACK 7
  85. SAVE_XMM 7
  86. push rbx
  87. push rsi
  88. push rdi
  89. ; end prolog
  90. ALIGN_STACK 16, rax
  91. sub rsp, 16
  92. ; put flimit on stack
  93. mov rbx, arg(5) ;flimits ptr
  94. UPDATE_FLIMIT
  95. %define flimit [rsp]
  96. mov rsi, arg(0) ;src_ptr
  97. mov rdi, arg(1) ;dst_ptr
  98. movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
  99. movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
  100. .nextrow:
  101. xor rdx, rdx ;col
  102. .nextcol:
  103. ;load current and next 2 rows
  104. movdqu xmm0, XMMWORD PTR [rsi]
  105. movdqu xmm1, XMMWORD PTR [rsi + rax]
  106. movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
  107. FIRST_2_ROWS
  108. ;load above 2 rows
  109. neg rax
  110. movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
  111. movdqu xmm3, XMMWORD PTR [rsi + rax]
  112. SECOND_2_ROWS
  113. movdqu XMMWORD PTR [rdi], xmm0
  114. neg rax ; positive stride
  115. add rsi, 16
  116. add rdi, 16
  117. add rdx, 16
  118. cmp edx, dword arg(4) ;cols
  119. jge .downdone
  120. UPDATE_FLIMIT
  121. jmp .nextcol
  122. .downdone:
  123. ; done with the all cols, start the across filtering in place
  124. sub rsi, rdx
  125. sub rdi, rdx
  126. mov rbx, arg(5) ; flimits
  127. UPDATE_FLIMIT
  128. ; dup the first byte into the left border 8 times
  129. movq mm1, [rdi]
  130. punpcklbw mm1, mm1
  131. punpcklwd mm1, mm1
  132. punpckldq mm1, mm1
  133. mov rdx, -8
  134. movq [rdi+rdx], mm1
  135. ; dup the last byte into the right border
  136. movsxd rdx, dword arg(4)
  137. movq mm1, [rdi + rdx + -1]
  138. punpcklbw mm1, mm1
  139. punpcklwd mm1, mm1
  140. punpckldq mm1, mm1
  141. movq [rdi+rdx], mm1
  142. xor rdx, rdx
  143. movq mm0, QWORD PTR [rdi-16];
  144. movq mm1, QWORD PTR [rdi-8];
  145. .acrossnextcol:
  146. movdqu xmm0, XMMWORD PTR [rdi + rdx]
  147. movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
  148. movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
  149. FIRST_2_ROWS
  150. movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
  151. movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
  152. SECOND_2_ROWS
  153. movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
  154. movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
  155. movdq2q mm0, xmm0
  156. psrldq xmm0, 8
  157. movdq2q mm1, xmm0
  158. add rdx, 16
  159. cmp edx, dword arg(4) ;cols
  160. jge .acrossdone
  161. UPDATE_FLIMIT
  162. jmp .acrossnextcol
  163. .acrossdone:
  164. ; last 16 pixels
  165. movq QWORD PTR [rdi+rdx-16], mm0
  166. cmp edx, dword arg(4)
  167. jne .throw_last_8
  168. movq QWORD PTR [rdi+rdx-8], mm1
  169. .throw_last_8:
  170. ; done with this rwo
  171. add rsi,rax ;next src line
  172. mov eax, dword arg(3) ;dst_pixels_per_line
  173. add rdi,rax ;next destination
  174. mov eax, dword arg(2) ;src_pixels_per_line
  175. mov rbx, arg(5) ;flimits
  176. UPDATE_FLIMIT
  177. dec rcx ;decrement count
  178. jnz .nextrow ;next row
  179. add rsp, 16
  180. pop rsp
  181. ; begin epilog
  182. pop rdi
  183. pop rsi
  184. pop rbx
  185. RESTORE_XMM
  186. UNSHADOW_ARGS
  187. pop rbp
  188. ret
  189. %undef flimit
  190. ;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
  191. ; int pitch, int rows, int cols,int flimit)
  192. extern sym(vpx_rv)
  193. global sym(vpx_mbpost_proc_down_sse2) PRIVATE
  194. sym(vpx_mbpost_proc_down_sse2):
  195. push rbp
  196. mov rbp, rsp
  197. SHADOW_ARGS_TO_STACK 5
  198. SAVE_XMM 7
  199. GET_GOT rbx
  200. push rsi
  201. push rdi
  202. ; end prolog
  203. ALIGN_STACK 16, rax
  204. sub rsp, 128+16
  205. ; unsigned char d[16][8] at [rsp]
  206. ; create flimit2 at [rsp+128]
  207. mov eax, dword ptr arg(4) ;flimit
  208. mov [rsp+128], eax
  209. mov [rsp+128+4], eax
  210. mov [rsp+128+8], eax
  211. mov [rsp+128+12], eax
  212. %define flimit4 [rsp+128]
  213. %if ABI_IS_32BIT=0
  214. lea r8, [GLOBAL(sym(vpx_rv))]
  215. %endif
  216. ;rows +=8;
  217. add dword arg(2), 8
  218. ;for(c=0; c<cols; c+=8)
  219. .loop_col:
  220. mov rsi, arg(0) ; s
  221. pxor xmm0, xmm0 ;
  222. movsxd rax, dword ptr arg(1) ;pitch ;
  223. ; this copies the last row down into the border 8 rows
  224. mov rdi, rsi
  225. mov rdx, arg(2)
  226. sub rdx, 9
  227. imul rdx, rax
  228. lea rdi, [rdi+rdx]
  229. movq xmm1, QWORD ptr[rdi] ; first row
  230. mov rcx, 8
  231. .init_borderd: ; initialize borders
  232. lea rdi, [rdi + rax]
  233. movq [rdi], xmm1
  234. dec rcx
  235. jne .init_borderd
  236. neg rax ; rax = -pitch
  237. ; this copies the first row up into the border 8 rows
  238. mov rdi, rsi
  239. movq xmm1, QWORD ptr[rdi] ; first row
  240. mov rcx, 8
  241. .init_border: ; initialize borders
  242. lea rdi, [rdi + rax]
  243. movq [rdi], xmm1
  244. dec rcx
  245. jne .init_border
  246. lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
  247. neg rax
  248. pxor xmm5, xmm5
  249. pxor xmm6, xmm6 ;
  250. pxor xmm7, xmm7 ;
  251. mov rdi, rsi
  252. mov rcx, 15 ;
  253. .loop_initvar:
  254. movq xmm1, QWORD PTR [rdi];
  255. punpcklbw xmm1, xmm0 ;
  256. paddw xmm5, xmm1 ;
  257. pmullw xmm1, xmm1 ;
  258. movdqa xmm2, xmm1 ;
  259. punpcklwd xmm1, xmm0 ;
  260. punpckhwd xmm2, xmm0 ;
  261. paddd xmm6, xmm1 ;
  262. paddd xmm7, xmm2 ;
  263. lea rdi, [rdi+rax] ;
  264. dec rcx
  265. jne .loop_initvar
  266. ;save the var and sum
  267. xor rdx, rdx
  268. .loop_row:
  269. movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
  270. movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
  271. punpcklbw xmm1, xmm0
  272. punpcklbw xmm2, xmm0
  273. paddw xmm5, xmm2
  274. psubw xmm5, xmm1
  275. pmullw xmm2, xmm2
  276. movdqa xmm4, xmm2
  277. punpcklwd xmm2, xmm0
  278. punpckhwd xmm4, xmm0
  279. paddd xmm6, xmm2
  280. paddd xmm7, xmm4
  281. pmullw xmm1, xmm1
  282. movdqa xmm2, xmm1
  283. punpcklwd xmm1, xmm0
  284. psubd xmm6, xmm1
  285. punpckhwd xmm2, xmm0
  286. psubd xmm7, xmm2
  287. movdqa xmm3, xmm6
  288. pslld xmm3, 4
  289. psubd xmm3, xmm6
  290. movdqa xmm1, xmm5
  291. movdqa xmm4, xmm5
  292. pmullw xmm1, xmm1
  293. pmulhw xmm4, xmm4
  294. movdqa xmm2, xmm1
  295. punpcklwd xmm1, xmm4
  296. punpckhwd xmm2, xmm4
  297. movdqa xmm4, xmm7
  298. pslld xmm4, 4
  299. psubd xmm4, xmm7
  300. psubd xmm3, xmm1
  301. psubd xmm4, xmm2
  302. psubd xmm3, flimit4
  303. psubd xmm4, flimit4
  304. psrad xmm3, 31
  305. psrad xmm4, 31
  306. packssdw xmm3, xmm4
  307. packsswb xmm3, xmm0
  308. movq xmm1, QWORD PTR [rsi+rax*8]
  309. movq xmm2, xmm1
  310. punpcklbw xmm1, xmm0
  311. paddw xmm1, xmm5
  312. mov rcx, rdx
  313. and rcx, 127
  314. %if ABI_IS_32BIT=1 && CONFIG_PIC=1
  315. push rax
  316. lea rax, [GLOBAL(sym(vpx_rv))]
  317. movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2]
  318. pop rax
  319. %elif ABI_IS_32BIT=0
  320. movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2]
  321. %else
  322. movdqu xmm4, [sym(vpx_rv) + rcx*2]
  323. %endif
  324. paddw xmm1, xmm4
  325. ;paddw xmm1, eight8s
  326. psraw xmm1, 4
  327. packuswb xmm1, xmm0
  328. pand xmm1, xmm3
  329. pandn xmm3, xmm2
  330. por xmm1, xmm3
  331. and rcx, 15
  332. movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
  333. cmp edx, 8
  334. jl .skip_assignment
  335. mov rcx, rdx
  336. sub rcx, 8
  337. and rcx, 15
  338. movq mm0, [rsp + rcx*8] ;d[rcx*8]
  339. movq [rsi], mm0
  340. .skip_assignment:
  341. lea rsi, [rsi+rax]
  342. lea rdi, [rdi+rax]
  343. add rdx, 1
  344. cmp edx, dword arg(2) ;rows
  345. jl .loop_row
  346. add dword arg(0), 8 ; s += 8
  347. sub dword arg(3), 8 ; cols -= 8
  348. cmp dword arg(3), 0
  349. jg .loop_col
  350. add rsp, 128+16
  351. pop rsp
  352. ; begin epilog
  353. pop rdi
  354. pop rsi
  355. RESTORE_GOT
  356. RESTORE_XMM
  357. UNSHADOW_ARGS
  358. pop rbp
  359. ret
  360. %undef flimit4
  361. ;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
  362. ; int pitch, int rows, int cols,int flimit)
  363. global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
  364. sym(vpx_mbpost_proc_across_ip_sse2):
  365. push rbp
  366. mov rbp, rsp
  367. SHADOW_ARGS_TO_STACK 5
  368. SAVE_XMM 7
  369. GET_GOT rbx
  370. push rsi
  371. push rdi
  372. ; end prolog
  373. ALIGN_STACK 16, rax
  374. sub rsp, 16
  375. ; create flimit4 at [rsp]
  376. mov eax, dword ptr arg(4) ;flimit
  377. mov [rsp], eax
  378. mov [rsp+4], eax
  379. mov [rsp+8], eax
  380. mov [rsp+12], eax
  381. %define flimit4 [rsp]
  382. ;for(r=0;r<rows;r++)
  383. .ip_row_loop:
  384. xor rdx, rdx ;sumsq=0;
  385. xor rcx, rcx ;sum=0;
  386. mov rsi, arg(0); s
  387. ; dup the first byte into the left border 8 times
  388. movq mm1, [rsi]
  389. punpcklbw mm1, mm1
  390. punpcklwd mm1, mm1
  391. punpckldq mm1, mm1
  392. mov rdi, -8
  393. movq [rsi+rdi], mm1
  394. ; dup the last byte into the right border
  395. movsxd rdx, dword arg(3)
  396. movq mm1, [rsi + rdx + -1]
  397. punpcklbw mm1, mm1
  398. punpcklwd mm1, mm1
  399. punpckldq mm1, mm1
  400. movq [rsi+rdx], mm1
  401. .ip_var_loop:
  402. ;for(i=-8;i<=6;i++)
  403. ;{
  404. ; sumsq += s[i]*s[i];
  405. ; sum += s[i];
  406. ;}
  407. movzx eax, byte [rsi+rdi]
  408. add ecx, eax
  409. mul al
  410. add edx, eax
  411. add rdi, 1
  412. cmp rdi, 6
  413. jle .ip_var_loop
  414. ;mov rax, sumsq
  415. ;movd xmm7, rax
  416. movd xmm7, edx
  417. ;mov rax, sum
  418. ;movd xmm6, rax
  419. movd xmm6, ecx
  420. mov rsi, arg(0) ;s
  421. xor rcx, rcx
  422. movsxd rdx, dword arg(3) ;cols
  423. add rdx, 8
  424. pxor mm0, mm0
  425. pxor mm1, mm1
  426. pxor xmm0, xmm0
  427. .nextcol4:
  428. movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
  429. movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
  430. punpcklbw xmm1, xmm0 ; expanding
  431. punpcklbw xmm2, xmm0 ; expanding
  432. punpcklwd xmm1, xmm0 ; expanding to dwords
  433. punpcklwd xmm2, xmm0 ; expanding to dwords
  434. psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
  435. paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
  436. paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
  437. pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
  438. paddd xmm6, xmm2
  439. paddd xmm7, xmm1
  440. pshufd xmm6, xmm6, 0 ; duplicate the last ones
  441. pshufd xmm7, xmm7, 0 ; duplicate the last ones
  442. psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
  443. psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
  444. pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
  445. pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
  446. paddd xmm6, xmm4
  447. paddd xmm7, xmm3
  448. pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
  449. pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
  450. paddd xmm7, xmm3
  451. paddd xmm6, xmm4
  452. pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
  453. pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
  454. paddd xmm7, xmm3
  455. paddd xmm6, xmm4
  456. movdqa xmm3, xmm6
  457. pmaddwd xmm3, xmm3
  458. movdqa xmm5, xmm7
  459. pslld xmm5, 4
  460. psubd xmm5, xmm7
  461. psubd xmm5, xmm3
  462. psubd xmm5, flimit4
  463. psrad xmm5, 31
  464. packssdw xmm5, xmm0
  465. packsswb xmm5, xmm0
  466. movd xmm1, DWORD PTR [rsi+rcx]
  467. movq xmm2, xmm1
  468. punpcklbw xmm1, xmm0
  469. punpcklwd xmm1, xmm0
  470. paddd xmm1, xmm6
  471. paddd xmm1, [GLOBAL(four8s)]
  472. psrad xmm1, 4
  473. packssdw xmm1, xmm0
  474. packuswb xmm1, xmm0
  475. pand xmm1, xmm5
  476. pandn xmm5, xmm2
  477. por xmm5, xmm1
  478. movd [rsi+rcx-8], mm0
  479. movq mm0, mm1
  480. movdq2q mm1, xmm5
  481. psrldq xmm7, 12
  482. psrldq xmm6, 12
  483. add rcx, 4
  484. cmp rcx, rdx
  485. jl .nextcol4
  486. ;s+=pitch;
  487. movsxd rax, dword arg(1)
  488. add arg(0), rax
  489. sub dword arg(2), 1 ;rows-=1
  490. cmp dword arg(2), 0
  491. jg .ip_row_loop
  492. add rsp, 16
  493. pop rsp
  494. ; begin epilog
  495. pop rdi
  496. pop rsi
  497. RESTORE_GOT
  498. RESTORE_XMM
  499. UNSHADOW_ARGS
  500. pop rbp
  501. ret
  502. %undef flimit4
  503. SECTION_RODATA
  504. align 16
  505. four8s:
  506. times 4 dd 8