subpel_variance_sse2.asm 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION_RODATA
  12. pw_8: times 8 dw 8
  13. bilin_filter_m_sse2: times 8 dw 16
  14. times 8 dw 0
  15. times 8 dw 14
  16. times 8 dw 2
  17. times 8 dw 12
  18. times 8 dw 4
  19. times 8 dw 10
  20. times 8 dw 6
  21. times 16 dw 8
  22. times 8 dw 6
  23. times 8 dw 10
  24. times 8 dw 4
  25. times 8 dw 12
  26. times 8 dw 2
  27. times 8 dw 14
  28. bilin_filter_m_ssse3: times 8 db 16, 0
  29. times 8 db 14, 2
  30. times 8 db 12, 4
  31. times 8 db 10, 6
  32. times 16 db 8
  33. times 8 db 6, 10
  34. times 8 db 4, 12
  35. times 8 db 2, 14
  36. SECTION .text
  37. ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
  38. ; int x_offset, int y_offset,
  39. ; const uint8_t *dst, ptrdiff_t dst_stride,
  40. ; int height, unsigned int *sse);
  41. ;
  42. ; This function returns the SE and stores SSE in the given pointer.
  43. %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
  44. psubw %3, %4
  45. psubw %1, %2
  46. paddw %5, %3
  47. pmaddwd %3, %3
  48. paddw %5, %1
  49. pmaddwd %1, %1
  50. paddd %6, %3
  51. paddd %6, %1
  52. %endmacro
  53. %macro STORE_AND_RET 1
  54. %if %1 > 4
  55. ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
  56. ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
  57. ; We have to sign-extend it before adding the words within the register
  58. ; and outputing to a dword.
  59. pcmpgtw m5, m6 ; mask for 0 > x
  60. movhlps m3, m7
  61. punpcklwd m4, m6, m5
  62. punpckhwd m6, m5 ; sign-extend m6 word->dword
  63. paddd m7, m3
  64. paddd m6, m4
  65. pshufd m3, m7, 0x1
  66. movhlps m4, m6
  67. paddd m7, m3
  68. paddd m6, m4
  69. mov r1, ssem ; r1 = unsigned int *sse
  70. pshufd m4, m6, 0x1
  71. movd [r1], m7 ; store sse
  72. paddd m6, m4
  73. movd raxd, m6 ; store sum as return value
  74. %else ; 4xh
  75. pshuflw m4, m6, 0xe
  76. pshuflw m3, m7, 0xe
  77. paddw m6, m4
  78. paddd m7, m3
  79. pcmpgtw m5, m6 ; mask for 0 > x
  80. mov r1, ssem ; r1 = unsigned int *sse
  81. punpcklwd m6, m5 ; sign-extend m6 word->dword
  82. movd [r1], m7 ; store sse
  83. pshuflw m4, m6, 0xe
  84. paddd m6, m4
  85. movd raxd, m6 ; store sum as return value
  86. %endif
  87. RET
  88. %endmacro
  89. %macro INC_SRC_BY_SRC_STRIDE 0
  90. %if ARCH_X86=1 && CONFIG_PIC=1
  91. add srcq, src_stridemp
  92. %else
  93. add srcq, src_strideq
  94. %endif
  95. %endmacro
  96. %macro SUBPEL_VARIANCE 1-2 0 ; W
  97. %if cpuflag(ssse3)
  98. %define bilin_filter_m bilin_filter_m_ssse3
  99. %define filter_idx_shift 4
  100. %else
  101. %define bilin_filter_m bilin_filter_m_sse2
  102. %define filter_idx_shift 5
  103. %endif
  104. ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
  105. ; 11, not 13, if the registers are ordered correctly. May make a minor speed
  106. ; difference on Win64
  107. %ifdef PIC ; 64bit PIC
  108. %if %2 == 1 ; avg
  109. cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
  110. x_offset, y_offset, \
  111. dst, dst_stride, \
  112. sec, sec_stride, height, sse
  113. %define sec_str sec_strideq
  114. %else
  115. cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
  116. y_offset, dst, dst_stride, height, sse
  117. %endif
  118. %define block_height heightd
  119. %define bilin_filter sseq
  120. %else
  121. %if ARCH_X86=1 && CONFIG_PIC=1
  122. %if %2 == 1 ; avg
  123. cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
  124. x_offset, y_offset, \
  125. dst, dst_stride, \
  126. sec, sec_stride, \
  127. height, sse, g_bilin_filter, g_pw_8
  128. %define block_height dword heightm
  129. %define sec_str sec_stridemp
  130. ;Store bilin_filter and pw_8 location in stack
  131. %if GET_GOT_DEFINED == 1
  132. GET_GOT eax
  133. add esp, 4 ; restore esp
  134. %endif
  135. lea ecx, [GLOBAL(bilin_filter_m)]
  136. mov g_bilin_filterm, ecx
  137. lea ecx, [GLOBAL(pw_8)]
  138. mov g_pw_8m, ecx
  139. LOAD_IF_USED 0, 1 ; load eax, ecx back
  140. %else
  141. cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
  142. y_offset, dst, dst_stride, height, sse, \
  143. g_bilin_filter, g_pw_8
  144. %define block_height heightd
  145. ;Store bilin_filter and pw_8 location in stack
  146. %if GET_GOT_DEFINED == 1
  147. GET_GOT eax
  148. add esp, 4 ; restore esp
  149. %endif
  150. lea ecx, [GLOBAL(bilin_filter_m)]
  151. mov g_bilin_filterm, ecx
  152. lea ecx, [GLOBAL(pw_8)]
  153. mov g_pw_8m, ecx
  154. LOAD_IF_USED 0, 1 ; load eax, ecx back
  155. %endif
  156. %else
  157. %if %2 == 1 ; avg
  158. cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
  159. 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
  160. x_offset, y_offset, \
  161. dst, dst_stride, \
  162. sec, sec_stride, \
  163. height, sse
  164. %if ARCH_X86_64
  165. %define block_height heightd
  166. %define sec_str sec_strideq
  167. %else
  168. %define block_height dword heightm
  169. %define sec_str sec_stridemp
  170. %endif
  171. %else
  172. cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
  173. y_offset, dst, dst_stride, height, sse
  174. %define block_height heightd
  175. %endif
  176. %define bilin_filter bilin_filter_m
  177. %endif
  178. %endif
  179. %if %1 == 4
  180. %define movx movd
  181. %else
  182. %define movx movh
  183. %endif
  184. ASSERT %1 <= 16 ; m6 overflows if w > 16
  185. pxor m6, m6 ; sum
  186. pxor m7, m7 ; sse
  187. ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
  188. ; could perhaps use it for something more productive then
  189. pxor m5, m5 ; dedicated zero register
  190. %if %1 < 16
  191. sar block_height, 1
  192. %if %2 == 1 ; avg
  193. shl sec_str, 1
  194. %endif
  195. %endif
  196. ; FIXME(rbultje) replace by jumptable?
  197. test x_offsetd, x_offsetd
  198. jnz .x_nonzero
  199. ; x_offset == 0
  200. test y_offsetd, y_offsetd
  201. jnz .x_zero_y_nonzero
  202. ; x_offset == 0 && y_offset == 0
  203. .x_zero_y_zero_loop:
  204. %if %1 == 16
  205. movu m0, [srcq]
  206. mova m1, [dstq]
  207. %if %2 == 1 ; avg
  208. pavgb m0, [secq]
  209. punpckhbw m3, m1, m5
  210. punpcklbw m1, m5
  211. %endif
  212. punpckhbw m2, m0, m5
  213. punpcklbw m0, m5
  214. %if %2 == 0 ; !avg
  215. punpckhbw m3, m1, m5
  216. punpcklbw m1, m5
  217. %endif
  218. SUM_SSE m0, m1, m2, m3, m6, m7
  219. add srcq, src_strideq
  220. add dstq, dst_strideq
  221. %else ; %1 < 16
  222. movx m0, [srcq]
  223. %if %2 == 1 ; avg
  224. %if %1 > 4
  225. movhps m0, [srcq+src_strideq]
  226. %else ; 4xh
  227. movx m1, [srcq+src_strideq]
  228. punpckldq m0, m1
  229. %endif
  230. %else ; !avg
  231. movx m2, [srcq+src_strideq]
  232. %endif
  233. movx m1, [dstq]
  234. movx m3, [dstq+dst_strideq]
  235. %if %2 == 1 ; avg
  236. %if %1 > 4
  237. pavgb m0, [secq]
  238. %else
  239. movh m2, [secq]
  240. pavgb m0, m2
  241. %endif
  242. punpcklbw m3, m5
  243. punpcklbw m1, m5
  244. %if %1 > 4
  245. punpckhbw m2, m0, m5
  246. punpcklbw m0, m5
  247. %else ; 4xh
  248. punpcklbw m0, m5
  249. movhlps m2, m0
  250. %endif
  251. %else ; !avg
  252. punpcklbw m0, m5
  253. punpcklbw m2, m5
  254. punpcklbw m3, m5
  255. punpcklbw m1, m5
  256. %endif
  257. SUM_SSE m0, m1, m2, m3, m6, m7
  258. lea srcq, [srcq+src_strideq*2]
  259. lea dstq, [dstq+dst_strideq*2]
  260. %endif
  261. %if %2 == 1 ; avg
  262. add secq, sec_str
  263. %endif
  264. dec block_height
  265. jg .x_zero_y_zero_loop
  266. STORE_AND_RET %1
  267. .x_zero_y_nonzero:
  268. cmp y_offsetd, 4
  269. jne .x_zero_y_nonhalf
  270. ; x_offset == 0 && y_offset == 0.5
  271. .x_zero_y_half_loop:
  272. %if %1 == 16
  273. movu m0, [srcq]
  274. movu m4, [srcq+src_strideq]
  275. mova m1, [dstq]
  276. pavgb m0, m4
  277. punpckhbw m3, m1, m5
  278. %if %2 == 1 ; avg
  279. pavgb m0, [secq]
  280. %endif
  281. punpcklbw m1, m5
  282. punpckhbw m2, m0, m5
  283. punpcklbw m0, m5
  284. SUM_SSE m0, m1, m2, m3, m6, m7
  285. add srcq, src_strideq
  286. add dstq, dst_strideq
  287. %else ; %1 < 16
  288. movx m0, [srcq]
  289. movx m2, [srcq+src_strideq]
  290. %if %2 == 1 ; avg
  291. %if %1 > 4
  292. movhps m2, [srcq+src_strideq*2]
  293. %else ; 4xh
  294. movx m1, [srcq+src_strideq*2]
  295. punpckldq m2, m1
  296. %endif
  297. movx m1, [dstq]
  298. %if %1 > 4
  299. movlhps m0, m2
  300. %else ; 4xh
  301. punpckldq m0, m2
  302. %endif
  303. movx m3, [dstq+dst_strideq]
  304. pavgb m0, m2
  305. punpcklbw m1, m5
  306. %if %1 > 4
  307. pavgb m0, [secq]
  308. punpcklbw m3, m5
  309. punpckhbw m2, m0, m5
  310. punpcklbw m0, m5
  311. %else ; 4xh
  312. movh m4, [secq]
  313. pavgb m0, m4
  314. punpcklbw m3, m5
  315. punpcklbw m0, m5
  316. movhlps m2, m0
  317. %endif
  318. %else ; !avg
  319. movx m4, [srcq+src_strideq*2]
  320. movx m1, [dstq]
  321. pavgb m0, m2
  322. movx m3, [dstq+dst_strideq]
  323. pavgb m2, m4
  324. punpcklbw m0, m5
  325. punpcklbw m2, m5
  326. punpcklbw m3, m5
  327. punpcklbw m1, m5
  328. %endif
  329. SUM_SSE m0, m1, m2, m3, m6, m7
  330. lea srcq, [srcq+src_strideq*2]
  331. lea dstq, [dstq+dst_strideq*2]
  332. %endif
  333. %if %2 == 1 ; avg
  334. add secq, sec_str
  335. %endif
  336. dec block_height
  337. jg .x_zero_y_half_loop
  338. STORE_AND_RET %1
  339. .x_zero_y_nonhalf:
  340. ; x_offset == 0 && y_offset == bilin interpolation
  341. %ifdef PIC
  342. lea bilin_filter, [bilin_filter_m]
  343. %endif
  344. shl y_offsetd, filter_idx_shift
  345. %if ARCH_X86_64 && %1 > 4
  346. mova m8, [bilin_filter+y_offsetq]
  347. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  348. mova m9, [bilin_filter+y_offsetq+16]
  349. %endif
  350. mova m10, [pw_8]
  351. %define filter_y_a m8
  352. %define filter_y_b m9
  353. %define filter_rnd m10
  354. %else ; x86-32 or mmx
  355. %if ARCH_X86=1 && CONFIG_PIC=1
  356. ; x_offset == 0, reuse x_offset reg
  357. %define tempq x_offsetq
  358. add y_offsetq, g_bilin_filterm
  359. %define filter_y_a [y_offsetq]
  360. %define filter_y_b [y_offsetq+16]
  361. mov tempq, g_pw_8m
  362. %define filter_rnd [tempq]
  363. %else
  364. add y_offsetq, bilin_filter
  365. %define filter_y_a [y_offsetq]
  366. %define filter_y_b [y_offsetq+16]
  367. %define filter_rnd [pw_8]
  368. %endif
  369. %endif
  370. .x_zero_y_other_loop:
  371. %if %1 == 16
  372. movu m0, [srcq]
  373. movu m4, [srcq+src_strideq]
  374. mova m1, [dstq]
  375. %if cpuflag(ssse3)
  376. punpckhbw m2, m0, m4
  377. punpcklbw m0, m4
  378. pmaddubsw m2, filter_y_a
  379. pmaddubsw m0, filter_y_a
  380. paddw m2, filter_rnd
  381. paddw m0, filter_rnd
  382. %else
  383. punpckhbw m2, m0, m5
  384. punpckhbw m3, m4, m5
  385. punpcklbw m0, m5
  386. punpcklbw m4, m5
  387. ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
  388. ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
  389. ; instructions is the same (5), but it is 1 mul instead of 2, so might be
  390. ; slightly faster because of pmullw latency. It would also cut our rodata
  391. ; tables in half for this function, and save 1-2 registers on x86-64.
  392. pmullw m2, filter_y_a
  393. pmullw m3, filter_y_b
  394. paddw m2, filter_rnd
  395. pmullw m0, filter_y_a
  396. pmullw m4, filter_y_b
  397. paddw m0, filter_rnd
  398. paddw m2, m3
  399. paddw m0, m4
  400. %endif
  401. psraw m2, 4
  402. psraw m0, 4
  403. %if %2 == 1 ; avg
  404. ; FIXME(rbultje) pipeline
  405. packuswb m0, m2
  406. pavgb m0, [secq]
  407. punpckhbw m2, m0, m5
  408. punpcklbw m0, m5
  409. %endif
  410. punpckhbw m3, m1, m5
  411. punpcklbw m1, m5
  412. SUM_SSE m0, m1, m2, m3, m6, m7
  413. add srcq, src_strideq
  414. add dstq, dst_strideq
  415. %else ; %1 < 16
  416. movx m0, [srcq]
  417. movx m2, [srcq+src_strideq]
  418. movx m4, [srcq+src_strideq*2]
  419. movx m3, [dstq+dst_strideq]
  420. %if cpuflag(ssse3)
  421. movx m1, [dstq]
  422. punpcklbw m0, m2
  423. punpcklbw m2, m4
  424. pmaddubsw m0, filter_y_a
  425. pmaddubsw m2, filter_y_a
  426. punpcklbw m3, m5
  427. paddw m2, filter_rnd
  428. paddw m0, filter_rnd
  429. %else
  430. punpcklbw m0, m5
  431. punpcklbw m2, m5
  432. punpcklbw m4, m5
  433. pmullw m0, filter_y_a
  434. pmullw m1, m2, filter_y_b
  435. punpcklbw m3, m5
  436. paddw m0, filter_rnd
  437. pmullw m2, filter_y_a
  438. pmullw m4, filter_y_b
  439. paddw m0, m1
  440. paddw m2, filter_rnd
  441. movx m1, [dstq]
  442. paddw m2, m4
  443. %endif
  444. psraw m0, 4
  445. psraw m2, 4
  446. %if %2 == 1 ; avg
  447. ; FIXME(rbultje) pipeline
  448. %if %1 == 4
  449. movlhps m0, m2
  450. %endif
  451. packuswb m0, m2
  452. %if %1 > 4
  453. pavgb m0, [secq]
  454. punpckhbw m2, m0, m5
  455. punpcklbw m0, m5
  456. %else ; 4xh
  457. movh m2, [secq]
  458. pavgb m0, m2
  459. punpcklbw m0, m5
  460. movhlps m2, m0
  461. %endif
  462. %endif
  463. punpcklbw m1, m5
  464. SUM_SSE m0, m1, m2, m3, m6, m7
  465. lea srcq, [srcq+src_strideq*2]
  466. lea dstq, [dstq+dst_strideq*2]
  467. %endif
  468. %if %2 == 1 ; avg
  469. add secq, sec_str
  470. %endif
  471. dec block_height
  472. jg .x_zero_y_other_loop
  473. %undef filter_y_a
  474. %undef filter_y_b
  475. %undef filter_rnd
  476. STORE_AND_RET %1
  477. .x_nonzero:
  478. cmp x_offsetd, 4
  479. jne .x_nonhalf
  480. ; x_offset == 0.5
  481. test y_offsetd, y_offsetd
  482. jnz .x_half_y_nonzero
  483. ; x_offset == 0.5 && y_offset == 0
  484. .x_half_y_zero_loop:
  485. %if %1 == 16
  486. movu m0, [srcq]
  487. movu m4, [srcq+1]
  488. mova m1, [dstq]
  489. pavgb m0, m4
  490. punpckhbw m3, m1, m5
  491. %if %2 == 1 ; avg
  492. pavgb m0, [secq]
  493. %endif
  494. punpcklbw m1, m5
  495. punpckhbw m2, m0, m5
  496. punpcklbw m0, m5
  497. SUM_SSE m0, m1, m2, m3, m6, m7
  498. add srcq, src_strideq
  499. add dstq, dst_strideq
  500. %else ; %1 < 16
  501. movx m0, [srcq]
  502. movx m4, [srcq+1]
  503. %if %2 == 1 ; avg
  504. %if %1 > 4
  505. movhps m0, [srcq+src_strideq]
  506. movhps m4, [srcq+src_strideq+1]
  507. %else ; 4xh
  508. movx m1, [srcq+src_strideq]
  509. punpckldq m0, m1
  510. movx m2, [srcq+src_strideq+1]
  511. punpckldq m4, m2
  512. %endif
  513. movx m1, [dstq]
  514. movx m3, [dstq+dst_strideq]
  515. pavgb m0, m4
  516. punpcklbw m3, m5
  517. %if %1 > 4
  518. pavgb m0, [secq]
  519. punpcklbw m1, m5
  520. punpckhbw m2, m0, m5
  521. punpcklbw m0, m5
  522. %else ; 4xh
  523. movh m2, [secq]
  524. pavgb m0, m2
  525. punpcklbw m1, m5
  526. punpcklbw m0, m5
  527. movhlps m2, m0
  528. %endif
  529. %else ; !avg
  530. movx m2, [srcq+src_strideq]
  531. movx m1, [dstq]
  532. pavgb m0, m4
  533. movx m4, [srcq+src_strideq+1]
  534. movx m3, [dstq+dst_strideq]
  535. pavgb m2, m4
  536. punpcklbw m0, m5
  537. punpcklbw m2, m5
  538. punpcklbw m3, m5
  539. punpcklbw m1, m5
  540. %endif
  541. SUM_SSE m0, m1, m2, m3, m6, m7
  542. lea srcq, [srcq+src_strideq*2]
  543. lea dstq, [dstq+dst_strideq*2]
  544. %endif
  545. %if %2 == 1 ; avg
  546. add secq, sec_str
  547. %endif
  548. dec block_height
  549. jg .x_half_y_zero_loop
  550. STORE_AND_RET %1
  551. .x_half_y_nonzero:
  552. cmp y_offsetd, 4
  553. jne .x_half_y_nonhalf
  554. ; x_offset == 0.5 && y_offset == 0.5
  555. %if %1 == 16
  556. movu m0, [srcq]
  557. movu m3, [srcq+1]
  558. add srcq, src_strideq
  559. pavgb m0, m3
  560. .x_half_y_half_loop:
  561. movu m4, [srcq]
  562. movu m3, [srcq+1]
  563. mova m1, [dstq]
  564. pavgb m4, m3
  565. punpckhbw m3, m1, m5
  566. pavgb m0, m4
  567. %if %2 == 1 ; avg
  568. punpcklbw m1, m5
  569. pavgb m0, [secq]
  570. punpckhbw m2, m0, m5
  571. punpcklbw m0, m5
  572. %else
  573. punpckhbw m2, m0, m5
  574. punpcklbw m0, m5
  575. punpcklbw m1, m5
  576. %endif
  577. SUM_SSE m0, m1, m2, m3, m6, m7
  578. mova m0, m4
  579. add srcq, src_strideq
  580. add dstq, dst_strideq
  581. %else ; %1 < 16
  582. movx m0, [srcq]
  583. movx m3, [srcq+1]
  584. add srcq, src_strideq
  585. pavgb m0, m3
  586. .x_half_y_half_loop:
  587. movx m2, [srcq]
  588. movx m3, [srcq+1]
  589. %if %2 == 1 ; avg
  590. %if %1 > 4
  591. movhps m2, [srcq+src_strideq]
  592. movhps m3, [srcq+src_strideq+1]
  593. %else
  594. movx m1, [srcq+src_strideq]
  595. punpckldq m2, m1
  596. movx m1, [srcq+src_strideq+1]
  597. punpckldq m3, m1
  598. %endif
  599. pavgb m2, m3
  600. %if %1 > 4
  601. movlhps m0, m2
  602. movhlps m4, m2
  603. %else ; 4xh
  604. punpckldq m0, m2
  605. pshuflw m4, m2, 0xe
  606. %endif
  607. movx m1, [dstq]
  608. pavgb m0, m2
  609. movx m3, [dstq+dst_strideq]
  610. %if %1 > 4
  611. pavgb m0, [secq]
  612. %else
  613. movh m2, [secq]
  614. pavgb m0, m2
  615. %endif
  616. punpcklbw m3, m5
  617. punpcklbw m1, m5
  618. %if %1 > 4
  619. punpckhbw m2, m0, m5
  620. punpcklbw m0, m5
  621. %else
  622. punpcklbw m0, m5
  623. movhlps m2, m0
  624. %endif
  625. %else ; !avg
  626. movx m4, [srcq+src_strideq]
  627. movx m1, [srcq+src_strideq+1]
  628. pavgb m2, m3
  629. pavgb m4, m1
  630. pavgb m0, m2
  631. pavgb m2, m4
  632. movx m1, [dstq]
  633. movx m3, [dstq+dst_strideq]
  634. punpcklbw m0, m5
  635. punpcklbw m2, m5
  636. punpcklbw m3, m5
  637. punpcklbw m1, m5
  638. %endif
  639. SUM_SSE m0, m1, m2, m3, m6, m7
  640. mova m0, m4
  641. lea srcq, [srcq+src_strideq*2]
  642. lea dstq, [dstq+dst_strideq*2]
  643. %endif
  644. %if %2 == 1 ; avg
  645. add secq, sec_str
  646. %endif
  647. dec block_height
  648. jg .x_half_y_half_loop
  649. STORE_AND_RET %1
  650. .x_half_y_nonhalf:
  651. ; x_offset == 0.5 && y_offset == bilin interpolation
  652. %ifdef PIC
  653. lea bilin_filter, [bilin_filter_m]
  654. %endif
  655. shl y_offsetd, filter_idx_shift
  656. %if ARCH_X86_64 && %1 > 4
  657. mova m8, [bilin_filter+y_offsetq]
  658. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  659. mova m9, [bilin_filter+y_offsetq+16]
  660. %endif
  661. mova m10, [pw_8]
  662. %define filter_y_a m8
  663. %define filter_y_b m9
  664. %define filter_rnd m10
  665. %else ;x86_32
  666. %if ARCH_X86=1 && CONFIG_PIC=1
  667. ; x_offset == 0.5. We can reuse x_offset reg
  668. %define tempq x_offsetq
  669. add y_offsetq, g_bilin_filterm
  670. %define filter_y_a [y_offsetq]
  671. %define filter_y_b [y_offsetq+16]
  672. mov tempq, g_pw_8m
  673. %define filter_rnd [tempq]
  674. %else
  675. add y_offsetq, bilin_filter
  676. %define filter_y_a [y_offsetq]
  677. %define filter_y_b [y_offsetq+16]
  678. %define filter_rnd [pw_8]
  679. %endif
  680. %endif
  681. %if %1 == 16
  682. movu m0, [srcq]
  683. movu m3, [srcq+1]
  684. add srcq, src_strideq
  685. pavgb m0, m3
  686. .x_half_y_other_loop:
  687. movu m4, [srcq]
  688. movu m2, [srcq+1]
  689. mova m1, [dstq]
  690. pavgb m4, m2
  691. %if cpuflag(ssse3)
  692. punpckhbw m2, m0, m4
  693. punpcklbw m0, m4
  694. pmaddubsw m2, filter_y_a
  695. pmaddubsw m0, filter_y_a
  696. paddw m2, filter_rnd
  697. paddw m0, filter_rnd
  698. psraw m2, 4
  699. %else
  700. punpckhbw m2, m0, m5
  701. punpckhbw m3, m4, m5
  702. pmullw m2, filter_y_a
  703. pmullw m3, filter_y_b
  704. paddw m2, filter_rnd
  705. punpcklbw m0, m5
  706. paddw m2, m3
  707. punpcklbw m3, m4, m5
  708. pmullw m0, filter_y_a
  709. pmullw m3, filter_y_b
  710. paddw m0, filter_rnd
  711. psraw m2, 4
  712. paddw m0, m3
  713. %endif
  714. punpckhbw m3, m1, m5
  715. psraw m0, 4
  716. %if %2 == 1 ; avg
  717. ; FIXME(rbultje) pipeline
  718. packuswb m0, m2
  719. pavgb m0, [secq]
  720. punpckhbw m2, m0, m5
  721. punpcklbw m0, m5
  722. %endif
  723. punpcklbw m1, m5
  724. SUM_SSE m0, m1, m2, m3, m6, m7
  725. mova m0, m4
  726. add srcq, src_strideq
  727. add dstq, dst_strideq
  728. %else ; %1 < 16
  729. movx m0, [srcq]
  730. movx m3, [srcq+1]
  731. add srcq, src_strideq
  732. pavgb m0, m3
  733. %if notcpuflag(ssse3)
  734. punpcklbw m0, m5
  735. %endif
  736. .x_half_y_other_loop:
  737. movx m2, [srcq]
  738. movx m1, [srcq+1]
  739. movx m4, [srcq+src_strideq]
  740. movx m3, [srcq+src_strideq+1]
  741. pavgb m2, m1
  742. pavgb m4, m3
  743. movx m3, [dstq+dst_strideq]
  744. %if cpuflag(ssse3)
  745. movx m1, [dstq]
  746. punpcklbw m0, m2
  747. punpcklbw m2, m4
  748. pmaddubsw m0, filter_y_a
  749. pmaddubsw m2, filter_y_a
  750. punpcklbw m3, m5
  751. paddw m0, filter_rnd
  752. paddw m2, filter_rnd
  753. %else
  754. punpcklbw m2, m5
  755. punpcklbw m4, m5
  756. pmullw m0, filter_y_a
  757. pmullw m1, m2, filter_y_b
  758. punpcklbw m3, m5
  759. paddw m0, filter_rnd
  760. pmullw m2, filter_y_a
  761. paddw m0, m1
  762. pmullw m1, m4, filter_y_b
  763. paddw m2, filter_rnd
  764. paddw m2, m1
  765. movx m1, [dstq]
  766. %endif
  767. psraw m0, 4
  768. psraw m2, 4
  769. %if %2 == 1 ; avg
  770. ; FIXME(rbultje) pipeline
  771. %if %1 == 4
  772. movlhps m0, m2
  773. %endif
  774. packuswb m0, m2
  775. %if %1 > 4
  776. pavgb m0, [secq]
  777. punpckhbw m2, m0, m5
  778. punpcklbw m0, m5
  779. %else
  780. movh m2, [secq]
  781. pavgb m0, m2
  782. punpcklbw m0, m5
  783. movhlps m2, m0
  784. %endif
  785. %endif
  786. punpcklbw m1, m5
  787. SUM_SSE m0, m1, m2, m3, m6, m7
  788. mova m0, m4
  789. lea srcq, [srcq+src_strideq*2]
  790. lea dstq, [dstq+dst_strideq*2]
  791. %endif
  792. %if %2 == 1 ; avg
  793. add secq, sec_str
  794. %endif
  795. dec block_height
  796. jg .x_half_y_other_loop
  797. %undef filter_y_a
  798. %undef filter_y_b
  799. %undef filter_rnd
  800. STORE_AND_RET %1
  801. .x_nonhalf:
  802. test y_offsetd, y_offsetd
  803. jnz .x_nonhalf_y_nonzero
  804. ; x_offset == bilin interpolation && y_offset == 0
  805. %ifdef PIC
  806. lea bilin_filter, [bilin_filter_m]
  807. %endif
  808. shl x_offsetd, filter_idx_shift
  809. %if ARCH_X86_64 && %1 > 4
  810. mova m8, [bilin_filter+x_offsetq]
  811. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  812. mova m9, [bilin_filter+x_offsetq+16]
  813. %endif
  814. mova m10, [pw_8]
  815. %define filter_x_a m8
  816. %define filter_x_b m9
  817. %define filter_rnd m10
  818. %else ; x86-32
  819. %if ARCH_X86=1 && CONFIG_PIC=1
  820. ;y_offset == 0. We can reuse y_offset reg.
  821. %define tempq y_offsetq
  822. add x_offsetq, g_bilin_filterm
  823. %define filter_x_a [x_offsetq]
  824. %define filter_x_b [x_offsetq+16]
  825. mov tempq, g_pw_8m
  826. %define filter_rnd [tempq]
  827. %else
  828. add x_offsetq, bilin_filter
  829. %define filter_x_a [x_offsetq]
  830. %define filter_x_b [x_offsetq+16]
  831. %define filter_rnd [pw_8]
  832. %endif
  833. %endif
  834. .x_other_y_zero_loop:
  835. %if %1 == 16
  836. movu m0, [srcq]
  837. movu m4, [srcq+1]
  838. mova m1, [dstq]
  839. %if cpuflag(ssse3)
  840. punpckhbw m2, m0, m4
  841. punpcklbw m0, m4
  842. pmaddubsw m2, filter_x_a
  843. pmaddubsw m0, filter_x_a
  844. paddw m2, filter_rnd
  845. paddw m0, filter_rnd
  846. %else
  847. punpckhbw m2, m0, m5
  848. punpckhbw m3, m4, m5
  849. punpcklbw m0, m5
  850. punpcklbw m4, m5
  851. pmullw m2, filter_x_a
  852. pmullw m3, filter_x_b
  853. paddw m2, filter_rnd
  854. pmullw m0, filter_x_a
  855. pmullw m4, filter_x_b
  856. paddw m0, filter_rnd
  857. paddw m2, m3
  858. paddw m0, m4
  859. %endif
  860. psraw m2, 4
  861. psraw m0, 4
  862. %if %2 == 1 ; avg
  863. ; FIXME(rbultje) pipeline
  864. packuswb m0, m2
  865. pavgb m0, [secq]
  866. punpckhbw m2, m0, m5
  867. punpcklbw m0, m5
  868. %endif
  869. punpckhbw m3, m1, m5
  870. punpcklbw m1, m5
  871. SUM_SSE m0, m1, m2, m3, m6, m7
  872. add srcq, src_strideq
  873. add dstq, dst_strideq
  874. %else ; %1 < 16
  875. movx m0, [srcq]
  876. movx m1, [srcq+1]
  877. movx m2, [srcq+src_strideq]
  878. movx m4, [srcq+src_strideq+1]
  879. movx m3, [dstq+dst_strideq]
  880. %if cpuflag(ssse3)
  881. punpcklbw m0, m1
  882. movx m1, [dstq]
  883. punpcklbw m2, m4
  884. pmaddubsw m0, filter_x_a
  885. pmaddubsw m2, filter_x_a
  886. punpcklbw m3, m5
  887. paddw m0, filter_rnd
  888. paddw m2, filter_rnd
  889. %else
  890. punpcklbw m0, m5
  891. punpcklbw m1, m5
  892. punpcklbw m2, m5
  893. punpcklbw m4, m5
  894. pmullw m0, filter_x_a
  895. pmullw m1, filter_x_b
  896. punpcklbw m3, m5
  897. paddw m0, filter_rnd
  898. pmullw m2, filter_x_a
  899. pmullw m4, filter_x_b
  900. paddw m0, m1
  901. paddw m2, filter_rnd
  902. movx m1, [dstq]
  903. paddw m2, m4
  904. %endif
  905. psraw m0, 4
  906. psraw m2, 4
  907. %if %2 == 1 ; avg
  908. ; FIXME(rbultje) pipeline
  909. %if %1 == 4
  910. movlhps m0, m2
  911. %endif
  912. packuswb m0, m2
  913. %if %1 > 4
  914. pavgb m0, [secq]
  915. punpckhbw m2, m0, m5
  916. punpcklbw m0, m5
  917. %else
  918. movh m2, [secq]
  919. pavgb m0, m2
  920. punpcklbw m0, m5
  921. movhlps m2, m0
  922. %endif
  923. %endif
  924. punpcklbw m1, m5
  925. SUM_SSE m0, m1, m2, m3, m6, m7
  926. lea srcq, [srcq+src_strideq*2]
  927. lea dstq, [dstq+dst_strideq*2]
  928. %endif
  929. %if %2 == 1 ; avg
  930. add secq, sec_str
  931. %endif
  932. dec block_height
  933. jg .x_other_y_zero_loop
  934. %undef filter_x_a
  935. %undef filter_x_b
  936. %undef filter_rnd
  937. STORE_AND_RET %1
  938. .x_nonhalf_y_nonzero:
  939. cmp y_offsetd, 4
  940. jne .x_nonhalf_y_nonhalf
  941. ; x_offset == bilin interpolation && y_offset == 0.5
  942. %ifdef PIC
  943. lea bilin_filter, [bilin_filter_m]
  944. %endif
  945. shl x_offsetd, filter_idx_shift
  946. %if ARCH_X86_64 && %1 > 4
  947. mova m8, [bilin_filter+x_offsetq]
  948. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  949. mova m9, [bilin_filter+x_offsetq+16]
  950. %endif
  951. mova m10, [pw_8]
  952. %define filter_x_a m8
  953. %define filter_x_b m9
  954. %define filter_rnd m10
  955. %else ; x86-32
  956. %if ARCH_X86=1 && CONFIG_PIC=1
  957. ; y_offset == 0.5. We can reuse y_offset reg.
  958. %define tempq y_offsetq
  959. add x_offsetq, g_bilin_filterm
  960. %define filter_x_a [x_offsetq]
  961. %define filter_x_b [x_offsetq+16]
  962. mov tempq, g_pw_8m
  963. %define filter_rnd [tempq]
  964. %else
  965. add x_offsetq, bilin_filter
  966. %define filter_x_a [x_offsetq]
  967. %define filter_x_b [x_offsetq+16]
  968. %define filter_rnd [pw_8]
  969. %endif
  970. %endif
  971. %if %1 == 16
  972. movu m0, [srcq]
  973. movu m1, [srcq+1]
  974. %if cpuflag(ssse3)
  975. punpckhbw m2, m0, m1
  976. punpcklbw m0, m1
  977. pmaddubsw m2, filter_x_a
  978. pmaddubsw m0, filter_x_a
  979. paddw m2, filter_rnd
  980. paddw m0, filter_rnd
  981. %else
  982. punpckhbw m2, m0, m5
  983. punpckhbw m3, m1, m5
  984. punpcklbw m0, m5
  985. punpcklbw m1, m5
  986. pmullw m0, filter_x_a
  987. pmullw m1, filter_x_b
  988. paddw m0, filter_rnd
  989. pmullw m2, filter_x_a
  990. pmullw m3, filter_x_b
  991. paddw m2, filter_rnd
  992. paddw m0, m1
  993. paddw m2, m3
  994. %endif
  995. psraw m0, 4
  996. psraw m2, 4
  997. add srcq, src_strideq
  998. packuswb m0, m2
  999. .x_other_y_half_loop:
  1000. movu m4, [srcq]
  1001. movu m3, [srcq+1]
  1002. %if cpuflag(ssse3)
  1003. mova m1, [dstq]
  1004. punpckhbw m2, m4, m3
  1005. punpcklbw m4, m3
  1006. pmaddubsw m2, filter_x_a
  1007. pmaddubsw m4, filter_x_a
  1008. paddw m2, filter_rnd
  1009. paddw m4, filter_rnd
  1010. psraw m2, 4
  1011. psraw m4, 4
  1012. packuswb m4, m2
  1013. pavgb m0, m4
  1014. punpckhbw m3, m1, m5
  1015. punpcklbw m1, m5
  1016. %else
  1017. punpckhbw m2, m4, m5
  1018. punpckhbw m1, m3, m5
  1019. punpcklbw m4, m5
  1020. punpcklbw m3, m5
  1021. pmullw m4, filter_x_a
  1022. pmullw m3, filter_x_b
  1023. paddw m4, filter_rnd
  1024. pmullw m2, filter_x_a
  1025. pmullw m1, filter_x_b
  1026. paddw m2, filter_rnd
  1027. paddw m4, m3
  1028. paddw m2, m1
  1029. mova m1, [dstq]
  1030. psraw m4, 4
  1031. psraw m2, 4
  1032. punpckhbw m3, m1, m5
  1033. ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
  1034. ; have a 1-register shortage to be able to store the backup of the bilin
  1035. ; filtered second line as words as cache for the next line. Packing into
  1036. ; a byte costs 1 pack and 2 unpacks, but saves a register.
  1037. packuswb m4, m2
  1038. punpcklbw m1, m5
  1039. pavgb m0, m4
  1040. %endif
  1041. %if %2 == 1 ; avg
  1042. ; FIXME(rbultje) pipeline
  1043. pavgb m0, [secq]
  1044. %endif
  1045. punpckhbw m2, m0, m5
  1046. punpcklbw m0, m5
  1047. SUM_SSE m0, m1, m2, m3, m6, m7
  1048. mova m0, m4
  1049. add srcq, src_strideq
  1050. add dstq, dst_strideq
  1051. %else ; %1 < 16
  1052. movx m0, [srcq]
  1053. movx m1, [srcq+1]
  1054. %if cpuflag(ssse3)
  1055. punpcklbw m0, m1
  1056. pmaddubsw m0, filter_x_a
  1057. paddw m0, filter_rnd
  1058. %else
  1059. punpcklbw m0, m5
  1060. punpcklbw m1, m5
  1061. pmullw m0, filter_x_a
  1062. pmullw m1, filter_x_b
  1063. paddw m0, filter_rnd
  1064. paddw m0, m1
  1065. %endif
  1066. add srcq, src_strideq
  1067. psraw m0, 4
  1068. .x_other_y_half_loop:
  1069. movx m2, [srcq]
  1070. movx m1, [srcq+1]
  1071. movx m4, [srcq+src_strideq]
  1072. movx m3, [srcq+src_strideq+1]
  1073. %if cpuflag(ssse3)
  1074. punpcklbw m2, m1
  1075. punpcklbw m4, m3
  1076. pmaddubsw m2, filter_x_a
  1077. pmaddubsw m4, filter_x_a
  1078. movx m1, [dstq]
  1079. movx m3, [dstq+dst_strideq]
  1080. paddw m2, filter_rnd
  1081. paddw m4, filter_rnd
  1082. %else
  1083. punpcklbw m2, m5
  1084. punpcklbw m1, m5
  1085. punpcklbw m4, m5
  1086. punpcklbw m3, m5
  1087. pmullw m2, filter_x_a
  1088. pmullw m1, filter_x_b
  1089. paddw m2, filter_rnd
  1090. pmullw m4, filter_x_a
  1091. pmullw m3, filter_x_b
  1092. paddw m4, filter_rnd
  1093. paddw m2, m1
  1094. movx m1, [dstq]
  1095. paddw m4, m3
  1096. movx m3, [dstq+dst_strideq]
  1097. %endif
  1098. psraw m2, 4
  1099. psraw m4, 4
  1100. pavgw m0, m2
  1101. pavgw m2, m4
  1102. %if %2 == 1 ; avg
  1103. ; FIXME(rbultje) pipeline - also consider going to bytes here
  1104. %if %1 == 4
  1105. movlhps m0, m2
  1106. %endif
  1107. packuswb m0, m2
  1108. %if %1 > 4
  1109. pavgb m0, [secq]
  1110. punpckhbw m2, m0, m5
  1111. punpcklbw m0, m5
  1112. %else
  1113. movh m2, [secq]
  1114. pavgb m0, m2
  1115. punpcklbw m0, m5
  1116. movhlps m2, m0
  1117. %endif
  1118. %endif
  1119. punpcklbw m3, m5
  1120. punpcklbw m1, m5
  1121. SUM_SSE m0, m1, m2, m3, m6, m7
  1122. mova m0, m4
  1123. lea srcq, [srcq+src_strideq*2]
  1124. lea dstq, [dstq+dst_strideq*2]
  1125. %endif
  1126. %if %2 == 1 ; avg
  1127. add secq, sec_str
  1128. %endif
  1129. dec block_height
  1130. jg .x_other_y_half_loop
  1131. %undef filter_x_a
  1132. %undef filter_x_b
  1133. %undef filter_rnd
  1134. STORE_AND_RET %1
  1135. .x_nonhalf_y_nonhalf:
  1136. %ifdef PIC
  1137. lea bilin_filter, [bilin_filter_m]
  1138. %endif
  1139. shl x_offsetd, filter_idx_shift
  1140. shl y_offsetd, filter_idx_shift
  1141. %if ARCH_X86_64 && %1 > 4
  1142. mova m8, [bilin_filter+x_offsetq]
  1143. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  1144. mova m9, [bilin_filter+x_offsetq+16]
  1145. %endif
  1146. mova m10, [bilin_filter+y_offsetq]
  1147. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  1148. mova m11, [bilin_filter+y_offsetq+16]
  1149. %endif
  1150. mova m12, [pw_8]
  1151. %define filter_x_a m8
  1152. %define filter_x_b m9
  1153. %define filter_y_a m10
  1154. %define filter_y_b m11
  1155. %define filter_rnd m12
  1156. %else ; x86-32
  1157. %if ARCH_X86=1 && CONFIG_PIC=1
  1158. ; In this case, there is NO unused register. Used src_stride register. Later,
  1159. ; src_stride has to be loaded from stack when it is needed.
  1160. %define tempq src_strideq
  1161. mov tempq, g_bilin_filterm
  1162. add x_offsetq, tempq
  1163. add y_offsetq, tempq
  1164. %define filter_x_a [x_offsetq]
  1165. %define filter_x_b [x_offsetq+16]
  1166. %define filter_y_a [y_offsetq]
  1167. %define filter_y_b [y_offsetq+16]
  1168. mov tempq, g_pw_8m
  1169. %define filter_rnd [tempq]
  1170. %else
  1171. add x_offsetq, bilin_filter
  1172. add y_offsetq, bilin_filter
  1173. %define filter_x_a [x_offsetq]
  1174. %define filter_x_b [x_offsetq+16]
  1175. %define filter_y_a [y_offsetq]
  1176. %define filter_y_b [y_offsetq+16]
  1177. %define filter_rnd [pw_8]
  1178. %endif
  1179. %endif
  1180. ; x_offset == bilin interpolation && y_offset == bilin interpolation
  1181. %if %1 == 16
  1182. movu m0, [srcq]
  1183. movu m1, [srcq+1]
  1184. %if cpuflag(ssse3)
  1185. punpckhbw m2, m0, m1
  1186. punpcklbw m0, m1
  1187. pmaddubsw m2, filter_x_a
  1188. pmaddubsw m0, filter_x_a
  1189. paddw m2, filter_rnd
  1190. paddw m0, filter_rnd
  1191. %else
  1192. punpckhbw m2, m0, m5
  1193. punpckhbw m3, m1, m5
  1194. punpcklbw m0, m5
  1195. punpcklbw m1, m5
  1196. pmullw m0, filter_x_a
  1197. pmullw m1, filter_x_b
  1198. paddw m0, filter_rnd
  1199. pmullw m2, filter_x_a
  1200. pmullw m3, filter_x_b
  1201. paddw m2, filter_rnd
  1202. paddw m0, m1
  1203. paddw m2, m3
  1204. %endif
  1205. psraw m0, 4
  1206. psraw m2, 4
  1207. INC_SRC_BY_SRC_STRIDE
  1208. packuswb m0, m2
  1209. .x_other_y_other_loop:
  1210. %if cpuflag(ssse3)
  1211. movu m4, [srcq]
  1212. movu m3, [srcq+1]
  1213. mova m1, [dstq]
  1214. punpckhbw m2, m4, m3
  1215. punpcklbw m4, m3
  1216. pmaddubsw m2, filter_x_a
  1217. pmaddubsw m4, filter_x_a
  1218. punpckhbw m3, m1, m5
  1219. paddw m2, filter_rnd
  1220. paddw m4, filter_rnd
  1221. psraw m2, 4
  1222. psraw m4, 4
  1223. packuswb m4, m2
  1224. punpckhbw m2, m0, m4
  1225. punpcklbw m0, m4
  1226. pmaddubsw m2, filter_y_a
  1227. pmaddubsw m0, filter_y_a
  1228. punpcklbw m1, m5
  1229. paddw m2, filter_rnd
  1230. paddw m0, filter_rnd
  1231. psraw m2, 4
  1232. psraw m0, 4
  1233. %else
  1234. movu m3, [srcq]
  1235. movu m4, [srcq+1]
  1236. punpckhbw m1, m3, m5
  1237. punpckhbw m2, m4, m5
  1238. punpcklbw m3, m5
  1239. punpcklbw m4, m5
  1240. pmullw m3, filter_x_a
  1241. pmullw m4, filter_x_b
  1242. paddw m3, filter_rnd
  1243. pmullw m1, filter_x_a
  1244. pmullw m2, filter_x_b
  1245. paddw m1, filter_rnd
  1246. paddw m3, m4
  1247. paddw m1, m2
  1248. psraw m3, 4
  1249. psraw m1, 4
  1250. packuswb m4, m3, m1
  1251. punpckhbw m2, m0, m5
  1252. punpcklbw m0, m5
  1253. pmullw m2, filter_y_a
  1254. pmullw m1, filter_y_b
  1255. paddw m2, filter_rnd
  1256. pmullw m0, filter_y_a
  1257. pmullw m3, filter_y_b
  1258. paddw m2, m1
  1259. mova m1, [dstq]
  1260. paddw m0, filter_rnd
  1261. psraw m2, 4
  1262. paddw m0, m3
  1263. punpckhbw m3, m1, m5
  1264. psraw m0, 4
  1265. punpcklbw m1, m5
  1266. %endif
  1267. %if %2 == 1 ; avg
  1268. ; FIXME(rbultje) pipeline
  1269. packuswb m0, m2
  1270. pavgb m0, [secq]
  1271. punpckhbw m2, m0, m5
  1272. punpcklbw m0, m5
  1273. %endif
  1274. SUM_SSE m0, m1, m2, m3, m6, m7
  1275. mova m0, m4
  1276. INC_SRC_BY_SRC_STRIDE
  1277. add dstq, dst_strideq
  1278. %else ; %1 < 16
  1279. movx m0, [srcq]
  1280. movx m1, [srcq+1]
  1281. %if cpuflag(ssse3)
  1282. punpcklbw m0, m1
  1283. pmaddubsw m0, filter_x_a
  1284. paddw m0, filter_rnd
  1285. %else
  1286. punpcklbw m0, m5
  1287. punpcklbw m1, m5
  1288. pmullw m0, filter_x_a
  1289. pmullw m1, filter_x_b
  1290. paddw m0, filter_rnd
  1291. paddw m0, m1
  1292. %endif
  1293. psraw m0, 4
  1294. %if cpuflag(ssse3)
  1295. packuswb m0, m0
  1296. %endif
  1297. INC_SRC_BY_SRC_STRIDE
  1298. .x_other_y_other_loop:
  1299. movx m2, [srcq]
  1300. movx m1, [srcq+1]
  1301. INC_SRC_BY_SRC_STRIDE
  1302. movx m4, [srcq]
  1303. movx m3, [srcq+1]
  1304. %if cpuflag(ssse3)
  1305. punpcklbw m2, m1
  1306. punpcklbw m4, m3
  1307. pmaddubsw m2, filter_x_a
  1308. pmaddubsw m4, filter_x_a
  1309. movx m3, [dstq+dst_strideq]
  1310. movx m1, [dstq]
  1311. paddw m2, filter_rnd
  1312. paddw m4, filter_rnd
  1313. psraw m2, 4
  1314. psraw m4, 4
  1315. packuswb m2, m2
  1316. packuswb m4, m4
  1317. punpcklbw m0, m2
  1318. punpcklbw m2, m4
  1319. pmaddubsw m0, filter_y_a
  1320. pmaddubsw m2, filter_y_a
  1321. punpcklbw m3, m5
  1322. paddw m0, filter_rnd
  1323. paddw m2, filter_rnd
  1324. psraw m0, 4
  1325. psraw m2, 4
  1326. punpcklbw m1, m5
  1327. %else
  1328. punpcklbw m2, m5
  1329. punpcklbw m1, m5
  1330. punpcklbw m4, m5
  1331. punpcklbw m3, m5
  1332. pmullw m2, filter_x_a
  1333. pmullw m1, filter_x_b
  1334. paddw m2, filter_rnd
  1335. pmullw m4, filter_x_a
  1336. pmullw m3, filter_x_b
  1337. paddw m4, filter_rnd
  1338. paddw m2, m1
  1339. paddw m4, m3
  1340. psraw m2, 4
  1341. psraw m4, 4
  1342. pmullw m0, filter_y_a
  1343. pmullw m3, m2, filter_y_b
  1344. paddw m0, filter_rnd
  1345. pmullw m2, filter_y_a
  1346. pmullw m1, m4, filter_y_b
  1347. paddw m2, filter_rnd
  1348. paddw m0, m3
  1349. movx m3, [dstq+dst_strideq]
  1350. paddw m2, m1
  1351. movx m1, [dstq]
  1352. psraw m0, 4
  1353. psraw m2, 4
  1354. punpcklbw m3, m5
  1355. punpcklbw m1, m5
  1356. %endif
  1357. %if %2 == 1 ; avg
  1358. ; FIXME(rbultje) pipeline
  1359. %if %1 == 4
  1360. movlhps m0, m2
  1361. %endif
  1362. packuswb m0, m2
  1363. %if %1 > 4
  1364. pavgb m0, [secq]
  1365. punpckhbw m2, m0, m5
  1366. punpcklbw m0, m5
  1367. %else
  1368. movh m2, [secq]
  1369. pavgb m0, m2
  1370. punpcklbw m0, m5
  1371. movhlps m2, m0
  1372. %endif
  1373. %endif
  1374. SUM_SSE m0, m1, m2, m3, m6, m7
  1375. mova m0, m4
  1376. INC_SRC_BY_SRC_STRIDE
  1377. lea dstq, [dstq+dst_strideq*2]
  1378. %endif
  1379. %if %2 == 1 ; avg
  1380. add secq, sec_str
  1381. %endif
  1382. dec block_height
  1383. jg .x_other_y_other_loop
  1384. %undef filter_x_a
  1385. %undef filter_x_b
  1386. %undef filter_y_a
  1387. %undef filter_y_b
  1388. %undef filter_rnd
  1389. %undef movx
  1390. STORE_AND_RET %1
  1391. %endmacro
  1392. ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
  1393. ; between the ssse3 and non-ssse3 version. It may make sense to merge their
  1394. ; code in the sense that the ssse3 version would jump to the appropriate
  1395. ; location in the sse/2 version, rather than duplicating that code in the
  1396. ; binary.
  1397. INIT_XMM sse2
  1398. SUBPEL_VARIANCE 4
  1399. SUBPEL_VARIANCE 8
  1400. SUBPEL_VARIANCE 16
  1401. INIT_XMM ssse3
  1402. SUBPEL_VARIANCE 4
  1403. SUBPEL_VARIANCE 8
  1404. SUBPEL_VARIANCE 16
  1405. INIT_XMM sse2
  1406. SUBPEL_VARIANCE 4, 1
  1407. SUBPEL_VARIANCE 8, 1
  1408. SUBPEL_VARIANCE 16, 1
  1409. INIT_XMM ssse3
  1410. SUBPEL_VARIANCE 4, 1
  1411. SUBPEL_VARIANCE 8, 1
  1412. SUBPEL_VARIANCE 16, 1