GR32_BlendSSE2.pas 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645
  1. unit GR32_BlendSSE2;
  2. (* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1 or LGPL 2.1 with linking exception
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * Alternatively, the contents of this file may be used under the terms of the
  16. * Free Pascal modified version of the GNU Lesser General Public License
  17. * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions
  18. * of this license are applicable instead of those above.
  19. * Please see the file LICENSE.txt for additional information concerning this
  20. * license.
  21. *
  22. * The Original Code is Graphics32
  23. *
  24. * The Initial Developer of the Original Code is
  25. * Alex A. Denisov
  26. *
  27. * Portions created by the Initial Developer are Copyright (C) 2000-2009
  28. * the Initial Developer. All Rights Reserved.
  29. *
  30. * Contributor(s):
  31. * Christian-W. Budde
  32. * - 2019/04/01 - Refactoring
  33. *
  34. * ***** END LICENSE BLOCK ***** *)
  35. interface
  36. {$I GR32.inc}
  37. uses
  38. GR32;
  39. function BlendReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  40. procedure BlendMem_SSE2(F: TColor32; var B: TColor32); {$IFDEF FPC} assembler; {$ENDIF}
  41. procedure BlendMems_SSE2(F: TColor32; B: PColor32; Count: Integer); {$IFDEF FPC} assembler; {$ENDIF}
  42. function BlendRegEx_SSE2(F, B: TColor32; M: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  43. procedure BlendMemEx_SSE2(F: TColor32; var B:TColor32; M: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  44. function BlendRegRGB_SSE2(F, B: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  45. procedure BlendMemRGB_SSE2(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  46. procedure BlendLine_SSE2(Src, Dst: PColor32; Count: Integer); {$IFDEF FPC} assembler; {$ENDIF}
  47. procedure BlendLineEx_SSE2(Src, Dst: PColor32; Count: Integer; M: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  48. function CombineReg_SSE2(X, Y: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  49. procedure CombineMem_SSE2(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  50. procedure CombineLine_SSE2(Src, Dst: PColor32; Count: Integer; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  51. function MergeReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  52. procedure EMMS_SSE2; {$IFDEF FPC} assembler; {$ENDIF}
  53. function LightenReg_SSE2(C: TColor32; Amount: Integer): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  54. function ColorAdd_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  55. function ColorSub_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  56. function ColorModulate_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  57. function ColorMax_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  58. function ColorMin_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  59. function ColorDifference_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  60. function ColorExclusion_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  61. function ColorScale_SSE2(C: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  62. implementation
  63. uses
  64. GR32_Blend,
  65. GR32_LowLevel,
  66. GR32_System;
  67. const
  68. BlendRegistryPrioritySSE2 = -768;
  69. { SSE2 versions }
  70. function BlendReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  71. asm
  72. // blend foreground color (F) to a background color (B),
  73. // using alpha channel value of F
  74. // EAX <- F
  75. // EDX <- B
  76. // Result := Fa * (Fargb - Bargb) + Bargb
  77. {$IFDEF TARGET_x86}
  78. MOVD XMM0,EAX // XMM0 <- 00 00 00 00 00 00 00 00 00 00 00 00 Fa Fr Fg Fb
  79. PXOR XMM3,XMM3 // XMM3 <- 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  80. MOVD XMM2,EDX // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Ba Br Bg Bb
  81. PUNPCKLBW XMM0,XMM3 // XMM0 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
  82. MOV ECX,bias_ptr // ECX <- Pointer to Bias
  83. PUNPCKLBW XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb
  84. MOVQ XMM1,XMM0 // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
  85. PSHUFLW XMM1,XMM1,$FF // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fa 00 Fa 00 Fa
  86. PSUBW XMM0,XMM2 // XMM0 <- 00 00 00 00 00 00 00 00 00 Da 00 Dr 00 Dg 00 Db
  87. PSLLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb 00
  88. PMULLW XMM0,XMM1 // XMM0 <- 00 00 00 00 00 00 00 00 Pa ** Pr ** Pg ** Pb **
  89. PADDW XMM2,[ECX] // add bias
  90. PADDW XMM2,XMM0 // XMM2 <- 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb **
  91. PSRLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb
  92. PACKUSWB XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Qa Qr Qg Qb
  93. MOVD EAX,XMM2 // EAX <- Za Zr Zg Zb
  94. OR EAX,$FF000000 // EAX <- FF Zr Zg Zb
  95. {$ENDIF}
  96. {$IFDEF TARGET_x64}
  97. MOVD XMM0,ECX // XMM0 <- 00 00 00 00 00 00 00 00 00 00 00 00 Fa Fr Fg Fb
  98. PXOR XMM3,XMM3 // XMM3 <- 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  99. MOVD XMM2,EDX // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Ba Br Bg Bb
  100. PUNPCKLBW XMM0,XMM3 // XMM0 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
  101. {$IFNDEF FPC}
  102. MOV RAX,bias_ptr // RAX <- Pointer to Bias
  103. {$ELSE}
  104. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  105. {$ENDIF}
  106. PUNPCKLBW XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb
  107. MOVQ XMM1,XMM0 // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
  108. PSHUFLW XMM1,XMM1,$FF // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fa 00 ** 00 **
  109. PSUBW XMM0,XMM2 // XMM0 <- 00 00 00 00 00 00 00 00 00 Da 00 Dr 00 Dg 00 Db
  110. PSLLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb 00
  111. PMULLW XMM0,XMM1 // XMM2 <- 00 00 00 00 00 00 00 00 Pa ** Pr ** Pg ** Pb **
  112. PADDW XMM2,[RAX] // add bias
  113. PADDW XMM2,XMM0 // XMM2 <- 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb **
  114. PSRLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb
  115. PACKUSWB XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Qa Qr Qg Qb
  116. MOVD EAX,XMM2 // EAX <- Za Zr Zg Zb
  117. OR EAX,$FF000000 // EAX <- FF Zr Zg Zb
  118. {$ENDIF}
  119. end;
  120. procedure BlendMem_SSE2(F: TColor32; var B: TColor32); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  121. asm
  122. {$IFDEF TARGET_x86}
  123. // EAX - Color X
  124. // [EDX] - Color Y
  125. // Result := W * (X - Y) + Y
  126. TEST EAX,$FF000000
  127. JZ @1
  128. CMP EAX,$FF000000
  129. JNC @2
  130. PXOR XMM3,XMM3
  131. MOVD XMM0,EAX
  132. MOVD XMM2,[EDX]
  133. PUNPCKLBW XMM0,XMM3
  134. MOV ECX,bias_ptr
  135. PUNPCKLBW XMM2,XMM3
  136. MOVQ XMM1,XMM0
  137. PSHUFLW XMM1,XMM1,$FF
  138. PSUBW XMM0,XMM2
  139. PSLLW XMM2,8
  140. PMULLW XMM0,XMM1
  141. PADDW XMM2,[ECX]
  142. PADDW XMM2,XMM0
  143. PSRLW XMM2,8
  144. PACKUSWB XMM2,XMM3
  145. MOVD [EDX],XMM2
  146. @1: RET
  147. @2: MOV [EDX], EAX
  148. {$ENDIF}
  149. {$IFDEF TARGET_x64}
  150. // ECX - Color X
  151. // [EDX] - Color Y
  152. // Result := W * (X - Y) + Y
  153. TEST ECX,$FF000000
  154. JZ @1
  155. CMP ECX,$FF000000
  156. JNC @2
  157. PXOR XMM3,XMM3
  158. MOVD XMM0,ECX
  159. MOVD XMM2,[RDX]
  160. PUNPCKLBW XMM0,XMM3
  161. {$IFNDEF FPC}
  162. MOV RAX,bias_ptr
  163. {$ELSE}
  164. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  165. {$ENDIF}
  166. PUNPCKLBW XMM2,XMM3
  167. MOVQ XMM1,XMM0
  168. PSHUFLW XMM1,XMM1,$FF
  169. PSUBW XMM0,XMM2
  170. PSLLW XMM2,8
  171. PMULLW XMM0,XMM1
  172. PADDW XMM2,[RAX]
  173. PADDW XMM2,XMM0
  174. PSRLW XMM2,8
  175. PACKUSWB XMM2,XMM3
  176. MOVD [RDX],XMM2
  177. @1: RET
  178. @2: MOV [RDX], ECX
  179. {$ENDIF}
  180. end;
  181. procedure BlendMems_SSE2(F: TColor32; B: PColor32; Count: Integer); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  182. asm
  183. {$IFDEF TARGET_x86}
  184. TEST ECX,ECX
  185. JZ @2
  186. TEST EAX,$FF000000
  187. JZ @2
  188. PUSH EBX
  189. MOV EBX,EAX
  190. SHR EBX,24
  191. CMP EBX,$FF
  192. JZ @3
  193. MOVD XMM4,EAX
  194. PXOR XMM3,XMM3
  195. PUNPCKLBW XMM4,XMM3
  196. MOV EBX,bias_ptr
  197. @1:
  198. MOVD XMM2,[EDX]
  199. PUNPCKLBW XMM2,XMM3
  200. MOVQ XMM1,XMM4
  201. PUNPCKLBW XMM1,XMM3
  202. PUNPCKHWD XMM1,XMM1
  203. MOVQ XMM0,XMM4
  204. PSUBW XMM0,XMM2
  205. PUNPCKHDQ XMM1,XMM1
  206. PSLLW XMM2,8
  207. PMULLW XMM0,XMM1
  208. PADDW XMM2,[EBX]
  209. PADDW XMM2,XMM0
  210. PSRLW XMM2,8
  211. PACKUSWB XMM2,XMM3
  212. MOVD [EDX],XMM2
  213. ADD EDX,4
  214. DEC ECX
  215. JNZ @1
  216. POP EBX
  217. @2:
  218. RET
  219. @3:
  220. MOV [EDX],EAX
  221. ADD EDX,4
  222. DEC ECX
  223. JNZ @3
  224. POP EBX
  225. {$ENDIF}
  226. {$IFDEF TARGET_x64}
  227. TEST R8D,R8D
  228. JZ @2
  229. TEST ECX,$FF000000
  230. JZ @2
  231. MOV RAX,RCX
  232. SHR EAX,24
  233. CMP EAX,$FF
  234. JZ @3
  235. MOVD XMM4,ECX
  236. PXOR XMM3,XMM3
  237. PUNPCKLBW XMM4,XMM3
  238. {$IFNDEF FPC}
  239. MOV RAX,bias_ptr // RAX <- Pointer to Bias
  240. {$ELSE}
  241. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  242. {$ENDIF}
  243. @1:
  244. MOVD XMM2,[RDX]
  245. PUNPCKLBW XMM2,XMM3
  246. MOVQ XMM1,XMM4
  247. PUNPCKLBW XMM1,XMM3
  248. PUNPCKHWD XMM1,XMM1
  249. MOVQ XMM0,XMM4
  250. PSUBW XMM0,XMM2
  251. PUNPCKHDQ XMM1,XMM1
  252. PSLLW XMM2,8
  253. PMULLW XMM0,XMM1
  254. PADDW XMM2,[RAX]
  255. PADDW XMM2,XMM0
  256. PSRLW XMM2,8
  257. PACKUSWB XMM2,XMM3
  258. MOVD [RDX], XMM2
  259. ADD RDX,4
  260. DEC R8D
  261. JNZ @1
  262. @2:
  263. RET
  264. @3:
  265. MOV [RDX],ECX
  266. ADD RDX,4
  267. DEC R8D
  268. JNZ @3
  269. {$ENDIF}
  270. end;
  271. function BlendRegEx_SSE2(F, B: TColor32; M: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  272. asm
  273. // blend foreground color (F) to a background color (B),
  274. // using alpha channel value of F
  275. // Result := M * Fa * (Fargb - Bargb) + Bargb
  276. {$IFDEF TARGET_x86}
  277. // EAX <- F
  278. // EDX <- B
  279. // ECX <- M
  280. PUSH EBX
  281. MOV EBX,EAX
  282. SHR EBX,24
  283. INC ECX // 255:256 range bias
  284. IMUL ECX,EBX
  285. SHR ECX,8
  286. JZ @1
  287. PXOR XMM0,XMM0
  288. MOVD XMM1,EAX
  289. SHL ECX,4
  290. MOVD XMM2,EDX
  291. PUNPCKLBW XMM1,XMM0
  292. PUNPCKLBW XMM2,XMM0
  293. ADD ECX,alpha_ptr
  294. PSUBW XMM1,XMM2
  295. PMULLW XMM1,[ECX]
  296. PSLLW XMM2,8
  297. MOV ECX,bias_ptr
  298. PADDW XMM2,[ECX]
  299. PADDW XMM1,XMM2
  300. PSRLW XMM1,8
  301. PACKUSWB XMM1,XMM0
  302. MOVD EAX,XMM1
  303. POP EBX
  304. RET
  305. @1: MOV EAX,EDX
  306. POP EBX
  307. {$ENDIF}
  308. {$IFDEF TARGET_x64}
  309. // ECX <- F
  310. // EDX <- B
  311. // R8D <- M
  312. MOV EAX,ECX
  313. SHR EAX,24
  314. INC R8D // 255:256 range bias
  315. IMUL R8D,EAX
  316. SHR R8D,8
  317. JZ @1
  318. PXOR XMM0,XMM0
  319. MOVD XMM1,ECX
  320. SHL R8D,4
  321. MOVD XMM2,EDX
  322. PUNPCKLBW XMM1,XMM0
  323. PUNPCKLBW XMM2,XMM0
  324. {$IFNDEF FPC}
  325. ADD R8,alpha_ptr
  326. {$ELSE}
  327. ADD R8,[RIP+alpha_ptr]
  328. {$ENDIF}
  329. PSUBW XMM1,XMM2
  330. PMULLW XMM1,[R8]
  331. PSLLW XMM2,8
  332. {$IFNDEF FPC}
  333. MOV R8,bias_ptr
  334. {$ELSE}
  335. MOV R8,[RIP+bias_ptr]
  336. {$ENDIF}
  337. PADDW XMM2,[R8]
  338. PADDW XMM1,XMM2
  339. PSRLW XMM1,8
  340. PACKUSWB XMM1,XMM0
  341. MOVD EAX,XMM1
  342. RET
  343. @1: MOV EAX,EDX
  344. {$ENDIF}
  345. end;
  346. procedure BlendMemEx_SSE2(F: TColor32; var B:TColor32; M: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  347. asm
  348. {$IFDEF TARGET_x86}
  349. // blend foreground color (F) to a background color (B),
  350. // using alpha channel value of F
  351. // EAX <- F
  352. // [EDX] <- B
  353. // ECX <- M
  354. // Result := M * Fa * (Fargb - Bargb) + Bargb
  355. TEST EAX,$FF000000
  356. JZ @2
  357. PUSH EBX
  358. MOV EBX,EAX // EBX <- Fa Fr Fg Fb
  359. SHR EBX,24 // EBX <- 00 00 00 Fa
  360. INC ECX // 255:256 range bias
  361. IMUL ECX,EBX // ECX <- 00 00 W **
  362. SHR ECX,8 // ECX <- 00 00 00 W
  363. JZ @1
  364. PXOR XMM0,XMM0 // XMM0 <- 00 00 00 00 00 00 00 00
  365. MOVD XMM1,EAX // XMM1 <- 00 00 00 00 Fa Fr Fg Fb
  366. SHL ECX,4
  367. MOVD XMM2,[EDX] // XMM2 <- 00 00 00 00 Ba Br Bg Bb
  368. PUNPCKLBW XMM1,XMM0
  369. PUNPCKLBW XMM2,XMM0
  370. ADD ECX,alpha_ptr
  371. PSUBW XMM1,XMM2
  372. PMULLW XMM1,[ECX]
  373. PSLLW XMM2,8
  374. MOV ECX,bias_ptr
  375. PADDW XMM2,[ECX]
  376. PADDW XMM1,XMM2
  377. PSRLW XMM1,8
  378. PACKUSWB XMM1,XMM0
  379. MOVD [EDX],XMM1
  380. @1:
  381. POP EBX
  382. @2:
  383. {$ENDIF}
  384. {$IFDEF TARGET_x64}
  385. // blend foreground color (F) to a background color (B),
  386. // using alpha channel value of F
  387. // RCX <- F
  388. // [RDX] <- B
  389. // R8 <- M
  390. // Result := M * Fa * (Fargb - Bargb) + Bargb
  391. TEST ECX,$FF000000
  392. JZ @1
  393. MOV R9D,ECX
  394. SHR R9D,24
  395. INC R8D // 255:256 range bias
  396. IMUL R8D,R9D
  397. SHR R8D,8
  398. JZ @1
  399. PXOR XMM0,XMM0
  400. MOVD XMM1,ECX
  401. SHL R8D,4
  402. MOVD XMM2,[RDX]
  403. PUNPCKLBW XMM1,XMM0
  404. PUNPCKLBW XMM2,XMM0
  405. {$IFNDEF FPC}
  406. ADD R8,alpha_ptr
  407. {$ELSE}
  408. ADD R8,[RIP+alpha_ptr]
  409. {$ENDIF}
  410. PSUBW XMM1,XMM2
  411. PMULLW XMM1,[R8]
  412. PSLLW XMM2,8
  413. {$IFNDEF FPC}
  414. MOV R8,bias_ptr
  415. {$ELSE}
  416. MOV R8,[RIP+bias_ptr]
  417. {$ENDIF}
  418. PADDW XMM2,[R8]
  419. PADDW XMM1,XMM2
  420. PSRLW XMM1,8
  421. PACKUSWB XMM1,XMM0
  422. MOVD DWORD PTR [RDX],XMM1
  423. @1:
  424. {$ENDIF}
  425. end;
  426. function BlendRegRGB_SSE2(F, B: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  427. asm
  428. {$IFDEF TARGET_x86}
  429. PXOR XMM2,XMM2
  430. MOVD XMM0,EAX
  431. PUNPCKLBW XMM0,XMM2
  432. MOVD XMM1,EDX
  433. PUNPCKLBW XMM1,XMM2
  434. BSWAP ECX
  435. PSUBW XMM0,XMM1
  436. MOVD XMM3,ECX
  437. PUNPCKLBW XMM3,XMM2
  438. PMULLW XMM0,XMM3
  439. MOV EAX,bias_ptr
  440. PSLLW XMM1,8
  441. PADDW XMM1,[EAX]
  442. PADDW XMM1,XMM0
  443. PSRLW XMM1,8
  444. PACKUSWB XMM1,XMM2
  445. MOVD EAX,XMM1
  446. {$ENDIF}
  447. {$IFDEF TARGET_x64}
  448. PXOR XMM2,XMM2
  449. MOVD XMM0,ECX
  450. PUNPCKLBW XMM0,XMM2
  451. MOVD XMM1,EDX
  452. PUNPCKLBW XMM1,XMM2
  453. BSWAP R8D
  454. PSUBW XMM0,XMM1
  455. MOVD XMM3,R8D
  456. PUNPCKLBW XMM3,XMM2
  457. PMULLW XMM0,XMM3
  458. {$IFNDEF FPC}
  459. MOV RAX,bias_ptr
  460. {$ELSE}
  461. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  462. {$ENDIF}
  463. PSLLW XMM1,8
  464. PADDW XMM1,[RAX]
  465. PADDW XMM1,XMM0
  466. PSRLW XMM1,8
  467. PACKUSWB XMM1,XMM2
  468. MOVD EAX,XMM1
  469. {$ENDIF}
  470. end;
  471. procedure BlendMemRGB_SSE2(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  472. asm
  473. {$IFDEF TARGET_x86}
  474. PXOR XMM2,XMM2
  475. MOVD XMM0,EAX
  476. PUNPCKLBW XMM0,XMM2
  477. MOVD XMM1,[EDX]
  478. PUNPCKLBW XMM1,XMM2
  479. BSWAP ECX
  480. PSUBW XMM0,XMM1
  481. MOVD XMM3,ECX
  482. PUNPCKLBW XMM3,XMM2
  483. PMULLW XMM0,XMM3
  484. MOV EAX,bias_ptr
  485. PSLLW XMM1,8
  486. PADDW XMM1,[EAX]
  487. PADDW XMM1,XMM0
  488. PSRLW XMM1,8
  489. PACKUSWB XMM1,XMM2
  490. MOVD [EDX],XMM1
  491. {$ENDIF}
  492. {$IFDEF TARGET_x64}
  493. MOVD XMM1,R8D
  494. PXOR XMM4,XMM4
  495. {$IFNDEF FPC}
  496. MOV RAX,bias_ptr
  497. {$ELSE}
  498. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  499. {$ENDIF}
  500. MOVQ XMM5,[RAX]
  501. MOVD XMM0,ECX
  502. MOVD XMM2,[RDX]
  503. PUNPCKLBW XMM0,XMM4
  504. PUNPCKLBW XMM1,XMM4
  505. PUNPCKLBW XMM2,XMM4
  506. PSHUFLW XMM1,XMM1,$1B
  507. // C = wA B - wB
  508. PMULLW XMM0,XMM1
  509. PADDW XMM0,XMM5
  510. PSRLW XMM0,8
  511. PADDW XMM0,XMM2
  512. PMULLW XMM2,XMM1
  513. PADDW XMM2,XMM5
  514. PSRLW XMM2,8
  515. PSUBW XMM0,XMM2
  516. PACKUSWB XMM0,XMM4
  517. MOVD [RDX],XMM0
  518. {$ENDIF}
  519. end;
  520. {$IFDEF TEST_BLENDMEMRGB128SSE4}
  521. procedure BlendMemRGB128_SSE4(F: TColor32; var B: TColor32; W: UInt64); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  522. asm
  523. {$IFDEF TARGET_x86}
  524. MOVQ XMM1,W
  525. PXOR XMM4,XMM4
  526. MOV ECX,[bias_ptr]
  527. MOVDQA XMM5,[ECX]
  528. MOVD XMM0,EAX
  529. PINSRD XMM0,EAX,1
  530. MOVQ XMM2,[EDX].QWORD
  531. PUNPCKLBW XMM0,XMM4
  532. PUNPCKLBW XMM1,XMM4
  533. PUNPCKLBW XMM2,XMM4
  534. PSHUFLW XMM1,XMM1,$1B
  535. PSHUFHW XMM1,XMM1,$1B
  536. // C = wA B - wB
  537. PMULLW XMM0,XMM1
  538. PADDW XMM0,XMM5
  539. PSRLW XMM0,8
  540. PADDW XMM0,XMM2
  541. PMULLW XMM2,XMM1
  542. PADDW XMM2,XMM5
  543. PSRLW XMM2,8
  544. PSUBW XMM0,XMM2
  545. PACKUSWB XMM0,XMM4
  546. MOVQ [EDX].QWORD,XMM0
  547. {$ENDIF}
  548. {$IFDEF TARGET_x64}
  549. MOVQ XMM1,R8
  550. PXOR XMM4,XMM4
  551. MOV RAX,[RIP+bias_ptr]
  552. MOVDQA XMM5,[RAX]
  553. MOVD XMM0,ECX
  554. PINSRD XMM0,ECX,1
  555. MOVQ XMM2,[RDX].QWORD
  556. PUNPCKLBW XMM0,XMM4
  557. PUNPCKLBW XMM1,XMM4
  558. PUNPCKLBW XMM2,XMM4
  559. PSHUFLW XMM1,XMM1,$1B
  560. PSHUFHW XMM1,XMM1,$1B
  561. // C = wA B - wB
  562. PMULLW XMM0,XMM1
  563. PADDW XMM0,XMM5
  564. PSRLW XMM0,8
  565. PADDW XMM0,XMM2
  566. PMULLW XMM2,XMM1
  567. PADDW XMM2,XMM5
  568. PSRLW XMM2,8
  569. PSUBW XMM0,XMM2
  570. PACKUSWB XMM0,XMM4
  571. MOVQ [RDX].QWORD,XMM0
  572. {$ENDIF}
  573. end;
  574. {$ENDIF}
  575. procedure BlendLine_SSE2(Src, Dst: PColor32; Count: Integer); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  576. {$IFDEF FPC}
  577. const
  578. COpaque: QWORD = $FF000000FF000000;
  579. {$ENDIF}
  580. asm
  581. {$IFDEF TARGET_X86}
  582. // EAX <- Src
  583. // EDX <- Dst
  584. // ECX <- Count
  585. TEST ECX,ECX
  586. JLE @3
  587. PUSH EBX
  588. PXOR XMM4,XMM4
  589. MOV EBX,[bias_ptr]
  590. MOVDQA XMM5,[EBX]
  591. POP EBX
  592. TEST ECX, 1
  593. JZ @2
  594. MOVD XMM0,[EAX]
  595. MOVD XMM2,[EDX]
  596. PUNPCKLBW XMM0,XMM4
  597. PUNPCKLBW XMM2,XMM4
  598. PSHUFLW XMM1,XMM0,$FF
  599. // premultiply source pixel by its alpha
  600. MOVQ XMM3,XMM1
  601. PSRLQ XMM3,16
  602. PMULLW XMM0,XMM3
  603. PADDW XMM0,XMM5
  604. PSRLW XMM0,8
  605. PSLLQ XMM3,48
  606. POR XMM0,XMM3
  607. // C' = A' B' - aB'
  608. PMULLW XMM1,XMM2
  609. PADDW XMM1,XMM5
  610. PSRLW XMM1,8
  611. PADDW XMM0,XMM2
  612. PSUBW XMM0,XMM1
  613. PACKUSWB XMM0,XMM4
  614. MOVD [EDX], XMM0
  615. @2:
  616. LEA EAX, [EAX + ECX * 4]
  617. LEA EDX, [EDX + ECX * 4]
  618. SHR ECX,1
  619. JZ @3
  620. NEG ECX
  621. @1:
  622. MOVQ XMM0,[EAX + ECX * 8].QWORD
  623. MOVQ XMM2,[EDX + ECX * 8].QWORD
  624. PUNPCKLBW XMM0,XMM4
  625. PUNPCKLBW XMM2,XMM4
  626. PSHUFLW XMM1,XMM0,$FF
  627. PSHUFHW XMM1,XMM1,$FF
  628. // premultiply source pixel by its alpha
  629. MOVDQA XMM3,XMM1
  630. PSRLQ XMM3,16
  631. PMULLW XMM0,XMM3
  632. PADDW XMM0,XMM5
  633. PSRLW XMM0,8
  634. PSLLQ XMM3,48
  635. POR XMM0,XMM3
  636. // C' = A' + B' - aB'
  637. PMULLW XMM1,XMM2
  638. PADDW XMM1,XMM5
  639. PSRLW XMM1,8
  640. PADDW XMM0,XMM2
  641. PSUBW XMM0,XMM1
  642. PACKUSWB XMM0,XMM4
  643. MOVQ [EDX + ECX * 8].QWORD,XMM0
  644. ADD ECX,1
  645. JS @1
  646. @3:
  647. {$ENDIF}
  648. {$IFDEF TARGET_X64}
  649. TEST R8D,R8D
  650. JLE @3
  651. PXOR XMM4,XMM4
  652. {$IFNDEF FPC}
  653. MOV RAX,bias_ptr
  654. {$ELSE}
  655. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  656. {$ENDIF}
  657. MOVDQA XMM5,[RAX]
  658. MOV R9D, R8D
  659. SHR R9D, 1
  660. TEST R9D, R9D
  661. JZ @2
  662. @1:
  663. MOVQ XMM0,[RCX].QWORD
  664. MOVQ RAX,XMM0
  665. {$IFDEF FPC}
  666. AND RAX,[RIP+COpaque]
  667. JZ @1b
  668. CMP RAX,[RIP+COpaque]
  669. JZ @1a
  670. {$ENDIF}
  671. MOVQ XMM2,[RDX].QWORD
  672. PUNPCKLBW XMM0,XMM4
  673. PUNPCKLBW XMM2,XMM4
  674. PSHUFLW XMM1,XMM0,$FF
  675. PSHUFHW XMM1,XMM1,$FF
  676. // premultiply source pixel by its alpha
  677. MOVDQA XMM3,XMM1
  678. PSRLQ XMM3,16
  679. PMULLW XMM0,XMM3
  680. PADDW XMM0,XMM5
  681. PSRLW XMM0,8
  682. PSLLQ XMM3,48
  683. POR XMM0,XMM3
  684. // C' = A' + B' - aB'
  685. PMULLW XMM1,XMM2
  686. PADDW XMM1,XMM5
  687. PSRLW XMM1,8
  688. PADDW XMM0,XMM2
  689. PSUBW XMM0,XMM1
  690. PACKUSWB XMM0,XMM4
  691. @1a: MOVQ [RDX].QWORD,XMM0
  692. @1b: ADD RCX,8
  693. ADD RDX,8
  694. SUB R9D,1
  695. JNZ @1
  696. @2:
  697. AND R8D, 1
  698. JZ @3
  699. MOVD XMM0,[RCX]
  700. MOVD XMM2,[RDX]
  701. PUNPCKLBW XMM0,XMM4
  702. PUNPCKLBW XMM2,XMM4
  703. PSHUFLW XMM1,XMM0,$FF
  704. // premultiply source pixel by its alpha
  705. MOVQ XMM3,XMM1
  706. PSRLQ XMM3,16
  707. PMULLW XMM0,XMM3
  708. PADDW XMM0,XMM5
  709. PSRLW XMM0,8
  710. PSLLQ XMM3,48
  711. POR XMM0,XMM3
  712. // C' = A' B' - aB'
  713. PMULLW XMM1,XMM2
  714. PADDW XMM1,XMM5
  715. PSRLW XMM1,8
  716. PADDW XMM0,XMM2
  717. PSUBW XMM0,XMM1
  718. PACKUSWB XMM0,XMM4
  719. MOVD [RDX], XMM0
  720. @3:
  721. {$ENDIF}
  722. end;
  723. procedure BlendLineEx_SSE2(Src, Dst: PColor32; Count: Integer; M: Cardinal); {$IFDEF FPC} assembler; {$IFDEF TARGET_X64}nostackframe;{$ENDIF} {$ENDIF}
  724. asm
  725. {$IFDEF TARGET_X86}
  726. // EAX <- Src
  727. // EDX <- Dst
  728. // ECX <- Count
  729. // test the counter for zero or negativity
  730. TEST ECX,ECX
  731. JS @4
  732. PUSH ESI
  733. PUSH EDI
  734. PUSH EBX
  735. MOV ESI,EAX // ESI <- Src
  736. MOV EDI,EDX // EDI <- Dst
  737. MOV EDX,M // EDX <- Master Alpha
  738. // loop start
  739. @1: MOV EAX,[ESI]
  740. TEST EAX,$FF000000
  741. JZ @3 // complete transparency, proceed to next point
  742. MOV EBX,EAX
  743. SHR EBX,24
  744. INC EBX // 255:256 range bias
  745. IMUL EBX,EDX
  746. SHR EBX,8
  747. JZ @3 // complete transparency, proceed to next point
  748. // blend
  749. PXOR XMM0,XMM0
  750. MOVD XMM1,EAX
  751. SHL EBX,4
  752. MOVD XMM2,[EDI]
  753. PUNPCKLBW XMM1,XMM0
  754. PUNPCKLBW XMM2,XMM0
  755. ADD EBX,alpha_ptr
  756. PSUBW XMM1,XMM2
  757. PMULLW XMM1,[EBX]
  758. PSLLW XMM2,8
  759. MOV EBX,bias_ptr
  760. PADDW XMM2,[EBX]
  761. PADDW XMM1,XMM2
  762. PSRLW XMM1,8
  763. PACKUSWB XMM1,XMM0
  764. MOVD EAX,XMM1
  765. @2: MOV [EDI],EAX
  766. @3: ADD ESI,4
  767. ADD EDI,4
  768. // loop end
  769. DEC ECX
  770. JNZ @1
  771. POP EBX
  772. POP EDI
  773. POP ESI
  774. @4:
  775. {$ENDIF}
  776. {$IFDEF TARGET_X64}
  777. // ECX <- Src
  778. // EDX <- Dst
  779. // R8D <- Count
  780. // R9D <- M
  781. // test the counter for zero or negativity
  782. TEST R8D,R8D
  783. JS @4
  784. TEST R9D,R9D
  785. JZ @4
  786. MOV R10,RCX // ESI <- Src
  787. // loop start
  788. @1: MOV ECX,[R10]
  789. TEST ECX,$FF000000
  790. JZ @3 // complete transparency, proceed to next point
  791. MOV EAX,ECX
  792. SHR EAX,24
  793. INC EAX // 255:256 range bias
  794. IMUL EAX,R9D
  795. SHR EAX,8
  796. JZ @3 // complete transparency, proceed to next point
  797. // blend
  798. PXOR XMM0,XMM0
  799. MOVD XMM1,ECX
  800. SHL EAX,4
  801. MOVD XMM2,[RDX]
  802. PUNPCKLBW XMM1,XMM0
  803. PUNPCKLBW XMM2,XMM0
  804. {$IFNDEF FPC}
  805. ADD RAX,alpha_ptr
  806. {$ELSE}
  807. ADD RAX,[RIP+alpha_ptr]
  808. {$ENDIF}
  809. PSUBW XMM1,XMM2
  810. PMULLW XMM1,[RAX]
  811. PSLLW XMM2,8
  812. {$IFNDEF FPC}
  813. MOV RAX,bias_ptr
  814. {$ELSE}
  815. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  816. {$ENDIF}
  817. PADDW XMM2,[RAX]
  818. PADDW XMM1,XMM2
  819. PSRLW XMM1,8
  820. PACKUSWB XMM1,XMM0
  821. MOVD ECX,XMM1
  822. @2: MOV [RDX],ECX
  823. @3: ADD R10,4
  824. ADD RDX,4
  825. // loop end
  826. DEC R8D
  827. JNZ @1
  828. @4:
  829. {$ENDIF}
  830. end;
  831. function CombineReg_SSE2(X, Y: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  832. asm
  833. {$IFDEF TARGET_X86}
  834. // EAX - Color X
  835. // EDX - Color Y
  836. // ECX - Weight of X [0..255]
  837. // Result := W * (X - Y) + Y
  838. MOVD XMM1,EAX
  839. PXOR XMM0,XMM0
  840. SHL ECX,4
  841. MOVD XMM2,EDX
  842. PUNPCKLBW XMM1,XMM0
  843. PUNPCKLBW XMM2,XMM0
  844. ADD ECX,alpha_ptr
  845. PSUBW XMM1,XMM2
  846. PMULLW XMM1,[ECX]
  847. PSLLW XMM2,8
  848. MOV ECX,bias_ptr
  849. PADDW XMM2,[ECX]
  850. PADDW XMM1,XMM2
  851. PSRLW XMM1,8
  852. PACKUSWB XMM1,XMM0
  853. MOVD EAX,XMM1
  854. {$ENDIF}
  855. {$IFDEF TARGET_X64}
  856. // ECX - Color X
  857. // EDX - Color Y
  858. // R8D - Weight of X [0..255]
  859. // Result := W * (X - Y) + Y
  860. MOVD XMM1,ECX
  861. PXOR XMM0,XMM0
  862. SHL R8D,4
  863. MOVD XMM2,EDX
  864. PUNPCKLBW XMM1,XMM0
  865. PUNPCKLBW XMM2,XMM0
  866. {$IFNDEF FPC}
  867. ADD R8,alpha_ptr
  868. {$ELSE}
  869. ADD R8,[RIP+alpha_ptr]
  870. {$ENDIF}
  871. PSUBW XMM1,XMM2
  872. PMULLW XMM1,[R8]
  873. PSLLW XMM2,8
  874. {$IFNDEF FPC}
  875. MOV R8,bias_ptr
  876. {$ELSE}
  877. MOV R8,[RIP+bias_ptr]
  878. {$ENDIF}
  879. PADDW XMM2,[R8]
  880. PADDW XMM1,XMM2
  881. PSRLW XMM1,8
  882. PACKUSWB XMM1,XMM0
  883. MOVD EAX,XMM1
  884. {$ENDIF}
  885. end;
  886. procedure CombineMem_SSE2(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  887. asm
  888. {$IFDEF TARGET_X86}
  889. // EAX - Color X
  890. // [EDX] - Color Y
  891. // ECX - Weight of X [0..255]
  892. // Result := W * (X - Y) + Y
  893. JCXZ @1
  894. CMP ECX,$FF
  895. JZ @2
  896. MOVD XMM1,EAX
  897. PXOR XMM0,XMM0
  898. SHL ECX,4
  899. MOVD XMM2,[EDX]
  900. PUNPCKLBW XMM1,XMM0
  901. PUNPCKLBW XMM2,XMM0
  902. ADD ECX,alpha_ptr
  903. PSUBW XMM1,XMM2
  904. PMULLW XMM1,[ECX]
  905. PSLLW XMM2,8
  906. MOV ECX,bias_ptr
  907. PADDW XMM2,[ECX]
  908. PADDW XMM1,XMM2
  909. PSRLW XMM1,8
  910. PACKUSWB XMM1,XMM0
  911. MOVD [EDX],XMM1
  912. @1: RET
  913. @2: MOV [EDX],EAX
  914. {$ENDIF}
  915. {$IFDEF TARGET_X64}
  916. // ECX - Color X
  917. // [RDX] - Color Y
  918. // R8D - Weight of X [0..255]
  919. // Result := W * (X - Y) + Y
  920. TEST R8D,R8D // Set flags for R8
  921. JZ @1 // W = 0 ? => Result := EDX
  922. CMP R8D,$FF
  923. JZ @2
  924. MOVD XMM1,ECX
  925. PXOR XMM0,XMM0
  926. SHL R8D,4
  927. MOVD XMM2,[RDX]
  928. PUNPCKLBW XMM1,XMM0
  929. PUNPCKLBW XMM2,XMM0
  930. {$IFNDEF FPC}
  931. ADD R8,alpha_ptr
  932. {$ELSE}
  933. ADD R8,[RIP+alpha_ptr]
  934. {$ENDIF}
  935. PSUBW XMM1,XMM2
  936. PMULLW XMM1,[R8]
  937. PSLLW XMM2,8
  938. {$IFNDEF FPC}
  939. MOV RAX,bias_ptr
  940. {$ELSE}
  941. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  942. {$ENDIF}
  943. PADDW XMM2,[RAX]
  944. PADDW XMM1,XMM2
  945. PSRLW XMM1,8
  946. PACKUSWB XMM1,XMM0
  947. MOVD [RDX],XMM1
  948. @1: RET
  949. @2: MOV [RDX],ECX
  950. {$ENDIF}
  951. end;
  952. procedure CombineLine_SSE2(Src, Dst: PColor32; Count: Integer; W: Cardinal); {$IFDEF FPC} assembler; {$IFDEF TARGET_X64}nostackframe;{$ENDIF} {$ENDIF}
  953. asm
  954. {$IFDEF TARGET_X86}
  955. // EAX <- Src
  956. // EDX <- Dst
  957. // ECX <- Count
  958. // Result := W * (X - Y) + Y
  959. TEST ECX,ECX
  960. JZ @3
  961. PUSH EBX
  962. MOV EBX,W
  963. TEST EBX,EBX
  964. JZ @2
  965. CMP EBX,$FF
  966. JZ @4
  967. SHL EBX,4
  968. ADD EBX,alpha_ptr
  969. MOVQ XMM3,[EBX]
  970. MOV EBX,bias_ptr
  971. MOVQ XMM4,[EBX]
  972. PXOR XMM0,XMM0
  973. @1: MOVD XMM1,[EAX]
  974. MOVD XMM2,[EDX]
  975. PUNPCKLBW XMM1,XMM0
  976. PUNPCKLBW XMM2,XMM0
  977. PSUBW XMM1,XMM2
  978. PMULLW XMM1,XMM3
  979. PSLLW XMM2,8
  980. PADDW XMM2,XMM4
  981. PADDW XMM1,XMM2
  982. PSRLW XMM1,8
  983. PACKUSWB XMM1,XMM0
  984. MOVD [EDX],XMM1
  985. ADD EAX,4
  986. ADD EDX,4
  987. DEC ECX
  988. JNZ @1
  989. @2: POP EBX
  990. POP EBP
  991. @3: RET $0004
  992. @4: SHL ECX,2
  993. CALL Move
  994. POP EBX
  995. {$ENDIF}
  996. {$IFDEF TARGET_X64}
  997. // ECX <- Src
  998. // EDX <- Dst
  999. // R8D <- Count
  1000. // Result := W * (X - Y) + Y
  1001. TEST R8D,R8D
  1002. JZ @2
  1003. TEST R9D,R9D
  1004. JZ @2
  1005. CMP R9D,$FF
  1006. JZ @3
  1007. SHL R9D,4
  1008. {$IFNDEF FPC}
  1009. ADD R9,alpha_ptr
  1010. {$ELSE}
  1011. ADD R9,[RIP+alpha_ptr]
  1012. {$ENDIF}
  1013. MOVQ XMM3,[R9]
  1014. {$IFNDEF FPC}
  1015. MOV R9,bias_ptr
  1016. {$ELSE}
  1017. MOV R9,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  1018. {$ENDIF}
  1019. MOVQ XMM4,[R9]
  1020. PXOR XMM0,XMM0
  1021. @1: MOVD XMM1,[RCX]
  1022. MOVD XMM2,[RDX]
  1023. PUNPCKLBW XMM1,XMM0
  1024. PUNPCKLBW XMM2,XMM0
  1025. PSUBW XMM1,XMM2
  1026. PMULLW XMM1,XMM3
  1027. PSLLW XMM2,8
  1028. PADDW XMM2,XMM4
  1029. PADDW XMM1,XMM2
  1030. PSRLW XMM1,8
  1031. PACKUSWB XMM1,XMM0
  1032. MOVD [RDX],XMM1
  1033. ADD RCX,4
  1034. ADD RDX,4
  1035. DEC R8D
  1036. JNZ @1
  1037. @2: RET
  1038. @3: SHL R8D,2
  1039. CALL Move
  1040. {$ENDIF}
  1041. end;
  1042. function MergeReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1043. asm
  1044. { This is an implementation of the merge formula, as described
  1045. in a paper by Bruce Wallace in 1981. Merging is associative,
  1046. that is, A over (B over C) = (A over B) over C. The formula is,
  1047. Ra = Fa + Ba * (1 - Fa)
  1048. Rc = (Fa * (Fc - Bc * Ba) + Bc * Ba) / Ra
  1049. where
  1050. Rc is the resultant color,
  1051. Ra is the resultant alpha,
  1052. Fc is the foreground color,
  1053. Fa is the foreground alpha,
  1054. Bc is the background color,
  1055. Ba is the background alpha.
  1056. Implementation:
  1057. Ra := 1 - (1 - Fa) * (1 - Ba);
  1058. Wa := Fa / Ra;
  1059. Rc := Bc + Wa * (Fc - Bc);
  1060. (1 - Fa) * (1 - Ba) = 1 - Fa - Ba + Fa * Ba = (1 - Ra)
  1061. }
  1062. {$IFDEF TARGET_X86}
  1063. TEST EAX,$FF000000 // foreground completely transparent =>
  1064. JZ @1 // result = background
  1065. CMP EAX,$FF000000 // foreground completely opaque =>
  1066. JNC @2 // result = foreground
  1067. TEST EDX,$FF000000 // background completely transparent =>
  1068. JZ @2 // result = foreground
  1069. PXOR XMM7,XMM7 // XMM7 <- 00
  1070. MOVD XMM0,EAX // XMM0 <- Fa Fr Fg Fb
  1071. SHR EAX,24 // EAX <- Fa
  1072. ROR EDX,24
  1073. MOVZX ECX,DL // ECX <- Ba
  1074. PUNPCKLBW XMM0,XMM7 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb
  1075. SUB EAX,$FF // EAX <- (Fa - 1)
  1076. XOR ECX,$FF // ECX <- (1 - Ba)
  1077. IMUL ECX,EAX // ECX <- (Fa - 1) * (1 - Ba) = Ra - 1
  1078. IMUL ECX,$8081 // ECX <- Xa 00 00 00
  1079. ADD ECX,$8081*$FF*$FF
  1080. SHR ECX,15 // ECX <- Ra
  1081. MOV DL,CH // EDX <- Br Bg Bb Ra
  1082. ROR EDX,8 // EDX <- Ra Br Bg Bb
  1083. MOVD XMM1,EDX // XMM1 <- Ra Br Bg Bb
  1084. PUNPCKLBW XMM1,XMM7 // XMM1 <- 00 Ra 00 Br 00 Bg 00 Bb
  1085. SHL EAX,20 // EAX <- Fa 00 00
  1086. PSUBW XMM0,XMM1 // XMM0 <- ** Da ** Dr ** Dg ** Db
  1087. ADD EAX,$0FF01000
  1088. PSLLW XMM0,4
  1089. XOR EDX,EDX // EDX <- 00
  1090. DIV ECX // EAX <- Fa / Ra = Wa
  1091. MOVD XMM4,EAX // XMM3 <- Wa
  1092. PSHUFLW XMM4,XMM4,$C0 // XMM3 <- 00 00 ** Wa ** Wa ** Wa
  1093. PMULHW XMM0,XMM4 // XMM0 <- 00 00 ** Pr ** Pg ** Pb
  1094. PADDW XMM0,XMM1 // XMM0 <- 00 Ra 00 Rr 00 Rg 00 Rb
  1095. PACKUSWB XMM0,XMM7 // XMM0 <- Ra Rr Rg Rb
  1096. MOVD EAX,XMM0
  1097. RET
  1098. @1: MOV EAX,EDX
  1099. @2:
  1100. {$ENDIF}
  1101. {$IFDEF TARGET_X64}
  1102. TEST ECX,$FF000000 // foreground completely transparent =>
  1103. JZ @1 // result = background
  1104. MOV EAX,ECX // EAX <- Fa
  1105. CMP EAX,$FF000000 // foreground completely opaque =>
  1106. JNC @2 // result = foreground
  1107. TEST EDX,$FF000000 // background completely transparent =>
  1108. JZ @2 // result = foreground
  1109. PXOR XMM7,XMM7 // XMM7 <- 00
  1110. MOVD XMM0,EAX // XMM0 <- Fa Fr Fg Fb
  1111. SHR EAX,24 // EAX <- Fa
  1112. ROR EDX,24
  1113. MOVZX ECX,DL // ECX <- Ba
  1114. PUNPCKLBW XMM0,XMM7 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb
  1115. SUB EAX,$FF // EAX <- (Fa - 1)
  1116. XOR ECX,$FF // ECX <- (1 - Ba)
  1117. IMUL ECX,EAX // ECX <- (Fa - 1) * (1 - Ba) = Ra - 1
  1118. IMUL ECX,$8081 // ECX <- Xa 00 00 00
  1119. ADD ECX,$8081*$FF*$FF
  1120. SHR ECX,15 // ECX <- Ra
  1121. MOV DL,CH // EDX <- Br Bg Bb Ra
  1122. ROR EDX,8 // EDX <- Ra Br Bg Bb
  1123. MOVD XMM1,EDX // XMM1 <- Ra Br Bg Bb
  1124. PUNPCKLBW XMM1,XMM7 // XMM1 <- 00 Ra 00 Br 00 Bg 00 Bb
  1125. SHL EAX,20 // EAX <- Fa 00 00
  1126. PSUBW XMM0,XMM1 // XMM0 <- ** Da ** Dr ** Dg ** Db
  1127. ADD EAX,$0FF01000
  1128. PSLLW XMM0,4
  1129. XOR EDX,EDX // EDX <- 00
  1130. DIV ECX // EAX <- Fa / Ra = Wa
  1131. MOVD XMM4,EAX // XMM3 <- Wa
  1132. PSHUFLW XMM4,XMM4,$C0 // XMM3 <- 00 00 ** Wa ** Wa ** Wa
  1133. PMULHW XMM0,XMM4 // XMM0 <- 00 00 ** Pr ** Pg ** Pb
  1134. PADDW XMM0,XMM1 // XMM0 <- 00 Ra 00 Rr 00 Rg 00 Rb
  1135. PACKUSWB XMM0,XMM7 // XMM0 <- Ra Rr Rg Rb
  1136. MOVD EAX,XMM0
  1137. RET
  1138. @1: MOV EAX,EDX
  1139. @2:
  1140. {$ENDIF}
  1141. end;
  1142. procedure EMMS_SSE2; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1143. asm
  1144. end;
  1145. function LightenReg_SSE2(C: TColor32; Amount: Integer): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1146. asm
  1147. {$IFDEF TARGET_X86}
  1148. MOVD XMM0,EAX
  1149. TEST EDX,EDX
  1150. JL @1
  1151. IMUL EDX,$010101
  1152. MOVD XMM1,EDX
  1153. PADDUSB XMM0,XMM1
  1154. MOVD EAX,XMM0
  1155. RET
  1156. @1: NEG EDX
  1157. IMUL EDX,$010101
  1158. MOVD XMM1,EDX
  1159. PSUBUSB XMM0,XMM1
  1160. MOVD EAX,XMM0
  1161. {$ENDIF}
  1162. {$IFDEF TARGET_X64}
  1163. MOVD XMM0,ECX
  1164. TEST EDX,EDX
  1165. JL @1
  1166. IMUL EDX,$010101
  1167. MOVD XMM1,EDX
  1168. PADDUSB XMM0,XMM1
  1169. MOVD EAX,XMM0
  1170. RET
  1171. @1: NEG EDX
  1172. IMUL EDX,$010101
  1173. MOVD XMM1,EDX
  1174. PSUBUSB XMM0,XMM1
  1175. MOVD EAX,XMM0
  1176. {$ENDIF}
  1177. end;
  1178. { SSE2 Color algebra}
  1179. function ColorAdd_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1180. asm
  1181. {$IFDEF TARGET_X86}
  1182. MOVD XMM0,EAX
  1183. MOVD XMM1,EDX
  1184. PADDUSB XMM0,XMM1
  1185. MOVD EAX,XMM0
  1186. {$ENDIF}
  1187. {$IFDEF TARGET_X64}
  1188. MOVD XMM0,ECX
  1189. MOVD XMM1,EDX
  1190. PADDUSB XMM0,XMM1
  1191. MOVD EAX,XMM0
  1192. {$ENDIF}
  1193. end;
  1194. function ColorSub_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1195. asm
  1196. {$IFDEF TARGET_X86}
  1197. MOVD XMM0,EAX
  1198. MOVD XMM1,EDX
  1199. PSUBUSB XMM0,XMM1
  1200. MOVD EAX,XMM0
  1201. {$ENDIF}
  1202. {$IFDEF TARGET_X64}
  1203. MOVD XMM0,ECX
  1204. MOVD XMM1,EDX
  1205. PSUBUSB XMM0,XMM1
  1206. MOVD EAX,XMM0
  1207. {$ENDIF}
  1208. end;
  1209. function ColorModulate_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1210. asm
  1211. {$IFDEF TARGET_X86}
  1212. PXOR XMM2,XMM2
  1213. MOVD XMM0,EAX
  1214. PUNPCKLBW XMM0,XMM2
  1215. MOVD XMM1,EDX
  1216. PUNPCKLBW XMM1,XMM2
  1217. PMULLW XMM0,XMM1
  1218. PSRLW XMM0,8
  1219. PACKUSWB XMM0,XMM2
  1220. MOVD EAX,XMM0
  1221. {$ENDIF}
  1222. {$IFDEF TARGET_X64}
  1223. PXOR XMM2,XMM2
  1224. MOVD XMM0,ECX
  1225. PUNPCKLBW XMM0,XMM2
  1226. MOVD XMM1,EDX
  1227. PUNPCKLBW XMM1,XMM2
  1228. PMULLW XMM0,XMM1
  1229. PSRLW XMM0,8
  1230. PACKUSWB XMM0,XMM2
  1231. MOVD EAX,XMM0
  1232. {$ENDIF}
  1233. end;
  1234. function ColorMax_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1235. asm
  1236. {$IFDEF TARGET_X86}
  1237. MOVD XMM0,EAX
  1238. MOVD XMM1,EDX
  1239. PMAXUB XMM0,XMM1
  1240. MOVD EAX,XMM0
  1241. {$ENDIF}
  1242. {$IFDEF TARGET_X64}
  1243. MOVD XMM0,ECX
  1244. MOVD XMM1,EDX
  1245. PMAXUB XMM0,XMM1
  1246. MOVD EAX,XMM0
  1247. {$ENDIF}
  1248. end;
  1249. function ColorMin_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1250. asm
  1251. {$IFDEF TARGET_X86}
  1252. MOVD XMM0,EAX
  1253. MOVD XMM1,EDX
  1254. PMINUB XMM0,XMM1
  1255. MOVD EAX,XMM0
  1256. {$ENDIF}
  1257. {$IFDEF TARGET_X64}
  1258. MOVD XMM0,ECX
  1259. MOVD XMM1,EDX
  1260. PMINUB XMM0,XMM1
  1261. MOVD EAX,XMM0
  1262. {$ENDIF}
  1263. end;
  1264. function ColorDifference_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1265. asm
  1266. {$IFDEF TARGET_X86}
  1267. MOVD XMM0,EAX
  1268. MOVD XMM1,EDX
  1269. MOVQ XMM2,XMM0
  1270. PSUBUSB XMM0,XMM1
  1271. PSUBUSB XMM1,XMM2
  1272. POR XMM0,XMM1
  1273. MOVD EAX,XMM0
  1274. {$ENDIF}
  1275. {$IFDEF TARGET_X64}
  1276. MOVD XMM0,ECX
  1277. MOVD XMM1,EDX
  1278. MOVQ XMM2,XMM0
  1279. PSUBUSB XMM0,XMM1
  1280. PSUBUSB XMM1,XMM2
  1281. POR XMM0,XMM1
  1282. MOVD EAX,XMM0
  1283. {$ENDIF}
  1284. end;
  1285. function ColorExclusion_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1286. asm
  1287. {$IFDEF TARGET_X86}
  1288. PXOR XMM2,XMM2
  1289. MOVD XMM0,EAX
  1290. PUNPCKLBW XMM0,XMM2
  1291. MOVD XMM1,EDX
  1292. PUNPCKLBW XMM1,XMM2
  1293. MOVQ XMM3,XMM0
  1294. PADDW XMM0,XMM1
  1295. PMULLW XMM1,XMM3
  1296. PSRLW XMM1,7
  1297. PSUBUSW XMM0,XMM1
  1298. PACKUSWB XMM0,XMM2
  1299. MOVD EAX,XMM0
  1300. {$ENDIF}
  1301. {$IFDEF TARGET_X64}
  1302. PXOR XMM2,XMM2
  1303. MOVD XMM0,ECX
  1304. PUNPCKLBW XMM0,XMM2
  1305. MOVD XMM1,EDX
  1306. PUNPCKLBW XMM1,XMM2
  1307. MOVQ XMM3,XMM0
  1308. PADDW XMM0,XMM1
  1309. PMULLW XMM1,XMM3
  1310. PSRLW XMM1,7
  1311. PSUBUSW XMM0,XMM1
  1312. PACKUSWB XMM0,XMM2
  1313. MOVD EAX,XMM0
  1314. {$ENDIF}
  1315. end;
  1316. function ColorScale_SSE2(C: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1317. asm
  1318. {$IFDEF TARGET_X86}
  1319. PXOR XMM2,XMM2
  1320. SHL EDX,4
  1321. MOVD XMM0,EAX
  1322. PUNPCKLBW XMM0,XMM2
  1323. ADD EDX,alpha_ptr
  1324. PMULLW XMM0,[EDX]
  1325. PSRLW XMM0,8
  1326. PACKUSWB XMM0,XMM2
  1327. MOVD EAX,XMM0
  1328. {$ENDIF}
  1329. {$IFDEF TARGET_X64}
  1330. PXOR XMM2,XMM2
  1331. SHL RDX,4
  1332. MOVD XMM0,ECX
  1333. PUNPCKLBW XMM0,XMM2
  1334. {$IFNDEF FPC}
  1335. ADD RDX,alpha_ptr
  1336. {$ELSE}
  1337. ADD RDX,[RIP+alpha_ptr]
  1338. {$ENDIF}
  1339. PMULLW XMM0,[RDX]
  1340. PSRLW XMM0,8
  1341. PACKUSWB XMM0,XMM2
  1342. MOVD EAX,XMM0
  1343. {$ENDIF}
  1344. end;
  1345. procedure RegisterBindingFunctions;
  1346. begin
  1347. {$IFNDEF PUREPASCAL}
  1348. {$IFNDEF OMIT_SSE2}
  1349. BlendRegistry.Add(FID_EMMS, @EMMS_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1350. BlendRegistry.Add(FID_MERGEREG, @MergeReg_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1351. BlendRegistry.Add(FID_COMBINEREG, @CombineReg_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1352. BlendRegistry.Add(FID_COMBINEMEM, @CombineMem_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1353. BlendRegistry.Add(FID_COMBINELINE, @CombineLine_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1354. BlendRegistry.Add(FID_BLENDREG, @BlendReg_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1355. BlendRegistry.Add(FID_BLENDMEM, @BlendMem_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1356. BlendRegistry.Add(FID_BLENDMEMS, @BlendMems_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1357. BlendRegistry.Add(FID_BLENDMEMEX, @BlendMemEx_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1358. BlendRegistry.Add(FID_BLENDLINE, @BlendLine_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1359. BlendRegistry.Add(FID_BLENDLINEEX, @BlendLineEx_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1360. BlendRegistry.Add(FID_BLENDREGEX, @BlendRegEx_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1361. BlendRegistry.Add(FID_COLORMAX, @ColorMax_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1362. BlendRegistry.Add(FID_COLORMIN, @ColorMin_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1363. BlendRegistry.Add(FID_COLORADD, @ColorAdd_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1364. BlendRegistry.Add(FID_COLORSUB, @ColorSub_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1365. BlendRegistry.Add(FID_COLORMODULATE, @ColorModulate_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1366. BlendRegistry.Add(FID_COLORDIFFERENCE, @ColorDifference_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1367. BlendRegistry.Add(FID_COLOREXCLUSION, @ColorExclusion_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1368. BlendRegistry.Add(FID_COLORSCALE, @ColorScale_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1369. BlendRegistry.Add(FID_LIGHTEN, @LightenReg_SSE2, [ciSSE], 0, BlendRegistryPrioritySSE2);
  1370. BlendRegistry.Add(FID_BLENDREGRGB, @BlendRegRGB_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1371. BlendRegistry.Add(FID_BLENDMEMRGB, @BlendMemRGB_SSE2, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1372. {$IFDEF TEST_BLENDMEMRGB128SSE4}
  1373. BlendRegistry.Add(FID_BLENDMEMRGB128, @BlendMemRGB128_SSE4, [ciSSE2], 0, BlendRegistryPrioritySSE2);
  1374. {$ENDIF}
  1375. {$ENDIF}
  1376. {$ENDIF}
  1377. end;
  1378. initialization
  1379. RegisterBindingFunctions;
  1380. end.