GR32.Blend.SSE2.pas 80 KB


  1. unit GR32.Blend.SSE2;
  2. (* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1 or LGPL 2.1 with linking exception
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * Alternatively, the contents of this file may be used under the terms of the
  16. * Free Pascal modified version of the GNU Lesser General Public License
  17. * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions
  18. * of this license are applicable instead of those above.
  19. * Please see the file LICENSE.txt for additional information concerning this
  20. * license.
  21. *
  22. * The Original Code is Graphics32
  23. *
  24. * The Initial Developer of the Original Code is
  25. * Alex A. Denisov
  26. *
  27. * Portions created by the Initial Developer are Copyright (C) 2000-2009
  28. * the Initial Developer. All Rights Reserved.
  29. *
  30. * ***** END LICENSE BLOCK ***** *)
  31. interface
  32. {$include GR32.inc}
  33. // Define GR32_SCALEMEMS_FAST to use the faster, but not very precise version of ScaleMems.
  34. // The fast version uses a "shr 8" as a substitute for "div 255" which is also what
  35. // ColorScale_Pas does.
  36. {$define GR32_SCALEMEMS_FAST}
  37. uses
  38. GR32;
  39. {$if not defined(PUREPASCAL)}
  40. //------------------------------------------------------------------------------
  41. //
  42. // SSE SIMD blend implementations
  43. //
  44. //------------------------------------------------------------------------------
  45. //------------------------------------------------------------------------------
  46. // Blend
  47. //------------------------------------------------------------------------------
  48. function BlendReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  49. procedure BlendMem_SSE2(F: TColor32; var B: TColor32); {$IFDEF FPC} assembler; {$ENDIF}
  50. procedure BlendMems_SSE2(F: TColor32; B: PColor32; Count: Integer); {$IFDEF FPC} assembler; {$ENDIF}
  51. function BlendRegEx_SSE2(F, B: TColor32; M: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  52. procedure BlendMemEx_SSE2(F: TColor32; var B:TColor32; M: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  53. function BlendRegRGB_SSE2(F, B: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  54. procedure BlendMemRGB_SSE2(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  55. procedure BlendLine_SSE2(Src, Dst: PColor32; Count: Integer); {$IFDEF FPC} assembler; {$ENDIF}
  56. procedure BlendLineEx_SSE2(Src, Dst: PColor32; Count: Integer; M: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  57. //------------------------------------------------------------------------------
  58. // Merge
  59. //------------------------------------------------------------------------------
  60. function MergeReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  61. //------------------------------------------------------------------------------
  62. // Combine
  63. //------------------------------------------------------------------------------
  64. function CombineReg_SSE2(X, Y: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  65. procedure CombineMem_SSE2_Table(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  66. procedure CombineMem_SSE2_128(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  67. procedure CombineMem_SSE41_8081(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  68. procedure CombineMem_SSE41_Kadaif(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  69. procedure CombineLine_SSE2(Src, Dst: PColor32; Count: Integer; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  70. //------------------------------------------------------------------------------
  71. // Color algebra
  72. //------------------------------------------------------------------------------
  73. function ColorAdd_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  74. function ColorSub_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  75. function ColorModulate_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  76. function ColorMax_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  77. function ColorMin_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  78. function ColorDifference_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  79. function ColorExclusion_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  80. function ColorScale_SSE2(C: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  81. //------------------------------------------------------------------------------
  82. // Misc
  83. //------------------------------------------------------------------------------
  84. function LightenReg_SSE2(C: TColor32; Amount: Integer): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
  85. procedure ScaleMems_SSE41(Dst: PColor32; Count: Integer; Weight: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  86. procedure FastScaleMems_SSE41(Dst: PColor32; Count: Integer; Weight: Cardinal); {$IFDEF FPC} assembler; {$ENDIF}
  87. {$ifend}
  88. //------------------------------------------------------------------------------
  89. //------------------------------------------------------------------------------
  90. //------------------------------------------------------------------------------
  91. implementation
  92. {$if not defined(PUREPASCAL)}
  93. uses
  94. GR32_Blend,
  95. GR32_LowLevel,
  96. GR32_Bindings,
  97. GR32.Types.SIMD;
  98. //------------------------------------------------------------------------------
  99. //
  100. // Blend
  101. //
  102. //------------------------------------------------------------------------------
  103. //------------------------------------------------------------------------------
  104. // BlendReg
  105. //------------------------------------------------------------------------------
  106. function BlendReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  107. asm
  108. // blend foreground color (F) to a background color (B),
  109. // using alpha channel value of F
  110. // EAX <- F
  111. // EDX <- B
  112. // Result := Fa * (Fargb - Bargb) + Bargb
  113. {$IFDEF TARGET_x86}
  114. MOVD XMM0,EAX // XMM0 <- 00 00 00 00 00 00 00 00 00 00 00 00 Fa Fr Fg Fb
  115. PXOR XMM3,XMM3 // XMM3 <- 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  116. MOVD XMM2,EDX // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Ba Br Bg Bb
  117. PUNPCKLBW XMM0,XMM3 // XMM0 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
  118. MOV ECX,bias_ptr // ECX <- Pointer to Bias
  119. PUNPCKLBW XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb
  120. MOVQ XMM1,XMM0 // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
  121. PSHUFLW XMM1,XMM1,$FF // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fa 00 Fa 00 Fa
  122. PSUBW XMM0,XMM2 // XMM0 <- 00 00 00 00 00 00 00 00 00 Da 00 Dr 00 Dg 00 Db
  123. PSLLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb 00
  124. PMULLW XMM0,XMM1 // XMM0 <- 00 00 00 00 00 00 00 00 Pa ** Pr ** Pg ** Pb **
  125. PADDW XMM2,[ECX] // add bias
  126. PADDW XMM2,XMM0 // XMM2 <- 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb **
  127. PSRLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb
  128. PACKUSWB XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Qa Qr Qg Qb
  129. MOVD EAX,XMM2 // EAX <- Za Zr Zg Zb
  130. OR EAX,$FF000000 // EAX <- FF Zr Zg Zb
  131. {$ENDIF}
  132. {$IFDEF TARGET_x64}
  133. MOVD XMM0,ECX // XMM0 <- 00 00 00 00 00 00 00 00 00 00 00 00 Fa Fr Fg Fb
  134. PXOR XMM3,XMM3 // XMM3 <- 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  135. MOVD XMM2,EDX // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Ba Br Bg Bb
  136. PUNPCKLBW XMM0,XMM3 // XMM0 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
  137. {$IFNDEF FPC}
  138. MOV RAX,bias_ptr // RAX <- Pointer to Bias
  139. {$ELSE}
  140. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  141. {$ENDIF}
  142. PUNPCKLBW XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb
  143. MOVQ XMM1,XMM0 // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
  144. PSHUFLW XMM1,XMM1,$FF // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fa 00 ** 00 **
  145. PSUBW XMM0,XMM2 // XMM0 <- 00 00 00 00 00 00 00 00 00 Da 00 Dr 00 Dg 00 Db
  146. PSLLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb 00
  147. PMULLW XMM0,XMM1 // XMM2 <- 00 00 00 00 00 00 00 00 Pa ** Pr ** Pg ** Pb **
  148. PADDW XMM2,[RAX] // add bias
  149. PADDW XMM2,XMM0 // XMM2 <- 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb **
  150. PSRLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb
  151. PACKUSWB XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Qa Qr Qg Qb
  152. MOVD EAX,XMM2 // EAX <- Za Zr Zg Zb
  153. OR EAX,$FF000000 // EAX <- FF Zr Zg Zb
  154. {$ENDIF}
  155. end;
  156. //------------------------------------------------------------------------------
  157. // BlendMem
  158. //------------------------------------------------------------------------------
  159. procedure BlendMem_SSE2(F: TColor32; var B: TColor32); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  160. asm
  161. {$IFDEF TARGET_x86}
  162. // EAX - Color X
  163. // [EDX] - Color Y
  164. // Result := W * (X - Y) + Y
  165. TEST EAX,$FF000000
  166. JZ @1
  167. CMP EAX,$FF000000
  168. JNC @2
  169. PXOR XMM3,XMM3
  170. MOVD XMM0,EAX
  171. MOVD XMM2,[EDX]
  172. PUNPCKLBW XMM0,XMM3
  173. MOV ECX,bias_ptr
  174. PUNPCKLBW XMM2,XMM3
  175. MOVQ XMM1,XMM0
  176. PSHUFLW XMM1,XMM1,$FF
  177. PSUBW XMM0,XMM2
  178. PSLLW XMM2,8
  179. PMULLW XMM0,XMM1
  180. PADDW XMM2,[ECX]
  181. PADDW XMM2,XMM0
  182. PSRLW XMM2,8
  183. PACKUSWB XMM2,XMM3
  184. MOVD [EDX],XMM2
  185. @1: RET
  186. @2: MOV [EDX], EAX
  187. {$ENDIF}
  188. {$IFDEF TARGET_x64}
  189. // ECX - Color X
  190. // [EDX] - Color Y
  191. // Result := W * (X - Y) + Y
  192. TEST ECX,$FF000000
  193. JZ @1
  194. CMP ECX,$FF000000
  195. JNC @2
  196. PXOR XMM3,XMM3
  197. MOVD XMM0,ECX
  198. MOVD XMM2,[RDX]
  199. PUNPCKLBW XMM0,XMM3
  200. {$IFNDEF FPC}
  201. MOV RAX,bias_ptr
  202. {$ELSE}
  203. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  204. {$ENDIF}
  205. PUNPCKLBW XMM2,XMM3
  206. MOVQ XMM1,XMM0
  207. PSHUFLW XMM1,XMM1,$FF
  208. PSUBW XMM0,XMM2
  209. PSLLW XMM2,8
  210. PMULLW XMM0,XMM1
  211. PADDW XMM2,[RAX]
  212. PADDW XMM2,XMM0
  213. PSRLW XMM2,8
  214. PACKUSWB XMM2,XMM3
  215. MOVD [RDX],XMM2
  216. @1: RET
  217. @2: MOV [RDX], ECX
  218. {$ENDIF}
  219. end;
  220. //------------------------------------------------------------------------------
  221. // BlendRegEx
  222. //------------------------------------------------------------------------------
  223. function BlendRegEx_SSE2(F, B: TColor32; M: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  224. asm
  225. // blend foreground color (F) to a background color (B),
  226. // using alpha channel value of F
  227. // Result := M * Fa * (Fargb - Bargb) + Bargb
  228. {$IFDEF TARGET_x86}
  229. // EAX <- F
  230. // EDX <- B
  231. // ECX <- M
  232. PUSH EBX
  233. MOV EBX,EAX
  234. SHR EBX,24
  235. INC ECX // 255:256 range bias
  236. IMUL ECX,EBX
  237. SHR ECX,8
  238. JZ @1
  239. PXOR XMM0,XMM0
  240. MOVD XMM1,EAX
  241. SHL ECX,4
  242. MOVD XMM2,EDX
  243. PUNPCKLBW XMM1,XMM0
  244. PUNPCKLBW XMM2,XMM0
  245. ADD ECX,alpha_ptr
  246. PSUBW XMM1,XMM2
  247. PMULLW XMM1,[ECX]
  248. PSLLW XMM2,8
  249. MOV ECX,bias_ptr
  250. PADDW XMM2,[ECX]
  251. PADDW XMM1,XMM2
  252. PSRLW XMM1,8
  253. PACKUSWB XMM1,XMM0
  254. MOVD EAX,XMM1
  255. POP EBX
  256. RET
  257. @1: MOV EAX,EDX
  258. POP EBX
  259. {$ENDIF}
  260. {$IFDEF TARGET_x64}
  261. // ECX <- F
  262. // EDX <- B
  263. // R8D <- M
  264. MOV EAX,ECX
  265. SHR EAX,24
  266. INC R8D // 255:256 range bias
  267. IMUL R8D,EAX
  268. SHR R8D,8
  269. JZ @1
  270. PXOR XMM0,XMM0
  271. MOVD XMM1,ECX
  272. SHL R8D,4
  273. MOVD XMM2,EDX
  274. PUNPCKLBW XMM1,XMM0
  275. PUNPCKLBW XMM2,XMM0
  276. {$IFNDEF FPC}
  277. ADD R8,alpha_ptr
  278. {$ELSE}
  279. ADD R8,[RIP+alpha_ptr]
  280. {$ENDIF}
  281. PSUBW XMM1,XMM2
  282. PMULLW XMM1,[R8]
  283. PSLLW XMM2,8
  284. {$IFNDEF FPC}
  285. MOV R8,bias_ptr
  286. {$ELSE}
  287. MOV R8,[RIP+bias_ptr]
  288. {$ENDIF}
  289. PADDW XMM2,[R8]
  290. PADDW XMM1,XMM2
  291. PSRLW XMM1,8
  292. PACKUSWB XMM1,XMM0
  293. MOVD EAX,XMM1
  294. RET
  295. @1: MOV EAX,EDX
  296. {$ENDIF}
  297. end;
  298. //------------------------------------------------------------------------------
  299. // BlendMemEx
  300. //------------------------------------------------------------------------------
  301. procedure BlendMemEx_SSE2(F: TColor32; var B:TColor32; M: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  302. asm
  303. {$IFDEF TARGET_x86}
  304. // blend foreground color (F) to a background color (B),
  305. // using alpha channel value of F
  306. // EAX <- F
  307. // [EDX] <- B
  308. // ECX <- M
  309. // Result := M * Fa * (Fargb - Bargb) + Bargb
  310. TEST EAX,$FF000000
  311. JZ @2
  312. PUSH EBX
  313. MOV EBX,EAX // EBX <- Fa Fr Fg Fb
  314. SHR EBX,24 // EBX <- 00 00 00 Fa
  315. INC ECX // 255:256 range bias
  316. IMUL ECX,EBX // ECX <- 00 00 W **
  317. SHR ECX,8 // ECX <- 00 00 00 W
  318. JZ @1
  319. PXOR XMM0,XMM0 // XMM0 <- 00 00 00 00 00 00 00 00
  320. MOVD XMM1,EAX // XMM1 <- 00 00 00 00 Fa Fr Fg Fb
  321. SHL ECX,4
  322. MOVD XMM2,[EDX] // XMM2 <- 00 00 00 00 Ba Br Bg Bb
  323. PUNPCKLBW XMM1,XMM0
  324. PUNPCKLBW XMM2,XMM0
  325. ADD ECX,alpha_ptr
  326. PSUBW XMM1,XMM2
  327. PMULLW XMM1,[ECX]
  328. PSLLW XMM2,8
  329. MOV ECX,bias_ptr
  330. PADDW XMM2,[ECX]
  331. PADDW XMM1,XMM2
  332. PSRLW XMM1,8
  333. PACKUSWB XMM1,XMM0
  334. MOVD [EDX],XMM1
  335. @1:
  336. POP EBX
  337. @2:
  338. {$ENDIF}
  339. {$IFDEF TARGET_x64}
  340. // blend foreground color (F) to a background color (B),
  341. // using alpha channel value of F
  342. // RCX <- F
  343. // [RDX] <- B
  344. // R8 <- M
  345. // Result := M * Fa * (Fargb - Bargb) + Bargb
  346. TEST ECX,$FF000000
  347. JZ @1
  348. MOV R9D,ECX
  349. SHR R9D,24
  350. INC R8D // 255:256 range bias
  351. IMUL R8D,R9D
  352. SHR R8D,8
  353. JZ @1
  354. PXOR XMM0,XMM0
  355. MOVD XMM1,ECX
  356. SHL R8D,4
  357. MOVD XMM2,[RDX]
  358. PUNPCKLBW XMM1,XMM0
  359. PUNPCKLBW XMM2,XMM0
  360. {$IFNDEF FPC}
  361. ADD R8,alpha_ptr
  362. {$ELSE}
  363. ADD R8,[RIP+alpha_ptr]
  364. {$ENDIF}
  365. PSUBW XMM1,XMM2
  366. PMULLW XMM1,[R8]
  367. PSLLW XMM2,8
  368. {$IFNDEF FPC}
  369. MOV R8,bias_ptr
  370. {$ELSE}
  371. MOV R8,[RIP+bias_ptr]
  372. {$ENDIF}
  373. PADDW XMM2,[R8]
  374. PADDW XMM1,XMM2
  375. PSRLW XMM1,8
  376. PACKUSWB XMM1,XMM0
  377. MOVD DWORD PTR [RDX],XMM1
  378. @1:
  379. {$ENDIF}
  380. end;
  381. //------------------------------------------------------------------------------
  382. // BlendRegRGB
  383. //------------------------------------------------------------------------------
  384. function BlendRegRGB_SSE2(F, B: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  385. asm
  386. {$IFDEF TARGET_x86}
  387. PXOR XMM2,XMM2
  388. MOVD XMM0,EAX
  389. PUNPCKLBW XMM0,XMM2
  390. MOVD XMM1,EDX
  391. PUNPCKLBW XMM1,XMM2
  392. BSWAP ECX
  393. PSUBW XMM0,XMM1
  394. MOVD XMM3,ECX
  395. PUNPCKLBW XMM3,XMM2
  396. PMULLW XMM0,XMM3
  397. MOV EAX,bias_ptr
  398. PSLLW XMM1,8
  399. PADDW XMM1,[EAX]
  400. PADDW XMM1,XMM0
  401. PSRLW XMM1,8
  402. PACKUSWB XMM1,XMM2
  403. MOVD EAX,XMM1
  404. {$ENDIF}
  405. {$IFDEF TARGET_x64}
  406. PXOR XMM2,XMM2
  407. MOVD XMM0,ECX
  408. PUNPCKLBW XMM0,XMM2
  409. MOVD XMM1,EDX
  410. PUNPCKLBW XMM1,XMM2
  411. BSWAP R8D
  412. PSUBW XMM0,XMM1
  413. MOVD XMM3,R8D
  414. PUNPCKLBW XMM3,XMM2
  415. PMULLW XMM0,XMM3
  416. {$IFNDEF FPC}
  417. MOV RAX,bias_ptr
  418. {$ELSE}
  419. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  420. {$ENDIF}
  421. PSLLW XMM1,8
  422. PADDW XMM1,[RAX]
  423. PADDW XMM1,XMM0
  424. PSRLW XMM1,8
  425. PACKUSWB XMM1,XMM2
  426. MOVD EAX,XMM1
  427. {$ENDIF}
  428. end;
  429. //------------------------------------------------------------------------------
  430. // BlendMemRGB
  431. //------------------------------------------------------------------------------
  432. procedure BlendMemRGB_SSE2(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  433. asm
  434. {$IFDEF TARGET_x86}
  435. PXOR XMM2,XMM2
  436. MOVD XMM0,EAX
  437. PUNPCKLBW XMM0,XMM2
  438. MOVD XMM1,[EDX]
  439. PUNPCKLBW XMM1,XMM2
  440. BSWAP ECX
  441. PSUBW XMM0,XMM1
  442. MOVD XMM3,ECX
  443. PUNPCKLBW XMM3,XMM2
  444. PMULLW XMM0,XMM3
  445. MOV EAX,bias_ptr
  446. PSLLW XMM1,8
  447. PADDW XMM1,[EAX]
  448. PADDW XMM1,XMM0
  449. PSRLW XMM1,8
  450. PACKUSWB XMM1,XMM2
  451. MOVD [EDX],XMM1
  452. {$ENDIF}
  453. {$IFDEF TARGET_x64}
  454. MOVD XMM1,R8D
  455. PXOR XMM4,XMM4
  456. {$IFNDEF FPC}
  457. MOV RAX,bias_ptr
  458. {$ELSE}
  459. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  460. {$ENDIF}
  461. MOVQ XMM5,[RAX]
  462. MOVD XMM0,ECX
  463. MOVD XMM2,[RDX]
  464. PUNPCKLBW XMM0,XMM4
  465. PUNPCKLBW XMM1,XMM4
  466. PUNPCKLBW XMM2,XMM4
  467. PSHUFLW XMM1,XMM1,$1B
  468. // C = wA B - wB
  469. PMULLW XMM0,XMM1
  470. PADDW XMM0,XMM5
  471. PSRLW XMM0,8
  472. PADDW XMM0,XMM2
  473. PMULLW XMM2,XMM1
  474. PADDW XMM2,XMM5
  475. PSRLW XMM2,8
  476. PSUBW XMM0,XMM2
  477. PACKUSWB XMM0,XMM4
  478. MOVD [RDX],XMM0
  479. {$ENDIF}
  480. end;
  481. //------------------------------------------------------------------------------
  482. // BlendMemRGB128
  483. //------------------------------------------------------------------------------
  484. {$IFDEF TEST_BLENDMEMRGB128SSE4}
  485. procedure BlendMemRGB128_SSE4(F: TColor32; var B: TColor32; W: UInt64); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  486. asm
  487. {$IFDEF TARGET_x86}
  488. MOVQ XMM1,W
  489. PXOR XMM4,XMM4
  490. MOV ECX,[bias_ptr]
  491. MOVDQA XMM5,[ECX]
  492. MOVD XMM0,EAX
  493. PINSRD XMM0,EAX,1
  494. MOVQ XMM2,[EDX].QWORD
  495. PUNPCKLBW XMM0,XMM4
  496. PUNPCKLBW XMM1,XMM4
  497. PUNPCKLBW XMM2,XMM4
  498. PSHUFLW XMM1,XMM1,$1B
  499. PSHUFHW XMM1,XMM1,$1B
  500. // C = wA B - wB
  501. PMULLW XMM0,XMM1
  502. PADDW XMM0,XMM5
  503. PSRLW XMM0,8
  504. PADDW XMM0,XMM2
  505. PMULLW XMM2,XMM1
  506. PADDW XMM2,XMM5
  507. PSRLW XMM2,8
  508. PSUBW XMM0,XMM2
  509. PACKUSWB XMM0,XMM4
  510. MOVQ [EDX].QWORD,XMM0
  511. {$ENDIF}
  512. {$IFDEF TARGET_x64}
  513. MOVQ XMM1,R8
  514. PXOR XMM4,XMM4
  515. MOV RAX,[RIP+bias_ptr]
  516. MOVDQA XMM5,[RAX]
  517. MOVD XMM0,ECX
  518. PINSRD XMM0,ECX,1
  519. MOVQ XMM2,[RDX].QWORD
  520. PUNPCKLBW XMM0,XMM4
  521. PUNPCKLBW XMM1,XMM4
  522. PUNPCKLBW XMM2,XMM4
  523. PSHUFLW XMM1,XMM1,$1B
  524. PSHUFHW XMM1,XMM1,$1B
  525. // C = wA B - wB
  526. PMULLW XMM0,XMM1
  527. PADDW XMM0,XMM5
  528. PSRLW XMM0,8
  529. PADDW XMM0,XMM2
  530. PMULLW XMM2,XMM1
  531. PADDW XMM2,XMM5
  532. PSRLW XMM2,8
  533. PSUBW XMM0,XMM2
  534. PACKUSWB XMM0,XMM4
  535. MOVQ [RDX].QWORD,XMM0
  536. {$ENDIF}
  537. end;
  538. {$ENDIF}
  539. //------------------------------------------------------------------------------
  540. // BlendLine
  541. //------------------------------------------------------------------------------
  542. procedure BlendLine_SSE2(Src, Dst: PColor32; Count: Integer); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  543. {$IFDEF FPC}
  544. const
  545. COpaque: QWORD = QWORD($FF000000FF000000);
  546. {$ENDIF}
  547. asm
  548. {$IFDEF TARGET_X86}
  549. // EAX <- Src
  550. // EDX <- Dst
  551. // ECX <- Count
  552. TEST ECX,ECX
  553. JLE @3
  554. PUSH EBX
  555. PXOR XMM4,XMM4
  556. MOV EBX,[bias_ptr]
  557. MOVDQA XMM5,[EBX]
  558. POP EBX
  559. TEST ECX, 1
  560. JZ @2
  561. MOVD XMM0,[EAX]
  562. MOVD XMM2,[EDX]
  563. PUNPCKLBW XMM0,XMM4
  564. PUNPCKLBW XMM2,XMM4
  565. PSHUFLW XMM1,XMM0,$FF
  566. // premultiply source pixel by its alpha
  567. MOVQ XMM3,XMM1
  568. PSRLQ XMM3,16
  569. PMULLW XMM0,XMM3
  570. PADDW XMM0,XMM5
  571. PSRLW XMM0,8
  572. PSLLQ XMM3,48
  573. POR XMM0,XMM3
  574. // C' = A' B' - aB'
  575. PMULLW XMM1,XMM2
  576. PADDW XMM1,XMM5
  577. PSRLW XMM1,8
  578. PADDW XMM0,XMM2
  579. PSUBW XMM0,XMM1
  580. PACKUSWB XMM0,XMM4
  581. MOVD [EDX], XMM0
  582. @2:
  583. LEA EAX, [EAX + ECX * 4]
  584. LEA EDX, [EDX + ECX * 4]
  585. SHR ECX,1
  586. JZ @3
  587. NEG ECX
  588. @1:
  589. MOVQ XMM0,[EAX + ECX * 8].QWORD
  590. MOVQ XMM2,[EDX + ECX * 8].QWORD
  591. PUNPCKLBW XMM0,XMM4
  592. PUNPCKLBW XMM2,XMM4
  593. PSHUFLW XMM1,XMM0,$FF
  594. PSHUFHW XMM1,XMM1,$FF
  595. // premultiply source pixel by its alpha
  596. MOVDQA XMM3,XMM1
  597. PSRLQ XMM3,16
  598. PMULLW XMM0,XMM3
  599. PADDW XMM0,XMM5
  600. PSRLW XMM0,8
  601. PSLLQ XMM3,48
  602. POR XMM0,XMM3
  603. // C' = A' + B' - aB'
  604. PMULLW XMM1,XMM2
  605. PADDW XMM1,XMM5
  606. PSRLW XMM1,8
  607. PADDW XMM0,XMM2
  608. PSUBW XMM0,XMM1
  609. PACKUSWB XMM0,XMM4
  610. MOVQ [EDX + ECX * 8].QWORD,XMM0
  611. ADD ECX,1
  612. JS @1
  613. @3:
  614. {$ENDIF}
  615. {$IFDEF TARGET_X64}
  616. TEST R8D,R8D
  617. JLE @3
  618. PXOR XMM4,XMM4
  619. {$IFNDEF FPC}
  620. MOV RAX,bias_ptr
  621. {$ELSE}
  622. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  623. {$ENDIF}
  624. MOVDQA XMM5,[RAX]
  625. MOV R9D, R8D
  626. SHR R9D, 1
  627. TEST R9D, R9D
  628. JZ @2
  629. @1:
  630. MOVQ XMM0,[RCX].QWORD
  631. MOVQ RAX,XMM0
  632. {$IFDEF FPC}
  633. AND RAX,[RIP+COpaque]
  634. JZ @1b
  635. CMP RAX,[RIP+COpaque]
  636. JZ @1a
  637. {$ENDIF}
  638. MOVQ XMM2,[RDX].QWORD
  639. PUNPCKLBW XMM0,XMM4
  640. PUNPCKLBW XMM2,XMM4
  641. PSHUFLW XMM1,XMM0,$FF
  642. PSHUFHW XMM1,XMM1,$FF
  643. // premultiply source pixel by its alpha
  644. MOVDQA XMM3,XMM1
  645. PSRLQ XMM3,16
  646. PMULLW XMM0,XMM3
  647. PADDW XMM0,XMM5
  648. PSRLW XMM0,8
  649. PSLLQ XMM3,48
  650. POR XMM0,XMM3
  651. // C' = A' + B' - aB'
  652. PMULLW XMM1,XMM2
  653. PADDW XMM1,XMM5
  654. PSRLW XMM1,8
  655. PADDW XMM0,XMM2
  656. PSUBW XMM0,XMM1
  657. PACKUSWB XMM0,XMM4
  658. @1a: MOVQ [RDX].QWORD,XMM0
  659. @1b: ADD RCX,8
  660. ADD RDX,8
  661. SUB R9D,1
  662. JNZ @1
  663. @2:
  664. AND R8D, 1
  665. JZ @3
  666. MOVD XMM0,[RCX]
  667. MOVD XMM2,[RDX]
  668. PUNPCKLBW XMM0,XMM4
  669. PUNPCKLBW XMM2,XMM4
  670. PSHUFLW XMM1,XMM0,$FF
  671. // premultiply source pixel by its alpha
  672. MOVQ XMM3,XMM1
  673. PSRLQ XMM3,16
  674. PMULLW XMM0,XMM3
  675. PADDW XMM0,XMM5
  676. PSRLW XMM0,8
  677. PSLLQ XMM3,48
  678. POR XMM0,XMM3
  679. // C' = A' B' - aB'
  680. PMULLW XMM1,XMM2
  681. PADDW XMM1,XMM5
  682. PSRLW XMM1,8
  683. PADDW XMM0,XMM2
  684. PSUBW XMM0,XMM1
  685. PACKUSWB XMM0,XMM4
  686. MOVD [RDX], XMM0
  687. @3:
  688. {$ENDIF}
  689. end;
  690. //------------------------------------------------------------------------------
  691. // BlendMems
  692. // Like BlendLine except the Src parameter is static.
  693. //------------------------------------------------------------------------------
  694. procedure BlendMems_SSE2(F: TColor32; B: PColor32; Count: Integer); {$IFDEF FPC} assembler; {$ENDIF}
  695. asm
  696. //
  697. // Result Z = Fa * (Fargb - Bargb) + Bargb
  698. // = Fa * Fargb - Fa * Bargb + Bargb
  699. //
  700. // For Fa * Fargb, ((a*x) div 255) is approximated as ((((a * $101) shr 16) * x + 128) div 256)
  701. // For Fa * Bargb, (x div 255) is approximated as ((x + 128) div 256)
  702. //
  703. {$IFDEF TARGET_X86}
  704. // EAX <- Src: TColor32
  705. // EDX <- Dst: PColor32
  706. // ECX <- Count
  707. // Test the counter for zero or negativity
  708. // JCXZ @Done
  709. TEST ECX, ECX
  710. JLE @Done
  711. // Test if source if fully transparent
  712. TEST EAX, $FF000000
  713. JZ @Done
  714. // Setup division by 255 bias
  715. PUSH EBX
  716. PXOR XMM4, XMM4
  717. MOV EBX, [bias_ptr]
  718. MOVDQA XMM5, [EBX]
  719. POP EBX
  720. // Load source
  721. MOVD XMM0, EAX // XMM0 <- 00 00 00 00 Fa Fr Fg Fb
  722. // Get source alpha and test if fully opaque
  723. SHR EAX, 24
  724. CMP EAX, $FF
  725. JZ @FillWithSource
  726. PSHUFD XMM0, XMM0, 0 // XMM0[0..3] <- XMM0[0][0..3]
  727. PUNPCKLBW XMM0, XMM4 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb
  728. PSHUFLW XMM1, XMM0, $FF // XMM1 <- 00 Fa 00 Fa 00 Fa 00 Fa
  729. PSHUFHW XMM1, XMM1, $FF
  730. // Premultiply source pixel by its alpha: Fa * Fargb
  731. MOVDQA XMM3, XMM1 // XMM3 <- 2*QWord(XMM1)
  732. PSRLQ XMM3, 16 // XMM3 <- 00 00 00 Fa 00 Fa 00 Fa
  733. PMULLW XMM0, XMM3 // XMM0 <- Frgb * Fa
  734. PADDW XMM0, XMM5 // XMM0 <- Frgb * Fa + Bias
  735. PSRLW XMM0, 8 // XMM0 <- (Frgb * Fa + Bias) div 256
  736. PSLLQ XMM3, 48 // XMM3 <- 00 Fa 00 00 00 00 00 00
  737. POR XMM0, XMM3 // XMM0 <- 00 Fa 00 FR 00 FG 00 FB
  738. // Save alpha multiplier
  739. MOVDQA XMM3, XMM1
  740. // Test for odd/even count
  741. TEST ECX, 1
  742. JZ @Even
  743. // We have an odd number of pixels.
  744. // Blend a single pixel so the remaining count is even.
  745. // Load dest
  746. MOVD XMM2, DWORD PTR [EDX] // XMM2 <- 00 00 00 00 Ba Br Bg Bb
  747. PUNPCKLBW XMM2, XMM4 // XMM2 <- 00 Ba 00 Br 00 Bg 00 Bb
  748. // Blend: C' = A' B' - aB'
  749. PMULLW XMM1, XMM2 // Z1 = Fa * Brgba
  750. PADDW XMM1, XMM5 // Z1 = Fa * Brgba + Bias
  751. PSRLW XMM1, 8 // Z1 = (Fa * Bargb + Bias) div 256
  752. PADDW XMM2, XMM0 // Z2 = Brgba + FaRGB
  753. PSUBW XMM2, XMM1 // Z2 = Z2 - Z1
  754. PACKUSWB XMM2, XMM4
  755. MOVD [EDX], XMM2
  756. @Even:
  757. LEA EDX, [EDX + ECX * 4] // Get address of last pixel
  758. SHR ECX,1 // Number of QWORDs
  759. JZ @Done
  760. NEG ECX // Negate count so we can use it as an offset to move forward
  761. @Loop:
  762. // Blend two pixels at a time
  763. // Restore alpha multiplier
  764. MOVDQA XMM1, XMM3
  765. // Load dest
  766. MOVQ XMM2, [EDX + ECX * 8].QWORD // XMM2 <- Ba Br Bg Bb Ba Br Bg Bb
  767. PUNPCKLBW XMM2, XMM4 // XMM2 <- 00 Ba 00 Br 00 Bg 00 Bb
  768. // Blend: C' = A' + B' - aB'
  769. PMULLW XMM1, XMM2
  770. PADDW XMM1, XMM5
  771. PSRLW XMM1, 8
  772. PADDW XMM2, XMM0
  773. PSUBW XMM2, XMM1
  774. PACKUSWB XMM2, XMM4
  775. MOVQ [EDX + ECX * 8].QWORD, XMM2
  776. ADD ECX, 1
  777. JS @Loop
  778. @Done:
  779. RET
  780. @FillWithSource:
  781. // Shuffle registers for FillLongword
  782. MOV EAX, EDX
  783. MOV EDX, ECX
  784. MOVD ECX, XMM0
  785. CALL FillLongword // EAX:Dest, EDX:Count, ECX:Value
  786. {$ENDIF}
  787. {$IFDEF TARGET_X64}
  788. // ECX <- Src: TColor32
  789. // RDX <- Dst: PColor32
  790. // R8D <- Count
  791. // Test the counter for zero or negativity
  792. TEST R8D, R8D
  793. JLE @Done
  794. // Test if source if fully transparent
  795. TEST ECX, $FF000000
  796. JZ @Done
  797. // Get source alpha
  798. MOV EAX, ECX
  799. SHR EAX, 24
  800. // Test if source is fully opaque
  801. CMP EAX, $FF
  802. JZ @FillWithSource
  803. // Setup division by 255 bias
  804. PXOR XMM4, XMM4
  805. {$IFNDEF FPC}
  806. MOV RAX, bias_ptr
  807. {$ELSE}
  808. MOV RAX, [RIP+bias_ptr]
  809. {$ENDIF}
  810. MOVDQA XMM5, [RAX]
  811. // Load source
  812. MOVQ XMM0, RCX // XMM0 <- 00 00 00 00 Fa Fr Fg Fb
  813. PSHUFD XMM0, XMM0, 0 // XMM0[0..3] <- XMM0[0][0..3]
  814. PUNPCKLBW XMM0, XMM4 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb
  815. PSHUFLW XMM1, XMM0, $FF // XMM1 <- 00 Fa 00 Fa 00 Fa 00 Fa
  816. PSHUFHW XMM1, XMM1, $FF
  817. // Premultiply source pixel by its alpha
  818. MOVDQA XMM3, XMM1 // XMM3 <- 2*QWord(XMM1)
  819. PSRLQ XMM3, 16 // XMM3 <- 00 00 00 Fa 00 Fa 00 Fa
  820. PMULLW XMM0, XMM3 // XMM0 <- Frgb * Fa
  821. PADDW XMM0, XMM5 // XMM0 <- Frgb * Fa + Bias
  822. PSRLW XMM0, 8 // XMM0 <- (Frgb * Fa + Bias) shr 8
  823. PSLLQ XMM3, 48 // XMM3 <- 00 Fa 00 00 00 00 00 00
  824. POR XMM0, XMM3 // XMM0 <- 00 Fa 00 FR 00 FG 00 FB
  825. // Save alpha multiplier
  826. MOVDQA XMM3, XMM1
  827. // Test for odd/even count
  828. MOV R9D, R8D
  829. SHR R9D, 1 // Get number of double pixels
  830. TEST R9D, R9D
  831. JZ @SinglePixel // None; We only have a single pixel
  832. @Loop:
  833. // Blend two pixels at a time
  834. // Load dest
  835. MOVQ XMM2, [RDX].QWORD
  836. PUNPCKLBW XMM2, XMM4
  837. // Blend: C' = A' + B' - aB'
  838. PMULLW XMM1, XMM2
  839. PADDW XMM1, XMM5
  840. PSRLW XMM1, 8
  841. PADDW XMM2, XMM0
  842. PSUBW XMM2, XMM1
  843. // Restore alpha multiplier
  844. MOVDQA XMM1, XMM3
  845. // Store dest
  846. PACKUSWB XMM2, XMM4
  847. MOVQ [RDX].QWORD, XMM2
  848. ADD RDX, 8
  849. SUB R9D, 1
  850. JNZ @Loop
  851. @SinglePixel:
  852. AND R8D, 1
  853. JZ @Done
  854. // Blend a single pixel
  855. // Load dest
  856. MOVD XMM2, [RDX]
  857. PUNPCKLBW XMM2, XMM4
  858. // Blend: C' = A' B' - aB'
  859. PMULLW XMM1, XMM2
  860. PADDW XMM1, XMM5
  861. PSRLW XMM1, 8
  862. PADDW XMM0, XMM2
  863. PSUBW XMM0, XMM1
  864. // Store dest
  865. PACKUSWB XMM0, XMM4
  866. MOVD [RDX], XMM0
  867. @Done:
  868. RET
  869. @FillWithSource:
  870. // Shuffle registers for FillLongword
  871. MOV EAX, ECX
  872. MOV RCX, RDX
  873. MOV EDX, R8D
  874. MOV R8D, EAX
  875. {$IFNDEF FPC}
  876. CALL FillLongword // RCX:Dest, EDX:Count, R8D:Value
  877. {$ELSE}
  878. CALL [rip+FillLongword] // RCX:Dest, EDX:Count, R8D:Value
  879. {$ENDIF}
  880. {$ENDIF}
  881. end;
  882. //------------------------------------------------------------------------------
  883. // BlendLineEx
  884. //------------------------------------------------------------------------------
  885. procedure BlendLineEx_SSE2(Src, Dst: PColor32; Count: Integer; M: Cardinal); {$IFDEF FPC} assembler; {$IFDEF TARGET_X64}nostackframe;{$ENDIF} {$ENDIF}
  886. asm
  887. {$IFDEF TARGET_X86}
  888. // EAX <- Src
  889. // EDX <- Dst
  890. // ECX <- Count
  891. // test the counter for zero or negativity
  892. TEST ECX,ECX
  893. JLE @4
  894. PUSH ESI
  895. PUSH EDI
  896. PUSH EBX
  897. MOV ESI,EAX // ESI <- Src
  898. MOV EDI,EDX // EDI <- Dst
  899. MOV EDX,M // EDX <- Master Alpha
  900. // loop start
  901. @1: MOV EAX,[ESI]
  902. TEST EAX,$FF000000
  903. JZ @3 // complete transparency, proceed to next point
  904. MOV EBX,EAX
  905. SHR EBX,24
  906. INC EBX // 255:256 range bias
  907. IMUL EBX,EDX
  908. SHR EBX,8
  909. JZ @3 // complete transparency, proceed to next point
  910. // blend
  911. PXOR XMM0,XMM0
  912. MOVD XMM1,EAX
  913. SHL EBX,4
  914. MOVD XMM2,[EDI]
  915. PUNPCKLBW XMM1,XMM0
  916. PUNPCKLBW XMM2,XMM0
  917. ADD EBX,alpha_ptr
  918. PSUBW XMM1,XMM2
  919. PMULLW XMM1,[EBX]
  920. PSLLW XMM2,8
  921. MOV EBX,bias_ptr
  922. PADDW XMM2,[EBX]
  923. PADDW XMM1,XMM2
  924. PSRLW XMM1,8
  925. PACKUSWB XMM1,XMM0
  926. MOVD EAX,XMM1
  927. @2: MOV [EDI],EAX
  928. @3: ADD ESI,4
  929. ADD EDI,4
  930. // loop end
  931. DEC ECX
  932. JNZ @1
  933. POP EBX
  934. POP EDI
  935. POP ESI
  936. @4:
  937. {$ENDIF}
  938. {$IFDEF TARGET_X64}
  939. // ECX <- Src
  940. // EDX <- Dst
  941. // R8D <- Count
  942. // R9D <- M
  943. // test the counter for zero or negativity
  944. TEST R8D,R8D
  945. JLE @4
  946. TEST R9D,R9D
  947. JZ @4
  948. MOV R10,RCX // ESI <- Src
  949. // loop start
  950. @1: MOV ECX,[R10]
  951. TEST ECX,$FF000000
  952. JZ @3 // complete transparency, proceed to next point
  953. MOV EAX,ECX
  954. SHR EAX,24
  955. INC EAX // 255:256 range bias
  956. IMUL EAX,R9D
  957. SHR EAX,8
  958. JZ @3 // complete transparency, proceed to next point
  959. // blend
  960. PXOR XMM0,XMM0
  961. MOVD XMM1,ECX
  962. SHL EAX,4
  963. MOVD XMM2,[RDX]
  964. PUNPCKLBW XMM1,XMM0
  965. PUNPCKLBW XMM2,XMM0
  966. {$IFNDEF FPC}
  967. ADD RAX,alpha_ptr
  968. {$ELSE}
  969. ADD RAX,[RIP+alpha_ptr]
  970. {$ENDIF}
  971. PSUBW XMM1,XMM2
  972. PMULLW XMM1,[RAX]
  973. PSLLW XMM2,8
  974. {$IFNDEF FPC}
  975. MOV RAX,bias_ptr
  976. {$ELSE}
  977. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  978. {$ENDIF}
  979. PADDW XMM2,[RAX]
  980. PADDW XMM1,XMM2
  981. PSRLW XMM1,8
  982. PACKUSWB XMM1,XMM0
  983. MOVD ECX,XMM1
  984. @2: MOV [RDX],ECX
  985. @3: ADD R10,4
  986. ADD RDX,4
  987. // loop end
  988. DEC R8D
  989. JNZ @1
  990. @4:
  991. {$ENDIF}
  992. end;
  993. //------------------------------------------------------------------------------
  994. //
  995. // Combine
  996. //
  997. //------------------------------------------------------------------------------
  998. //------------------------------------------------------------------------------
  999. // CombineReg
  1000. //------------------------------------------------------------------------------
  1001. function CombineReg_SSE2(X, Y: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1002. asm
  1003. {$IFDEF TARGET_X86}
  1004. // EAX - Color X
  1005. // EDX - Color Y
  1006. // ECX - Weight of X [0..255]
  1007. // Result := W * (X - Y) + Y
  1008. MOVD XMM1,EAX
  1009. PXOR XMM0,XMM0
  1010. SHL ECX,4
  1011. MOVD XMM2,EDX
  1012. PUNPCKLBW XMM1,XMM0
  1013. PUNPCKLBW XMM2,XMM0
  1014. ADD ECX,alpha_ptr
  1015. PSUBW XMM1,XMM2
  1016. PMULLW XMM1,[ECX]
  1017. PSLLW XMM2,8
  1018. MOV ECX,bias_ptr
  1019. PADDW XMM2,[ECX]
  1020. PADDW XMM1,XMM2
  1021. PSRLW XMM1,8
  1022. PACKUSWB XMM1,XMM0
  1023. MOVD EAX,XMM1
  1024. {$ENDIF}
  1025. {$IFDEF TARGET_X64}
  1026. // ECX - Color X
  1027. // EDX - Color Y
  1028. // R8D - Weight of X [0..255]
  1029. // Result := W * (X - Y) + Y
  1030. MOVD XMM1,ECX
  1031. PXOR XMM0,XMM0
  1032. SHL R8D,4
  1033. MOVD XMM2,EDX
  1034. PUNPCKLBW XMM1,XMM0
  1035. PUNPCKLBW XMM2,XMM0
  1036. {$IFNDEF FPC}
  1037. ADD R8,alpha_ptr
  1038. {$ELSE}
  1039. ADD R8,[RIP+alpha_ptr]
  1040. {$ENDIF}
  1041. PSUBW XMM1,XMM2
  1042. PMULLW XMM1,[R8]
  1043. PSLLW XMM2,8
  1044. {$IFNDEF FPC}
  1045. MOV R8,bias_ptr
  1046. {$ELSE}
  1047. MOV R8,[RIP+bias_ptr]
  1048. {$ENDIF}
  1049. PADDW XMM2,[R8]
  1050. PADDW XMM1,XMM2
  1051. PSRLW XMM1,8
  1052. PACKUSWB XMM1,XMM0
  1053. MOVD EAX,XMM1
  1054. {$ENDIF}
  1055. end;
  1056. //------------------------------------------------------------------------------
  1057. // CombineMem
  1058. //------------------------------------------------------------------------------
  1059. procedure CombineMem_SSE2_Table(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1060. (*
  1061. Contributed by: Christian-W. Budde
  1062. TestCombineMem:
  1063. Errors: 39.082 = 29,8 % (Limit: -1)
  1064. Differences: 92.042
  1065. Average difference: -0,29
  1066. Max difference: 1 (Limit: 1)
  1067. *)
  1068. asm
  1069. //
  1070. // Result := W * (X - Y) + Y
  1071. //
  1072. // Approximates (x div 255) as ((x + 128) div 256)
  1073. //
  1074. {$IFDEF TARGET_X86}
  1075. // EAX Color X
  1076. // [EDX] Color Y
  1077. // ECX Weight of X [0..255]
  1078. // Return ColorY if weight=0
  1079. JCXZ @exit
  1080. // Return ColorX if weight=255
  1081. CMP ECX, $FF
  1082. JZ @return_x
  1083. // Load ColorX and ColorY
  1084. MOVD XMM1, EAX // XMM1 <- ColorX (Fa Fr Fg Fb)
  1085. MOVD XMM2, [EDX] // XMM2 <- ColorY (Ba Br Bg Bb)
  1086. // Create a Zero for use in unpack
  1087. PXOR XMM0, XMM0 // XMM0 <- 0
  1088. SHL ECX, 4 // ECX <- Offset into AlphaTable
  1089. // Unpack the ColorX and ColorY WORDs into DWORDs
  1090. PUNPCKLBW XMM1, XMM0 // XMM1.high <- 0 (00 Fa 00 Fr 00 Fg 00 Fb)
  1091. PUNPCKLBW XMM2, XMM0 // XMM2.high <- 0 (00 Ba 00 Br 00 Bg 00 Bb)
  1092. ADD ECX, alpha_ptr // ECX <- &AlphaTable[Weight]
  1093. // Lerp: Result = (Weight * (ColorX - ColorY) + 256 * ColorY) / 256
  1094. PSUBW XMM1, XMM2 // XMM1 <- ColorX - ColorY
  1095. PMULLW XMM1, [ECX] // XMM1 <- (ColorX - ColorY) * AlphaTable[Weight]
  1096. PSLLW XMM2, 8 // XMM2 <- ColorY * 256
  1097. MOV ECX, bias_ptr // ECX <- AlphaTable[128] (= $00800080 = 0.5)
  1098. PADDW XMM2, [ECX] // XMM2 <- (ColorY * 256) + 128
  1099. PADDW XMM1, XMM2 // XMM1 <- (ColorX - ColorY) * Weight + ColorY
  1100. PSRLW XMM1, 8 // XMM1 <- XMM1 div 256
  1101. // Pack result back from word to byte components
  1102. PACKUSWB XMM1, XMM0 // XMM1 <- XMM1.low (Ra Rr Rg Rb)
  1103. MOVD [EDX], XMM1 // ColorY <- XMM1
  1104. @exit:
  1105. RET
  1106. @return_x:
  1107. MOV [EDX], EAX // ColorY <- ColorX
  1108. {$ENDIF}
  1109. {$IFDEF TARGET_X64}
  1110. // ECX - Color X
  1111. // [RDX] - Color Y
  1112. // R8D - Weight of X [0..255]
  1113. TEST R8D,R8D // Set flags for R8
  1114. JZ @1 // W = 0 ? => Result := EDX
  1115. CMP R8D,$FF
  1116. JZ @2
  1117. MOVD XMM1,ECX
  1118. PXOR XMM0,XMM0
  1119. SHL R8D,4
  1120. MOVD XMM2,[RDX]
  1121. PUNPCKLBW XMM1,XMM0
  1122. PUNPCKLBW XMM2,XMM0
  1123. {$IFNDEF FPC}
  1124. ADD R8,alpha_ptr
  1125. {$ELSE}
  1126. ADD R8,[RIP+alpha_ptr]
  1127. {$ENDIF}
  1128. PSUBW XMM1,XMM2
  1129. PMULLW XMM1,[R8]
  1130. PSLLW XMM2,8
  1131. {$IFNDEF FPC}
  1132. MOV RAX,bias_ptr
  1133. {$ELSE}
  1134. MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  1135. {$ENDIF}
  1136. PADDW XMM2,[RAX]
  1137. PADDW XMM1,XMM2
  1138. PSRLW XMM1,8
  1139. PACKUSWB XMM1,XMM0
  1140. MOVD [RDX],XMM1
  1141. @1: RET
  1142. @2: MOV [RDX],ECX
  1143. {$ENDIF}
  1144. end;
  1145. //------------------------------------------------------------------------------
  1146. procedure CombineMem_SSE2_128(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1147. (*
  1148. Contributed by: Anders Melander
  1149. Basically the same as CombineMem_SSE2_Table but uses immediate loads instead of tables.
  1150. TestCombineMem:
  1151. Errors: 39.082 = 29,8 % (Limit: -1)
  1152. Differences: 92.042
  1153. Average difference: -0,29
  1154. Max difference: 1 (Limit: 1)
  1155. *)
  1156. asm
  1157. //
  1158. // Result := W * (X - Y) + Y
  1159. //
  1160. // Approximates (x div 255) as ((x + 128) div 256)
  1161. //
  1162. {$IFDEF TARGET_X86}
  1163. // EAX Color X (Foreground)
  1164. // [EDX] Color Y (Background)
  1165. // ECX Weight of X [0..255]
  1166. // Return ColorY if weight=0
  1167. JCXZ @exit
  1168. // Return ColorX if weight=255
  1169. CMP ECX, $FF
  1170. JZ @return_x
  1171. {$ELSE}
  1172. // ECX Color X (Foreground)
  1173. // [RDX] Color Y (Background)
  1174. // R8D Weight of X [0..255]
  1175. // Return ColorY if weight=0
  1176. TEST R8D, R8D
  1177. JZ @exit
  1178. // Return ColorX if weight=255
  1179. CMP ECX, $FF
  1180. JZ @return_x
  1181. {$ENDIF}
  1182. // Load ColorX and ColorY
  1183. {$IFDEF TARGET_X86}
  1184. MOVD XMM1, EAX // XMM1 <- ColorX (Fa Fr Fg Fb)
  1185. MOVD XMM2, [EDX] // XMM2 <- ColorY (Ba Br Bg Bb)
  1186. {$ELSE}
  1187. MOVD XMM1, ECX // XMM1 <- ColorX (Fa Fr Fg Fb)
  1188. MOVD XMM2, [RDX] // XMM2 <- ColorY (Ba Br Bg Bb)
  1189. {$ENDIF}
  1190. // Duplicate weight into 4 words
  1191. {$IFDEF TARGET_X86}
  1192. MOVD XMM3, ECX // XMM3 <- Weight (00 00 00 00 00 00 00 WW)
  1193. {$ELSE}
  1194. MOVD XMM3, R8D // XMM3 <- Weight (00 00 00 00 00 00 00 WW)
  1195. {$ENDIF}
  1196. PSHUFLW XMM3, XMM3, 0 // (00 WW 00 WW 00 WW 00 WW)
  1197. // Duplicate 128 into 4 words for saturated biasing
  1198. MOV ECX, 128
  1199. MOVD XMM4, ECX // XMM4 <- 0 (00 00 00 00 00 00 00 80)
  1200. PSHUFLW XMM4, XMM4, 0 // (00 80 00 80 00 80 00 80)
  1201. // Create a Zero for use in unpack
  1202. PXOR XMM0, XMM0 // XMM0 <- 0
  1203. // Unpack the ColorX and ColorY byte components into words
  1204. PUNPCKLBW XMM1, XMM0 // XMM1.high <- 0 (00 Fa 00 Fr 00 Fg 00 Fb)
  1205. PUNPCKLBW XMM2, XMM0 // XMM2.high <- 0 (00 Ba 00 Br 00 Bg 00 Bb)
  1206. // Save a copy of ColorY*256
  1207. MOVQ XMM0, XMM2
  1208. PSLLW XMM0, 8 // XMM0 <- (Ba 00 Br 00 Bg 00 Bb 00)
  1209. // Lerp: Result = (weight * (ColorX - ColorY) + 256 * ColorY) / 256
  1210. PSUBW XMM1, XMM2 // XMM1 <- ColorX - ColorY
  1211. PMULLW XMM1, XMM3 // XMM1 <- Weight * (ColorX - ColorY)
  1212. PADDW XMM1, XMM0 // XMM1 <- Weight * (ColorX - ColorY) + 256 * ColorY
  1213. // Add 255:256 correction bias
  1214. PADDW XMM1, XMM4 // XMM1 <- Weight * (ColorX - ColorY) + 256 * ColorY + 128
  1215. PSRLW XMM1, 8 // XMM1 <- (Weight * (ColorX - ColorY) + 256 * ColorY) div 256
  1216. // Pack result back from word to byte components
  1217. PACKUSWB XMM1, XMM1 // XMM1 <- XMM1.low (Ra Rr Rg Rb)
  1218. {$IFDEF TARGET_X86}
  1219. MOVD [EDX], XMM1 // ColorY <- XMM1
  1220. {$ELSE}
  1221. MOVD [RDX], XMM1 // ColorY <- XMM1
  1222. {$ENDIF}
  1223. @exit:
  1224. RET
  1225. @return_x:
  1226. {$IFDEF TARGET_X86}
  1227. MOV [EDX], EAX // ColorY <- ColorX
  1228. {$ELSE}
  1229. MOV [RDX], ECX // ColorY <- ColorX
  1230. {$ENDIF}
  1231. end;
  1232. //------------------------------------------------------------------------------
  1233. procedure CombineMem_SSE41_8081(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1234. (*
  1235. Contributed by: Anders Melander
  1236. Based on CombineMem_SSE41_Kadaif but uses immediate loads instead of tables.
  1237. Also uses a slight different bias value.
  1238. Also slower :-(
  1239. TestCombineMem:
  1240. Errors: 20 = 0,0 % (Limit: -1)
  1241. Differences: 38
  1242. Average difference: -0,05
  1243. Max difference: 1 (Limit: 1)
  1244. *)
  1245. asm
  1246. //
  1247. // Result := W * (X - Y) + Y
  1248. //
  1249. // Approximates Round(x / 255) as (((x + $7F) * $8081) shr 23) = ((x * $8081 + Bias) shr 23)
  1250. //
  1251. {$IFDEF TARGET_X86}
  1252. // EAX Color X (Foreground)
  1253. // [EDX] Color Y (Background)
  1254. // ECX Weight of X [0..255]
  1255. // Return ColorY if weight=0
  1256. JCXZ @exit
  1257. // Return ColorX if weight=255
  1258. CMP ECX, $FF
  1259. JZ @return_x
  1260. {$ELSE}
  1261. // ECX Color X (Foreground)
  1262. // [RDX] Color Y (Background)
  1263. // R8D Weight of X [0..255]
  1264. // Return ColorY if weight=0
  1265. TEST R8D, R8D
  1266. JZ @exit
  1267. // Return ColorX if weight=255
  1268. CMP R8D, $FF
  1269. JZ @return_x
  1270. {$ENDIF}
  1271. // Load ColorX and ColorY
  1272. {$IFDEF TARGET_X86}
  1273. MOVD XMM1, EAX // XMM1 <- ColorX (Fa Fr Fg Fb)
  1274. MOVD XMM2, [EDX] // XMM2 <- ColorY (Ba Br Bg Bb)
  1275. {$ELSE}
  1276. MOVD XMM1, ECX // XMM1 <- ColorX (Fa Fr Fg Fb)
  1277. MOVD XMM2, [RDX] // XMM2 <- ColorY (Ba Br Bg Bb)
  1278. {$ENDIF}
  1279. // Duplicate weight*$8081 into 4 dwords
  1280. {$IFDEF TARGET_X86}
  1281. IMUL ECX, ECX, $8081
  1282. {$ELSE}
  1283. IMUL ECX, R8D, $8081
  1284. {$ENDIF}
  1285. MOVD XMM3, ECX // XMM3 <- Weight * $8081
  1286. PSHUFD XMM3, XMM3, 0 // XMM3[0..3] <- XMM3[0][0..3]
  1287. // Unpack the ColorX and ColorY byte components into dwords
  1288. // PMOVZXBD is SSE4.1
  1289. PMOVZXBD XMM1, XMM1 // XMM1[0..3] <- ColorX[0][0..3]
  1290. PMOVZXBD XMM0, XMM2 // XMM0[0..3] <- ColorY[0][0..3]
  1291. //
  1292. // Lerp: Result = (weight * (ColorX - ColorY) + ColorY)
  1293. // = (($8081 * weight * (ColorX - ColorY)) shr 23 + ColorY)
  1294. //
  1295. PSUBD XMM1, XMM0 // XMM1 <- ColorX - ColorY
  1296. PMULLD XMM1, XMM3 // XMM1 <- (ColorX - ColorY) * Weight * $8081
  1297. // Duplicate bias (~$7F*$8081) into 4 dwords
  1298. MOV ECX, $003FFF0F
  1299. MOVD XMM3, ECX // XMM3 <- Bias
  1300. PSHUFD XMM3, XMM3, 0 // XMM3[0..3] <- XMM3[0][0..3]
  1301. // Add bias
  1302. PADDD XMM1, XMM3 // XMM2 <- (ColorX - ColorY) * Weight * $8081 + Bias
  1303. // Reduce 32-bits to 9-bits
  1304. PSRLD XMM1, 23 // XMM2 <- ((ColorX - ColorY) * Weight * $8081 + Bias) shr 23
  1305. // PACKUSDW is SSE4.1
  1306. // Convert from dwords to words
  1307. PACKUSDW XMM1, XMM0 // XMM1[0..1][0..1] <- XMM1[0..3]
  1308. // Convert from words.lo to bytes
  1309. PSLLW XMM1, 8 // Get rid of the high byte
  1310. PSRLW XMM1, 8
  1311. PACKUSWB XMM1, XMM0 // XMM1[0][0..3] <- XMM1[0..1][0..1]
  1312. // Result := Value + ColorY
  1313. PADDB XMM1, XMM2 // XMM0 <- XMM2 + ColorY
  1314. {$IFDEF TARGET_X86}
  1315. MOVD [EDX], XMM1 // ColorY <- XMM1
  1316. {$ELSE}
  1317. MOVD [RDX], XMM1 // ColorY <- XMM1
  1318. {$ENDIF}
  1319. @exit:
  1320. RET
  1321. @return_x:
  1322. {$IFDEF TARGET_X86}
  1323. MOV [EDX], EAX // ColorY <- ColorX
  1324. {$ELSE}
  1325. MOV [RDX], ECX // ColorY <- ColorX
  1326. {$ENDIF}
  1327. end;
  1328. //------------------------------------------------------------------------------
  1329. procedure CombineMem_SSE41_Kadaif(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1330. (*
  1331. Contributed by: Kadaif
  1332. TestCombineMem:
  1333. Errors: 16 = 0,0 % (Limit: -1)
  1334. Differences: 30
  1335. Average difference: 0,20
  1336. Max difference: 1 (Limit: 1)
  1337. *)
  1338. asm
  1339. //
  1340. // Result := W * (X - Y) + Y
  1341. //
  1342. // Approximates Round(x / 255) as ((x * $8081 + Bias) shr 23)
  1343. //
  1344. {$IFDEF TARGET_X86}
  1345. // EAX Color X (Foreground)
  1346. // [EDX] Color Y (Background)
  1347. // ECX Weight of X [0..255]
  1348. // Return ColorY if weight=0
  1349. JCXZ @exit
  1350. // Return ColorX if weight=255
  1351. CMP ECX, $FF
  1352. JZ @return_x
  1353. {$ELSE}
  1354. // ECX Color X (Foreground)
  1355. // [RDX] Color Y (Background)
  1356. // R8D Weight of X [0..255]
  1357. // Return ColorY if weight=0
  1358. TEST R8D, R8D
  1359. JZ @exit
  1360. // Return ColorX if weight=255
  1361. CMP R8D, $FF
  1362. JZ @return_x
  1363. {$ENDIF}
  1364. // Load ColorX and ColorY
  1365. {$IFDEF TARGET_X86}
  1366. MOVD XMM0, EAX // XMM0 <- ColorX (Fa Fr Fg Fb)
  1367. MOVD XMM1, [EDX] // XMM1 <- ColorY (Ba Br Bg Bb)
  1368. {$ELSE}
  1369. MOVD XMM0, ECX // XMM0 <- ColorX (Fa Fr Fg Fb)
  1370. MOVD XMM1, [RDX] // XMM1 <- ColorY (Ba Br Bg Bb)
  1371. {$ENDIF}
  1372. // Weight = Weight * $8081
  1373. {$IFDEF TARGET_X86}
  1374. IMUL ECX, ECX, $8081
  1375. {$ELSE}
  1376. IMUL ECX, R8D, $8081
  1377. {$ENDIF}
  1378. // Convert from bytes to integers
  1379. // PMOVZXBD is SSE4.1
  1380. PMOVZXBD XMM2, XMM1 // XMM2[0..3] <- ColorY[0][0..3]
  1381. PMOVZXBD XMM0, XMM0 // XMM0[0..3] <- ColorX[0][0..3]
  1382. //
  1383. // Lerp: Result = (weight * (ColorX - ColorY) + ColorY)
  1384. // = (($8081 * weight * (ColorX - ColorY)) shr 23 + ColorY)
  1385. //
  1386. PSUBD XMM0, XMM2 // XMM0 <- ColorX - ColorY
  1387. MOVD XMM2, ECX // XMM2 <- Weight * $8081
  1388. PSHUFD XMM2, XMM2, 0 // XMM2[0..3] <- XMM2[0][0..3]
  1389. PMULLD XMM2, XMM0 // XMM2 <- (ColorX - ColorY) * Weight * $8081
  1390. // Add bias (~$7F*$8081)
  1391. {$if (not defined(FPC)) or (not defined(TARGET_X64))}
  1392. PADDD XMM2, DQWORD PTR [SSE_003FFF7F_ALIGNED] // XMM2 <- ((ColorX - ColorY) * Weight * $8081) + Bias
  1393. {$else}
  1394. PADDD XMM2, DQWORD PTR [rip+SSE_003FFF7F_ALIGNED]
  1395. {$ifend}
  1396. // Reduce 32-bits to 9-bits
  1397. PSRLD XMM2, 23 // XMM2 <- (((ColorX - ColorY) * Weight * $8081) + Bias) shr 23
  1398. // Convert from dwords to bytes with truncation (losing the sign in the 9th bit)
  1399. {$if (not defined(FPC)) or (not defined(TARGET_X64))}
  1400. PSHUFB XMM2, DQWORD PTR [SSE_0C080400_ALIGNED] // XMM2[0] <- XMM4[0..3][0]
  1401. {$else}
  1402. PSHUFB XMM2, DQWORD PTR [rip+SSE_0C080400_ALIGNED]
  1403. {$ifend}
  1404. // Result := Value + ColorY
  1405. PADDB XMM2, XMM1 // XMM2 <- XMM2 + ColorY
  1406. {$IFDEF TARGET_X86}
  1407. MOVD [EDX], XMM2 // ColorY <- XMM2
  1408. {$ELSE}
  1409. MOVD [RDX], XMM2 // ColorY <- XMM2
  1410. {$ENDIF}
  1411. @exit:
  1412. RET
  1413. @return_x:
  1414. {$IFDEF TARGET_X86}
  1415. MOV [EDX], EAX // ColorY <- ColorX
  1416. {$ELSE}
  1417. MOV [RDX], ECX // ColorY <- ColorX
  1418. {$ENDIF}
  1419. end;
  1420. //------------------------------------------------------------------------------
  1421. // CombineLine
  1422. //------------------------------------------------------------------------------
  1423. procedure CombineLine_SSE2(Src, Dst: PColor32; Count: Integer; W: Cardinal); {$IFDEF FPC} assembler; {$IFDEF TARGET_X64}nostackframe;{$ENDIF} {$ENDIF}
  1424. asm
  1425. {$IFDEF TARGET_X86}
  1426. // EAX <- Src
  1427. // EDX <- Dst
  1428. // ECX <- Count
  1429. // Result := W * (X - Y) + Y
  1430. TEST ECX,ECX
  1431. JZ @3
  1432. PUSH EBX
  1433. MOV EBX,W
  1434. TEST EBX,EBX
  1435. JZ @2
  1436. CMP EBX,$FF
  1437. JZ @4
  1438. SHL EBX,4
  1439. ADD EBX,alpha_ptr
  1440. MOVQ XMM3,[EBX]
  1441. MOV EBX,bias_ptr
  1442. MOVQ XMM4,[EBX]
  1443. PXOR XMM0,XMM0
  1444. @1: MOVD XMM1,[EAX]
  1445. MOVD XMM2,[EDX]
  1446. PUNPCKLBW XMM1,XMM0
  1447. PUNPCKLBW XMM2,XMM0
  1448. PSUBW XMM1,XMM2
  1449. PMULLW XMM1,XMM3
  1450. PSLLW XMM2,8
  1451. PADDW XMM2,XMM4
  1452. PADDW XMM1,XMM2
  1453. PSRLW XMM1,8
  1454. PACKUSWB XMM1,XMM0
  1455. MOVD [EDX],XMM1
  1456. ADD EAX,4
  1457. ADD EDX,4
  1458. DEC ECX
  1459. JNZ @1
  1460. @2: POP EBX
  1461. POP EBP
  1462. @3: RET $0004
  1463. @4: SHL ECX,2
  1464. CALL Move
  1465. POP EBX
  1466. {$ENDIF}
  1467. {$IFDEF TARGET_X64}
  1468. // ECX <- Src
  1469. // EDX <- Dst
  1470. // R8D <- Count
  1471. // Result := W * (X - Y) + Y
  1472. TEST R8D,R8D
  1473. JZ @2
  1474. TEST R9D,R9D
  1475. JZ @2
  1476. CMP R9D,$FF
  1477. JZ @3
  1478. SHL R9D,4
  1479. {$IFNDEF FPC}
  1480. ADD R9,alpha_ptr
  1481. {$ELSE}
  1482. ADD R9,[RIP+alpha_ptr]
  1483. {$ENDIF}
  1484. MOVQ XMM3,[R9]
  1485. {$IFNDEF FPC}
  1486. MOV R9,bias_ptr
  1487. {$ELSE}
  1488. MOV R9,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
  1489. {$ENDIF}
  1490. MOVQ XMM4,[R9]
  1491. PXOR XMM0,XMM0
  1492. @1: MOVD XMM1,[RCX]
  1493. MOVD XMM2,[RDX]
  1494. PUNPCKLBW XMM1,XMM0
  1495. PUNPCKLBW XMM2,XMM0
  1496. PSUBW XMM1,XMM2
  1497. PMULLW XMM1,XMM3
  1498. PSLLW XMM2,8
  1499. PADDW XMM2,XMM4
  1500. PADDW XMM1,XMM2
  1501. PSRLW XMM1,8
  1502. PACKUSWB XMM1,XMM0
  1503. MOVD [RDX],XMM1
  1504. ADD RCX,4
  1505. ADD RDX,4
  1506. DEC R8D
  1507. JNZ @1
  1508. @2: RET
  1509. @3: SHL R8D,2
  1510. CALL Move
  1511. {$ENDIF}
  1512. end;
  1513. //------------------------------------------------------------------------------
  1514. //
  1515. // Merge
  1516. //
  1517. //------------------------------------------------------------------------------
  1518. //------------------------------------------------------------------------------
  1519. // MergeReg
  1520. //------------------------------------------------------------------------------
  1521. function MergeReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1522. asm
  1523. { This is an implementation of the merge formula, as described
  1524. in a paper by Bruce Wallace in 1981. Merging is associative,
  1525. that is, A over (B over C) = (A over B) over C. The formula is,
  1526. Ra = Fa + Ba * (1 - Fa)
  1527. Rc = (Fa * (Fc - Bc * Ba) + Bc * Ba) / Ra
  1528. where
  1529. Rc is the resultant color,
  1530. Ra is the resultant alpha,
  1531. Fc is the foreground color,
  1532. Fa is the foreground alpha,
  1533. Bc is the background color,
  1534. Ba is the background alpha.
  1535. Implementation:
  1536. Ra := 1 - (1 - Fa) * (1 - Ba);
  1537. Wa := Fa / Ra;
  1538. Rc := Bc + Wa * (Fc - Bc);
  1539. (1 - Fa) * (1 - Ba) = 1 - Fa - Ba + Fa * Ba = (1 - Ra)
  1540. }
  1541. {$IFDEF TARGET_X86}
  1542. TEST EAX,$FF000000 // foreground completely transparent =>
  1543. JZ @1 // result = background
  1544. CMP EAX,$FF000000 // foreground completely opaque =>
  1545. JNC @2 // result = foreground
  1546. TEST EDX,$FF000000 // background completely transparent =>
  1547. JZ @2 // result = foreground
  1548. PXOR XMM7,XMM7 // XMM7 <- 00
  1549. MOVD XMM0,EAX // XMM0 <- Fa Fr Fg Fb
  1550. SHR EAX,24 // EAX <- Fa
  1551. ROR EDX,24
  1552. MOVZX ECX,DL // ECX <- Ba
  1553. PUNPCKLBW XMM0,XMM7 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb
  1554. SUB EAX,$FF // EAX <- (Fa - 1)
  1555. XOR ECX,$FF // ECX <- (1 - Ba)
  1556. IMUL ECX,EAX // ECX <- (Fa - 1) * (1 - Ba) = Ra - 1
  1557. IMUL ECX,$8081 // ECX <- Xa 00 00 00
  1558. ADD ECX,$8081*$FF*$FF
  1559. SHR ECX,15 // ECX <- Ra
  1560. MOV DL,CH // EDX <- Br Bg Bb Ra
  1561. ROR EDX,8 // EDX <- Ra Br Bg Bb
  1562. MOVD XMM1,EDX // XMM1 <- Ra Br Bg Bb
  1563. PUNPCKLBW XMM1,XMM7 // XMM1 <- 00 Ra 00 Br 00 Bg 00 Bb
  1564. SHL EAX,20 // EAX <- Fa 00 00
  1565. PSUBW XMM0,XMM1 // XMM0 <- ** Da ** Dr ** Dg ** Db
  1566. ADD EAX,$0FF01000
  1567. PSLLW XMM0,4
  1568. XOR EDX,EDX // EDX <- 00
  1569. DIV ECX // EAX <- Fa / Ra = Wa
  1570. MOVD XMM4,EAX // XMM3 <- Wa
  1571. PSHUFLW XMM4,XMM4,$C0 // XMM3 <- 00 00 ** Wa ** Wa ** Wa
  1572. PMULHW XMM0,XMM4 // XMM0 <- 00 00 ** Pr ** Pg ** Pb
  1573. PADDW XMM0,XMM1 // XMM0 <- 00 Ra 00 Rr 00 Rg 00 Rb
  1574. PACKUSWB XMM0,XMM7 // XMM0 <- Ra Rr Rg Rb
  1575. MOVD EAX,XMM0
  1576. RET
  1577. @1: MOV EAX,EDX
  1578. @2:
  1579. {$ENDIF}
  1580. {$IFDEF TARGET_X64}
  1581. TEST ECX,$FF000000 // foreground completely transparent =>
  1582. JZ @1 // result = background
  1583. MOV EAX,ECX // EAX <- Fa
  1584. CMP EAX,$FF000000 // foreground completely opaque =>
  1585. JNC @2 // result = foreground
  1586. TEST EDX,$FF000000 // background completely transparent =>
  1587. JZ @2 // result = foreground
  1588. PXOR XMM7,XMM7 // XMM7 <- 00
  1589. MOVD XMM0,EAX // XMM0 <- Fa Fr Fg Fb
  1590. SHR EAX,24 // EAX <- Fa
  1591. ROR EDX,24
  1592. MOVZX ECX,DL // ECX <- Ba
  1593. PUNPCKLBW XMM0,XMM7 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb
  1594. SUB EAX,$FF // EAX <- (Fa - 1)
  1595. XOR ECX,$FF // ECX <- (1 - Ba)
  1596. IMUL ECX,EAX // ECX <- (Fa - 1) * (1 - Ba) = Ra - 1
  1597. IMUL ECX,$8081 // ECX <- Xa 00 00 00
  1598. ADD ECX,$8081*$FF*$FF
  1599. SHR ECX,15 // ECX <- Ra
  1600. MOV DL,CH // EDX <- Br Bg Bb Ra
  1601. ROR EDX,8 // EDX <- Ra Br Bg Bb
  1602. MOVD XMM1,EDX // XMM1 <- Ra Br Bg Bb
  1603. PUNPCKLBW XMM1,XMM7 // XMM1 <- 00 Ra 00 Br 00 Bg 00 Bb
  1604. SHL EAX,20 // EAX <- Fa 00 00
  1605. PSUBW XMM0,XMM1 // XMM0 <- ** Da ** Dr ** Dg ** Db
  1606. ADD EAX,$0FF01000
  1607. PSLLW XMM0,4
  1608. XOR EDX,EDX // EDX <- 00
  1609. DIV ECX // EAX <- Fa / Ra = Wa
  1610. MOVD XMM4,EAX // XMM3 <- Wa
  1611. PSHUFLW XMM4,XMM4,$C0 // XMM3 <- 00 00 ** Wa ** Wa ** Wa
  1612. PMULHW XMM0,XMM4 // XMM0 <- 00 00 ** Pr ** Pg ** Pb
  1613. PADDW XMM0,XMM1 // XMM0 <- 00 Ra 00 Rr 00 Rg 00 Rb
  1614. PACKUSWB XMM0,XMM7 // XMM0 <- Ra Rr Rg Rb
  1615. MOVD EAX,XMM0
  1616. RET
  1617. @1: MOV EAX,EDX
  1618. @2:
  1619. {$ENDIF}
  1620. end;
  1621. //------------------------------------------------------------------------------
  1622. //
  1623. // Color algebra
  1624. //
  1625. //------------------------------------------------------------------------------
  1626. //------------------------------------------------------------------------------
  1627. // ColorAdd
  1628. //------------------------------------------------------------------------------
  1629. function ColorAdd_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1630. asm
  1631. {$IFDEF TARGET_X86}
  1632. MOVD XMM0,EAX
  1633. MOVD XMM1,EDX
  1634. PADDUSB XMM0,XMM1
  1635. MOVD EAX,XMM0
  1636. {$ENDIF}
  1637. {$IFDEF TARGET_X64}
  1638. MOVD XMM0,ECX
  1639. MOVD XMM1,EDX
  1640. PADDUSB XMM0,XMM1
  1641. MOVD EAX,XMM0
  1642. {$ENDIF}
  1643. end;
  1644. //------------------------------------------------------------------------------
  1645. // ColorSub
  1646. //------------------------------------------------------------------------------
  1647. function ColorSub_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1648. asm
  1649. {$IFDEF TARGET_X86}
  1650. MOVD XMM0,EAX
  1651. MOVD XMM1,EDX
  1652. PSUBUSB XMM0,XMM1
  1653. MOVD EAX,XMM0
  1654. {$ENDIF}
  1655. {$IFDEF TARGET_X64}
  1656. MOVD XMM0,ECX
  1657. MOVD XMM1,EDX
  1658. PSUBUSB XMM0,XMM1
  1659. MOVD EAX,XMM0
  1660. {$ENDIF}
  1661. end;
  1662. //------------------------------------------------------------------------------
  1663. // ColorModulate
  1664. //------------------------------------------------------------------------------
  1665. function ColorModulate_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1666. asm
  1667. {$IFDEF TARGET_X86}
  1668. PXOR XMM2,XMM2
  1669. MOVD XMM0,EAX
  1670. PUNPCKLBW XMM0,XMM2
  1671. MOVD XMM1,EDX
  1672. PUNPCKLBW XMM1,XMM2
  1673. PMULLW XMM0,XMM1
  1674. PSRLW XMM0,8
  1675. PACKUSWB XMM0,XMM2
  1676. MOVD EAX,XMM0
  1677. {$ENDIF}
  1678. {$IFDEF TARGET_X64}
  1679. PXOR XMM2,XMM2
  1680. MOVD XMM0,ECX
  1681. PUNPCKLBW XMM0,XMM2
  1682. MOVD XMM1,EDX
  1683. PUNPCKLBW XMM1,XMM2
  1684. PMULLW XMM0,XMM1
  1685. PSRLW XMM0,8
  1686. PACKUSWB XMM0,XMM2
  1687. MOVD EAX,XMM0
  1688. {$ENDIF}
  1689. end;
  1690. //------------------------------------------------------------------------------
  1691. // ColorMax
  1692. //------------------------------------------------------------------------------
  1693. function ColorMax_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1694. asm
  1695. {$IFDEF TARGET_X86}
  1696. MOVD XMM0,EAX
  1697. MOVD XMM1,EDX
  1698. PMAXUB XMM0,XMM1
  1699. MOVD EAX,XMM0
  1700. {$ENDIF}
  1701. {$IFDEF TARGET_X64}
  1702. MOVD XMM0,ECX
  1703. MOVD XMM1,EDX
  1704. PMAXUB XMM0,XMM1
  1705. MOVD EAX,XMM0
  1706. {$ENDIF}
  1707. end;
  1708. //------------------------------------------------------------------------------
  1709. // ColorMin
  1710. //------------------------------------------------------------------------------
  1711. function ColorMin_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1712. asm
  1713. {$IFDEF TARGET_X86}
  1714. MOVD XMM0,EAX
  1715. MOVD XMM1,EDX
  1716. PMINUB XMM0,XMM1
  1717. MOVD EAX,XMM0
  1718. {$ENDIF}
  1719. {$IFDEF TARGET_X64}
  1720. MOVD XMM0,ECX
  1721. MOVD XMM1,EDX
  1722. PMINUB XMM0,XMM1
  1723. MOVD EAX,XMM0
  1724. {$ENDIF}
  1725. end;
  1726. //------------------------------------------------------------------------------
  1727. // ColorDifference
  1728. //------------------------------------------------------------------------------
  1729. function ColorDifference_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1730. asm
  1731. {$IFDEF TARGET_X86}
  1732. MOVD XMM0,EAX
  1733. MOVD XMM1,EDX
  1734. MOVQ XMM2,XMM0
  1735. PSUBUSB XMM0,XMM1
  1736. PSUBUSB XMM1,XMM2
  1737. POR XMM0,XMM1
  1738. MOVD EAX,XMM0
  1739. {$ENDIF}
  1740. {$IFDEF TARGET_X64}
  1741. MOVD XMM0,ECX
  1742. MOVD XMM1,EDX
  1743. MOVQ XMM2,XMM0
  1744. PSUBUSB XMM0,XMM1
  1745. PSUBUSB XMM1,XMM2
  1746. POR XMM0,XMM1
  1747. MOVD EAX,XMM0
  1748. {$ENDIF}
  1749. end;
  1750. //------------------------------------------------------------------------------
  1751. // ColorExclusion
  1752. //------------------------------------------------------------------------------
  1753. function ColorExclusion_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1754. asm
  1755. {$IFDEF TARGET_X86}
  1756. PXOR XMM2,XMM2
  1757. MOVD XMM0,EAX
  1758. PUNPCKLBW XMM0,XMM2
  1759. MOVD XMM1,EDX
  1760. PUNPCKLBW XMM1,XMM2
  1761. MOVQ XMM3,XMM0
  1762. PADDW XMM0,XMM1
  1763. PMULLW XMM1,XMM3
  1764. PSRLW XMM1,7
  1765. PSUBUSW XMM0,XMM1
  1766. PACKUSWB XMM0,XMM2
  1767. MOVD EAX,XMM0
  1768. {$ENDIF}
  1769. {$IFDEF TARGET_X64}
  1770. PXOR XMM2,XMM2
  1771. MOVD XMM0,ECX
  1772. PUNPCKLBW XMM0,XMM2
  1773. MOVD XMM1,EDX
  1774. PUNPCKLBW XMM1,XMM2
  1775. MOVQ XMM3,XMM0
  1776. PADDW XMM0,XMM1
  1777. PMULLW XMM1,XMM3
  1778. PSRLW XMM1,7
  1779. PSUBUSW XMM0,XMM1
  1780. PACKUSWB XMM0,XMM2
  1781. MOVD EAX,XMM0
  1782. {$ENDIF}
  1783. end;
  1784. //------------------------------------------------------------------------------
  1785. // ColorScale
  1786. //------------------------------------------------------------------------------
  1787. function ColorScale_SSE2(C: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1788. asm
  1789. {$IFDEF TARGET_X86}
  1790. PXOR XMM2,XMM2
  1791. SHL EDX,4
  1792. MOVD XMM0,EAX
  1793. PUNPCKLBW XMM0,XMM2
  1794. ADD EDX,alpha_ptr
  1795. PMULLW XMM0,[EDX]
  1796. PSRLW XMM0,8
  1797. PACKUSWB XMM0,XMM2
  1798. MOVD EAX,XMM0
  1799. {$ENDIF}
  1800. {$IFDEF TARGET_X64}
  1801. PXOR XMM2,XMM2
  1802. SHL RDX,4
  1803. MOVD XMM0,ECX
  1804. PUNPCKLBW XMM0,XMM2
  1805. {$IFNDEF FPC}
  1806. ADD RDX,alpha_ptr
  1807. {$ELSE}
  1808. ADD RDX,[RIP+alpha_ptr]
  1809. {$ENDIF}
  1810. PMULLW XMM0,[RDX]
  1811. PSRLW XMM0,8
  1812. PACKUSWB XMM0,XMM2
  1813. MOVD EAX,XMM0
  1814. {$ENDIF}
  1815. end;
  1816. //------------------------------------------------------------------------------
  1817. //
  1818. // Misc
  1819. //
  1820. //------------------------------------------------------------------------------
  1821. //------------------------------------------------------------------------------
  1822. // LightenReg
  1823. //------------------------------------------------------------------------------
  1824. function LightenReg_SSE2(C: TColor32; Amount: Integer): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1825. asm
  1826. {$IFDEF TARGET_X86}
  1827. // EAX <- C: TColor32
  1828. // EDX <- Amount: integer
  1829. // EAX -> Result
  1830. MOVD XMM0, EAX
  1831. TEST EDX, EDX
  1832. JL @1
  1833. // Positive: Lighten
  1834. IMUL EDX, $010101
  1835. MOVD XMM1, EDX
  1836. PADDUSB XMM0, XMM1
  1837. MOVD EAX, XMM0
  1838. RET
  1839. // Negative: Darken
  1840. @1: NEG EDX
  1841. IMUL EDX, $010101
  1842. MOVD XMM1, EDX
  1843. PSUBUSB XMM0, XMM1
  1844. MOVD EAX, XMM0
  1845. {$ENDIF}
  1846. {$IFDEF TARGET_X64}
  1847. // ECX <- C: TColor32
  1848. // EDX <- Amount: integer
  1849. // EAX -> Result
  1850. MOVD XMM0, ECX
  1851. TEST EDX, EDX
  1852. JL @1
  1853. // Positive: Lighten
  1854. IMUL EDX, $010101
  1855. MOVD XMM1, EDX
  1856. PADDUSB XMM0, XMM1
  1857. MOVD EAX, XMM0
  1858. RET
  1859. // Negative: Darken
  1860. @1: NEG EDX
  1861. IMUL EDX, $010101
  1862. MOVD XMM1, EDX
  1863. PSUBUSB XMM0, XMM1
  1864. MOVD EAX, XMM0
  1865. {$ENDIF}
  1866. end;
  1867. //------------------------------------------------------------------------------
  1868. // ScaleMems
  1869. //------------------------------------------------------------------------------
  1870. procedure ScaleMems_SSE41(Dst: PColor32; Count: Integer; Weight: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1871. asm
  1872. //
  1873. // Result Z = W * Bargb
  1874. //
  1875. // Approximates (x div 255) as ((x * $8081 + Bias) shr 23)
  1876. //
  1877. {$IFDEF TARGET_X86}
  1878. // EAX <- Dst: PColor32
  1879. // EDX <- Count
  1880. // ECX <- Weight: Byte
  1881. // Test the counter for zero or negativity
  1882. TEST EDX, EDX
  1883. JLE @Done
  1884. // Test if:
  1885. // - Weight is 0 (i.e. clear RGB to zero)
  1886. // - Weight is 255 (i.e. no scale)
  1887. AND ECX, $000000FF
  1888. JZ @Clear
  1889. TEST ECX, $000000FF
  1890. JE @Done
  1891. // Weight = Weight * $8081
  1892. IMUL ECX, ECX, $8081
  1893. MOVD XMM0, ECX
  1894. // 1*Byte -> 4*DWord
  1895. PSHUFD XMM0, XMM0, 0 // XMM0[0..3] <- XMM0[0][0]
  1896. @Loop:
  1897. // Load dest
  1898. MOVD XMM1, DWORD PTR [EAX] // XMM1 <- 00 00 00 00 Ba Br Bg Bb
  1899. // 4*Byte -> 4*DWord
  1900. PMOVZXBD XMM1, XMM1 // XMM1[0..3] <- Color[0][0..3]
  1901. //
  1902. // Scale: Result = (Weight * Color)
  1903. // = (($8081 * Weight * Color) shr 23)
  1904. //
  1905. PMULLD XMM1, XMM0 // XMM1 <- Color * Weight * $8081
  1906. // Add bias (~$7F*$8081)
  1907. PADDD XMM1, DQWORD PTR [SSE_003FFF7F_ALIGNED] // XMM1 <- (Color * Weight * $8081) + Bias
  1908. // Reduce 32-bits to 9-bits
  1909. PSRLD XMM1, 23 // XMM1 <- ((Color * Weight * $8081) + Bias) shr 23
  1910. // Convert from dwords to bytes with truncation (losing the sign in the 9th bit)
  1911. PSHUFB XMM1, DQWORD PTR [SSE_0C080400_ALIGNED] // XMM1[0] <- XMM1[0..3][0]
  1912. // Store dest
  1913. MOVD [EAX], XMM1
  1914. ADD EAX, 4
  1915. DEC EDX
  1916. JNZ @Loop
  1917. @Done:
  1918. RET
  1919. @Clear:
  1920. // Clear RGB, leave A as-is
  1921. MOV ECX, DWORD PTR [EAX]
  1922. AND ECX, $FF000000
  1923. MOV DWORD PTR [EAX], ECX
  1924. ADD EAX, 4
  1925. DEC EDX
  1926. JNZ @Clear
  1927. {$ENDIF}
  1928. {$IFDEF TARGET_X64}
  1929. // RCX <- Dst: PColor32
  1930. // RDX <- Count
  1931. // R8D <- Weight: Byte
  1932. // Test the counter for zero or negativity
  1933. TEST EDX, EDX
  1934. JLE @Done
  1935. // Test if:
  1936. // - Weight is 0 (i.e. clear RGB to zero)
  1937. // - Weight is 255 (i.e. no scale)
  1938. AND R8D, $000000FF
  1939. JZ @Clear
  1940. TEST R8D, $000000FF
  1941. JE @Done
  1942. // Weight = Weight * $8081
  1943. IMUL R8D, R8D, $8081
  1944. MOVD XMM0, R8D // XMM0 <- Weight * $8081
  1945. // 1*Byte -> 4*DWord
  1946. PSHUFD XMM0, XMM0, 0 // XMM0[0..3] <- XMM0[0][0]
  1947. @Loop:
  1948. // Load dest
  1949. MOVD XMM1, DWORD PTR [RCX] // XMM1 <- 00 00 00 00 Ba Br Bg Bb
  1950. // 4*Byte -> 4*DWord
  1951. PMOVZXBD XMM1, XMM1 // XMM1[0..3] <- Color[0][0..3]
  1952. //
  1953. // Scale: Result = (Weight * Color)
  1954. // = (($8081 * Weight * Color) shr 23)
  1955. //
  1956. PMULLD XMM1, XMM0 // XMM1 <- Color * Weight * $8081
  1957. // Add bias (~$7F*$8081)
  1958. {$if (not defined(FPC))}
  1959. PADDD XMM1, DQWORD PTR [SSE_003FFF7F_ALIGNED] // XMM1 <- (Color * Weight * $8081) + Bias
  1960. {$else}
  1961. PADDD XMM1, DQWORD PTR [rip+SSE_003FFF7F_ALIGNED]
  1962. {$ifend}
  1963. // Reduce 32-bits to 9-bits
  1964. PSRLD XMM1, 23 // XMM1 <- ((Color * Weight * $8081) + Bias) shr 23
  1965. // Convert from dwords to bytes with truncation (losing the sign in the 9th bit)
  1966. {$if (not defined(FPC))}
  1967. PSHUFB XMM1, DQWORD PTR [SSE_0C080400_ALIGNED] // XMM1[0] <- XMM1[0..3][0]
  1968. {$else}
  1969. PSHUFB XMM1, DQWORD PTR [rip+SSE_0C080400_ALIGNED]
  1970. {$ifend}
  1971. // Store dest
  1972. MOVD [RCX], XMM1
  1973. ADD RCX,4
  1974. DEC EDX
  1975. JNZ @Loop
  1976. @Done:
  1977. RET
  1978. @Clear:
  1979. // Clear RGB, leave A as-is
  1980. MOV EAX, DWORD PTR [RCX]
  1981. AND EAX, $FF000000
  1982. MOV DWORD PTR [RCX], EAX
  1983. ADD RCX, 4
  1984. DEC EDX
  1985. JNZ @Clear
  1986. {$ENDIF}
  1987. end;
  1988. procedure FastScaleMems_SSE41(Dst: PColor32; Count: Integer; Weight: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  1989. asm
  1990. //
  1991. // Result Z = W * Bargb
  1992. //
  1993. // Approximates (x div 255) as (x shr 8); Same as ColorScale_Pas
  1994. //
  1995. {$IFDEF TARGET_X86}
  1996. // EAX <- Dst: PColor32
  1997. // EDX <- Count
  1998. // ECX <- Weight: Byte
  1999. // Test the counter for zero or negativity
  2000. TEST EDX, EDX
  2001. JLE @Done
  2002. // Test if:
  2003. // - Weight is 0 (i.e. clear RGB to zero)
  2004. // - Weight is 255 (i.e. no scale)
  2005. AND ECX, $000000FF
  2006. JZ @Clear
  2007. TEST ECX, $000000FF
  2008. JE @Done
  2009. PXOR XMM2, XMM2
  2010. // Duplicate Weight into 8 words so we can process two pixels at a time
  2011. MOVD XMM0, ECX // XMM0 <- (00 00 00 00 00 00 00 WW)
  2012. PSHUFLW XMM0, XMM0, 0 // (00 WW 00 WW 00 WW 00 WW)
  2013. PSHUFD XMM0, XMM0, 0 // (00 WW 00 WW 00 WW 00 WW)*2
  2014. // Test for odd/even count
  2015. TEST EDX, 1
  2016. JZ @Even
  2017. // We have an odd number of pixels.
  2018. // Process a single pixel so the remaining count is even.
  2019. // Load dest
  2020. MOVD XMM1, DWORD PTR [EAX] // XMM1 <- 00 00 00 00 Ba Br Bg Bb
  2021. PUNPCKLBW XMM1, XMM2 // XMM1 <- 00 Ba 00 Br 00 Bg 00 Bb
  2022. //
  2023. // Scale: Result = (Weight * Color)
  2024. // = ((Weight * Color) shr 8)
  2025. //
  2026. PMULLW XMM1, XMM0
  2027. PSRLW XMM1, 8
  2028. // Store dest
  2029. // Pack result back from word to byte components
  2030. PACKUSWB XMM1, XMM1
  2031. MOVD [EAX], XMM1
  2032. @Even:
  2033. LEA EAX, [EAX + EDX * 4] // Get address of last pixel
  2034. SHR EDX, 1 // Number of QWORDs
  2035. JZ @Done
  2036. NEG EDX // Negate count so we can use it as an offset to move forward
  2037. @Loop:
  2038. // Load dest
  2039. MOVQ XMM1, [EAX + EDX * 8].QWORD // XMM2 <- Ba Br Bg Bb Ba Br Bg Bb
  2040. {-$define FASTSCALEMEMS_SKIPWRITE}
  2041. {$ifdef FASTSCALEMEMS_SKIPWRITE}
  2042. // Skip scale (and thus the relatively costly write) if the color is pure black
  2043. PTEST XMM1, XMM1
  2044. JZ @SkipWrite
  2045. {$endif FASTSCALEMEMS_SKIPWRITE}
  2046. // 8*Byte -> 8*Word
  2047. PUNPCKLBW XMM1, XMM2 // XMM2 <- 00 Ba 00 Br 00 Bg 00 Bb
  2048. //
  2049. // Scale: Result = (Weight * Color)
  2050. // = ((Weight * Color) shr 8)
  2051. //
  2052. PMULLW XMM1, XMM0
  2053. PSRLW XMM1, 8
  2054. // Store dest
  2055. PACKUSWB XMM1, XMM2
  2056. MOVQ [EAX + EDX * 8].QWORD, XMM1
  2057. {$ifdef FASTSCALEMEMS_SKIPWRITE}
  2058. @SkipWrite:
  2059. {$endif FASTSCALEMEMS_SKIPWRITE}
  2060. ADD EDX, 1
  2061. JS @Loop
  2062. @Done:
  2063. RET
  2064. @Clear:
  2065. // Clear RGB, leave A as-is
  2066. MOV ECX, DWORD PTR [EAX]
  2067. AND ECX, $FF000000
  2068. MOV DWORD PTR [EAX], ECX
  2069. ADD EAX, 4
  2070. DEC EDX
  2071. JNZ @Clear
  2072. {$ENDIF}
  2073. {$IFDEF TARGET_X64}
  2074. // RCX <- Dst: PColor32
  2075. // RDX <- Count
  2076. // R8D <- Weight: Byte
  2077. // Test the counter for zero or negativity
  2078. TEST RDX, RDX
  2079. JLE @Done
  2080. // Test if:
  2081. // - Weight is 0 (i.e. clear RGB to zero)
  2082. // - Weight is 255 (i.e. no scale)
  2083. AND R8D, $000000FF
  2084. JZ @Clear
  2085. TEST R8D, $000000FF
  2086. JE @Done
  2087. PXOR XMM2, XMM2
  2088. // Duplicate Weight into 8 words so we can process two pixels at a time
  2089. MOVD XMM0, R8D // XMM0 <- (00 00 00 00 00 00 00 WW)
  2090. PSHUFLW XMM0, XMM0, 0 // (00 WW 00 WW 00 WW 00 WW)
  2091. PSHUFD XMM0, XMM0, 0 // (00 WW 00 WW 00 WW 00 WW)*2
  2092. // Test for odd/even count
  2093. TEST EDX, 1
  2094. JZ @Even
  2095. // We have an odd number of pixels.
  2096. // Process a single pixel so the remaining count is even.
  2097. // Load dest
  2098. MOVD XMM1, DWORD PTR [RCX] // XMM1 <- 00 00 00 00 Ba Br Bg Bb
  2099. PUNPCKLBW XMM1, XMM2 // XMM1 <- 00 Ba 00 Br 00 Bg 00 Bb
  2100. //
  2101. // Scale: Result = (Weight * Color)
  2102. // = ((Weight * Color) shr 8)
  2103. //
  2104. PMULLW XMM1, XMM0
  2105. PSRLW XMM1, 8
  2106. // Store dest
  2107. // Pack result back from word to byte components
  2108. PACKUSWB XMM1, XMM1
  2109. MOVD [RCX], XMM1
  2110. @Even:
  2111. LEA RCX, [RCX + RDX * 4] // Get address of last pixel
  2112. SHR RDX, 1 // Number of QWORDs
  2113. JZ @Done
  2114. NEG RDX // Negate count so we can use it as an offset to move forward
  2115. @Loop:
  2116. // Load dest
  2117. MOVQ XMM1, [RCX + RDX * 8].QWORD // XMM1 <- Ba Br Bg Bb Ba Br Bg Bb
  2118. // FASTSCALEMEMS_SKIPWRITE has been disabled as it doesn't give us enough and in some
  2119. // cases makes the loop slower. Probably due to branch misprediction.
  2120. {-$define FASTSCALEMEMS_SKIPWRITE}
  2121. {$ifdef FASTSCALEMEMS_SKIPWRITE}
  2122. // Skip scale (and thus the relatively costly write) if the color is pure black
  2123. PTEST XMM1, XMM1
  2124. JZ @SkipWrite
  2125. {$endif FASTSCALEMEMS_SKIPWRITE}
  2126. // 8*Byte -> 8*Word
  2127. PUNPCKLBW XMM1, XMM2 // XMM1 <- 00 Ba 00 Br 00 Bg 00 Bb
  2128. //
  2129. // Scale: Result = (Weight * Color)
  2130. // = ((Weight * Color) shr 8)
  2131. //
  2132. PMULLW XMM1, XMM0
  2133. PSRLW XMM1, 8
  2134. // Store dest
  2135. PACKUSWB XMM1, XMM2
  2136. MOVQ [RCX + RDX * 8].QWORD, XMM1
  2137. {$ifdef FASTSCALEMEMS_SKIPWRITE}
  2138. @SkipWrite:
  2139. {$endif FASTSCALEMEMS_SKIPWRITE}
  2140. ADD RDX, 1
  2141. JS @Loop
  2142. @Done:
  2143. RET
  2144. @Clear:
  2145. // Clear RGB, leave A as-is
  2146. MOV ECX, DWORD PTR [RCX]
  2147. AND ECX, $FF000000
  2148. MOV DWORD PTR [RCX], ECX
  2149. ADD RCX, 4
  2150. DEC RDX
  2151. JNZ @Clear
  2152. {$ENDIF}
  2153. end;
  2154. {$ifend}
  2155. //------------------------------------------------------------------------------
  2156. //
  2157. // Bindings
  2158. //
  2159. //------------------------------------------------------------------------------
  2160. procedure RegisterBindingFunctions;
  2161. begin
  2162. {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))}
  2163. BlendRegistry[@@MergeReg].Add( @MergeReg_SSE2, [isSSE2]).Name := 'MergeReg_SSE2';
  2164. BlendRegistry[@@CombineReg].Add( @CombineReg_SSE2, [isSSE2]).Name := 'CombineReg_SSE2';
  2165. BlendRegistry[@@CombineMem].Add( @CombineMem_SSE2_128, [isSSE2]).Name := 'CombineMem_SSE2_128';
  2166. BlendRegistry[@@CombineMem].Add( @CombineMem_SSE41_Kadaif, [isSSE41]).Name := 'CombineMem_SSE41_Kadaif';
  2167. {$if defined(BENCHMARK)}
  2168. BlendRegistry[@@CombineMem].Add( @CombineMem_SSE2_Table, [isSSE2], BindingPriorityWorse).Name := 'CombineMem_SSE2_Table';
  2169. BlendRegistry[@@CombineMem].Add( @CombineMem_SSE41_8081, [isSSE41], BindingPriorityWorse).Name := 'CombineMem_SSE41_8081';
  2170. {$ifend}
  2171. BlendRegistry[@@CombineLine].Add( @CombineLine_SSE2, [isSSE2]).Name := 'CombineLine_SSE2';
  2172. BlendRegistry[@@BlendReg].Add( @BlendReg_SSE2, [isSSE2]).Name := 'BlendReg_SSE2';
  2173. BlendRegistry[@@BlendMem].Add( @BlendMem_SSE2, [isSSE2]).Name := 'BlendMem_SSE2';
  2174. BlendRegistry[@@BlendMems].Add( @BlendMems_SSE2, [isSSE2]).Name := 'BlendMems_SSE2';
  2175. BlendRegistry[@@BlendMemEx].Add( @BlendMemEx_SSE2, [isSSE2]).Name := 'BlendMemEx_SSE2';
  2176. BlendRegistry[@@BlendLine].Add( @BlendLine_SSE2, [isSSE2]).Name := 'BlendLine_SSE2';
  2177. BlendRegistry[@@BlendLineEx].Add( @BlendLineEx_SSE2, [isSSE2]).Name := 'BlendLineEx_SSE2';
  2178. BlendRegistry[@@BlendRegEx].Add( @BlendRegEx_SSE2, [isSSE2]).Name := 'BlendRegEx_SSE2';
  2179. BlendRegistry[@@ColorMax].Add( @ColorMax_SSE2, [isSSE2]).Name := 'ColorMax_SSE2';
  2180. BlendRegistry[@@ColorMin].Add( @ColorMin_SSE2, [isSSE2]).Name := 'ColorMin_SSE2';
  2181. BlendRegistry[@@ColorAdd].Add( @ColorAdd_SSE2, [isSSE2]).Name := 'ColorAdd_SSE2';
  2182. BlendRegistry[@@ColorSub].Add( @ColorSub_SSE2, [isSSE2]).Name := 'ColorSub_SSE2';
  2183. BlendRegistry[@@ColorModulate].Add( @ColorModulate_SSE2, [isSSE2]).Name := 'ColorModulate_SSE2';
  2184. BlendRegistry[@@ColorDifference].Add(@ColorDifference_SSE2, [isSSE2]).Name := 'ColorDifference_SSE2';
  2185. BlendRegistry[@@ColorExclusion].Add(@ColorExclusion_SSE2, [isSSE2]).Name := 'ColorExclusion_SSE2';
  2186. BlendRegistry[@@ColorScale].Add( @ColorScale_SSE2, [isSSE2]).Name := 'ColorScale_SSE2';
  2187. BlendRegistry[@@LightenReg].Add( @LightenReg_SSE2, [isSSE]).Name := 'LightenReg_SSE2';
  2188. BlendRegistry[@@BlendRegRGB].Add( @BlendRegRGB_SSE2, [isSSE2]).Name := 'BlendRegRGB_SSE2';
  2189. BlendRegistry[@@BlendMemRGB].Add( @BlendMemRGB_SSE2, [isSSE2]).Name := 'BlendMemRGB_SSE2';
  2190. {$if defined(GR32_SCALEMEMS_FAST) or defined(BENCHMARK)}
  2191. BlendRegistry[@@ScaleMems].Add( @FastScaleMems_SSE41, [isSSE41]).Name := 'FastScaleMems_SSE41';
  2192. {$ifend}
  2193. {$if (not defined(GR32_SCALEMEMS_FAST)) or defined(BENCHMARK)}
  2194. BlendRegistry[@@ScaleMems].Add( @ScaleMems_SSE41, [isSSE41]).Name := 'ScaleMems_SSE41';
  2195. {$ifend}
  2196. {$if defined(TEST_BLENDMEMRGB128SSE4) or defined(BENCHMARK)}
  2197. BlendRegistry[@@BlendMemRGB128].Add(@BlendMemRGB128_SSE4, [isSSE2]).Name := 'BlendMemRGB128_SSE4';
  2198. {$ifend}
  2199. {$ifend}
  2200. end;
  2201. //------------------------------------------------------------------------------
  2202. //------------------------------------------------------------------------------
  2203. //------------------------------------------------------------------------------
  2204. initialization
  2205. RegisterBindingFunctions;
  2206. end.