unit GR32.Blend.SSE2; (* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1 or LGPL 2.1 with linking exception * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * Alternatively, the contents of this file may be used under the terms of the * Free Pascal modified version of the GNU Lesser General Public License * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions * of this license are applicable instead of those above. * Please see the file LICENSE.txt for additional information concerning this * license. * * The Original Code is Graphics32 * * The Initial Developer of the Original Code is * Alex A. Denisov * * Portions created by the Initial Developer are Copyright (C) 2000-2009 * the Initial Developer. All Rights Reserved. * * ***** END LICENSE BLOCK ***** *) interface {$include GR32.inc} // Define GR32_SCALEMEMS_FAST to use the faster, but not very precise version of ScaleMems. // The fast version uses a "shr 8" as a substitute for "div 255" which is also what // ColorScale_Pas does. {$define GR32_SCALEMEMS_FAST} uses GR32; {$if not defined(PUREPASCAL)} //------------------------------------------------------------------------------ // // SSE SIMD blend implementations // //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // Blend //------------------------------------------------------------------------------ function BlendReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF} procedure BlendMem_SSE2(F: TColor32; var B: TColor32); {$IFDEF FPC} assembler; {$ENDIF} procedure BlendMems_SSE2(F: TColor32; B: PColor32; Count: Integer); {$IFDEF FPC} assembler; {$ENDIF} function BlendRegEx_SSE2(F, B: TColor32; M: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF} procedure BlendMemEx_SSE2(F: TColor32; var B:TColor32; M: Cardinal); {$IFDEF FPC} assembler; {$ENDIF} function BlendRegRGB_SSE2(F, B: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF} procedure BlendMemRGB_SSE2(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF} procedure BlendLine_SSE2(Src, Dst: PColor32; Count: Integer); {$IFDEF FPC} assembler; {$ENDIF} procedure BlendLineEx_SSE2(Src, Dst: PColor32; Count: Integer; M: Cardinal); {$IFDEF FPC} assembler; {$ENDIF} //------------------------------------------------------------------------------ // Merge //------------------------------------------------------------------------------ function MergeReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF} //------------------------------------------------------------------------------ // Combine //------------------------------------------------------------------------------ function CombineReg_SSE2(X, Y: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF} procedure CombineMem_SSE2_Table(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF} procedure CombineMem_SSE2_128(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF} procedure CombineMem_SSE41_8081(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF} procedure CombineMem_SSE41_Kadaif(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF} procedure CombineLine_SSE2(Src, Dst: PColor32; Count: Integer; W: Cardinal); {$IFDEF FPC} assembler; {$ENDIF} //------------------------------------------------------------------------------ // Color algebra //------------------------------------------------------------------------------ function ColorAdd_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF} function ColorSub_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF} function ColorModulate_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF} function ColorMax_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF} function ColorMin_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF} function ColorDifference_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF} function ColorExclusion_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF} function ColorScale_SSE2(C: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; {$ENDIF} //------------------------------------------------------------------------------ // Misc //------------------------------------------------------------------------------ function LightenReg_SSE2(C: TColor32; Amount: Integer): TColor32; {$IFDEF FPC} assembler; {$ENDIF} procedure ScaleMems_SSE41(Dst: PColor32; Count: Integer; Weight: Cardinal); {$IFDEF FPC} assembler; {$ENDIF} procedure FastScaleMems_SSE41(Dst: PColor32; Count: Integer; Weight: Cardinal); {$IFDEF FPC} assembler; {$ENDIF} {$ifend} //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ implementation {$if not defined(PUREPASCAL)} uses GR32_Blend, GR32_LowLevel, GR32_Bindings, GR32.Types.SIMD; //------------------------------------------------------------------------------ // // Blend // //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // BlendReg //------------------------------------------------------------------------------ function BlendReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm // blend foreground color (F) to a background color (B), // using alpha channel value of F // EAX <- F // EDX <- B // Result := Fa * (Fargb - Bargb) + Bargb {$IFDEF TARGET_x86} MOVD XMM0,EAX // XMM0 <- 00 00 00 00 00 00 00 00 00 00 00 00 Fa Fr Fg Fb PXOR XMM3,XMM3 // XMM3 <- 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 MOVD XMM2,EDX // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Ba Br Bg Bb PUNPCKLBW XMM0,XMM3 // XMM0 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb MOV ECX,bias_ptr // ECX <- Pointer to Bias PUNPCKLBW XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb MOVQ XMM1,XMM0 // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb PSHUFLW XMM1,XMM1,$FF // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fa 00 Fa 00 Fa PSUBW XMM0,XMM2 // XMM0 <- 00 00 00 00 00 00 00 00 00 Da 00 Dr 00 Dg 00 Db PSLLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb 00 PMULLW XMM0,XMM1 // XMM0 <- 00 00 00 00 00 00 00 00 Pa ** Pr ** Pg ** Pb ** PADDW XMM2,[ECX] // add bias PADDW XMM2,XMM0 // XMM2 <- 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb ** PSRLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb PACKUSWB XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Qa Qr Qg Qb MOVD EAX,XMM2 // EAX <- Za Zr Zg Zb OR EAX,$FF000000 // EAX <- FF Zr Zg Zb {$ENDIF} {$IFDEF TARGET_x64} MOVD XMM0,ECX // XMM0 <- 00 00 00 00 00 00 00 00 00 00 00 00 Fa Fr Fg Fb PXOR XMM3,XMM3 // XMM3 <- 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 MOVD XMM2,EDX // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Ba Br Bg Bb PUNPCKLBW XMM0,XMM3 // XMM0 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb {$IFNDEF FPC} MOV RAX,bias_ptr // RAX <- Pointer to Bias {$ELSE} MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64 {$ENDIF} PUNPCKLBW XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb MOVQ XMM1,XMM0 // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb PSHUFLW XMM1,XMM1,$FF // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fa 00 ** 00 ** PSUBW XMM0,XMM2 // XMM0 <- 00 00 00 00 00 00 00 00 00 Da 00 Dr 00 Dg 00 Db PSLLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb 00 PMULLW XMM0,XMM1 // XMM2 <- 00 00 00 00 00 00 00 00 Pa ** Pr ** Pg ** Pb ** PADDW XMM2,[RAX] // add bias PADDW XMM2,XMM0 // XMM2 <- 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb ** PSRLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb PACKUSWB XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Qa Qr Qg Qb MOVD EAX,XMM2 // EAX <- Za Zr Zg Zb OR EAX,$FF000000 // EAX <- FF Zr Zg Zb {$ENDIF} end; //------------------------------------------------------------------------------ // BlendMem //------------------------------------------------------------------------------ procedure BlendMem_SSE2(F: TColor32; var B: TColor32); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_x86} // EAX - Color X // [EDX] - Color Y // Result := W * (X - Y) + Y TEST EAX,$FF000000 JZ @1 CMP EAX,$FF000000 JNC @2 PXOR XMM3,XMM3 MOVD XMM0,EAX MOVD XMM2,[EDX] PUNPCKLBW XMM0,XMM3 MOV ECX,bias_ptr PUNPCKLBW XMM2,XMM3 MOVQ XMM1,XMM0 PSHUFLW XMM1,XMM1,$FF PSUBW XMM0,XMM2 PSLLW XMM2,8 PMULLW XMM0,XMM1 PADDW XMM2,[ECX] PADDW XMM2,XMM0 PSRLW XMM2,8 PACKUSWB XMM2,XMM3 MOVD [EDX],XMM2 @1: RET @2: MOV [EDX], EAX {$ENDIF} {$IFDEF TARGET_x64} // ECX - Color X // [EDX] - Color Y // Result := W * (X - Y) + Y TEST ECX,$FF000000 JZ @1 CMP ECX,$FF000000 JNC @2 PXOR XMM3,XMM3 MOVD XMM0,ECX MOVD XMM2,[RDX] PUNPCKLBW XMM0,XMM3 {$IFNDEF FPC} MOV RAX,bias_ptr {$ELSE} MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64 {$ENDIF} PUNPCKLBW XMM2,XMM3 MOVQ XMM1,XMM0 PSHUFLW XMM1,XMM1,$FF PSUBW XMM0,XMM2 PSLLW XMM2,8 PMULLW XMM0,XMM1 PADDW XMM2,[RAX] PADDW XMM2,XMM0 PSRLW XMM2,8 PACKUSWB XMM2,XMM3 MOVD [RDX],XMM2 @1: RET @2: MOV [RDX], ECX {$ENDIF} end; //------------------------------------------------------------------------------ // BlendRegEx //------------------------------------------------------------------------------ function BlendRegEx_SSE2(F, B: TColor32; M: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm // blend foreground color (F) to a background color (B), // using alpha channel value of F // Result := M * Fa * (Fargb - Bargb) + Bargb {$IFDEF TARGET_x86} // EAX <- F // EDX <- B // ECX <- M PUSH EBX MOV EBX,EAX SHR EBX,24 INC ECX // 255:256 range bias IMUL ECX,EBX SHR ECX,8 JZ @1 PXOR XMM0,XMM0 MOVD XMM1,EAX SHL ECX,4 MOVD XMM2,EDX PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 ADD ECX,alpha_ptr PSUBW XMM1,XMM2 PMULLW XMM1,[ECX] PSLLW XMM2,8 MOV ECX,bias_ptr PADDW XMM2,[ECX] PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD EAX,XMM1 POP EBX RET @1: MOV EAX,EDX POP EBX {$ENDIF} {$IFDEF TARGET_x64} // ECX <- F // EDX <- B // R8D <- M MOV EAX,ECX SHR EAX,24 INC R8D // 255:256 range bias IMUL R8D,EAX SHR R8D,8 JZ @1 PXOR XMM0,XMM0 MOVD XMM1,ECX SHL R8D,4 MOVD XMM2,EDX PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 {$IFNDEF FPC} ADD R8,alpha_ptr {$ELSE} ADD R8,[RIP+alpha_ptr] {$ENDIF} PSUBW XMM1,XMM2 PMULLW XMM1,[R8] PSLLW XMM2,8 {$IFNDEF FPC} MOV R8,bias_ptr {$ELSE} MOV R8,[RIP+bias_ptr] {$ENDIF} PADDW XMM2,[R8] PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD EAX,XMM1 RET @1: MOV EAX,EDX {$ENDIF} end; //------------------------------------------------------------------------------ // BlendMemEx //------------------------------------------------------------------------------ procedure BlendMemEx_SSE2(F: TColor32; var B:TColor32; M: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_x86} // blend foreground color (F) to a background color (B), // using alpha channel value of F // EAX <- F // [EDX] <- B // ECX <- M // Result := M * Fa * (Fargb - Bargb) + Bargb TEST EAX,$FF000000 JZ @2 PUSH EBX MOV EBX,EAX // EBX <- Fa Fr Fg Fb SHR EBX,24 // EBX <- 00 00 00 Fa INC ECX // 255:256 range bias IMUL ECX,EBX // ECX <- 00 00 W ** SHR ECX,8 // ECX <- 00 00 00 W JZ @1 PXOR XMM0,XMM0 // XMM0 <- 00 00 00 00 00 00 00 00 MOVD XMM1,EAX // XMM1 <- 00 00 00 00 Fa Fr Fg Fb SHL ECX,4 MOVD XMM2,[EDX] // XMM2 <- 00 00 00 00 Ba Br Bg Bb PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 ADD ECX,alpha_ptr PSUBW XMM1,XMM2 PMULLW XMM1,[ECX] PSLLW XMM2,8 MOV ECX,bias_ptr PADDW XMM2,[ECX] PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD [EDX],XMM1 @1: POP EBX @2: {$ENDIF} {$IFDEF TARGET_x64} // blend foreground color (F) to a background color (B), // using alpha channel value of F // RCX <- F // [RDX] <- B // R8 <- M // Result := M * Fa * (Fargb - Bargb) + Bargb TEST ECX,$FF000000 JZ @1 MOV R9D,ECX SHR R9D,24 INC R8D // 255:256 range bias IMUL R8D,R9D SHR R8D,8 JZ @1 PXOR XMM0,XMM0 MOVD XMM1,ECX SHL R8D,4 MOVD XMM2,[RDX] PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 {$IFNDEF FPC} ADD R8,alpha_ptr {$ELSE} ADD R8,[RIP+alpha_ptr] {$ENDIF} PSUBW XMM1,XMM2 PMULLW XMM1,[R8] PSLLW XMM2,8 {$IFNDEF FPC} MOV R8,bias_ptr {$ELSE} MOV R8,[RIP+bias_ptr] {$ENDIF} PADDW XMM2,[R8] PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD DWORD PTR [RDX],XMM1 @1: {$ENDIF} end; //------------------------------------------------------------------------------ // BlendRegRGB //------------------------------------------------------------------------------ function BlendRegRGB_SSE2(F, B: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_x86} PXOR XMM2,XMM2 MOVD XMM0,EAX PUNPCKLBW XMM0,XMM2 MOVD XMM1,EDX PUNPCKLBW XMM1,XMM2 BSWAP ECX PSUBW XMM0,XMM1 MOVD XMM3,ECX PUNPCKLBW XMM3,XMM2 PMULLW XMM0,XMM3 MOV EAX,bias_ptr PSLLW XMM1,8 PADDW XMM1,[EAX] PADDW XMM1,XMM0 PSRLW XMM1,8 PACKUSWB XMM1,XMM2 MOVD EAX,XMM1 {$ENDIF} {$IFDEF TARGET_x64} PXOR XMM2,XMM2 MOVD XMM0,ECX PUNPCKLBW XMM0,XMM2 MOVD XMM1,EDX PUNPCKLBW XMM1,XMM2 BSWAP R8D PSUBW XMM0,XMM1 MOVD XMM3,R8D PUNPCKLBW XMM3,XMM2 PMULLW XMM0,XMM3 {$IFNDEF FPC} MOV RAX,bias_ptr {$ELSE} MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64 {$ENDIF} PSLLW XMM1,8 PADDW XMM1,[RAX] PADDW XMM1,XMM0 PSRLW XMM1,8 PACKUSWB XMM1,XMM2 MOVD EAX,XMM1 {$ENDIF} end; //------------------------------------------------------------------------------ // BlendMemRGB //------------------------------------------------------------------------------ procedure BlendMemRGB_SSE2(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_x86} PXOR XMM2,XMM2 MOVD XMM0,EAX PUNPCKLBW XMM0,XMM2 MOVD XMM1,[EDX] PUNPCKLBW XMM1,XMM2 BSWAP ECX PSUBW XMM0,XMM1 MOVD XMM3,ECX PUNPCKLBW XMM3,XMM2 PMULLW XMM0,XMM3 MOV EAX,bias_ptr PSLLW XMM1,8 PADDW XMM1,[EAX] PADDW XMM1,XMM0 PSRLW XMM1,8 PACKUSWB XMM1,XMM2 MOVD [EDX],XMM1 {$ENDIF} {$IFDEF TARGET_x64} MOVD XMM1,R8D PXOR XMM4,XMM4 {$IFNDEF FPC} MOV RAX,bias_ptr {$ELSE} MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64 {$ENDIF} MOVQ XMM5,[RAX] MOVD XMM0,ECX MOVD XMM2,[RDX] PUNPCKLBW XMM0,XMM4 PUNPCKLBW XMM1,XMM4 PUNPCKLBW XMM2,XMM4 PSHUFLW XMM1,XMM1,$1B // C = wA B - wB PMULLW XMM0,XMM1 PADDW XMM0,XMM5 PSRLW XMM0,8 PADDW XMM0,XMM2 PMULLW XMM2,XMM1 PADDW XMM2,XMM5 PSRLW XMM2,8 PSUBW XMM0,XMM2 PACKUSWB XMM0,XMM4 MOVD [RDX],XMM0 {$ENDIF} end; //------------------------------------------------------------------------------ // BlendMemRGB128 //------------------------------------------------------------------------------ {$IFDEF TEST_BLENDMEMRGB128SSE4} procedure BlendMemRGB128_SSE4(F: TColor32; var B: TColor32; W: UInt64); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_x86} MOVQ XMM1,W PXOR XMM4,XMM4 MOV ECX,[bias_ptr] MOVDQA XMM5,[ECX] MOVD XMM0,EAX PINSRD XMM0,EAX,1 MOVQ XMM2,[EDX].QWORD PUNPCKLBW XMM0,XMM4 PUNPCKLBW XMM1,XMM4 PUNPCKLBW XMM2,XMM4 PSHUFLW XMM1,XMM1,$1B PSHUFHW XMM1,XMM1,$1B // C = wA B - wB PMULLW XMM0,XMM1 PADDW XMM0,XMM5 PSRLW XMM0,8 PADDW XMM0,XMM2 PMULLW XMM2,XMM1 PADDW XMM2,XMM5 PSRLW XMM2,8 PSUBW XMM0,XMM2 PACKUSWB XMM0,XMM4 MOVQ [EDX].QWORD,XMM0 {$ENDIF} {$IFDEF TARGET_x64} MOVQ XMM1,R8 PXOR XMM4,XMM4 MOV RAX,[RIP+bias_ptr] MOVDQA XMM5,[RAX] MOVD XMM0,ECX PINSRD XMM0,ECX,1 MOVQ XMM2,[RDX].QWORD PUNPCKLBW XMM0,XMM4 PUNPCKLBW XMM1,XMM4 PUNPCKLBW XMM2,XMM4 PSHUFLW XMM1,XMM1,$1B PSHUFHW XMM1,XMM1,$1B // C = wA B - wB PMULLW XMM0,XMM1 PADDW XMM0,XMM5 PSRLW XMM0,8 PADDW XMM0,XMM2 PMULLW XMM2,XMM1 PADDW XMM2,XMM5 PSRLW XMM2,8 PSUBW XMM0,XMM2 PACKUSWB XMM0,XMM4 MOVQ [RDX].QWORD,XMM0 {$ENDIF} end; {$ENDIF} //------------------------------------------------------------------------------ // BlendLine //------------------------------------------------------------------------------ procedure BlendLine_SSE2(Src, Dst: PColor32; Count: Integer); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} {$IFDEF FPC} const COpaque: QWORD = QWORD($FF000000FF000000); {$ENDIF} asm {$IFDEF TARGET_X86} // EAX <- Src // EDX <- Dst // ECX <- Count TEST ECX,ECX JLE @3 PUSH EBX PXOR XMM4,XMM4 MOV EBX,[bias_ptr] MOVDQA XMM5,[EBX] POP EBX TEST ECX, 1 JZ @2 MOVD XMM0,[EAX] MOVD XMM2,[EDX] PUNPCKLBW XMM0,XMM4 PUNPCKLBW XMM2,XMM4 PSHUFLW XMM1,XMM0,$FF // premultiply source pixel by its alpha MOVQ XMM3,XMM1 PSRLQ XMM3,16 PMULLW XMM0,XMM3 PADDW XMM0,XMM5 PSRLW XMM0,8 PSLLQ XMM3,48 POR XMM0,XMM3 // C' = A' B' - aB' PMULLW XMM1,XMM2 PADDW XMM1,XMM5 PSRLW XMM1,8 PADDW XMM0,XMM2 PSUBW XMM0,XMM1 PACKUSWB XMM0,XMM4 MOVD [EDX], XMM0 @2: LEA EAX, [EAX + ECX * 4] LEA EDX, [EDX + ECX * 4] SHR ECX,1 JZ @3 NEG ECX @1: MOVQ XMM0,[EAX + ECX * 8].QWORD MOVQ XMM2,[EDX + ECX * 8].QWORD PUNPCKLBW XMM0,XMM4 PUNPCKLBW XMM2,XMM4 PSHUFLW XMM1,XMM0,$FF PSHUFHW XMM1,XMM1,$FF // premultiply source pixel by its alpha MOVDQA XMM3,XMM1 PSRLQ XMM3,16 PMULLW XMM0,XMM3 PADDW XMM0,XMM5 PSRLW XMM0,8 PSLLQ XMM3,48 POR XMM0,XMM3 // C' = A' + B' - aB' PMULLW XMM1,XMM2 PADDW XMM1,XMM5 PSRLW XMM1,8 PADDW XMM0,XMM2 PSUBW XMM0,XMM1 PACKUSWB XMM0,XMM4 MOVQ [EDX + ECX * 8].QWORD,XMM0 ADD ECX,1 JS @1 @3: {$ENDIF} {$IFDEF TARGET_X64} TEST R8D,R8D JLE @3 PXOR XMM4,XMM4 {$IFNDEF FPC} MOV RAX,bias_ptr {$ELSE} MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64 {$ENDIF} MOVDQA XMM5,[RAX] MOV R9D, R8D SHR R9D, 1 TEST R9D, R9D JZ @2 @1: MOVQ XMM0,[RCX].QWORD MOVQ RAX,XMM0 {$IFDEF FPC} AND RAX,[RIP+COpaque] JZ @1b CMP RAX,[RIP+COpaque] JZ @1a {$ENDIF} MOVQ XMM2,[RDX].QWORD PUNPCKLBW XMM0,XMM4 PUNPCKLBW XMM2,XMM4 PSHUFLW XMM1,XMM0,$FF PSHUFHW XMM1,XMM1,$FF // premultiply source pixel by its alpha MOVDQA XMM3,XMM1 PSRLQ XMM3,16 PMULLW XMM0,XMM3 PADDW XMM0,XMM5 PSRLW XMM0,8 PSLLQ XMM3,48 POR XMM0,XMM3 // C' = A' + B' - aB' PMULLW XMM1,XMM2 PADDW XMM1,XMM5 PSRLW XMM1,8 PADDW XMM0,XMM2 PSUBW XMM0,XMM1 PACKUSWB XMM0,XMM4 @1a: MOVQ [RDX].QWORD,XMM0 @1b: ADD RCX,8 ADD RDX,8 SUB R9D,1 JNZ @1 @2: AND R8D, 1 JZ @3 MOVD XMM0,[RCX] MOVD XMM2,[RDX] PUNPCKLBW XMM0,XMM4 PUNPCKLBW XMM2,XMM4 PSHUFLW XMM1,XMM0,$FF // premultiply source pixel by its alpha MOVQ XMM3,XMM1 PSRLQ XMM3,16 PMULLW XMM0,XMM3 PADDW XMM0,XMM5 PSRLW XMM0,8 PSLLQ XMM3,48 POR XMM0,XMM3 // C' = A' B' - aB' PMULLW XMM1,XMM2 PADDW XMM1,XMM5 PSRLW XMM1,8 PADDW XMM0,XMM2 PSUBW XMM0,XMM1 PACKUSWB XMM0,XMM4 MOVD [RDX], XMM0 @3: {$ENDIF} end; //------------------------------------------------------------------------------ // BlendMems // Like BlendLine except the Src parameter is static. //------------------------------------------------------------------------------ procedure BlendMems_SSE2(F: TColor32; B: PColor32; Count: Integer); {$IFDEF FPC} assembler; {$ENDIF} asm // // Result Z = Fa * (Fargb - Bargb) + Bargb // = Fa * Fargb - Fa * Bargb + Bargb // // For Fa * Fargb, ((a*x) div 255) is approximated as ((((a * $101) shr 16) * x + 128) div 256) // For Fa * Bargb, (x div 255) is approximated as ((x + 128) div 256) // {$IFDEF TARGET_X86} // EAX <- Src: TColor32 // EDX <- Dst: PColor32 // ECX <- Count // Test the counter for zero or negativity // JCXZ @Done TEST ECX, ECX JLE @Done // Test if source if fully transparent TEST EAX, $FF000000 JZ @Done // Setup division by 255 bias PUSH EBX PXOR XMM4, XMM4 MOV EBX, [bias_ptr] MOVDQA XMM5, [EBX] POP EBX // Load source MOVD XMM0, EAX // XMM0 <- 00 00 00 00 Fa Fr Fg Fb // Get source alpha and test if fully opaque SHR EAX, 24 CMP EAX, $FF JZ @FillWithSource PSHUFD XMM0, XMM0, 0 // XMM0[0..3] <- XMM0[0][0..3] PUNPCKLBW XMM0, XMM4 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb PSHUFLW XMM1, XMM0, $FF // XMM1 <- 00 Fa 00 Fa 00 Fa 00 Fa PSHUFHW XMM1, XMM1, $FF // Premultiply source pixel by its alpha: Fa * Fargb MOVDQA XMM3, XMM1 // XMM3 <- 2*QWord(XMM1) PSRLQ XMM3, 16 // XMM3 <- 00 00 00 Fa 00 Fa 00 Fa PMULLW XMM0, XMM3 // XMM0 <- Frgb * Fa PADDW XMM0, XMM5 // XMM0 <- Frgb * Fa + Bias PSRLW XMM0, 8 // XMM0 <- (Frgb * Fa + Bias) div 256 PSLLQ XMM3, 48 // XMM3 <- 00 Fa 00 00 00 00 00 00 POR XMM0, XMM3 // XMM0 <- 00 Fa 00 FR 00 FG 00 FB // Save alpha multiplier MOVDQA XMM3, XMM1 // Test for odd/even count TEST ECX, 1 JZ @Even // We have an odd number of pixels. // Blend a single pixel so the remaining count is even. // Load dest MOVD XMM2, DWORD PTR [EDX] // XMM2 <- 00 00 00 00 Ba Br Bg Bb PUNPCKLBW XMM2, XMM4 // XMM2 <- 00 Ba 00 Br 00 Bg 00 Bb // Blend: C' = A' B' - aB' PMULLW XMM1, XMM2 // Z1 = Fa * Brgba PADDW XMM1, XMM5 // Z1 = Fa * Brgba + Bias PSRLW XMM1, 8 // Z1 = (Fa * Bargb + Bias) div 256 PADDW XMM2, XMM0 // Z2 = Brgba + FaRGB PSUBW XMM2, XMM1 // Z2 = Z2 - Z1 PACKUSWB XMM2, XMM4 MOVD [EDX], XMM2 @Even: LEA EDX, [EDX + ECX * 4] // Get address of last pixel SHR ECX,1 // Number of QWORDs JZ @Done NEG ECX // Negate count so we can use it as an offset to move forward @Loop: // Blend two pixels at a time // Restore alpha multiplier MOVDQA XMM1, XMM3 // Load dest MOVQ XMM2, [EDX + ECX * 8].QWORD // XMM2 <- Ba Br Bg Bb Ba Br Bg Bb PUNPCKLBW XMM2, XMM4 // XMM2 <- 00 Ba 00 Br 00 Bg 00 Bb // Blend: C' = A' + B' - aB' PMULLW XMM1, XMM2 PADDW XMM1, XMM5 PSRLW XMM1, 8 PADDW XMM2, XMM0 PSUBW XMM2, XMM1 PACKUSWB XMM2, XMM4 MOVQ [EDX + ECX * 8].QWORD, XMM2 ADD ECX, 1 JS @Loop @Done: RET @FillWithSource: // Shuffle registers for FillLongword MOV EAX, EDX MOV EDX, ECX MOVD ECX, XMM0 CALL FillLongword // EAX:Dest, EDX:Count, ECX:Value {$ENDIF} {$IFDEF TARGET_X64} // ECX <- Src: TColor32 // RDX <- Dst: PColor32 // R8D <- Count // Test the counter for zero or negativity TEST R8D, R8D JLE @Done // Test if source if fully transparent TEST ECX, $FF000000 JZ @Done // Get source alpha MOV EAX, ECX SHR EAX, 24 // Test if source is fully opaque CMP EAX, $FF JZ @FillWithSource // Setup division by 255 bias PXOR XMM4, XMM4 {$IFNDEF FPC} MOV RAX, bias_ptr {$ELSE} MOV RAX, [RIP+bias_ptr] {$ENDIF} MOVDQA XMM5, [RAX] // Load source MOVQ XMM0, RCX // XMM0 <- 00 00 00 00 Fa Fr Fg Fb PSHUFD XMM0, XMM0, 0 // XMM0[0..3] <- XMM0[0][0..3] PUNPCKLBW XMM0, XMM4 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb PSHUFLW XMM1, XMM0, $FF // XMM1 <- 00 Fa 00 Fa 00 Fa 00 Fa PSHUFHW XMM1, XMM1, $FF // Premultiply source pixel by its alpha MOVDQA XMM3, XMM1 // XMM3 <- 2*QWord(XMM1) PSRLQ XMM3, 16 // XMM3 <- 00 00 00 Fa 00 Fa 00 Fa PMULLW XMM0, XMM3 // XMM0 <- Frgb * Fa PADDW XMM0, XMM5 // XMM0 <- Frgb * Fa + Bias PSRLW XMM0, 8 // XMM0 <- (Frgb * Fa + Bias) shr 8 PSLLQ XMM3, 48 // XMM3 <- 00 Fa 00 00 00 00 00 00 POR XMM0, XMM3 // XMM0 <- 00 Fa 00 FR 00 FG 00 FB // Save alpha multiplier MOVDQA XMM3, XMM1 // Test for odd/even count MOV R9D, R8D SHR R9D, 1 // Get number of double pixels TEST R9D, R9D JZ @SinglePixel // None; We only have a single pixel @Loop: // Blend two pixels at a time // Load dest MOVQ XMM2, [RDX].QWORD PUNPCKLBW XMM2, XMM4 // Blend: C' = A' + B' - aB' PMULLW XMM1, XMM2 PADDW XMM1, XMM5 PSRLW XMM1, 8 PADDW XMM2, XMM0 PSUBW XMM2, XMM1 // Restore alpha multiplier MOVDQA XMM1, XMM3 // Store dest PACKUSWB XMM2, XMM4 MOVQ [RDX].QWORD, XMM2 ADD RDX, 8 SUB R9D, 1 JNZ @Loop @SinglePixel: AND R8D, 1 JZ @Done // Blend a single pixel // Load dest MOVD XMM2, [RDX] PUNPCKLBW XMM2, XMM4 // Blend: C' = A' B' - aB' PMULLW XMM1, XMM2 PADDW XMM1, XMM5 PSRLW XMM1, 8 PADDW XMM0, XMM2 PSUBW XMM0, XMM1 // Store dest PACKUSWB XMM0, XMM4 MOVD [RDX], XMM0 @Done: RET @FillWithSource: // Shuffle registers for FillLongword MOV EAX, ECX MOV RCX, RDX MOV EDX, R8D MOV R8D, EAX {$IFNDEF FPC} CALL FillLongword // RCX:Dest, EDX:Count, R8D:Value {$ELSE} CALL [rip+FillLongword] // RCX:Dest, EDX:Count, R8D:Value {$ENDIF} {$ENDIF} end; //------------------------------------------------------------------------------ // BlendLineEx //------------------------------------------------------------------------------ procedure BlendLineEx_SSE2(Src, Dst: PColor32; Count: Integer; M: Cardinal); {$IFDEF FPC} assembler; {$IFDEF TARGET_X64}nostackframe;{$ENDIF} {$ENDIF} asm {$IFDEF TARGET_X86} // EAX <- Src // EDX <- Dst // ECX <- Count // test the counter for zero or negativity TEST ECX,ECX JLE @4 PUSH ESI PUSH EDI PUSH EBX MOV ESI,EAX // ESI <- Src MOV EDI,EDX // EDI <- Dst MOV EDX,M // EDX <- Master Alpha // loop start @1: MOV EAX,[ESI] TEST EAX,$FF000000 JZ @3 // complete transparency, proceed to next point MOV EBX,EAX SHR EBX,24 INC EBX // 255:256 range bias IMUL EBX,EDX SHR EBX,8 JZ @3 // complete transparency, proceed to next point // blend PXOR XMM0,XMM0 MOVD XMM1,EAX SHL EBX,4 MOVD XMM2,[EDI] PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 ADD EBX,alpha_ptr PSUBW XMM1,XMM2 PMULLW XMM1,[EBX] PSLLW XMM2,8 MOV EBX,bias_ptr PADDW XMM2,[EBX] PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD EAX,XMM1 @2: MOV [EDI],EAX @3: ADD ESI,4 ADD EDI,4 // loop end DEC ECX JNZ @1 POP EBX POP EDI POP ESI @4: {$ENDIF} {$IFDEF TARGET_X64} // ECX <- Src // EDX <- Dst // R8D <- Count // R9D <- M // test the counter for zero or negativity TEST R8D,R8D JLE @4 TEST R9D,R9D JZ @4 MOV R10,RCX // ESI <- Src // loop start @1: MOV ECX,[R10] TEST ECX,$FF000000 JZ @3 // complete transparency, proceed to next point MOV EAX,ECX SHR EAX,24 INC EAX // 255:256 range bias IMUL EAX,R9D SHR EAX,8 JZ @3 // complete transparency, proceed to next point // blend PXOR XMM0,XMM0 MOVD XMM1,ECX SHL EAX,4 MOVD XMM2,[RDX] PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 {$IFNDEF FPC} ADD RAX,alpha_ptr {$ELSE} ADD RAX,[RIP+alpha_ptr] {$ENDIF} PSUBW XMM1,XMM2 PMULLW XMM1,[RAX] PSLLW XMM2,8 {$IFNDEF FPC} MOV RAX,bias_ptr {$ELSE} MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64 {$ENDIF} PADDW XMM2,[RAX] PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD ECX,XMM1 @2: MOV [RDX],ECX @3: ADD R10,4 ADD RDX,4 // loop end DEC R8D JNZ @1 @4: {$ENDIF} end; //------------------------------------------------------------------------------ // // Combine // //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // CombineReg //------------------------------------------------------------------------------ function CombineReg_SSE2(X, Y: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_X86} // EAX - Color X // EDX - Color Y // ECX - Weight of X [0..255] // Result := W * (X - Y) + Y MOVD XMM1,EAX PXOR XMM0,XMM0 SHL ECX,4 MOVD XMM2,EDX PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 ADD ECX,alpha_ptr PSUBW XMM1,XMM2 PMULLW XMM1,[ECX] PSLLW XMM2,8 MOV ECX,bias_ptr PADDW XMM2,[ECX] PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD EAX,XMM1 {$ENDIF} {$IFDEF TARGET_X64} // ECX - Color X // EDX - Color Y // R8D - Weight of X [0..255] // Result := W * (X - Y) + Y MOVD XMM1,ECX PXOR XMM0,XMM0 SHL R8D,4 MOVD XMM2,EDX PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 {$IFNDEF FPC} ADD R8,alpha_ptr {$ELSE} ADD R8,[RIP+alpha_ptr] {$ENDIF} PSUBW XMM1,XMM2 PMULLW XMM1,[R8] PSLLW XMM2,8 {$IFNDEF FPC} MOV R8,bias_ptr {$ELSE} MOV R8,[RIP+bias_ptr] {$ENDIF} PADDW XMM2,[R8] PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD EAX,XMM1 {$ENDIF} end; //------------------------------------------------------------------------------ // CombineMem //------------------------------------------------------------------------------ procedure CombineMem_SSE2_Table(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} (* Contributed by: Christian-W. Budde TestCombineMem: Errors: 39.082 = 29,8 % (Limit: -1) Differences: 92.042 Average difference: -0,29 Max difference: 1 (Limit: 1) *) asm // // Result := W * (X - Y) + Y // // Approximates (x div 255) as ((x + 128) div 256) // {$IFDEF TARGET_X86} // EAX Color X // [EDX] Color Y // ECX Weight of X [0..255] // Return ColorY if weight=0 JCXZ @exit // Return ColorX if weight=255 CMP ECX, $FF JZ @return_x // Load ColorX and ColorY MOVD XMM1, EAX // XMM1 <- ColorX (Fa Fr Fg Fb) MOVD XMM2, [EDX] // XMM2 <- ColorY (Ba Br Bg Bb) // Create a Zero for use in unpack PXOR XMM0, XMM0 // XMM0 <- 0 SHL ECX, 4 // ECX <- Offset into AlphaTable // Unpack the ColorX and ColorY WORDs into DWORDs PUNPCKLBW XMM1, XMM0 // XMM1.high <- 0 (00 Fa 00 Fr 00 Fg 00 Fb) PUNPCKLBW XMM2, XMM0 // XMM2.high <- 0 (00 Ba 00 Br 00 Bg 00 Bb) ADD ECX, alpha_ptr // ECX <- &AlphaTable[Weight] // Lerp: Result = (Weight * (ColorX - ColorY) + 256 * ColorY) / 256 PSUBW XMM1, XMM2 // XMM1 <- ColorX - ColorY PMULLW XMM1, [ECX] // XMM1 <- (ColorX - ColorY) * AlphaTable[Weight] PSLLW XMM2, 8 // XMM2 <- ColorY * 256 MOV ECX, bias_ptr // ECX <- AlphaTable[128] (= $00800080 = 0.5) PADDW XMM2, [ECX] // XMM2 <- (ColorY * 256) + 128 PADDW XMM1, XMM2 // XMM1 <- (ColorX - ColorY) * Weight + ColorY PSRLW XMM1, 8 // XMM1 <- XMM1 div 256 // Pack result back from word to byte components PACKUSWB XMM1, XMM0 // XMM1 <- XMM1.low (Ra Rr Rg Rb) MOVD [EDX], XMM1 // ColorY <- XMM1 @exit: RET @return_x: MOV [EDX], EAX // ColorY <- ColorX {$ENDIF} {$IFDEF TARGET_X64} // ECX - Color X // [RDX] - Color Y // R8D - Weight of X [0..255] TEST R8D,R8D // Set flags for R8 JZ @1 // W = 0 ? => Result := EDX CMP R8D,$FF JZ @2 MOVD XMM1,ECX PXOR XMM0,XMM0 SHL R8D,4 MOVD XMM2,[RDX] PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 {$IFNDEF FPC} ADD R8,alpha_ptr {$ELSE} ADD R8,[RIP+alpha_ptr] {$ENDIF} PSUBW XMM1,XMM2 PMULLW XMM1,[R8] PSLLW XMM2,8 {$IFNDEF FPC} MOV RAX,bias_ptr {$ELSE} MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64 {$ENDIF} PADDW XMM2,[RAX] PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD [RDX],XMM1 @1: RET @2: MOV [RDX],ECX {$ENDIF} end; //------------------------------------------------------------------------------ procedure CombineMem_SSE2_128(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} (* Contributed by: Anders Melander Basically the same as CombineMem_SSE2_Table but uses immediate loads instead of tables. TestCombineMem: Errors: 39.082 = 29,8 % (Limit: -1) Differences: 92.042 Average difference: -0,29 Max difference: 1 (Limit: 1) *) asm // // Result := W * (X - Y) + Y // // Approximates (x div 255) as ((x + 128) div 256) // {$IFDEF TARGET_X86} // EAX Color X (Foreground) // [EDX] Color Y (Background) // ECX Weight of X [0..255] // Return ColorY if weight=0 JCXZ @exit // Return ColorX if weight=255 CMP ECX, $FF JZ @return_x {$ELSE} // ECX Color X (Foreground) // [RDX] Color Y (Background) // R8D Weight of X [0..255] // Return ColorY if weight=0 TEST R8D, R8D JZ @exit // Return ColorX if weight=255 CMP ECX, $FF JZ @return_x {$ENDIF} // Load ColorX and ColorY {$IFDEF TARGET_X86} MOVD XMM1, EAX // XMM1 <- ColorX (Fa Fr Fg Fb) MOVD XMM2, [EDX] // XMM2 <- ColorY (Ba Br Bg Bb) {$ELSE} MOVD XMM1, ECX // XMM1 <- ColorX (Fa Fr Fg Fb) MOVD XMM2, [RDX] // XMM2 <- ColorY (Ba Br Bg Bb) {$ENDIF} // Duplicate weight into 4 words {$IFDEF TARGET_X86} MOVD XMM3, ECX // XMM3 <- Weight (00 00 00 00 00 00 00 WW) {$ELSE} MOVD XMM3, R8D // XMM3 <- Weight (00 00 00 00 00 00 00 WW) {$ENDIF} PSHUFLW XMM3, XMM3, 0 // (00 WW 00 WW 00 WW 00 WW) // Duplicate 128 into 4 words for saturated biasing MOV ECX, 128 MOVD XMM4, ECX // XMM4 <- 0 (00 00 00 00 00 00 00 80) PSHUFLW XMM4, XMM4, 0 // (00 80 00 80 00 80 00 80) // Create a Zero for use in unpack PXOR XMM0, XMM0 // XMM0 <- 0 // Unpack the ColorX and ColorY byte components into words PUNPCKLBW XMM1, XMM0 // XMM1.high <- 0 (00 Fa 00 Fr 00 Fg 00 Fb) PUNPCKLBW XMM2, XMM0 // XMM2.high <- 0 (00 Ba 00 Br 00 Bg 00 Bb) // Save a copy of ColorY*256 MOVQ XMM0, XMM2 PSLLW XMM0, 8 // XMM0 <- (Ba 00 Br 00 Bg 00 Bb 00) // Lerp: Result = (weight * (ColorX - ColorY) + 256 * ColorY) / 256 PSUBW XMM1, XMM2 // XMM1 <- ColorX - ColorY PMULLW XMM1, XMM3 // XMM1 <- Weight * (ColorX - ColorY) PADDW XMM1, XMM0 // XMM1 <- Weight * (ColorX - ColorY) + 256 * ColorY // Add 255:256 correction bias PADDW XMM1, XMM4 // XMM1 <- Weight * (ColorX - ColorY) + 256 * ColorY + 128 PSRLW XMM1, 8 // XMM1 <- (Weight * (ColorX - ColorY) + 256 * ColorY) div 256 // Pack result back from word to byte components PACKUSWB XMM1, XMM1 // XMM1 <- XMM1.low (Ra Rr Rg Rb) {$IFDEF TARGET_X86} MOVD [EDX], XMM1 // ColorY <- XMM1 {$ELSE} MOVD [RDX], XMM1 // ColorY <- XMM1 {$ENDIF} @exit: RET @return_x: {$IFDEF TARGET_X86} MOV [EDX], EAX // ColorY <- ColorX {$ELSE} MOV [RDX], ECX // ColorY <- ColorX {$ENDIF} end; //------------------------------------------------------------------------------ procedure CombineMem_SSE41_8081(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} (* Contributed by: Anders Melander Based on CombineMem_SSE41_Kadaif but uses immediate loads instead of tables. Also uses a slight different bias value. Also slower :-( TestCombineMem: Errors: 20 = 0,0 % (Limit: -1) Differences: 38 Average difference: -0,05 Max difference: 1 (Limit: 1) *) asm // // Result := W * (X - Y) + Y // // Approximates Round(x / 255) as (((x + $7F) * $8081) shr 23) = ((x * $8081 + Bias) shr 23) // {$IFDEF TARGET_X86} // EAX Color X (Foreground) // [EDX] Color Y (Background) // ECX Weight of X [0..255] // Return ColorY if weight=0 JCXZ @exit // Return ColorX if weight=255 CMP ECX, $FF JZ @return_x {$ELSE} // ECX Color X (Foreground) // [RDX] Color Y (Background) // R8D Weight of X [0..255] // Return ColorY if weight=0 TEST R8D, R8D JZ @exit // Return ColorX if weight=255 CMP R8D, $FF JZ @return_x {$ENDIF} // Load ColorX and ColorY {$IFDEF TARGET_X86} MOVD XMM1, EAX // XMM1 <- ColorX (Fa Fr Fg Fb) MOVD XMM2, [EDX] // XMM2 <- ColorY (Ba Br Bg Bb) {$ELSE} MOVD XMM1, ECX // XMM1 <- ColorX (Fa Fr Fg Fb) MOVD XMM2, [RDX] // XMM2 <- ColorY (Ba Br Bg Bb) {$ENDIF} // Duplicate weight*$8081 into 4 dwords {$IFDEF TARGET_X86} IMUL ECX, ECX, $8081 {$ELSE} IMUL ECX, R8D, $8081 {$ENDIF} MOVD XMM3, ECX // XMM3 <- Weight * $8081 PSHUFD XMM3, XMM3, 0 // XMM3[0..3] <- XMM3[0][0..3] // Unpack the ColorX and ColorY byte components into dwords // PMOVZXBD is SSE4.1 PMOVZXBD XMM1, XMM1 // XMM1[0..3] <- ColorX[0][0..3] PMOVZXBD XMM0, XMM2 // XMM0[0..3] <- ColorY[0][0..3] // // Lerp: Result = (weight * (ColorX - ColorY) + ColorY) // = (($8081 * weight * (ColorX - ColorY)) shr 23 + ColorY) // PSUBD XMM1, XMM0 // XMM1 <- ColorX - ColorY PMULLD XMM1, XMM3 // XMM1 <- (ColorX - ColorY) * Weight * $8081 // Duplicate bias (~$7F*$8081) into 4 dwords MOV ECX, $003FFF0F MOVD XMM3, ECX // XMM3 <- Bias PSHUFD XMM3, XMM3, 0 // XMM3[0..3] <- XMM3[0][0..3] // Add bias PADDD XMM1, XMM3 // XMM2 <- (ColorX - ColorY) * Weight * $8081 + Bias // Reduce 32-bits to 9-bits PSRLD XMM1, 23 // XMM2 <- ((ColorX - ColorY) * Weight * $8081 + Bias) shr 23 // PACKUSDW is SSE4.1 // Convert from dwords to words PACKUSDW XMM1, XMM0 // XMM1[0..1][0..1] <- XMM1[0..3] // Convert from words.lo to bytes PSLLW XMM1, 8 // Get rid of the high byte PSRLW XMM1, 8 PACKUSWB XMM1, XMM0 // XMM1[0][0..3] <- XMM1[0..1][0..1] // Result := Value + ColorY PADDB XMM1, XMM2 // XMM0 <- XMM2 + ColorY {$IFDEF TARGET_X86} MOVD [EDX], XMM1 // ColorY <- XMM1 {$ELSE} MOVD [RDX], XMM1 // ColorY <- XMM1 {$ENDIF} @exit: RET @return_x: {$IFDEF TARGET_X86} MOV [EDX], EAX // ColorY <- ColorX {$ELSE} MOV [RDX], ECX // ColorY <- ColorX {$ENDIF} end; //------------------------------------------------------------------------------ procedure CombineMem_SSE41_Kadaif(F: TColor32; var B: TColor32; W: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} (* Contributed by: Kadaif TestCombineMem: Errors: 16 = 0,0 % (Limit: -1) Differences: 30 Average difference: 0,20 Max difference: 1 (Limit: 1) *) asm // // Result := W * (X - Y) + Y // // Approximates Round(x / 255) as ((x * $8081 + Bias) shr 23) // {$IFDEF TARGET_X86} // EAX Color X (Foreground) // [EDX] Color Y (Background) // ECX Weight of X [0..255] // Return ColorY if weight=0 JCXZ @exit // Return ColorX if weight=255 CMP ECX, $FF JZ @return_x {$ELSE} // ECX Color X (Foreground) // [RDX] Color Y (Background) // R8D Weight of X [0..255] // Return ColorY if weight=0 TEST R8D, R8D JZ @exit // Return ColorX if weight=255 CMP R8D, $FF JZ @return_x {$ENDIF} // Load ColorX and ColorY {$IFDEF TARGET_X86} MOVD XMM0, EAX // XMM0 <- ColorX (Fa Fr Fg Fb) MOVD XMM1, [EDX] // XMM1 <- ColorY (Ba Br Bg Bb) {$ELSE} MOVD XMM0, ECX // XMM0 <- ColorX (Fa Fr Fg Fb) MOVD XMM1, [RDX] // XMM1 <- ColorY (Ba Br Bg Bb) {$ENDIF} // Weight = Weight * $8081 {$IFDEF TARGET_X86} IMUL ECX, ECX, $8081 {$ELSE} IMUL ECX, R8D, $8081 {$ENDIF} // Convert from bytes to integers // PMOVZXBD is SSE4.1 PMOVZXBD XMM2, XMM1 // XMM2[0..3] <- ColorY[0][0..3] PMOVZXBD XMM0, XMM0 // XMM0[0..3] <- ColorX[0][0..3] // // Lerp: Result = (weight * (ColorX - ColorY) + ColorY) // = (($8081 * weight * (ColorX - ColorY)) shr 23 + ColorY) // PSUBD XMM0, XMM2 // XMM0 <- ColorX - ColorY MOVD XMM2, ECX // XMM2 <- Weight * $8081 PSHUFD XMM2, XMM2, 0 // XMM2[0..3] <- XMM2[0][0..3] PMULLD XMM2, XMM0 // XMM2 <- (ColorX - ColorY) * Weight * $8081 // Add bias (~$7F*$8081) {$if (not defined(FPC)) or (not defined(TARGET_X64))} PADDD XMM2, DQWORD PTR [SSE_003FFF7F_ALIGNED] // XMM2 <- ((ColorX - ColorY) * Weight * $8081) + Bias {$else} PADDD XMM2, DQWORD PTR [rip+SSE_003FFF7F_ALIGNED] {$ifend} // Reduce 32-bits to 9-bits PSRLD XMM2, 23 // XMM2 <- (((ColorX - ColorY) * Weight * $8081) + Bias) shr 23 // Convert from dwords to bytes with truncation (losing the sign in the 9th bit) {$if (not defined(FPC)) or (not defined(TARGET_X64))} PSHUFB XMM2, DQWORD PTR [SSE_0C080400_ALIGNED] // XMM2[0] <- XMM4[0..3][0] {$else} PSHUFB XMM2, DQWORD PTR [rip+SSE_0C080400_ALIGNED] {$ifend} // Result := Value + ColorY PADDB XMM2, XMM1 // XMM2 <- XMM2 + ColorY {$IFDEF TARGET_X86} MOVD [EDX], XMM2 // ColorY <- XMM2 {$ELSE} MOVD [RDX], XMM2 // ColorY <- XMM2 {$ENDIF} @exit: RET @return_x: {$IFDEF TARGET_X86} MOV [EDX], EAX // ColorY <- ColorX {$ELSE} MOV [RDX], ECX // ColorY <- ColorX {$ENDIF} end; //------------------------------------------------------------------------------ // CombineLine //------------------------------------------------------------------------------ procedure CombineLine_SSE2(Src, Dst: PColor32; Count: Integer; W: Cardinal); {$IFDEF FPC} assembler; {$IFDEF TARGET_X64}nostackframe;{$ENDIF} {$ENDIF} asm {$IFDEF TARGET_X86} // EAX <- Src // EDX <- Dst // ECX <- Count // Result := W * (X - Y) + Y TEST ECX,ECX JZ @3 PUSH EBX MOV EBX,W TEST EBX,EBX JZ @2 CMP EBX,$FF JZ @4 SHL EBX,4 ADD EBX,alpha_ptr MOVQ XMM3,[EBX] MOV EBX,bias_ptr MOVQ XMM4,[EBX] PXOR XMM0,XMM0 @1: MOVD XMM1,[EAX] MOVD XMM2,[EDX] PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 PSUBW XMM1,XMM2 PMULLW XMM1,XMM3 PSLLW XMM2,8 PADDW XMM2,XMM4 PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD [EDX],XMM1 ADD EAX,4 ADD EDX,4 DEC ECX JNZ @1 @2: POP EBX POP EBP @3: RET $0004 @4: SHL ECX,2 CALL Move POP EBX {$ENDIF} {$IFDEF TARGET_X64} // ECX <- Src // EDX <- Dst // R8D <- Count // Result := W * (X - Y) + Y TEST R8D,R8D JZ @2 TEST R9D,R9D JZ @2 CMP R9D,$FF JZ @3 SHL R9D,4 {$IFNDEF FPC} ADD R9,alpha_ptr {$ELSE} ADD R9,[RIP+alpha_ptr] {$ENDIF} MOVQ XMM3,[R9] {$IFNDEF FPC} MOV R9,bias_ptr {$ELSE} MOV R9,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64 {$ENDIF} MOVQ XMM4,[R9] PXOR XMM0,XMM0 @1: MOVD XMM1,[RCX] MOVD XMM2,[RDX] PUNPCKLBW XMM1,XMM0 PUNPCKLBW XMM2,XMM0 PSUBW XMM1,XMM2 PMULLW XMM1,XMM3 PSLLW XMM2,8 PADDW XMM2,XMM4 PADDW XMM1,XMM2 PSRLW XMM1,8 PACKUSWB XMM1,XMM0 MOVD [RDX],XMM1 ADD RCX,4 ADD RDX,4 DEC R8D JNZ @1 @2: RET @3: SHL R8D,2 CALL Move {$ENDIF} end; //------------------------------------------------------------------------------ // // Merge // //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // MergeReg //------------------------------------------------------------------------------ function MergeReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm { This is an implementation of the merge formula, as described in a paper by Bruce Wallace in 1981. Merging is associative, that is, A over (B over C) = (A over B) over C. The formula is, Ra = Fa + Ba * (1 - Fa) Rc = (Fa * (Fc - Bc * Ba) + Bc * Ba) / Ra where Rc is the resultant color, Ra is the resultant alpha, Fc is the foreground color, Fa is the foreground alpha, Bc is the background color, Ba is the background alpha. Implementation: Ra := 1 - (1 - Fa) * (1 - Ba); Wa := Fa / Ra; Rc := Bc + Wa * (Fc - Bc); (1 - Fa) * (1 - Ba) = 1 - Fa - Ba + Fa * Ba = (1 - Ra) } {$IFDEF TARGET_X86} TEST EAX,$FF000000 // foreground completely transparent => JZ @1 // result = background CMP EAX,$FF000000 // foreground completely opaque => JNC @2 // result = foreground TEST EDX,$FF000000 // background completely transparent => JZ @2 // result = foreground PXOR XMM7,XMM7 // XMM7 <- 00 MOVD XMM0,EAX // XMM0 <- Fa Fr Fg Fb SHR EAX,24 // EAX <- Fa ROR EDX,24 MOVZX ECX,DL // ECX <- Ba PUNPCKLBW XMM0,XMM7 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb SUB EAX,$FF // EAX <- (Fa - 1) XOR ECX,$FF // ECX <- (1 - Ba) IMUL ECX,EAX // ECX <- (Fa - 1) * (1 - Ba) = Ra - 1 IMUL ECX,$8081 // ECX <- Xa 00 00 00 ADD ECX,$8081*$FF*$FF SHR ECX,15 // ECX <- Ra MOV DL,CH // EDX <- Br Bg Bb Ra ROR EDX,8 // EDX <- Ra Br Bg Bb MOVD XMM1,EDX // XMM1 <- Ra Br Bg Bb PUNPCKLBW XMM1,XMM7 // XMM1 <- 00 Ra 00 Br 00 Bg 00 Bb SHL EAX,20 // EAX <- Fa 00 00 PSUBW XMM0,XMM1 // XMM0 <- ** Da ** Dr ** Dg ** Db ADD EAX,$0FF01000 PSLLW XMM0,4 XOR EDX,EDX // EDX <- 00 DIV ECX // EAX <- Fa / Ra = Wa MOVD XMM4,EAX // XMM3 <- Wa PSHUFLW XMM4,XMM4,$C0 // XMM3 <- 00 00 ** Wa ** Wa ** Wa PMULHW XMM0,XMM4 // XMM0 <- 00 00 ** Pr ** Pg ** Pb PADDW XMM0,XMM1 // XMM0 <- 00 Ra 00 Rr 00 Rg 00 Rb PACKUSWB XMM0,XMM7 // XMM0 <- Ra Rr Rg Rb MOVD EAX,XMM0 RET @1: MOV EAX,EDX @2: {$ENDIF} {$IFDEF TARGET_X64} TEST ECX,$FF000000 // foreground completely transparent => JZ @1 // result = background MOV EAX,ECX // EAX <- Fa CMP EAX,$FF000000 // foreground completely opaque => JNC @2 // result = foreground TEST EDX,$FF000000 // background completely transparent => JZ @2 // result = foreground PXOR XMM7,XMM7 // XMM7 <- 00 MOVD XMM0,EAX // XMM0 <- Fa Fr Fg Fb SHR EAX,24 // EAX <- Fa ROR EDX,24 MOVZX ECX,DL // ECX <- Ba PUNPCKLBW XMM0,XMM7 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb SUB EAX,$FF // EAX <- (Fa - 1) XOR ECX,$FF // ECX <- (1 - Ba) IMUL ECX,EAX // ECX <- (Fa - 1) * (1 - Ba) = Ra - 1 IMUL ECX,$8081 // ECX <- Xa 00 00 00 ADD ECX,$8081*$FF*$FF SHR ECX,15 // ECX <- Ra MOV DL,CH // EDX <- Br Bg Bb Ra ROR EDX,8 // EDX <- Ra Br Bg Bb MOVD XMM1,EDX // XMM1 <- Ra Br Bg Bb PUNPCKLBW XMM1,XMM7 // XMM1 <- 00 Ra 00 Br 00 Bg 00 Bb SHL EAX,20 // EAX <- Fa 00 00 PSUBW XMM0,XMM1 // XMM0 <- ** Da ** Dr ** Dg ** Db ADD EAX,$0FF01000 PSLLW XMM0,4 XOR EDX,EDX // EDX <- 00 DIV ECX // EAX <- Fa / Ra = Wa MOVD XMM4,EAX // XMM3 <- Wa PSHUFLW XMM4,XMM4,$C0 // XMM3 <- 00 00 ** Wa ** Wa ** Wa PMULHW XMM0,XMM4 // XMM0 <- 00 00 ** Pr ** Pg ** Pb PADDW XMM0,XMM1 // XMM0 <- 00 Ra 00 Rr 00 Rg 00 Rb PACKUSWB XMM0,XMM7 // XMM0 <- Ra Rr Rg Rb MOVD EAX,XMM0 RET @1: MOV EAX,EDX @2: {$ENDIF} end; //------------------------------------------------------------------------------ // // Color algebra // //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // ColorAdd //------------------------------------------------------------------------------ function ColorAdd_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_X86} MOVD XMM0,EAX MOVD XMM1,EDX PADDUSB XMM0,XMM1 MOVD EAX,XMM0 {$ENDIF} {$IFDEF TARGET_X64} MOVD XMM0,ECX MOVD XMM1,EDX PADDUSB XMM0,XMM1 MOVD EAX,XMM0 {$ENDIF} end; //------------------------------------------------------------------------------ // ColorSub //------------------------------------------------------------------------------ function ColorSub_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_X86} MOVD XMM0,EAX MOVD XMM1,EDX PSUBUSB XMM0,XMM1 MOVD EAX,XMM0 {$ENDIF} {$IFDEF TARGET_X64} MOVD XMM0,ECX MOVD XMM1,EDX PSUBUSB XMM0,XMM1 MOVD EAX,XMM0 {$ENDIF} end; //------------------------------------------------------------------------------ // ColorModulate //------------------------------------------------------------------------------ function ColorModulate_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_X86} PXOR XMM2,XMM2 MOVD XMM0,EAX PUNPCKLBW XMM0,XMM2 MOVD XMM1,EDX PUNPCKLBW XMM1,XMM2 PMULLW XMM0,XMM1 PSRLW XMM0,8 PACKUSWB XMM0,XMM2 MOVD EAX,XMM0 {$ENDIF} {$IFDEF TARGET_X64} PXOR XMM2,XMM2 MOVD XMM0,ECX PUNPCKLBW XMM0,XMM2 MOVD XMM1,EDX PUNPCKLBW XMM1,XMM2 PMULLW XMM0,XMM1 PSRLW XMM0,8 PACKUSWB XMM0,XMM2 MOVD EAX,XMM0 {$ENDIF} end; //------------------------------------------------------------------------------ // ColorMax //------------------------------------------------------------------------------ function ColorMax_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_X86} MOVD XMM0,EAX MOVD XMM1,EDX PMAXUB XMM0,XMM1 MOVD EAX,XMM0 {$ENDIF} {$IFDEF TARGET_X64} MOVD XMM0,ECX MOVD XMM1,EDX PMAXUB XMM0,XMM1 MOVD EAX,XMM0 {$ENDIF} end; //------------------------------------------------------------------------------ // ColorMin //------------------------------------------------------------------------------ function ColorMin_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_X86} MOVD XMM0,EAX MOVD XMM1,EDX PMINUB XMM0,XMM1 MOVD EAX,XMM0 {$ENDIF} {$IFDEF TARGET_X64} MOVD XMM0,ECX MOVD XMM1,EDX PMINUB XMM0,XMM1 MOVD EAX,XMM0 {$ENDIF} end; //------------------------------------------------------------------------------ // ColorDifference //------------------------------------------------------------------------------ function ColorDifference_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_X86} MOVD XMM0,EAX MOVD XMM1,EDX MOVQ XMM2,XMM0 PSUBUSB XMM0,XMM1 PSUBUSB XMM1,XMM2 POR XMM0,XMM1 MOVD EAX,XMM0 {$ENDIF} {$IFDEF TARGET_X64} MOVD XMM0,ECX MOVD XMM1,EDX MOVQ XMM2,XMM0 PSUBUSB XMM0,XMM1 PSUBUSB XMM1,XMM2 POR XMM0,XMM1 MOVD EAX,XMM0 {$ENDIF} end; //------------------------------------------------------------------------------ // ColorExclusion //------------------------------------------------------------------------------ function ColorExclusion_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_X86} PXOR XMM2,XMM2 MOVD XMM0,EAX PUNPCKLBW XMM0,XMM2 MOVD XMM1,EDX PUNPCKLBW XMM1,XMM2 MOVQ XMM3,XMM0 PADDW XMM0,XMM1 PMULLW XMM1,XMM3 PSRLW XMM1,7 PSUBUSW XMM0,XMM1 PACKUSWB XMM0,XMM2 MOVD EAX,XMM0 {$ENDIF} {$IFDEF TARGET_X64} PXOR XMM2,XMM2 MOVD XMM0,ECX PUNPCKLBW XMM0,XMM2 MOVD XMM1,EDX PUNPCKLBW XMM1,XMM2 MOVQ XMM3,XMM0 PADDW XMM0,XMM1 PMULLW XMM1,XMM3 PSRLW XMM1,7 PSUBUSW XMM0,XMM1 PACKUSWB XMM0,XMM2 MOVD EAX,XMM0 {$ENDIF} end; //------------------------------------------------------------------------------ // ColorScale //------------------------------------------------------------------------------ function ColorScale_SSE2(C: TColor32; W: Cardinal): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_X86} PXOR XMM2,XMM2 SHL EDX,4 MOVD XMM0,EAX PUNPCKLBW XMM0,XMM2 ADD EDX,alpha_ptr PMULLW XMM0,[EDX] PSRLW XMM0,8 PACKUSWB XMM0,XMM2 MOVD EAX,XMM0 {$ENDIF} {$IFDEF TARGET_X64} PXOR XMM2,XMM2 SHL RDX,4 MOVD XMM0,ECX PUNPCKLBW XMM0,XMM2 {$IFNDEF FPC} ADD RDX,alpha_ptr {$ELSE} ADD RDX,[RIP+alpha_ptr] {$ENDIF} PMULLW XMM0,[RDX] PSRLW XMM0,8 PACKUSWB XMM0,XMM2 MOVD EAX,XMM0 {$ENDIF} end; //------------------------------------------------------------------------------ // // Misc // //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // LightenReg //------------------------------------------------------------------------------ function LightenReg_SSE2(C: TColor32; Amount: Integer): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF TARGET_X86} // EAX <- C: TColor32 // EDX <- Amount: integer // EAX -> Result MOVD XMM0, EAX TEST EDX, EDX JL @1 // Positive: Lighten IMUL EDX, $010101 MOVD XMM1, EDX PADDUSB XMM0, XMM1 MOVD EAX, XMM0 RET // Negative: Darken @1: NEG EDX IMUL EDX, $010101 MOVD XMM1, EDX PSUBUSB XMM0, XMM1 MOVD EAX, XMM0 {$ENDIF} {$IFDEF TARGET_X64} // ECX <- C: TColor32 // EDX <- Amount: integer // EAX -> Result MOVD XMM0, ECX TEST EDX, EDX JL @1 // Positive: Lighten IMUL EDX, $010101 MOVD XMM1, EDX PADDUSB XMM0, XMM1 MOVD EAX, XMM0 RET // Negative: Darken @1: NEG EDX IMUL EDX, $010101 MOVD XMM1, EDX PSUBUSB XMM0, XMM1 MOVD EAX, XMM0 {$ENDIF} end; //------------------------------------------------------------------------------ // ScaleMems //------------------------------------------------------------------------------ procedure ScaleMems_SSE41(Dst: PColor32; Count: Integer; Weight: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm // // Result Z = W * Bargb // // Approximates (x div 255) as ((x * $8081 + Bias) shr 23) // {$IFDEF TARGET_X86} // EAX <- Dst: PColor32 // EDX <- Count // ECX <- Weight: Byte // Test the counter for zero or negativity TEST EDX, EDX JLE @Done // Test if: // - Weight is 0 (i.e. clear RGB to zero) // - Weight is 255 (i.e. no scale) AND ECX, $000000FF JZ @Clear TEST ECX, $000000FF JE @Done // Weight = Weight * $8081 IMUL ECX, ECX, $8081 MOVD XMM0, ECX // 1*Byte -> 4*DWord PSHUFD XMM0, XMM0, 0 // XMM0[0..3] <- XMM0[0][0] @Loop: // Load dest MOVD XMM1, DWORD PTR [EAX] // XMM1 <- 00 00 00 00 Ba Br Bg Bb // 4*Byte -> 4*DWord PMOVZXBD XMM1, XMM1 // XMM1[0..3] <- Color[0][0..3] // // Scale: Result = (Weight * Color) // = (($8081 * Weight * Color) shr 23) // PMULLD XMM1, XMM0 // XMM1 <- Color * Weight * $8081 // Add bias (~$7F*$8081) PADDD XMM1, DQWORD PTR [SSE_003FFF7F_ALIGNED] // XMM1 <- (Color * Weight * $8081) + Bias // Reduce 32-bits to 9-bits PSRLD XMM1, 23 // XMM1 <- ((Color * Weight * $8081) + Bias) shr 23 // Convert from dwords to bytes with truncation (losing the sign in the 9th bit) PSHUFB XMM1, DQWORD PTR [SSE_0C080400_ALIGNED] // XMM1[0] <- XMM1[0..3][0] // Store dest MOVD [EAX], XMM1 ADD EAX, 4 DEC EDX JNZ @Loop @Done: RET @Clear: // Clear RGB, leave A as-is MOV ECX, DWORD PTR [EAX] AND ECX, $FF000000 MOV DWORD PTR [EAX], ECX ADD EAX, 4 DEC EDX JNZ @Clear {$ENDIF} {$IFDEF TARGET_X64} // RCX <- Dst: PColor32 // RDX <- Count // R8D <- Weight: Byte // Test the counter for zero or negativity TEST EDX, EDX JLE @Done // Test if: // - Weight is 0 (i.e. clear RGB to zero) // - Weight is 255 (i.e. no scale) AND R8D, $000000FF JZ @Clear TEST R8D, $000000FF JE @Done // Weight = Weight * $8081 IMUL R8D, R8D, $8081 MOVD XMM0, R8D // XMM0 <- Weight * $8081 // 1*Byte -> 4*DWord PSHUFD XMM0, XMM0, 0 // XMM0[0..3] <- XMM0[0][0] @Loop: // Load dest MOVD XMM1, DWORD PTR [RCX] // XMM1 <- 00 00 00 00 Ba Br Bg Bb // 4*Byte -> 4*DWord PMOVZXBD XMM1, XMM1 // XMM1[0..3] <- Color[0][0..3] // // Scale: Result = (Weight * Color) // = (($8081 * Weight * Color) shr 23) // PMULLD XMM1, XMM0 // XMM1 <- Color * Weight * $8081 // Add bias (~$7F*$8081) {$if (not defined(FPC))} PADDD XMM1, DQWORD PTR [SSE_003FFF7F_ALIGNED] // XMM1 <- (Color * Weight * $8081) + Bias {$else} PADDD XMM1, DQWORD PTR [rip+SSE_003FFF7F_ALIGNED] {$ifend} // Reduce 32-bits to 9-bits PSRLD XMM1, 23 // XMM1 <- ((Color * Weight * $8081) + Bias) shr 23 // Convert from dwords to bytes with truncation (losing the sign in the 9th bit) {$if (not defined(FPC))} PSHUFB XMM1, DQWORD PTR [SSE_0C080400_ALIGNED] // XMM1[0] <- XMM1[0..3][0] {$else} PSHUFB XMM1, DQWORD PTR [rip+SSE_0C080400_ALIGNED] {$ifend} // Store dest MOVD [RCX], XMM1 ADD RCX,4 DEC EDX JNZ @Loop @Done: RET @Clear: // Clear RGB, leave A as-is MOV EAX, DWORD PTR [RCX] AND EAX, $FF000000 MOV DWORD PTR [RCX], EAX ADD RCX, 4 DEC EDX JNZ @Clear {$ENDIF} end; procedure FastScaleMems_SSE41(Dst: PColor32; Count: Integer; Weight: Cardinal); {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm // // Result Z = W * Bargb // // Approximates (x div 255) as (x shr 8); Same as ColorScale_Pas // {$IFDEF TARGET_X86} // EAX <- Dst: PColor32 // EDX <- Count // ECX <- Weight: Byte // Test the counter for zero or negativity TEST EDX, EDX JLE @Done // Test if: // - Weight is 0 (i.e. clear RGB to zero) // - Weight is 255 (i.e. no scale) AND ECX, $000000FF JZ @Clear TEST ECX, $000000FF JE @Done PXOR XMM2, XMM2 // Duplicate Weight into 8 words so we can process two pixels at a time MOVD XMM0, ECX // XMM0 <- (00 00 00 00 00 00 00 WW) PSHUFLW XMM0, XMM0, 0 // (00 WW 00 WW 00 WW 00 WW) PSHUFD XMM0, XMM0, 0 // (00 WW 00 WW 00 WW 00 WW)*2 // Test for odd/even count TEST EDX, 1 JZ @Even // We have an odd number of pixels. // Process a single pixel so the remaining count is even. // Load dest MOVD XMM1, DWORD PTR [EAX] // XMM1 <- 00 00 00 00 Ba Br Bg Bb PUNPCKLBW XMM1, XMM2 // XMM1 <- 00 Ba 00 Br 00 Bg 00 Bb // // Scale: Result = (Weight * Color) // = ((Weight * Color) shr 8) // PMULLW XMM1, XMM0 PSRLW XMM1, 8 // Store dest // Pack result back from word to byte components PACKUSWB XMM1, XMM1 MOVD [EAX], XMM1 @Even: LEA EAX, [EAX + EDX * 4] // Get address of last pixel SHR EDX, 1 // Number of QWORDs JZ @Done NEG EDX // Negate count so we can use it as an offset to move forward @Loop: // Load dest MOVQ XMM1, [EAX + EDX * 8].QWORD // XMM2 <- Ba Br Bg Bb Ba Br Bg Bb {-$define FASTSCALEMEMS_SKIPWRITE} {$ifdef FASTSCALEMEMS_SKIPWRITE} // Skip scale (and thus the relatively costly write) if the color is pure black PTEST XMM1, XMM1 JZ @SkipWrite {$endif FASTSCALEMEMS_SKIPWRITE} // 8*Byte -> 8*Word PUNPCKLBW XMM1, XMM2 // XMM2 <- 00 Ba 00 Br 00 Bg 00 Bb // // Scale: Result = (Weight * Color) // = ((Weight * Color) shr 8) // PMULLW XMM1, XMM0 PSRLW XMM1, 8 // Store dest PACKUSWB XMM1, XMM2 MOVQ [EAX + EDX * 8].QWORD, XMM1 {$ifdef FASTSCALEMEMS_SKIPWRITE} @SkipWrite: {$endif FASTSCALEMEMS_SKIPWRITE} ADD EDX, 1 JS @Loop @Done: RET @Clear: // Clear RGB, leave A as-is MOV ECX, DWORD PTR [EAX] AND ECX, $FF000000 MOV DWORD PTR [EAX], ECX ADD EAX, 4 DEC EDX JNZ @Clear {$ENDIF} {$IFDEF TARGET_X64} // RCX <- Dst: PColor32 // RDX <- Count // R8D <- Weight: Byte // Test the counter for zero or negativity TEST RDX, RDX JLE @Done // Test if: // - Weight is 0 (i.e. clear RGB to zero) // - Weight is 255 (i.e. no scale) AND R8D, $000000FF JZ @Clear TEST R8D, $000000FF JE @Done PXOR XMM2, XMM2 // Duplicate Weight into 8 words so we can process two pixels at a time MOVD XMM0, R8D // XMM0 <- (00 00 00 00 00 00 00 WW) PSHUFLW XMM0, XMM0, 0 // (00 WW 00 WW 00 WW 00 WW) PSHUFD XMM0, XMM0, 0 // (00 WW 00 WW 00 WW 00 WW)*2 // Test for odd/even count TEST EDX, 1 JZ @Even // We have an odd number of pixels. // Process a single pixel so the remaining count is even. // Load dest MOVD XMM1, DWORD PTR [RCX] // XMM1 <- 00 00 00 00 Ba Br Bg Bb PUNPCKLBW XMM1, XMM2 // XMM1 <- 00 Ba 00 Br 00 Bg 00 Bb // // Scale: Result = (Weight * Color) // = ((Weight * Color) shr 8) // PMULLW XMM1, XMM0 PSRLW XMM1, 8 // Store dest // Pack result back from word to byte components PACKUSWB XMM1, XMM1 MOVD [RCX], XMM1 @Even: LEA RCX, [RCX + RDX * 4] // Get address of last pixel SHR RDX, 1 // Number of QWORDs JZ @Done NEG RDX // Negate count so we can use it as an offset to move forward @Loop: // Load dest MOVQ XMM1, [RCX + RDX * 8].QWORD // XMM1 <- Ba Br Bg Bb Ba Br Bg Bb // FASTSCALEMEMS_SKIPWRITE has been disabled as it doesn't give us enough and in some // cases makes the loop slower. Probably due to branch misprediction. {-$define FASTSCALEMEMS_SKIPWRITE} {$ifdef FASTSCALEMEMS_SKIPWRITE} // Skip scale (and thus the relatively costly write) if the color is pure black PTEST XMM1, XMM1 JZ @SkipWrite {$endif FASTSCALEMEMS_SKIPWRITE} // 8*Byte -> 8*Word PUNPCKLBW XMM1, XMM2 // XMM1 <- 00 Ba 00 Br 00 Bg 00 Bb // // Scale: Result = (Weight * Color) // = ((Weight * Color) shr 8) // PMULLW XMM1, XMM0 PSRLW XMM1, 8 // Store dest PACKUSWB XMM1, XMM2 MOVQ [RCX + RDX * 8].QWORD, XMM1 {$ifdef FASTSCALEMEMS_SKIPWRITE} @SkipWrite: {$endif FASTSCALEMEMS_SKIPWRITE} ADD RDX, 1 JS @Loop @Done: RET @Clear: // Clear RGB, leave A as-is MOV ECX, DWORD PTR [RCX] AND ECX, $FF000000 MOV DWORD PTR [RCX], ECX ADD RCX, 4 DEC RDX JNZ @Clear {$ENDIF} end; {$ifend} //------------------------------------------------------------------------------ // // Bindings // //------------------------------------------------------------------------------ procedure RegisterBindingFunctions; begin {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))} BlendRegistry[@@MergeReg].Add( @MergeReg_SSE2, [isSSE2]).Name := 'MergeReg_SSE2'; BlendRegistry[@@CombineReg].Add( @CombineReg_SSE2, [isSSE2]).Name := 'CombineReg_SSE2'; BlendRegistry[@@CombineMem].Add( @CombineMem_SSE2_128, [isSSE2]).Name := 'CombineMem_SSE2_128'; BlendRegistry[@@CombineMem].Add( @CombineMem_SSE41_Kadaif, [isSSE41]).Name := 'CombineMem_SSE41_Kadaif'; {$if defined(BENCHMARK)} BlendRegistry[@@CombineMem].Add( @CombineMem_SSE2_Table, [isSSE2], BindingPriorityWorse).Name := 'CombineMem_SSE2_Table'; BlendRegistry[@@CombineMem].Add( @CombineMem_SSE41_8081, [isSSE41], BindingPriorityWorse).Name := 'CombineMem_SSE41_8081'; {$ifend} BlendRegistry[@@CombineLine].Add( @CombineLine_SSE2, [isSSE2]).Name := 'CombineLine_SSE2'; BlendRegistry[@@BlendReg].Add( @BlendReg_SSE2, [isSSE2]).Name := 'BlendReg_SSE2'; BlendRegistry[@@BlendMem].Add( @BlendMem_SSE2, [isSSE2]).Name := 'BlendMem_SSE2'; BlendRegistry[@@BlendMems].Add( @BlendMems_SSE2, [isSSE2]).Name := 'BlendMems_SSE2'; BlendRegistry[@@BlendMemEx].Add( @BlendMemEx_SSE2, [isSSE2]).Name := 'BlendMemEx_SSE2'; BlendRegistry[@@BlendLine].Add( @BlendLine_SSE2, [isSSE2]).Name := 'BlendLine_SSE2'; BlendRegistry[@@BlendLineEx].Add( @BlendLineEx_SSE2, [isSSE2]).Name := 'BlendLineEx_SSE2'; BlendRegistry[@@BlendRegEx].Add( @BlendRegEx_SSE2, [isSSE2]).Name := 'BlendRegEx_SSE2'; BlendRegistry[@@ColorMax].Add( @ColorMax_SSE2, [isSSE2]).Name := 'ColorMax_SSE2'; BlendRegistry[@@ColorMin].Add( @ColorMin_SSE2, [isSSE2]).Name := 'ColorMin_SSE2'; BlendRegistry[@@ColorAdd].Add( @ColorAdd_SSE2, [isSSE2]).Name := 'ColorAdd_SSE2'; BlendRegistry[@@ColorSub].Add( @ColorSub_SSE2, [isSSE2]).Name := 'ColorSub_SSE2'; BlendRegistry[@@ColorModulate].Add( @ColorModulate_SSE2, [isSSE2]).Name := 'ColorModulate_SSE2'; BlendRegistry[@@ColorDifference].Add(@ColorDifference_SSE2, [isSSE2]).Name := 'ColorDifference_SSE2'; BlendRegistry[@@ColorExclusion].Add(@ColorExclusion_SSE2, [isSSE2]).Name := 'ColorExclusion_SSE2'; BlendRegistry[@@ColorScale].Add( @ColorScale_SSE2, [isSSE2]).Name := 'ColorScale_SSE2'; BlendRegistry[@@LightenReg].Add( @LightenReg_SSE2, [isSSE]).Name := 'LightenReg_SSE2'; BlendRegistry[@@BlendRegRGB].Add( @BlendRegRGB_SSE2, [isSSE2]).Name := 'BlendRegRGB_SSE2'; BlendRegistry[@@BlendMemRGB].Add( @BlendMemRGB_SSE2, [isSSE2]).Name := 'BlendMemRGB_SSE2'; {$if defined(GR32_SCALEMEMS_FAST) or defined(BENCHMARK)} BlendRegistry[@@ScaleMems].Add( @FastScaleMems_SSE41, [isSSE41]).Name := 'FastScaleMems_SSE41'; {$ifend} {$if (not defined(GR32_SCALEMEMS_FAST)) or defined(BENCHMARK)} BlendRegistry[@@ScaleMems].Add( @ScaleMems_SSE41, [isSSE41]).Name := 'ScaleMems_SSE41'; {$ifend} {$if defined(TEST_BLENDMEMRGB128SSE4) or defined(BENCHMARK)} BlendRegistry[@@BlendMemRGB128].Add(@BlendMemRGB128_SSE4, [isSSE2]).Name := 'BlendMemRGB128_SSE4'; {$ifend} {$ifend} end; //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ initialization RegisterBindingFunctions; end.