unit GR32.Blur.RecursiveGaussian; (* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1 or LGPL 2.1 with linking exception * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * Alternatively, the contents of this file may be used under the terms of the * Free Pascal modified version of the GNU Lesser General Public License * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions * of this license are applicable instead of those above. * Please see the file LICENSE.txt for additional information concerning this * license. * * The Original Code is Recursive Gaussian Blur for Graphics32 * * The Initial Developer of the Original Code is * Anders Melander * * Portions created by the Initial Developer are Copyright (C) 2010-2018 * the Initial Developer. All Rights Reserved. * * ***** END LICENSE BLOCK ***** *) interface {$include GR32.inc} // IIR = Infinite Input Response {$define IIR_BLUR_DEFAULT} // Register as default Blur32 implementation. {$define IIR_BLUR_SIMD} // Enable forward/backward filter implemented using Streaming SIMD Extensions (SSE). {$define IIR_BLUR_EDGE_CORRECTION} // Apply the boundary corrections from [3] {$define IIR_BLUR_EDGE_CORRECTION_SIMD} // Enable SIMD implementation of [3] - significantly faster than the Pascal version but unfortunately not a hotspot {.$define IIR_BLUR_INKSCAPE_COEFFICIENTS}// Enable use of InkScape [7] coefficient calculation {.$define IIR_BLUR_GABOR_COEFFICIENTS} // Enable use of Gabor [2] coefficient calculation {$define IIR_USE_HADDPS} // Use the HADDPS SSE3 instruction instead of two ADDPS SSE instructions. See comment in code. {$define IIR_USE_DPPS} // Use the DPPS SSE4.1 instruction instead of a MULPS and two ADDPS SSE instructions. See comment in code. {$define IIR_BLUR_ALIGN_BUFFERS} // Use aligned buffers (note only start of buffer is aligned; Individual rows are not) {$define IIR_BLUR_DIV_LUT} // Use a Look Up Table for alpha pre- and unpremultiplication and gamma correction. Slightly faster and much more precise. // Note that GammaBlur32 requires that IIR_BLUR_DIV_LUT is defined. uses GR32; //------------------------------------------------------------------------------ // // Recursive Gaussian Blur // //------------------------------------------------------------------------------ // Anders Melander, August 2010-September 2018 //------------------------------------------------------------------------------ // References: // // [1] Recursive implementation of the Gaussian filter // Ian T. Young, Lucas J. van Vliet // Signal Processing, Volume 44, Number 2, June 1995, pp. 139-151(13) // // [2] Recursive Gabor Filtering // Ian T. Young, Lucas J. van Vliet, and Michael van Ginkel // IEEE Transactions on Signal Processing, Volume 50, Number 11, Nov 2002, pp. 2799–2805 // // [3] Boundary Conditions for Young - van Vliet Recursive Filtering // Bill Triggs, Michael Sdika // IEEE Transactions on Signal Processing, Volume 54, Number 6, June 2006, pp. 2365-2367 // // [4] Recursive Gaussian Derivative Filters // L. J. Van Vliet, I. T. Young, and P. W. Verbeek // Fourteenth International Conference on Pattern Recognition, 1998. Brisbane. // Proceedings, Volume 1, pp. 509-514 // // [5] Performance of three recursive algorithms for fast space-variant Gaussian filtering // Sovira Tan, Jason L. Dale, Alan Johnston // Real-Time Imaging, Number 9, 2003, pp 215–228 // // [6] Gimp, Retinex plug-in // Fabien Pelisson // // [7] Inkscape, Gaussian blur renderer // Niko Kiirala, bulia byak, Jasper van de Gronde //------------------------------------------------------------------------------ // // From [1]: // // This implementation yields an infinite impulse response filter that has six // MADDs (Multiply/Add) per dimension independent of the value of sigma in the // Gaussian kernel. In contrast to the Deriche implementation (1987), the // coefficients of our recursive filter have a simple, closed-form solution for a // desired value of the Gaussian sigma. Our implementation is, in general, faster // than: // // (1) an implementation based upon direct convolution with samples of a Gaussian, // (2) repeated convolutions with a kernel such as the uniform filter, and // (3) an FFT implementation of a Gaussian filter. // //------------------------------------------------------------------------------ procedure RecursiveGaussianBlur(Src, Dst: TBitmap32; Sigma: TFloat); procedure RecursiveGaussianBlurRadius(Src, Dst: TBitmap32; Radius: TFloat); procedure RecursiveGaussianBlurGamma(Src, Dst: TBitmap32; Sigma: TFloat); procedure RecursiveGaussianBlurRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat); procedure RecursiveGaussianBlurHorizontalRadius(Src, Dst: TBitmap32; Radius: TFloat); procedure RecursiveGaussianBlurHorizontalRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat); //------------------------------------------------------------------------------ // // Recursive Gaussian Blur internal API types // //------------------------------------------------------------------------------ // For now this implementation only works when TFloat=Single since the SSE // and transpose code has been written for 32-bit values. //------------------------------------------------------------------------------ type TQuadFloatRec = packed record v0: TFloat; v1: TFloat; v2: TFloat; v3: TFloat; end; TQuadFloat = array[0..3] of TFloat; PQuadFloat = ^TQuadFloat; TNineFloats = array[0..8] of TFloat; //------------------------------------------------------------------------------ // // Recursive Gaussian Blur internal bindings // //------------------------------------------------------------------------------ type TIIR_BlurFilterBackwardProc = procedure (pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat); TIIR_BlurFilterForwardProc = procedure (pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat); TIIR_BlurApplyEdgeCorrection = procedure (iPlus: PFloat; const B: TQuadFloat; var v: TQuadFloat; const M: TNineFloats); var IIR_BlurFilterBackward: TIIR_BlurFilterBackwardProc = nil; IIR_BlurFilterForward: TIIR_BlurFilterForwardProc = nil; IIR_BlurApplyEdgeCorrection: TIIR_BlurApplyEdgeCorrection = nil; //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ implementation uses SyncObjs, // TCriticalSection Math, GR32.Blur, GR32_Bindings, GR32.Transpose, GR32.Math.Complex, GR32_OrdinalMaps, GR32_Blend, GR32_LowLevel, GR32_Math, GR32_System, GR32_Gamma; // Ensure that we use the GR32.TFloat and not FPC's Math.TFloat (which is an alias for Double!) type TFloat = GR32.TFloat; PFloat = ^TFloat; const OneOver255: TFloat = 1/255; //------------------------------------------------------------------------------ // // PremultiplyLUT // //------------------------------------------------------------------------------ // Lookup tables for alpha premultiplication. // // MulDiv255[a,b] = a * b / 255 Used for premultiplication // Mul255Div[a,b] = Round(a * 255 / b) Used for unpremultiplication // // where // // a: Color value // b: Alpha value // // PremultiplyLUT is used for pre- and unpremultiplication. // GammaPremultiplyLUT rolls the gamma correction and pre-/unpremultiplication // operations into one for a significant gain in precision at no extra cost in // performance. // //------------------------------------------------------------------------------ {$ifdef IIR_BLUR_DIV_LUT} type PPremultiplyLUT = ^TPremultiplyLUT; TPremultiplyLUT = record strict private class constructor Create; class destructor Destroy; strict private class var FLock: TCriticalSection; FPremultiplyLUT: PPremultiplyLUT; FGammaPremultiplyLUT: PPremultiplyLUT; strict private FsRGB: boolean; FGamma: Double; FGammaInv: Double; procedure SetGamma(const GammaValue: Double; sRGB: boolean); procedure GammaChangedHandler; public Mul255Div: array[byte, byte] of byte; MulDiv255: array[byte, byte] of TFloat; public class var public class function PremultiplyLUT: PPremultiplyLUT; static; class function GammaPremultiplyLUT: PPremultiplyLUT; static; property sRGB: boolean read FsRGB; property Gamma: Double read FGamma; end; //------------------------------------------------------------------------------ class constructor TPremultiplyLUT.Create; begin FLock := TCriticalSection.Create; FPremultiplyLUT := nil; FGammaPremultiplyLUT := nil; end; class destructor TPremultiplyLUT.Destroy; begin FLock.Free; if (FPremultiplyLUT <> nil) then Dispose(FPremultiplyLUT); if (FGammaPremultiplyLUT <> nil) then begin UnregisterGammaChangeNotification(FGammaPremultiplyLUT.GammaChangedHandler); Dispose(FGammaPremultiplyLUT); end; end; //------------------------------------------------------------------------------ class function TPremultiplyLUT.PremultiplyLUT: PPremultiplyLUT; var AlphaValue, ColorValue: Integer; begin if (FPremultiplyLUT = nil) then begin FLock.Acquire; if (FPremultiplyLUT = nil) then begin New(FPremultiplyLUT); for ColorValue := 0 to 255 do begin FPremultiplyLUT.Mul255Div[ColorValue, 0] := 0; FPremultiplyLUT.MulDiv255[ColorValue, 0] := 0.0; for AlphaValue := 1 to 255 do begin FPremultiplyLUT.Mul255Div[ColorValue, AlphaValue] := Clamp(Round(ColorValue * 255 / AlphaValue)); FPremultiplyLUT.MulDiv255[ColorValue, AlphaValue] := ColorValue * AlphaValue * OneOver255;; end; end; end; FLock.Release; end; Result := FPremultiplyLUT; end; //------------------------------------------------------------------------------ class function TPremultiplyLUT.GammaPremultiplyLUT: PPremultiplyLUT; begin if (FGammaPremultiplyLUT = nil) then begin FLock.Acquire; if (FGammaPremultiplyLUT = nil) then begin New(FGammaPremultiplyLUT); RegisterGammaChangeNotification(FGammaPremultiplyLUT.GammaChangedHandler); FGammaPremultiplyLUT.SetGamma(GAMMA_VALUE, GAMMA_IS_SRGB); end; FLock.Release; end; Result := FGammaPremultiplyLUT; end; procedure TPremultiplyLUT.GammaChangedHandler; begin SetGamma(GAMMA_VALUE, GAMMA_IS_SRGB); end; procedure TPremultiplyLUT.SetGamma(const GammaValue: Double; sRGB: boolean); var AlphaValue, ColorValue: Integer; n: Single; ColorLinear, ColorRGB: TFloat; begin if (FsRGB = sRGB) and ((FsRGB) or (GammaValue = FGamma)) then exit; FsRGB := sRGB; if (not FsRGB) then begin FGamma := GammaValue; FGammaInv := 1 / FGamma; end; for ColorValue := 0 to 255 do begin Mul255Div[ColorValue, 0] := 0; MulDiv255[ColorValue, 0] := 0.0; // sRGB -> Linear RGB / 255 ColorLinear := ColorValue * OneOver255; if (FsRGB) then begin if (ColorLinear >= 0.04045) then ColorLinear := Power((ColorLinear + 0.055) * (1 / 1.055), 2.4) else ColorLinear := ColorLinear * (1 / 12.92); end else ColorLinear := Power(ColorLinear, FGamma); // ColorValue: Color, AlphaValue: Alpha for AlphaValue := 1 to 255 do begin // Linear RGB -> Premultiplied, Linear RGB n := ColorLinear * AlphaValue; MulDiv255[ColorValue, AlphaValue] := n; // Premultiplied, Linear RGB -> Unpremultiplied, Linear RGB n := ColorValue / AlphaValue; // Linear RGB -> sRGB / 255 if (FsRGB) then begin if (n >= 0.0031308) then ColorRGB := 1.055 * Power(n, 1 / 2.4) - 0.055 else ColorRGB := n * 12.92; end else ColorRGB := Power(n, FGammaInv); Mul255Div[ColorValue, AlphaValue] := Clamp(Round(ColorRGB * 255)); end; end; end; {$endif IIR_BLUR_DIV_LUT} //------------------------------------------------------------------------------ // // Recursive Gaussian Blur // //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // Low level, SIMD implementations //------------------------------------------------------------------------------ {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))} procedure BlurFilterForward_SSE41(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat); {$IFDEF FPC}assembler;{$ENDIF} // Parameters (x86): // EAX <- pIn // EDX <- pOut // ECX <- B // Stack[0] <- Width // Stack[1] <- v // Preserves: (none) // // Parameters (x64): // RCX <- pIn // RDX <- pOut // R8 <- B // R9 <- Width // Stack[0] <- v // Preserves: (none) // // SSE register usage: // XMM0: B3 | B2 | B1 | B0 // XMM1: * | v1 | v2 | v3 // XMM2, XMM3: misc use asm {$if defined(TARGET_x64)} {$IFNDEF FPC} // nothing {$ELSE} // nothing {$ENDIF} {$elseif defined(TARGET_x86)} // nothing {$else} {$message fatal 'Unsupported target'} {$ifend} MOVUPS XMM0, TQuadFloatRec PTR [B] // XMM0 <- B3 | B2 | B1 | B0 (* ** Initialization of forward pass ** ** v[1] := pIn^ / (B[0] + B[1] + B[2] + B[3]); ** v[2] := v[1]; ** v[3] := v[1]; *) MOVAPS XMM2, XMM0 // XMM2 <- B3 | B2 | B1 | B0 // XMM2[0] <- (B + B1 + B2 + B3) // TODO : HADDPS is slow according to this: // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026 // However, my own benchmarks indicate that the alternative is slower... {$ifdef IIR_USE_HADDPS} HADDPS XMM2, XMM2 // XMM2[0..3] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3] HADDPS XMM2, XMM2 {$else IIR_USE_HADDPS} MOVHLPS XMM3, XMM2 // XMM3[0,1] <- XMM2[2,3] ADDPS XMM2, XMM3 // XMM2[0] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3] PSHUFD XMM3, XMM2, $01 ADDSS XMM2, XMM3 {$endif IIR_USE_HADDPS} MOVSS XMM1, [pIn] // XMM1[0] <- pIn^ DIVSS XMM1, XMM2 // XMM1[0] <- XMM1[0] / XMM2[0] SHUFPS XMM1, XMM1, $00 // XMM1[0..3] <- XMM1[0] // Iterate over all cols using the "negative offset technique" {$if defined(TARGET_x86)} // We're now done with the value in the "B" register so it's repurposed for the "Width" counter. MOV ECX, Width LEA pIn, [pIn+ECX*4] LEA pOut, [pOut+ECX*4] NEG ECX {$elseif defined(TARGET_x64)} LEA pIn, [pIn+R9*4] LEA pOut, [pOut+R9*4] NEG R9 {$ifend} @LOOP: (* ** v0 := (pIn^ * B0) + (v1 * B1) + (v2 * B2) + (v3 * B3); *) {$if defined(TARGET_x86)} MOVSS XMM2, [pIn+ECX*4] // XMM2[0] <- pIn {$elseif defined(TARGET_x64)} MOVSS XMM2, TFloat PTR [pIn+R9*4] // XMM2[0] <- pIn {$ifend} MOVSS XMM1, XMM2 // XMM1[0] <- XMM2[0] MOVAPS XMM2, XMM1 // XMM2 <- XMM1 // TODO : DPPS might be slow on some CPUs: // https://stackoverflow.com/questions/37879678/dot-product-performance-with-sse-instructions-is-dpps-worth-using // On my system it's significantly faster than the alternative {$ifdef IIR_USE_DPPS} DPPS XMM2, XMM0, $FF {$else IIR_USE_DPPS} // Multiply B[] with v[] MULPS XMM2, XMM0 // XMM2 <- XMM2 * XMM0 // Add the four results of the previous vectorized multiplication MOVHLPS XMM3, XMM2 // XMM2[0] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3] ADDPS XMM2, XMM3 PSHUFD XMM3, XMM2, $01 ADDSS XMM2, XMM3 {$endif IIR_USE_DPPS} MOVSS XMM1, XMM2 // XMM1[0] <- v0 // pOut^ := v0; {$if defined(TARGET_x86)} MOVSS [pOut+ECX*4], XMM2 {$elseif defined(TARGET_x64)} MOVSS TFloat PTR [pOut+R9*4], XMM2 {$ifend} (* ** Shift cached output values ** v3 := v2; ** v2 := v1; ** v1 := v0; **) SHUFPS XMM1, XMM1, $93 // XMM1[0,1,2,3] <- XMM1[3,0,1,2] // Loop to next col {$if defined(TARGET_x86)} INC ECX {$elseif defined(TARGET_x64)} INC R9 {$ifend} JNZ @LOOP // Update v1, v2, v3 for backward filter {$if defined(TARGET_x86)} MOV EDX, v MOVUPS [EDX], XMM1 {$elseif defined(TARGET_x64)} // TODO : For some reason this doesn't work ("v" isn't updated): // MOVUPS [TQuadFloat PTR v], XMM1 MOV RDX, v MOVUPS [RDX], XMM1 {$ifend} {$if defined(TARGET_x64)} {$IFDEF FPC} // nothing {$ENDIF} {$elseif defined(TARGET_x86)} // nothing {$ifend} end; //------------------------------------------------------------------------------ procedure BlurFilterBackward_SSE41(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; const v: TQuadFloat); {$IFDEF FPC}assembler;{$ENDIF} // Parameters (x86): // EAX <- pIn // EDX <- pOut // ECX <- B // Stack[0] <- Width // Stack[1] <- v // Preserves: (none) // // Parameters (x64): // RCX <- pIn // RDX <- pOut // R8 <- B // R9 <- Width // Stack[0] <- v // Preserves: (none) // // SSE register usage: // XMM0: B3 | B2 | B1 | B0 // XMM1: * | v1 | v2 | v3 // XMM2, XMM3: misc use asm MOVUPS XMM0, TQuadFloatRec PTR [B] // XMM0 <- B3 | B2 | B1 | B0 {$if defined(TARGET_x86)} MOV ECX, v MOVUPS XMM1, [ECX] // XMM1 <- v3 | v2 | v1 | * {$elseif defined(TARGET_x64)} // TODO : For some reason this doesn't work ("v" isn't loaded): // MOVUPS XMM1, [TQuadFloat PTR v] // XMM1 <- v3 | v2 | v1 | * MOV R8, QWORD PTR v MOVUPS XMM1, [R8] // XMM1 <- v3 | v2 | v1 | * {$ifend} // Iterate over all cols // Note: We're iterating backwards from the last to the first col. // On entry, pIn and pOut points to the last col. Here we adjust them to point to the first col // so we can use the col counter as an offset. {$if defined(TARGET_x86)} // We're now done with the value in the "B" register so it's repurposed for the "Width" counter. MOV ECX, Width // Although the compiler accepts [RegA-RegB*4] it assembles it to [RegA+RegB*4] so we need // to temporarily negate the offset value NEG ECX LEA pIn, [pIn+ECX*4] LEA pOut, [pOut+ECX*4] NEG ECX {$elseif defined(TARGET_x64)} NEG R9 LEA pIn, [pIn+R9*4] LEA pOut, [pOut+R9*4] NEG R9 {$ifend} @LOOP: (* ** v0 := (B0 * pIn^) + (v1 * B1) + (v2 * B2) + (v3 * B3); *) // Multiply B[] with v[] {$if defined(TARGET_x86)} INSERTPS XMM1, [pIn+ECX*4], $00 // XMM1[0] <- pIn {$elseif defined(TARGET_x64)} INSERTPS XMM1, TFloat PTR [pIn+R9*4], $00 // XMM1[0] <- pIn {$ifend} MOVAPS XMM2, XMM1 // XMM2 <- XMM1 {$ifdef IIR_USE_DPPS} DPPS XMM2, XMM0, $FF // XMM2 <- XMM2 dot XMM1 {$else IIR_USE_DPPS} MULPS XMM2, XMM0 // XMM2 <- XMM2 * XMM0 // Add the four results of the previous vectorized multiplication MOVHLPS XMM3, XMM2 // XMM2[0] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3] ADDPS XMM2, XMM3 PSHUFD XMM3, XMM2, $01 ADDSS XMM2, XMM3 {$endif IIR_USE_DPPS} MOVSS XMM1, XMM2 // XMM1[0] <- v0 // pOut^ := v0; {$if defined(TARGET_x86)} MOVSS [pOut+ECX*4], XMM2 {$elseif defined(TARGET_x64)} MOVSS TFloat PTR [pOut+R9*4], XMM2 {$ifend} (* ** Shift cached output values ** v3 := v2; ** v2 := v1; ** v1 := v0; **) SHUFPS XMM1, XMM1, $93 // XMM0 <- v2 | v1 | v0 | v3 // Loop to next (previous actually) col {$if defined(TARGET_x86)} DEC ECX {$elseif defined(TARGET_x64)} DEC R9 {$ifend} JNZ @LOOP end; //------------------------------------------------------------------------------ procedure BlurApplyEdgeCorrection_SSE41(iPlus: PFloat; const B: TQuadFloat; var v: TQuadFloat; const M: TNineFloats); // {$IFDEF FPC}assembler;{$ENDIF} // Parameters (x86): // EAX <- iPlus // EDX <- B0 // ECX <- v // Stack[0] <- M // Preserves: (none) // // Parameters (x64): // RCX <- iPlus // RDX <- B0 // R8 <- v // R9 <- M // Preserves: XMM4 // // SSE register usage: // XMM0: B3 | B2 | B1 | B0 // XMM1: * | v1 | v2 | v3 // XMM2, XMM3, XMM4: misc use {$if defined(TARGET_x64) and defined(FPC)}begin{$ifend} asm {$if defined(TARGET_x64)} {$IFNDEF FPC} .SAVENV XMM4 {$ENDIF} {$elseif defined(TARGET_x86)} // nothing {$else} {$message fatal 'Unsupported target'} {$ifend} (* ** unp0 := v[1] - uPlus; ** unp1 := v[2] - uPlus; ** unp2 := v[3] - uPlus; *) // XMM1 = [v3 | v2 | v1 | 0] MOVUPS XMM1, TQuadFloat PTR [v] // XMM1 <- v3 | v2 | v1 | v0 // Save v0 MOVAPS XMM5, XMM1 // Now get rid of XMM1[0] SHUFPS XMM1, XMM1, $39 // XMM1 <- v0 | v3 | v2 | v1 INSERTPS XMM1, XMM1, $F8 // XMM1 <- 0 | v3 | v2 | v1 // XMM0 = [0 | u+ | u+ | u+] MOVSS XMM0, [iPlus] // XMM0[0] <- u+ SHUFPS XMM0, XMM0, $00 // XMM0 <- u+ | u+ | u+ | u+ // XMM1 = XMM1*XMM0 = [0, unp2, unp1, unp0] SUBPS XMM1, XMM0 // XMM1 <- XMM0 - u+ // XMM1 = [0, unp2,unp1,unp0] INSERTPS XMM1, XMM1, $F8 // XMM1[3] <- 0 (* ** v[1] := ( (M0 * unp0) + (M1 * unp1) + (M2 * unp2) ) * B0 + vPlus; ** v[2] := ( (M3 * unp0) + (M4 * unp1) + (M5 * unp2) ) * B0 + vPlus; ** v[3] := ( (M6 * unp0) + (M7 * unp1) + (M8 * unp2) ) * B0 + vPlus; *) // XMM2 = [0 | M2 | M1 | M0] // XMM3 = [0 | M5 | M4 | M3] // XMM4 = [0 | M8 | M7 | M6] {$if defined(TARGET_x86)} MOV EAX, M MOVUPS XMM2, [EAX] // XMM1 <- * | M2 | M1 | M0 {$elseif defined(TARGET_x64)} MOVUPS XMM2, TQuadFloat PTR [M] // XMM1 <- * | M2 | M1 | M0 {$ifend} INSERTPS XMM2, XMM2, $F8 // XMM2 <- 0 | M2 | M1 | M0 {$if defined(TARGET_x86)} ADD EAX, 12//SizeOf(TFloat)*3 MOVUPS XMM3, [EAX] // XMM3 <- * | M5 | M4 | M3 {$elseif defined(TARGET_x64)} ADD M, 12//SizeOf(TFloat)*3 MOVUPS XMM3, TQuadFloat PTR [M] // XMM3 <- * | M5 | M4 | M3 {$ifend} INSERTPS XMM3, XMM3, $F8 // XMM3 <- 0 | M5 | M4 | M3 {$if defined(TARGET_x86)} ADD EAX, 12//SizeOf(TFloat)*3 MOVUPS XMM4, [EAX] // XMM4 <- * | M8 | M7 | M6 {$elseif defined(TARGET_x64)} ADD M, 12//SizeOf(TFloat)*3 MOVUPS XMM4, TQuadFloat PTR [M] // XMM4 <- * | M8 | M7 | M6 {$ifend} INSERTPS XMM4, XMM4, $F8 // XMM4 <- 0 | M8 | M7 | M6 // XMM2[1] = [M2, M1, M0]*[unp2, unp1, unp0] = (M0 * unp0) + (M1 * unp1) + (M2 * unp2) // TODO : {$ifdef IIR_USE_DPPS} DPPS XMM2, XMM1, $72 // XMM2[1] <- [M2, M1, M0]*[unp2, unp1, unp0] // XMM3[2] = [M5, M4, M3]*[unp2, unp1, unp0] = (M3 * unp0) + (M4 * unp1) + (M5 * unp2) // TODO : {$ifdef IIR_USE_DPPS} DPPS XMM3, XMM1, $74 // XMM3[2] <- [M5, M4, M3]*[unp2, unp1, unp0] // XMM4[3] = [M8, M7, M6]*[unp2, unp1, unp0] = (M6 * unp0) + (M7 * unp1) + (M8 * unp2) // TODO : {$ifdef IIR_USE_DPPS} DPPS XMM4, XMM1, $78 // XMM4[3] <- [M8, M7, M6]*[unp2, unp1, unp0] // XMM2[1..3] = [ XMM2[1] | XMM3[2] | XMM4[3] ] INSERTPS XMM2, XMM3, $A0 // XMM2[2] <- XMM3[2] INSERTPS XMM2, XMM4, $F0 // XMM2[3] <- XMM4[3] // XMM1 = B[0] {$IFNDEF FPC} MOVSS XMM1, TFloat PTR [B] // XMM1[1] <- B0 {$ELSE} {$if defined(TARGET_x64)} MOVSS XMM1, TFloat PTR [RDX] // XMM1[1] <- B0 {$else} MOVSS XMM1, TFloat PTR [EDX] // XMM1[1] <- B0 {$ifend} {$ENDIF} SHUFPS XMM1, XMM1, 0 // XMM1 <- B0 | B0 | B0 | B0 // XMM2 = XMM2 * B0 MULPS XMM2, XMM1 // XMM2 <- XMM2 * B0 // XMM2 = XMM2 + v+ ADDPS XMM2, XMM0 // XMM2 <- XMM2 + v+ INSERTPS XMM2, XMM5, $00 // XMM2[0] <- XMM5[0] MOVUPS TQuadFloat PTR [v], XMM2 // XMM2 -> v {$if defined(TARGET_x64) and defined(FPC)}end['XMM4'];{$ifend} end; {$ifend} //------------------------------------------------------------------------------ // Low level, Pascal implementations //------------------------------------------------------------------------------ procedure BlurFilterForward_Pas(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var vv: TQuadFloat); inline; var i: integer; v: TQuadFloat; begin // Initialization of forward pass // [2] Equation (20) // // in[1] // w[1] = w[2] = w[3] = ────────────────── // 1+ fd1 + fd2 + fd3 // v[1] := pIn[0] / (B[0] + B[1] + B[2] + B[3]); v[2] := v[1]; v[3] := v[1]; // Forward pass // [1] Equation (9a) // // B[1]*v[n-1] + B[2]*v[n-2] + B[3]*v[n-3] ∑ B[1..3] // w[n] = B * in[n] + ─────────────────────────────────────── , B = 1 - ───────── ([1] equation 10) // B[0] B[0] // for i := 0 to Width-1 do begin // Note: In this implementation B[1..3] has already been scaled by 1/b0 and B[0] instead contains B. v[0] := (pIn[i] * B[0]) + (v[1] * B[1]) + (v[2] * B[2]) + (v[3] * B[3]); pOut[i] := v[0]; // Shift cached output values v[3] := v[2]; v[2] := v[1]; v[1] := v[0]; end; vv := v; end; //------------------------------------------------------------------------------ procedure BlurFilterBackward_Pas(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat); inline; var i: integer; begin Dec(PFloat(pIn), Width-1); Dec(PFloat(pOut), Width-1); // Backward pass // [1] Equation (9b) // // B[1]*out[n+1] + B[2]*out[n+2] + B[3]*out[n+3] ∑ B[1..3] // out[n] = B * w[n] + ───────────────────────────────────────────── , B = 1 - ───────── ([1] equation 10) // B[0] B[0] // for i := Width-1 downto 0 do // Using negative array index works too but will give the purists a heart attack begin v[0] := (B[0] * pIn[i]) + (v[1] * B[1]) + (v[2] * B[2]) + (v[3] * B[3]); pOut[i] := v[0]; // Shift cached output values v[3] := v[2]; v[2] := v[1]; v[1] := v[0]; end; end; //------------------------------------------------------------------------------ procedure BlurApplyEdgeCorrection_Pas(iPlus: PFloat; const B: TQuadFloat; var v: TQuadFloat; const M: TNineFloats); var uPlus, vPlus: TFloat; unp0, unp1, unp2: TFloat; begin // [3] Equation (15) // iPlus = infinite stream of input values = value of rightmost pixel in our case // [3] Equation (15a) // // i+ i+ // u+ = ────────────── = ──── // 1 - ∑ b1..b3 B // // Note: B = B[0] = 1-(B[1]+B[2]+B[3]), [1] Equation (10) // See calculation of B[0] in ComputeCoefficients* uPlus := iPlus^; // We have already scaled by B // [3] Equation (15b) // // u+ u+ // v+ = ────────────── = ──── // 1 - ∑ b1..b3 B // vPlus := uPlus; // We have already scaled by B // [3] Equation (14) // // ┌ ┐ ┌ ┐ // │v[2] - u+│ │ v+ │ // v[] = M[] * │v[1] - u+│ + │ v+ │ // │v[0] - u+│ │ v+ │ // └ ┘ └ ┘ // unp0 := v[1] - uPlus; unp1 := v[2] - uPlus; unp2 := v[3] - uPlus; v[1] := ( (M[0] * unp0) + (M[1] * unp1) + (M[2] * unp2) ) * B[0] + vPlus; v[2] := ( (M[3] * unp0) + (M[4] * unp1) + (M[5] * unp2) ) * B[0] + vPlus; v[3] := ( (M[6] * unp0) + (M[7] * unp1) + (M[8] * unp2) ) * B[0] + vPlus; end; //------------------------------------------------------------------------------ // Coefficient calculation //------------------------------------------------------------------------------ // Uses algorithm from Inkscape to calculate coefficients (Adapted from [7]. Based on [4]). // (Iterative and supposedly even better approximation than [2]). // Note: My (anme) own tests shows that [2] produces a much more precise approximation // of the gaussian curve. procedure ComputeCoefficientsInkscape(Sigma: TFloat; var B: TQuadFloat); var d1_org: TComplex; d3_org: Double; qLow, qHigh: Double; SigmaSqr: Double; s, q: Double; d1: TComplex; d3: Double; sSqr: Double; AbsD1Sqr: Double; re2d1: Double; B0: Double; Limit: Double; begin (* The use of ComputeCoefficientsInkscape is currently disabled by default because: - ComputeCoefficientsInkscape is only valid for Sigma >= 20 and the transition between the default coefficient and ComputeCoefficientsInkscape isn't smooth. *) // [4] Table 1, 3rd order filter d1_org := TComplex.From(1.40098, 1.00236); d3_org := 1.85132; SigmaSqr := Sqr(Sigma); // [4] suggests using "a few iterations of a linear extrapolation scheme" to find // the value of q that yields a filter h[n] with the specified variance. // Here we use a binary search instead. qLow := 1; // Don't go lower than sigma=2 (we'd probably want a normal convolution in that case anyway) qHigh := 2 * Sigma; {$if (SizeOf(TFloat) > SizeOf(Single))} Limit := Sigma / (1 shl 30) {$else} Limit := Sigma / (1 shl 20); {$ifend} repeat q := (qLow + qHigh) / 2; // Compute scaled filter coefficients // [4] Equation (20) // 1 jθ(d) 1 jθ(d) // ─── ───── ─── ───── -2 // var(h[n]) = ∑ 2|d| q * e q * ( |d| q - e q ) // d1 := TComplex.Power(d1_org, 1 / q); d3 := Power(d3_org, 1 / q); // Compute actual sigma^2 // [4] Equation (10) // // 2d // ∑ n²*h[n] = ∑ ──────── // (d - 1)² // sSqr := 2 * (2 * (d1 / TComplex.Sqr(d1 - 1)).Real + d3 / Sqr(d3 - 1)); if (sSqr < SigmaSqr) then qLow := q else qHigh := q; s := qHigh - qLow; until (s <= Limit); q := (qLow + qHigh) / 2; // Compute z-poles // [4] Equation (20) d1 := TComplex.Power(d1_org, 1 / q); d3 := Power(d3_org, 1 / q); AbsD1Sqr := TComplex.AbsSqr(d1); // d1*d2 = d1*conj(d1) = |d1|^2 = std::norm(d1) re2d1 := 2 * d1.Real; // d1+d2 = d1+conj(d1) = 2*real(d1) // Compute filter coefficients // [4] Equation (13) // Note: For performance b1..b3 are divided b0 to avoid doing so in the inner loop. // Additionally B is stored in B[0] since we no longer need b0. B0 := 1 / (AbsD1Sqr * d3); B[1] := (AbsD1Sqr + d3 * re2d1) * B0; B[2] := -(d3 + re2d1) * B0; B[3] := B0; // B: Normalization constant // [1] Equation (10) B[0] := 1.0 - (B[1] + B[2] + B[3]); end; //------------------------------------------------------------------------------ // Uses algorithm from [2] to calculate coefficients (more precise approximation). // Not stable for sigma >= 20 (Radius >= 50) - produces "crazy" results. (* The use of ComputeCoefficientsGabor is currently disabled by default because: a) ComputeCoefficientsGaussianDerivative produces almost the same values for Radius=1 as ComputeCoefficientsGabor does for Radius=2 which makes no sense to the user. b) ComputeCoefficientsGabor doesn't support Sigma < 0.5 so we can't just use it instead of ComputeCoefficientsGaussianDerivative. * Radius: 1; Sigma: 0.300386630413846 ComputeCoefficientsGaussianDerivative B[0.840293169021606, 0.170402005314827, -0.0110350390896201, 0.00033983588218689] * Radius: 2; Sigma: 0.300386630413846) 0.600773260827692 ComputeCoefficientsGabor B[0.845094859600067, 0.164933204650879, -0.0103367269039154, 0.000308647722704336] *) procedure ComputeCoefficientsGabor(Sigma: TFloat; var B: TQuadFloat); var q, qq, qqq: Double; B0: Double; const // [2] Equation (7) m0 = 1.16680; m1 = 1.10783; m2 = 1.40586; m1m1 = m1*m1; m2m2 = m2*m2; begin // Compute q from Standard Deviation σ // [2] Equation (16) // // ┌ // │ 0.9804(σ -3.556) + 2.5091 for σ >= 3.556 // q = ┤ // │ 0.0561σ² +0.5784σ - 0.2568 for 0.5 <= σ <= 3.556 // └ // Note: Doesn't work well for values smaller than 0.5 if (Sigma < 3.556) then q := (0.0561 * Sigma + 0.5784) * Sigma - 0.2568 // = 0.0561*Sqr(Sigma) + 0.5784*Sigma - 0.2568 else q := 0.9804 * (Sigma - 3.556) + 2.5091; qq := Sqr(q); qqq := qq * q; // Compute filter coefficients // [2] Equation (15) // // b0 = 1 [*] scale = (m0 + q)(m1² + m2² + 2m1q + q²) [*] Something is missing here in the original paper // b1 = -q(2m0m1 + m1² + m2² + (2m0 + 4m1)q + 3q²) / scale // b2 = q²(m0 + 2m1 + 3q) / scale // b3 = -q³ / scale // // Note: We have reversed the sign of b1, b2 & b3 from [2] // Note: For performance we scale B[1..3] by 1/b0 to avoid doing so in the inner loop. // Additionally we store B in B[0]. B0 := 1 / ((m0 + q) * (m1m1 + m2m2 + 2 * m1 * q + qq)); B[1] := (q * (2 * m0 * m1 + m1m1 + m2m2 + (2 * m0 + 4 * m1) * q + 3 * qq)) * B0; B[2] := (-qq * (m0 + 2 * m1 + 3 * q)) * B0; B[3] := (qqq) * B0; // [2] Equation (19) // // B = (m0(m1² + m2²)/scale)² // B[0] := (m0 * (m1m1 + m2m2)) * B0; end; //------------------------------------------------------------------------------ // Uses algorithm from [4] to calculate coefficients procedure ComputeCoefficientsGaussianDerivative(Sigma: TFloat; var B: TQuadFloat); var q, qq, qqq: Double; B0: Double; // Sigma2, Sigma3, Sigma4: Double; begin // Compute q from Standard Deviation (* if (Sigma <= 4) and (Sigma >= 1) then begin // [5] Equation (22) // Valid for the range σ = [1,4] Sigma2 := Sqr(Sigma); Sigma3 := Sigma2 * Sigma; Sigma4 := Sigma3 * Sigma; q := 0.0001 * Sigma4 - 0.0021 * Sigma3 + 0.0207 * Sigma2 + 0.3797 * Sigma + 0.1763; end else // Well, this sucks: The transition between the case above (Radius=13, Sigma=3.9) and the one // below (Radius=14, Sigma=4.2) isn't smooth; There's a visible "jump" in the amount of blur // when we transition between them. *) if (Sigma >= 2.5) then // [1] Equation (11b) q := 0.98711 * Sigma - 0.96330 else if (Sigma >= 0.5) then // [1] Equation (11b) q := 3.97156 - 4.14554 * Sqrt(1.0 - 0.26891 * Sigma) else // [6] q := 0.1147705018520355224609375; qq := Sqr(q); qqq := qq * q; // Compute filter coefficients // [1] Equation (8c) B0 := 1 / (1.57825 + (2.44413 * q) + (1.4281 * qq) + (0.422205 * qqq)); B[1] := ( (2.44413 * q) + (2.85619 * qq) + (1.26661 * qqq)) * B0; B[2] := (-( (1.4281 * qq) + (1.26661 * qqq))) * B0; B[3] := ( 0.422205 * qqq) * B0; // B: Normalization constant // [1] Equation (10) B[0] := 1.0 - (B[1] + B[2] + B[3]); end; //------------------------------------------------------------------------------ // Edge correction matrix [3] //------------------------------------------------------------------------------ procedure ComputeBoundaryCorrectionMatrix(const B: TQuadFloat; var M: TNineFloats); var Scale: Double; B3B3, B2B2: Double; B3B1, B3B2: Double; begin // Compute edge correction matrix M[3,3] // [3] Equation (5) // // // 1 // Scale = ──────────────────────────────────────── // (1+a1−a2+a3)(1−a1−a2−a3)(1+a2+(a1−a3)a3) // // ┌ ┐ // │ −a3a1+1−a3²-a2 (a3+a1)(a2+a3a1) a3(a1+a3a2) │ // M = Scale * │ a1+a3a2 −(a2−1)(a2+a3a1) −(a3a1+a3²+a2−1)a3 │ // │ a3a1+a2+a1²−a2² a1a2+a3a2²−a1a3²−a3³−a3a2+a3 a3(a1+a3a2) │ // └ ┘ // Scale := 1 / ((1 + B[1] - B[2] + B[3]) * (1 - B[1] - B[2] - B[3]) * (1 + B[2] + (B[1] - B[3]) * B[3])); B3B3 := Sqr(B[3]); B2B2 := Sqr(B[2]); B3B1 := B[3] * B[1]; B3B2 := B[3] * B[2]; M[0] := Scale * (-B3B1 + 1 - B3B3 - B[2]); // [0,0] M[1] := Scale * (B[3] + B[1]) * (B[2] + B3B1); // [0,1] M[3] := Scale * (B[1] + B3B2); // [1,0] M[2] := M[3] * B[3]; // [0,2] M[4] := -Scale * (B[2] - 1) * (B[2] + B3B1); // [1,1] M[5] := -Scale * B[3] * (B3B1 + B3B3 + B[2] - 1); // [1,2] M[6] := Scale * (B3B1 + B[2] + Sqr(B[1]) - B2B2); // [2,0] M[7] := Scale * (B[1] * B[2] + B[3] * B2B2 - B3B3 * (B[1]+B[3]) - B3B2 + B[3]); // [2,1] M[8] := Scale * B[3] * (B[1] + B3B2); // [2,2] end; //------------------------------------------------------------------------------ // Single channel buffer I/O //------------------------------------------------------------------------------ // Note: Although the layout of the Graphics32 TColor32Entry type depends on the // RGBA_FORMAT define, the position of the Alpha channel in the record is // constant which is what is important for us here. //------------------------------------------------------------------------------ procedure LoadChannel(Src: TBitmap32; Channel: integer; Buffer: PFloatArray {$ifdef IIR_BLUR_DIV_LUT}; const PremultiplyLUT: TPremultiplyLUT{$endif}); var DestValues: PFloatArray; SourceValues: PColor32EntryArray; AlphaValues: PColor32EntryArray; Count: integer; i: integer; Alpha: Byte; begin Count := Src.Height * Src.Width; DestValues := Buffer; SourceValues := PColor32EntryArray(@(PByte(Src.Bits)[Channel])); if (Channel = Ord(ccAlpha)) then begin for i := 0 to Count-1 do DestValues[i] := SourceValues[i].Planes[0]; end else begin AlphaValues := PColor32EntryArray(@(PColor32Entry(Src.Bits).A)); for i := 0 to Count-1 do begin Alpha := AlphaValues[i].Planes[0]; // Important: // 1) First convert to linear color space. // 2) Then alpha premultiply. if (Alpha = 255) then // Most common case first {$ifdef IIR_BLUR_DIV_LUT} DestValues[i] := PremultiplyLUT.MulDiv255[SourceValues[i].Planes[0], 255] {$else IIR_BLUR_DIV_LUT} DestValues[i] := GAMMA_DECODING_TABLE[SourceValues[i].Planes[0]] {$endif IIR_BLUR_DIV_LUT} else if (Alpha = 0) then DestValues[i] := 0 else // Premultiply: ColorPremult := Color * Alpha / 255 {$ifdef IIR_BLUR_DIV_LUT} DestValues[i] := PremultiplyLUT.MulDiv255[SourceValues[i].Planes[0], Alpha]; {$else IIR_BLUR_DIV_LUT} DestValues[i] := GAMMA_DECODING_TABLE[SourceValues[i].Planes[0]] * Alpha * OneOver255; {$endif IIR_BLUR_DIV_LUT} end; end; end; //------------------------------------------------------------------------------ procedure SaveChannel(Dst: TBitmap32; Channel: integer; Buffer: PFloatArray {$ifdef IIR_BLUR_DIV_LUT}; const PremultiplyLUT: TPremultiplyLUT{$endif}); var SourceValues: PFloatArray; DestValues: PColor32EntryArray; AlphaValues: PColor32EntryArray; Count: integer; i: integer; n: TFloat; Color, Alpha: Byte; begin Count := Dst.Height * Dst.Width; SourceValues := Buffer; DestValues := PColor32EntryArray(@(PByte(Dst.Bits)[Channel])); if (Channel = Ord(ccAlpha)) then begin for i := 0 to Count-1 do begin Alpha := Clamp(FastRound(SourceValues[i])); DestValues[i].Planes[0] := Alpha; end; end else begin AlphaValues := PColor32EntryArray(@(PColor32Entry(Dst.Bits).A)); for i := 0 to Count-1 do begin Alpha := AlphaValues[i].Planes[0]; // Important: // 1) First alpha unpremultiply. // 2) Then convert to gamma/sRGB color space. if (Alpha = 255) then // Most common case first begin {$ifdef IIR_BLUR_DIV_LUT} n := SourceValues[i]; Color := Clamp(FastRound(n)); Color := PremultiplyLUT.Mul255Div[Color, 255]; {$else IIR_BLUR_DIV_LUT} Color := FastRound(SourceValues[i]); Color := GAMMA_ENCODING_TABLE[Color]; {$endif IIR_BLUR_DIV_LUT} end else if (Alpha = 0) then Color := 0 else begin // Unpremultiply: Color := ColorPremult * 255 / AlphaValues {$ifdef IIR_BLUR_DIV_LUT} n := SourceValues[i]; Color := Clamp(FastRound(n)); // Warning: Loss of precision due to rounding before division Color := PremultiplyLUT.Mul255Div[Color, Alpha]; {$else IIR_BLUR_DIV_LUT} // Note: We're doing a double Round() here because SourceValues[i]/Alpha can be greater than 1 // since we're dividing an unrounded value by a rounded value. // For example if both values are equal (e.g. source was $FFFFFFFF) and the 'alpha' value was rounded // down, then we will be dividing a larger value (e.g. 25.4) with a smaller (e.g. 25). // To avoid this rounding error we would have to keep a copy of the unrounded alpha values and // divide with those. Color := FastRound(FastRound(SourceValues[i]) / Alpha * 255 ); Color := GAMMA_ENCODING_TABLE[Color]; {$endif IIR_BLUR_DIV_LUT} end; DestValues[i].Planes[0] := Color; end; end; end; //------------------------------------------------------------------------------ // Buffer management //------------------------------------------------------------------------------ function AllocateChannelBuffer(out Buffer: pointer; Size: integer): PFloatArray; const Alignment: NativeUInt = 64*1024; begin {$ifdef IIR_BLUR_ALIGN_BUFFERS} // Allocate a cache-friendly channel buffer // TODO : Although this gives us a small performance improvement it is most likely not worth the effort Inc(Size, 2 * Alignment); GetMem(Buffer, Size); Result := Buffer; Inc(PByte(Result), Alignment); // Ensure data alignment by X bytes Result := Pointer(NativeUInt(Result) and (NativeUInt(-1) xor (Alignment - 1))); // Make sure data is not aligned by 2*A bytes NativeUInt(Result) := NativeUInt(Result) or Alignment; {$else IIR_BLUR_ALIGN_BUFFERS} GetMem(Buffer, Size); Result := Buffer; {$endif IIR_BLUR_ALIGN_BUFFERS} end; //------------------------------------------------------------------------------ // Core function //------------------------------------------------------------------------------ procedure InternalRecursiveGaussianBlur(Src, Dst: TBitmap32; Sigma: TFloat; TwoDimensional: boolean {$ifdef IIR_BLUR_DIV_LUT}; const PremultiplyLUT: TPremultiplyLUT{$endif}); var B: TQuadFloat; M: TNineFloats; // [3] Fix boundary Buffer, TransposedBuffer: PFloatArray; RowBuffer: PFloatArray; procedure BlurRow(Row: integer; Width, Height: integer; InBuffer, OutBuffer: PFloatArray); var InOffset, OutOffset: integer; pIn, pOut: PFloatArray; v: TQuadFloat; begin (* ** Causal filter (forward) *) InOffset := Row * Width; pIn := PFloatArray(@InBuffer[InOffset]); pOut := RowBuffer; IIR_BlurFilterForward(pIn, pOut, B, Width, v); (* ** Anti-causal filter (backward) *) OutOffset := InOffset + Width-1; pIn := @RowBuffer[Width-1]; pOut := @OutBuffer[OutOffset]; {$ifdef IIR_BLUR_EDGE_CORRECTION} (* ** [3] Apply Triggs/Sdika edge correction *) begin IIR_BlurApplyEdgeCorrection(@InBuffer[InOffset + Width-1], B, v, M); pOut[0] := v[1]; Dec(PFloat(pOut)); Dec(PFloat(pIn)); Dec(Width); // Adjust backward loop count end; {$else IIR_BLUR_EDGE_CORRECTION} (* ** Regular initialization of backward pass. Not used with [3] *) // [2] Equation (21) v[1] := pIn[0] / (B[0] + B[1] + B[2] + B[3]); v[2] := v[1]; v[3] := v[1]; {$endif IIR_BLUR_EDGE_CORRECTION} IIR_BlurFilterBackward(pIn, pOut, B, Width, v); end; procedure BlurBuffer; var i: integer; begin (* ** A note about transpose: ** An earlier version performed the transpose operation between the horizontal and vertical pass, as part of the ** backward (anti-causal) row filter. ** Tests has shown that this is at least 25% slower than performing the transpose in one go, after the complete ** horizontal and vertical passes, using a blocked transpose algorithm. *) for i := 0 to Src.Height-1 do BlurRow(i, Src.Width, Src.Height, Buffer, Buffer); Transpose32(@Buffer[0], @TransposedBuffer[0], Src.Width, Src.Height); for i := 0 to Src.Width-1 do BlurRow(i, Src.Height, Src.Width, TransposedBuffer, TransposedBuffer); Transpose32(@TransposedBuffer[0], @Buffer[0], Src.Height, Src.Width); end; procedure BlurHorizontal; var i: integer; begin for i := 0 to Src.Height-1 do BlurRow(i, Src.Width, Src.Height, Buffer, Buffer); end; var Channel: integer; MaxRows: integer; UnAlignRowBuffer: pointer; UnAlignBuffer: pointer; UnAlignTransposedBuffer: pointer; begin if (Sigma < GaussianRadiusToSigma) then begin Src.CopyMapTo(Dst); exit; end; Dst.SetSize(Src.Width, Src.Height); {$ifdef IIR_BLUR_INKSCAPE_COEFFICIENTS} if (Sigma >= 20) then ComputeCoefficientsInkscape(Sigma, B) else {$endif IIR_BLUR_INKSCAPE_COEFFICIENTS} {$ifdef IIR_BLUR_GABOR_COEFFICIENTS} if (Sigma >= 0.5) then ComputeCoefficientsGabor(Sigma, B) else {$endif IIR_BLUR_GABOR_COEFFICIENTS} ComputeCoefficientsGaussianDerivative(Sigma, B); // [3] Fix boundary ComputeBoundaryCorrectionMatrix(B, M); MaxRows := Max(Src.Height, Src.Width); UnAlignBuffer := nil; UnAlignTransposedBuffer := nil; UnAlignRowBuffer := nil; try Buffer := AllocateChannelBuffer(UnAlignBuffer, Src.Width * Src.Height * SizeOf(TFloat)); if (TwoDimensional) then TransposedBuffer := AllocateChannelBuffer(UnAlignTransposedBuffer, Src.Width * Src.Height * SizeOf(TFloat)); RowBuffer := AllocateChannelBuffer(UnAlignRowBuffer, MaxRows * SizeOf(TFloat)); // Start with A channel so unpremult of RGB channels uses correct value for Channel := 3 downto 0 do begin LoadChannel(Src, Channel, Buffer {$ifdef IIR_BLUR_DIV_LUT}, PremultiplyLUT{$endif}); if (TwoDimensional) then BlurBuffer else BlurHorizontal; SaveChannel(Dst, Channel, Buffer {$ifdef IIR_BLUR_DIV_LUT}, PremultiplyLUT{$endif}); end; finally Freemem(UnAlignRowBuffer); Freemem(UnAlignTransposedBuffer); Freemem(UnAlignBuffer); end; end; //------------------------------------------------------------------------------ procedure RecursiveGaussianBlur(Src, Dst: TBitmap32; Sigma: TFloat); begin InternalRecursiveGaussianBlur(Src, Dst, Sigma, True {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.PremultiplyLUT^{$endif}); end; procedure RecursiveGaussianBlurRadius(Src, Dst: TBitmap32; Radius: TFloat); begin InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, True {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.PremultiplyLUT^{$endif}); end; procedure RecursiveGaussianBlurGamma(Src, Dst: TBitmap32; Sigma: TFloat); begin InternalRecursiveGaussianBlur(Src, Dst, Sigma, True {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.GammaPremultiplyLUT^{$endif}); end; procedure RecursiveGaussianBlurRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat); begin InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, True {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.GammaPremultiplyLUT^{$endif}); end; //------------------------------------------------------------------------------ procedure RecursiveGaussianBlurHorizontalRadius(Src, Dst: TBitmap32; Radius: TFloat); begin InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, False {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.PremultiplyLUT^{$endif}); end; procedure RecursiveGaussianBlurHorizontalRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat); begin InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, False {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.GammaPremultiplyLUT^{$endif}); end; //------------------------------------------------------------------------------ // // Registration of internal bindings // //------------------------------------------------------------------------------ procedure RegisterBindings; begin BlurRegistry.RegisterBinding(@@IIR_BlurFilterForward, 'IIR_BlurFilterForward'); BlurRegistry.RegisterBinding(@@IIR_BlurFilterBackward, 'IIR_BlurFilterBackward'); BlurRegistry.RegisterBinding(@@IIR_BlurApplyEdgeCorrection, 'IIR_BlurApplyEdgeCorrection'); {$ifdef IIR_BLUR_DEFAULT} // Register as default Blur32 implementation BlurRegistry[@@Blur32Proc].Add( @RecursiveGaussianBlurRadius, [isPascal]).Name := 'RecursiveGaussianBlurRadius'; BlurRegistry[@@GammaBlur32Proc].Add( @RecursiveGaussianBlurRadiusGamma, [isPascal]).Name := 'RecursiveGaussianBlurRadiusGamma'; {$endif IIR_BLUR_DEFAULT} BlurRegistry[@@HorizontalBlur32].Add( @RecursiveGaussianBlurHorizontalRadius, [isPascal]).Name := 'RecursiveGaussianBlurHorizontalRadius'; BlurRegistry[@@GammaHorizontalBlur32].Add( @RecursiveGaussianBlurHorizontalRadiusGamma, [isPascal]).Name := 'RecursiveGaussianBlurHorizontalRadiusGamma'; BlurRegistry[@@IIR_BlurFilterForward].Add( @BlurFilterForward_Pas, [isPascal]).Name := 'BlurFilterForward_Pas'; BlurRegistry[@@IIR_BlurFilterBackward].Add( @BlurFilterBackward_Pas, [isPascal]).Name := 'BlurFilterBackward_Pas'; BlurRegistry[@@IIR_BlurApplyEdgeCorrection].Add(@BlurApplyEdgeCorrection_Pas, [isPascal]).Name := 'BlurApplyEdgeCorrection_Pas'; {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))} {$ifdef IIR_BLUR_SIMD} BlurRegistry[@@IIR_BlurFilterForward].Add( @BlurFilterForward_SSE41, {$if defined(IIR_USE_DPPS)} [isSSE41]).Name := 'BlurFilterForward_SSE41'; {$elseif defined(IIR_USE_HADDPS)} [isSSE3]).Name := 'BlurFilterForward_SSE41'; {$else} [isSSE2]).Name := 'BlurFilterForward_SSE41'; {$ifend} BlurRegistry[@@IIR_BlurFilterBackward].Add( @BlurFilterBackward_SSE41, [isSSE41]).Name := 'BlurFilterBackward_SSE41'; {$endif IIR_BLUR_SIMD} {$ifdef IIR_BLUR_EDGE_CORRECTION_SIMD} BlurRegistry[@@IIR_BlurApplyEdgeCorrection].Add(@BlurApplyEdgeCorrection_SSE41, [isSSE41]).Name := 'BlurApplyEdgeCorrection_SSE41'; {$endif IIR_BLUR_EDGE_CORRECTION_SIMD} BlendRegistry.RebindAll; {$ifend} end; //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ initialization RegisterBindings; end.