123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587 |
- unit GR32.Blur.RecursiveGaussian;
- (* ***** BEGIN LICENSE BLOCK *****
- * Version: MPL 1.1 or LGPL 2.1 with linking exception
- *
- * The contents of this file are subject to the Mozilla Public License Version
- * 1.1 (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- * for the specific language governing rights and limitations under the
- * License.
- *
- * Alternatively, the contents of this file may be used under the terms of the
- * Free Pascal modified version of the GNU Lesser General Public License
- * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions
- * of this license are applicable instead of those above.
- * Please see the file LICENSE.txt for additional information concerning this
- * license.
- *
- * The Original Code is Recursive Gaussian Blur for Graphics32
- *
- * The Initial Developer of the Original Code is
- * Anders Melander <[email protected]>
- *
- * Portions created by the Initial Developer are Copyright (C) 2010-2018
- * the Initial Developer. All Rights Reserved.
- *
- * ***** END LICENSE BLOCK ***** *)
- interface
- {$include GR32.inc}
- // IIR = Infinite Input Response
- {$define IIR_BLUR_DEFAULT} // Register as default Blur32 implementation.
- {$define IIR_BLUR_SIMD} // Enable forward/backward filter implemented using Streaming SIMD Extensions (SSE).
- {$define IIR_BLUR_EDGE_CORRECTION} // Apply the boundary corrections from [3]
- {$define IIR_BLUR_EDGE_CORRECTION_SIMD} // Enable SIMD implementation of [3] - significantly faster than the Pascal version but unfortunately not a hotspot
- {.$define IIR_BLUR_INKSCAPE_COEFFICIENTS}// Enable use of InkScape [7] coefficient calculation
- {.$define IIR_BLUR_GABOR_COEFFICIENTS} // Enable use of Gabor [2] coefficient calculation
- {$define IIR_USE_HADDPS} // Use the HADDPS SSE3 instruction instead of two ADDPS SSE instructions. See comment in code.
- {$define IIR_USE_DPPS} // Use the DPPS SSE4.1 instruction instead of a MULPS and two ADDPS SSE instructions. See comment in code.
- {$define IIR_BLUR_ALIGN_BUFFERS} // Use aligned buffers (note only start of buffer is aligned; Individual rows are not)
- {$define IIR_BLUR_DIV_LUT} // Use a Look Up Table for alpha pre- and unpremultiplication and gamma correction. Slightly faster and much more precise.
- // Note that GammaBlur32 requires that IIR_BLUR_DIV_LUT is defined.
- uses
- GR32;
- //------------------------------------------------------------------------------
- //
- // Recursive Gaussian Blur
- //
- //------------------------------------------------------------------------------
- // Anders Melander, August 2010-September 2018
- //------------------------------------------------------------------------------
- // References:
- //
- // [1] Recursive implementation of the Gaussian filter
- // Ian T. Young, Lucas J. van Vliet
- // Signal Processing, Volume 44, Number 2, June 1995, pp. 139-151(13)
- //
- // [2] Recursive Gabor Filtering
- // Ian T. Young, Lucas J. van Vliet, and Michael van Ginkel
- // IEEE Transactions on Signal Processing, Volume 50, Number 11, Nov 2002, pp. 2799–2805
- //
- // [3] Boundary Conditions for Young - van Vliet Recursive Filtering
- // Bill Triggs, Michael Sdika
- // IEEE Transactions on Signal Processing, Volume 54, Number 6, June 2006, pp. 2365-2367
- //
- // [4] Recursive Gaussian Derivative Filters
- // L. J. Van Vliet, I. T. Young, and P. W. Verbeek
- // Fourteenth International Conference on Pattern Recognition, 1998. Brisbane.
- // Proceedings, Volume 1, pp. 509-514
- //
- // [5] Performance of three recursive algorithms for fast space-variant Gaussian filtering
- // Sovira Tan, Jason L. Dale, Alan Johnston
- // Real-Time Imaging, Number 9, 2003, pp 215–228
- //
- // [6] Gimp, Retinex plug-in
- // Fabien Pelisson
- //
- // [7] Inkscape, Gaussian blur renderer
- // Niko Kiirala, bulia byak, Jasper van de Gronde
- //------------------------------------------------------------------------------
- //
- // From [1]:
- //
- // This implementation yields an infinite impulse response filter that has six
- // MADDs (Multiply/Add) per dimension independent of the value of sigma in the
- // Gaussian kernel. In contrast to the Deriche implementation (1987), the
- // coefficients of our recursive filter have a simple, closed-form solution for a
- // desired value of the Gaussian sigma. Our implementation is, in general, faster
- // than:
- //
- // (1) an implementation based upon direct convolution with samples of a Gaussian,
- // (2) repeated convolutions with a kernel such as the uniform filter, and
- // (3) an FFT implementation of a Gaussian filter.
- //
- //------------------------------------------------------------------------------
- procedure RecursiveGaussianBlur(Src, Dst: TBitmap32; Sigma: TFloat);
- procedure RecursiveGaussianBlurRadius(Src, Dst: TBitmap32; Radius: TFloat);
- procedure RecursiveGaussianBlurGamma(Src, Dst: TBitmap32; Sigma: TFloat);
- procedure RecursiveGaussianBlurRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat);
- procedure RecursiveGaussianBlurHorizontalRadius(Src, Dst: TBitmap32; Radius: TFloat);
- procedure RecursiveGaussianBlurHorizontalRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat);
- //------------------------------------------------------------------------------
- //
- // Recursive Gaussian Blur internal API types
- //
- //------------------------------------------------------------------------------
- // For now this implementation only works when TFloat=Single since the SSE
- // and transpose code has been written for 32-bit values.
- //------------------------------------------------------------------------------
- type
- TQuadFloatRec = packed record
- v0: TFloat;
- v1: TFloat;
- v2: TFloat;
- v3: TFloat;
- end;
- TQuadFloat = array[0..3] of TFloat;
- PQuadFloat = ^TQuadFloat;
- TNineFloats = array[0..8] of TFloat;
- //------------------------------------------------------------------------------
- //
- // Recursive Gaussian Blur internal bindings
- //
- //------------------------------------------------------------------------------
- type
- TIIR_BlurFilterBackwardProc = procedure (pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat);
- TIIR_BlurFilterForwardProc = procedure (pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat);
- TIIR_BlurApplyEdgeCorrection = procedure (iPlus: PFloat; const B: TQuadFloat; var v: TQuadFloat; const M: TNineFloats);
- var
- IIR_BlurFilterBackward: TIIR_BlurFilterBackwardProc = nil;
- IIR_BlurFilterForward: TIIR_BlurFilterForwardProc = nil;
- IIR_BlurApplyEdgeCorrection: TIIR_BlurApplyEdgeCorrection = nil;
- //------------------------------------------------------------------------------
- //------------------------------------------------------------------------------
- //------------------------------------------------------------------------------
- implementation
- uses
- SyncObjs, // TCriticalSection
- Math,
- GR32.Blur,
- GR32_Bindings,
- GR32.Transpose,
- GR32.Math.Complex,
- GR32_OrdinalMaps,
- GR32_Blend,
- GR32_LowLevel,
- GR32_Math,
- GR32_System,
- GR32_Gamma;
- // Ensure that we use the GR32.TFloat and not FPC's Math.TFloat (which is an alias for Double!)
- type
- TFloat = GR32.TFloat;
- PFloat = ^TFloat;
- const
- OneOver255: TFloat = 1/255;
- //------------------------------------------------------------------------------
- //
- // PremultiplyLUT
- //
- //------------------------------------------------------------------------------
- // Lookup tables for alpha premultiplication.
- //
- // MulDiv255[a,b] = a * b / 255 Used for premultiplication
- // Mul255Div[a,b] = Round(a * 255 / b) Used for unpremultiplication
- //
- // where
- //
- // a: Color value
- // b: Alpha value
- //
- // PremultiplyLUT is used for pre- and unpremultiplication.
- // GammaPremultiplyLUT rolls the gamma correction and pre-/unpremultiplication
- // operations into one for a significant gain in precision at no extra cost in
- // performance.
- //
- //------------------------------------------------------------------------------
- {$ifdef IIR_BLUR_DIV_LUT}
- type
- PPremultiplyLUT = ^TPremultiplyLUT;
- TPremultiplyLUT = record
- strict private
- class constructor Create;
- class destructor Destroy;
- strict private class var
- FLock: TCriticalSection;
- FPremultiplyLUT: PPremultiplyLUT;
- FGammaPremultiplyLUT: PPremultiplyLUT;
- strict private
- FsRGB: boolean;
- FGamma: Double;
- FGammaInv: Double;
- procedure SetGamma(const GammaValue: Double; sRGB: boolean);
- procedure GammaChangedHandler;
- public
- Mul255Div: array[byte, byte] of byte;
- MulDiv255: array[byte, byte] of TFloat;
- public class var
- public
- class function PremultiplyLUT: PPremultiplyLUT; static;
- class function GammaPremultiplyLUT: PPremultiplyLUT; static;
- property sRGB: boolean read FsRGB;
- property Gamma: Double read FGamma;
- end;
- //------------------------------------------------------------------------------
- class constructor TPremultiplyLUT.Create;
- begin
- FLock := TCriticalSection.Create;
- FPremultiplyLUT := nil;
- FGammaPremultiplyLUT := nil;
- end;
- class destructor TPremultiplyLUT.Destroy;
- begin
- FLock.Free;
- if (FPremultiplyLUT <> nil) then
- Dispose(FPremultiplyLUT);
- if (FGammaPremultiplyLUT <> nil) then
- begin
- UnregisterGammaChangeNotification(FGammaPremultiplyLUT.GammaChangedHandler);
- Dispose(FGammaPremultiplyLUT);
- end;
- end;
- //------------------------------------------------------------------------------
- class function TPremultiplyLUT.PremultiplyLUT: PPremultiplyLUT;
- var
- AlphaValue, ColorValue: Integer;
- begin
- if (FPremultiplyLUT = nil) then
- begin
- FLock.Acquire;
- if (FPremultiplyLUT = nil) then
- begin
- New(FPremultiplyLUT);
- for ColorValue := 0 to 255 do
- begin
- FPremultiplyLUT.Mul255Div[ColorValue, 0] := 0;
- FPremultiplyLUT.MulDiv255[ColorValue, 0] := 0.0;
- for AlphaValue := 1 to 255 do
- begin
- FPremultiplyLUT.Mul255Div[ColorValue, AlphaValue] := Clamp(Round(ColorValue * 255 / AlphaValue));
- FPremultiplyLUT.MulDiv255[ColorValue, AlphaValue] := ColorValue * AlphaValue * OneOver255;;
- end;
- end;
- end;
- FLock.Release;
- end;
- Result := FPremultiplyLUT;
- end;
- //------------------------------------------------------------------------------
- class function TPremultiplyLUT.GammaPremultiplyLUT: PPremultiplyLUT;
- begin
- if (FGammaPremultiplyLUT = nil) then
- begin
- FLock.Acquire;
- if (FGammaPremultiplyLUT = nil) then
- begin
- New(FGammaPremultiplyLUT);
- RegisterGammaChangeNotification(FGammaPremultiplyLUT.GammaChangedHandler);
- FGammaPremultiplyLUT.SetGamma(GAMMA_VALUE, GAMMA_IS_SRGB);
- end;
- FLock.Release;
- end;
- Result := FGammaPremultiplyLUT;
- end;
- procedure TPremultiplyLUT.GammaChangedHandler;
- begin
- SetGamma(GAMMA_VALUE, GAMMA_IS_SRGB);
- end;
- procedure TPremultiplyLUT.SetGamma(const GammaValue: Double; sRGB: boolean);
- var
- AlphaValue, ColorValue: Integer;
- n: Single;
- ColorLinear, ColorRGB: TFloat;
- begin
- if (FsRGB = sRGB) and ((FsRGB) or (GammaValue = FGamma)) then
- exit;
- FsRGB := sRGB;
- if (not FsRGB) then
- begin
- FGamma := GammaValue;
- FGammaInv := 1 / FGamma;
- end;
- for ColorValue := 0 to 255 do
- begin
- Mul255Div[ColorValue, 0] := 0;
- MulDiv255[ColorValue, 0] := 0.0;
- // sRGB -> Linear RGB / 255
- ColorLinear := ColorValue * OneOver255;
- if (FsRGB) then
- begin
- if (ColorLinear >= 0.04045) then
- ColorLinear := Power((ColorLinear + 0.055) * (1 / 1.055), 2.4)
- else
- ColorLinear := ColorLinear * (1 / 12.92);
- end else
- ColorLinear := Power(ColorLinear, FGamma);
- // ColorValue: Color, AlphaValue: Alpha
- for AlphaValue := 1 to 255 do
- begin
- // Linear RGB -> Premultiplied, Linear RGB
- n := ColorLinear * AlphaValue;
- MulDiv255[ColorValue, AlphaValue] := n;
- // Premultiplied, Linear RGB -> Unpremultiplied, Linear RGB
- n := ColorValue / AlphaValue;
- // Linear RGB -> sRGB / 255
- if (FsRGB) then
- begin
- if (n >= 0.0031308) then
- ColorRGB := 1.055 * Power(n, 1 / 2.4) - 0.055
- else
- ColorRGB := n * 12.92;
- end else
- ColorRGB := Power(n, FGammaInv);
- Mul255Div[ColorValue, AlphaValue] := Clamp(Round(ColorRGB * 255));
- end;
- end;
- end;
- {$endif IIR_BLUR_DIV_LUT}
- //------------------------------------------------------------------------------
- //
- // Recursive Gaussian Blur
- //
- //------------------------------------------------------------------------------
- //------------------------------------------------------------------------------
- // Low level, SIMD implementations
- //------------------------------------------------------------------------------
- {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))}
- procedure BlurFilterForward_SSE41(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat); {$IFDEF FPC}assembler;{$ENDIF}
- // Parameters (x86):
- // EAX <- pIn
- // EDX <- pOut
- // ECX <- B
- // Stack[0] <- Width
- // Stack[1] <- v
- // Preserves: (none)
- //
- // Parameters (x64):
- // RCX <- pIn
- // RDX <- pOut
- // R8 <- B
- // R9 <- Width
- // Stack[0] <- v
- // Preserves: (none)
- //
- // SSE register usage:
- // XMM0: B3 | B2 | B1 | B0
- // XMM1: * | v1 | v2 | v3
- // XMM2, XMM3: misc use
- asm
- {$if defined(TARGET_x64)}
- {$IFNDEF FPC}
- // nothing
- {$ELSE}
- // nothing
- {$ENDIF}
- {$elseif defined(TARGET_x86)}
- // nothing
- {$else}
- {$message fatal 'Unsupported target'}
- {$ifend}
- MOVUPS XMM0, TQuadFloatRec PTR [B] // XMM0 <- B3 | B2 | B1 | B0
- (*
- ** Initialization of forward pass
- **
- ** v[1] := pIn^ / (B[0] + B[1] + B[2] + B[3]);
- ** v[2] := v[1];
- ** v[3] := v[1];
- *)
- MOVAPS XMM2, XMM0 // XMM2 <- B3 | B2 | B1 | B0
- // XMM2[0] <- (B + B1 + B2 + B3)
- // TODO : HADDPS is slow according to this:
- // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
- // However, my own benchmarks indicate that the alternative is slower...
- {$ifdef IIR_USE_HADDPS}
- HADDPS XMM2, XMM2 // XMM2[0..3] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3]
- HADDPS XMM2, XMM2
- {$else IIR_USE_HADDPS}
- MOVHLPS XMM3, XMM2 // XMM3[0,1] <- XMM2[2,3]
- ADDPS XMM2, XMM3 // XMM2[0] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3]
- PSHUFD XMM3, XMM2, $01
- ADDSS XMM2, XMM3
- {$endif IIR_USE_HADDPS}
- MOVSS XMM1, [pIn] // XMM1[0] <- pIn^
- DIVSS XMM1, XMM2 // XMM1[0] <- XMM1[0] / XMM2[0]
- SHUFPS XMM1, XMM1, $00 // XMM1[0..3] <- XMM1[0]
- // Iterate over all cols using the "negative offset technique"
- {$if defined(TARGET_x86)}
- // We're now done with the value in the "B" register so it's repurposed for the "Width" counter.
- MOV ECX, Width
- LEA pIn, [pIn+ECX*4]
- LEA pOut, [pOut+ECX*4]
- NEG ECX
- {$elseif defined(TARGET_x64)}
- LEA pIn, [pIn+R9*4]
- LEA pOut, [pOut+R9*4]
- NEG R9
- {$ifend}
- @LOOP:
- (*
- ** v0 := (pIn^ * B0) + (v1 * B1) + (v2 * B2) + (v3 * B3);
- *)
- {$if defined(TARGET_x86)}
- MOVSS XMM2, [pIn+ECX*4] // XMM2[0] <- pIn
- {$elseif defined(TARGET_x64)}
- MOVSS XMM2, TFloat PTR [pIn+R9*4] // XMM2[0] <- pIn
- {$ifend}
- MOVSS XMM1, XMM2 // XMM1[0] <- XMM2[0]
- MOVAPS XMM2, XMM1 // XMM2 <- XMM1
- // TODO : DPPS might be slow on some CPUs:
- // https://stackoverflow.com/questions/37879678/dot-product-performance-with-sse-instructions-is-dpps-worth-using
- // On my system it's significantly faster than the alternative
- {$ifdef IIR_USE_DPPS}
- DPPS XMM2, XMM0, $FF
- {$else IIR_USE_DPPS}
- // Multiply B[] with v[]
- MULPS XMM2, XMM0 // XMM2 <- XMM2 * XMM0
- // Add the four results of the previous vectorized multiplication
- MOVHLPS XMM3, XMM2 // XMM2[0] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3]
- ADDPS XMM2, XMM3
- PSHUFD XMM3, XMM2, $01
- ADDSS XMM2, XMM3
- {$endif IIR_USE_DPPS}
- MOVSS XMM1, XMM2 // XMM1[0] <- v0
- // pOut^ := v0;
- {$if defined(TARGET_x86)}
- MOVSS [pOut+ECX*4], XMM2
- {$elseif defined(TARGET_x64)}
- MOVSS TFloat PTR [pOut+R9*4], XMM2
- {$ifend}
- (*
- ** Shift cached output values
- ** v3 := v2;
- ** v2 := v1;
- ** v1 := v0;
- **)
- SHUFPS XMM1, XMM1, $93 // XMM1[0,1,2,3] <- XMM1[3,0,1,2]
- // Loop to next col
- {$if defined(TARGET_x86)}
- INC ECX
- {$elseif defined(TARGET_x64)}
- INC R9
- {$ifend}
- JNZ @LOOP
- // Update v1, v2, v3 for backward filter
- {$if defined(TARGET_x86)}
- MOV EDX, v
- MOVUPS [EDX], XMM1
- {$elseif defined(TARGET_x64)}
- // TODO : For some reason this doesn't work ("v" isn't updated):
- // MOVUPS [TQuadFloat PTR v], XMM1
- MOV RDX, v
- MOVUPS [RDX], XMM1
- {$ifend}
- {$if defined(TARGET_x64)}
- {$IFDEF FPC}
- // nothing
- {$ENDIF}
- {$elseif defined(TARGET_x86)}
- // nothing
- {$ifend}
- end;
- //------------------------------------------------------------------------------
- procedure BlurFilterBackward_SSE41(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; const v: TQuadFloat); {$IFDEF FPC}assembler;{$ENDIF}
- // Parameters (x86):
- // EAX <- pIn
- // EDX <- pOut
- // ECX <- B
- // Stack[0] <- Width
- // Stack[1] <- v
- // Preserves: (none)
- //
- // Parameters (x64):
- // RCX <- pIn
- // RDX <- pOut
- // R8 <- B
- // R9 <- Width
- // Stack[0] <- v
- // Preserves: (none)
- //
- // SSE register usage:
- // XMM0: B3 | B2 | B1 | B0
- // XMM1: * | v1 | v2 | v3
- // XMM2, XMM3: misc use
- asm
- MOVUPS XMM0, TQuadFloatRec PTR [B] // XMM0 <- B3 | B2 | B1 | B0
- {$if defined(TARGET_x86)}
- MOV ECX, v
- MOVUPS XMM1, [ECX] // XMM1 <- v3 | v2 | v1 | *
- {$elseif defined(TARGET_x64)}
- // TODO : For some reason this doesn't work ("v" isn't loaded):
- // MOVUPS XMM1, [TQuadFloat PTR v] // XMM1 <- v3 | v2 | v1 | *
- MOV R8, QWORD PTR v
- MOVUPS XMM1, [R8] // XMM1 <- v3 | v2 | v1 | *
- {$ifend}
- // Iterate over all cols
- // Note: We're iterating backwards from the last to the first col.
- // On entry, pIn and pOut points to the last col. Here we adjust them to point to the first col
- // so we can use the col counter as an offset.
- {$if defined(TARGET_x86)}
- // We're now done with the value in the "B" register so it's repurposed for the "Width" counter.
- MOV ECX, Width
- // Although the compiler accepts [RegA-RegB*4] it assembles it to [RegA+RegB*4] so we need
- // to temporarily negate the offset value
- NEG ECX
- LEA pIn, [pIn+ECX*4]
- LEA pOut, [pOut+ECX*4]
- NEG ECX
- {$elseif defined(TARGET_x64)}
- NEG R9
- LEA pIn, [pIn+R9*4]
- LEA pOut, [pOut+R9*4]
- NEG R9
- {$ifend}
- @LOOP:
- (*
- ** v0 := (B0 * pIn^) + (v1 * B1) + (v2 * B2) + (v3 * B3);
- *)
- // Multiply B[] with v[]
- {$if defined(TARGET_x86)}
- INSERTPS XMM1, [pIn+ECX*4], $00 // XMM1[0] <- pIn
- {$elseif defined(TARGET_x64)}
- INSERTPS XMM1, TFloat PTR [pIn+R9*4], $00 // XMM1[0] <- pIn
- {$ifend}
- MOVAPS XMM2, XMM1 // XMM2 <- XMM1
- {$ifdef IIR_USE_DPPS}
- DPPS XMM2, XMM0, $FF // XMM2 <- XMM2 dot XMM1
- {$else IIR_USE_DPPS}
- MULPS XMM2, XMM0 // XMM2 <- XMM2 * XMM0
- // Add the four results of the previous vectorized multiplication
- MOVHLPS XMM3, XMM2 // XMM2[0] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3]
- ADDPS XMM2, XMM3
- PSHUFD XMM3, XMM2, $01
- ADDSS XMM2, XMM3
- {$endif IIR_USE_DPPS}
- MOVSS XMM1, XMM2 // XMM1[0] <- v0
- // pOut^ := v0;
- {$if defined(TARGET_x86)}
- MOVSS [pOut+ECX*4], XMM2
- {$elseif defined(TARGET_x64)}
- MOVSS TFloat PTR [pOut+R9*4], XMM2
- {$ifend}
- (*
- ** Shift cached output values
- ** v3 := v2;
- ** v2 := v1;
- ** v1 := v0;
- **)
- SHUFPS XMM1, XMM1, $93 // XMM0 <- v2 | v1 | v0 | v3
- // Loop to next (previous actually) col
- {$if defined(TARGET_x86)}
- DEC ECX
- {$elseif defined(TARGET_x64)}
- DEC R9
- {$ifend}
- JNZ @LOOP
- end;
- //------------------------------------------------------------------------------
- procedure BlurApplyEdgeCorrection_SSE41(iPlus: PFloat; const B: TQuadFloat; var v: TQuadFloat; const M: TNineFloats); // {$IFDEF FPC}assembler;{$ENDIF}
- // Parameters (x86):
- // EAX <- iPlus
- // EDX <- B0
- // ECX <- v
- // Stack[0] <- M
- // Preserves: (none)
- //
- // Parameters (x64):
- // RCX <- iPlus
- // RDX <- B0
- // R8 <- v
- // R9 <- M
- // Preserves: XMM4
- //
- // SSE register usage:
- // XMM0: B3 | B2 | B1 | B0
- // XMM1: * | v1 | v2 | v3
- // XMM2, XMM3, XMM4: misc use
- {$if defined(TARGET_x64) and defined(FPC)}begin{$ifend}
- asm
- {$if defined(TARGET_x64)}
- {$IFNDEF FPC}
- .SAVENV XMM4
- {$ENDIF}
- {$elseif defined(TARGET_x86)}
- // nothing
- {$else}
- {$message fatal 'Unsupported target'}
- {$ifend}
- (*
- ** unp0 := v[1] - uPlus;
- ** unp1 := v[2] - uPlus;
- ** unp2 := v[3] - uPlus;
- *)
- // XMM1 = [v3 | v2 | v1 | 0]
- MOVUPS XMM1, TQuadFloat PTR [v] // XMM1 <- v3 | v2 | v1 | v0
- // Save v0
- MOVAPS XMM5, XMM1
- // Now get rid of XMM1[0]
- SHUFPS XMM1, XMM1, $39 // XMM1 <- v0 | v3 | v2 | v1
- INSERTPS XMM1, XMM1, $F8 // XMM1 <- 0 | v3 | v2 | v1
- // XMM0 = [0 | u+ | u+ | u+]
- MOVSS XMM0, [iPlus] // XMM0[0] <- u+
- SHUFPS XMM0, XMM0, $00 // XMM0 <- u+ | u+ | u+ | u+
- // XMM1 = XMM1*XMM0 = [0, unp2, unp1, unp0]
- SUBPS XMM1, XMM0 // XMM1 <- XMM0 - u+
- // XMM1 = [0, unp2,unp1,unp0]
- INSERTPS XMM1, XMM1, $F8 // XMM1[3] <- 0
- (*
- ** v[1] := ( (M0 * unp0) + (M1 * unp1) + (M2 * unp2) ) * B0 + vPlus;
- ** v[2] := ( (M3 * unp0) + (M4 * unp1) + (M5 * unp2) ) * B0 + vPlus;
- ** v[3] := ( (M6 * unp0) + (M7 * unp1) + (M8 * unp2) ) * B0 + vPlus;
- *)
- // XMM2 = [0 | M2 | M1 | M0]
- // XMM3 = [0 | M5 | M4 | M3]
- // XMM4 = [0 | M8 | M7 | M6]
- {$if defined(TARGET_x86)}
- MOV EAX, M
- MOVUPS XMM2, [EAX] // XMM1 <- * | M2 | M1 | M0
- {$elseif defined(TARGET_x64)}
- MOVUPS XMM2, TQuadFloat PTR [M] // XMM1 <- * | M2 | M1 | M0
- {$ifend}
- INSERTPS XMM2, XMM2, $F8 // XMM2 <- 0 | M2 | M1 | M0
- {$if defined(TARGET_x86)}
- ADD EAX, 12//SizeOf(TFloat)*3
- MOVUPS XMM3, [EAX] // XMM3 <- * | M5 | M4 | M3
- {$elseif defined(TARGET_x64)}
- ADD M, 12//SizeOf(TFloat)*3
- MOVUPS XMM3, TQuadFloat PTR [M] // XMM3 <- * | M5 | M4 | M3
- {$ifend}
- INSERTPS XMM3, XMM3, $F8 // XMM3 <- 0 | M5 | M4 | M3
- {$if defined(TARGET_x86)}
- ADD EAX, 12//SizeOf(TFloat)*3
- MOVUPS XMM4, [EAX] // XMM4 <- * | M8 | M7 | M6
- {$elseif defined(TARGET_x64)}
- ADD M, 12//SizeOf(TFloat)*3
- MOVUPS XMM4, TQuadFloat PTR [M] // XMM4 <- * | M8 | M7 | M6
- {$ifend}
- INSERTPS XMM4, XMM4, $F8 // XMM4 <- 0 | M8 | M7 | M6
- // XMM2[1] = [M2, M1, M0]*[unp2, unp1, unp0] = (M0 * unp0) + (M1 * unp1) + (M2 * unp2)
- // TODO : {$ifdef IIR_USE_DPPS}
- DPPS XMM2, XMM1, $72 // XMM2[1] <- [M2, M1, M0]*[unp2, unp1, unp0]
- // XMM3[2] = [M5, M4, M3]*[unp2, unp1, unp0] = (M3 * unp0) + (M4 * unp1) + (M5 * unp2)
- // TODO : {$ifdef IIR_USE_DPPS}
- DPPS XMM3, XMM1, $74 // XMM3[2] <- [M5, M4, M3]*[unp2, unp1, unp0]
- // XMM4[3] = [M8, M7, M6]*[unp2, unp1, unp0] = (M6 * unp0) + (M7 * unp1) + (M8 * unp2)
- // TODO : {$ifdef IIR_USE_DPPS}
- DPPS XMM4, XMM1, $78 // XMM4[3] <- [M8, M7, M6]*[unp2, unp1, unp0]
- // XMM2[1..3] = [ XMM2[1] | XMM3[2] | XMM4[3] ]
- INSERTPS XMM2, XMM3, $A0 // XMM2[2] <- XMM3[2]
- INSERTPS XMM2, XMM4, $F0 // XMM2[3] <- XMM4[3]
- // XMM1 = B[0]
- {$IFNDEF FPC}
- MOVSS XMM1, TFloat PTR [B] // XMM1[1] <- B0
- {$ELSE}
- {$if defined(TARGET_x64)}
- MOVSS XMM1, TFloat PTR [RDX] // XMM1[1] <- B0
- {$else}
- MOVSS XMM1, TFloat PTR [EDX] // XMM1[1] <- B0
- {$ifend}
- {$ENDIF}
- SHUFPS XMM1, XMM1, 0 // XMM1 <- B0 | B0 | B0 | B0
- // XMM2 = XMM2 * B0
- MULPS XMM2, XMM1 // XMM2 <- XMM2 * B0
- // XMM2 = XMM2 + v+
- ADDPS XMM2, XMM0 // XMM2 <- XMM2 + v+
- INSERTPS XMM2, XMM5, $00 // XMM2[0] <- XMM5[0]
- MOVUPS TQuadFloat PTR [v], XMM2 // XMM2 -> v
- {$if defined(TARGET_x64) and defined(FPC)}end['XMM4'];{$ifend}
- end;
- {$ifend}
- //------------------------------------------------------------------------------
- // Low level, Pascal implementations
- //------------------------------------------------------------------------------
- procedure BlurFilterForward_Pas(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var vv: TQuadFloat); inline;
- var
- i: integer;
- v: TQuadFloat;
- begin
- // Initialization of forward pass
- // [2] Equation (20)
- //
- // in[1]
- // w[1] = w[2] = w[3] = ──────────────────
- // 1+ fd1 + fd2 + fd3
- //
- v[1] := pIn[0] / (B[0] + B[1] + B[2] + B[3]);
- v[2] := v[1];
- v[3] := v[1];
- // Forward pass
- // [1] Equation (9a)
- //
- // B[1]*v[n-1] + B[2]*v[n-2] + B[3]*v[n-3] ∑ B[1..3]
- // w[n] = B * in[n] + ─────────────────────────────────────── , B = 1 - ───────── ([1] equation 10)
- // B[0] B[0]
- //
- for i := 0 to Width-1 do
- begin
- // Note: In this implementation B[1..3] has already been scaled by 1/b0 and B[0] instead contains B.
- v[0] := (pIn[i] * B[0]) + (v[1] * B[1]) + (v[2] * B[2]) + (v[3] * B[3]);
- pOut[i] := v[0];
- // Shift cached output values
- v[3] := v[2];
- v[2] := v[1];
- v[1] := v[0];
- end;
- vv := v;
- end;
- //------------------------------------------------------------------------------
- procedure BlurFilterBackward_Pas(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat); inline;
- var
- i: integer;
- begin
- Dec(PFloat(pIn), Width-1);
- Dec(PFloat(pOut), Width-1);
- // Backward pass
- // [1] Equation (9b)
- //
- // B[1]*out[n+1] + B[2]*out[n+2] + B[3]*out[n+3] ∑ B[1..3]
- // out[n] = B * w[n] + ───────────────────────────────────────────── , B = 1 - ───────── ([1] equation 10)
- // B[0] B[0]
- //
- for i := Width-1 downto 0 do // Using negative array index works too but will give the purists a heart attack
- begin
- v[0] := (B[0] * pIn[i]) + (v[1] * B[1]) + (v[2] * B[2]) + (v[3] * B[3]);
- pOut[i] := v[0];
- // Shift cached output values
- v[3] := v[2];
- v[2] := v[1];
- v[1] := v[0];
- end;
- end;
- //------------------------------------------------------------------------------
- procedure BlurApplyEdgeCorrection_Pas(iPlus: PFloat; const B: TQuadFloat; var v: TQuadFloat; const M: TNineFloats);
- var
- uPlus, vPlus: TFloat;
- unp0, unp1, unp2: TFloat;
- begin
- // [3] Equation (15)
- // iPlus = infinite stream of input values = value of rightmost pixel in our case
- // [3] Equation (15a)
- //
- // i+ i+
- // u+ = ────────────── = ────
- // 1 - ∑ b1..b3 B
- //
- // Note: B = B[0] = 1-(B[1]+B[2]+B[3]), [1] Equation (10)
- // See calculation of B[0] in ComputeCoefficients*
- uPlus := iPlus^; // We have already scaled by B
- // [3] Equation (15b)
- //
- // u+ u+
- // v+ = ────────────── = ────
- // 1 - ∑ b1..b3 B
- //
- vPlus := uPlus; // We have already scaled by B
- // [3] Equation (14)
- //
- // ┌ ┐ ┌ ┐
- // │v[2] - u+│ │ v+ │
- // v[] = M[] * │v[1] - u+│ + │ v+ │
- // │v[0] - u+│ │ v+ │
- // └ ┘ └ ┘
- //
- unp0 := v[1] - uPlus;
- unp1 := v[2] - uPlus;
- unp2 := v[3] - uPlus;
- v[1] := ( (M[0] * unp0) + (M[1] * unp1) + (M[2] * unp2) ) * B[0] + vPlus;
- v[2] := ( (M[3] * unp0) + (M[4] * unp1) + (M[5] * unp2) ) * B[0] + vPlus;
- v[3] := ( (M[6] * unp0) + (M[7] * unp1) + (M[8] * unp2) ) * B[0] + vPlus;
- end;
- //------------------------------------------------------------------------------
- // Coefficient calculation
- //------------------------------------------------------------------------------
- // Uses algorithm from Inkscape to calculate coefficients (Adapted from [7]. Based on [4]).
- // (Iterative and supposedly even better approximation than [2]).
- // Note: My (anme) own tests shows that [2] produces a much more precise approximation
- // of the gaussian curve.
- procedure ComputeCoefficientsInkscape(Sigma: TFloat; var B: TQuadFloat);
- var
- d1_org: TComplex;
- d3_org: Double;
- qLow, qHigh: Double;
- SigmaSqr: Double;
- s, q: Double;
- d1: TComplex;
- d3: Double;
- sSqr: Double;
- AbsD1Sqr: Double;
- re2d1: Double;
- B0: Double;
- Limit: Double;
- begin
- (*
- The use of ComputeCoefficientsInkscape is currently disabled by default because:
- - ComputeCoefficientsInkscape is only valid for Sigma >= 20 and the transition
- between the default coefficient and ComputeCoefficientsInkscape isn't smooth.
- *)
- // [4] Table 1, 3rd order filter
- d1_org := TComplex.From(1.40098, 1.00236);
- d3_org := 1.85132;
- SigmaSqr := Sqr(Sigma);
- // [4] suggests using "a few iterations of a linear extrapolation scheme" to find
- // the value of q that yields a filter h[n] with the specified variance.
- // Here we use a binary search instead.
- qLow := 1; // Don't go lower than sigma=2 (we'd probably want a normal convolution in that case anyway)
- qHigh := 2 * Sigma;
- {$if (SizeOf(TFloat) > SizeOf(Single))}
- Limit := Sigma / (1 shl 30)
- {$else}
- Limit := Sigma / (1 shl 20);
- {$ifend}
- repeat
- q := (qLow + qHigh) / 2;
- // Compute scaled filter coefficients
- // [4] Equation (20)
- // 1 jθ(d) 1 jθ(d)
- // ─── ───── ─── ───── -2
- // var(h[n]) = ∑ 2|d| q * e q * ( |d| q - e q )
- //
- d1 := TComplex.Power(d1_org, 1 / q);
- d3 := Power(d3_org, 1 / q);
- // Compute actual sigma^2
- // [4] Equation (10)
- //
- // 2d
- // ∑ n²*h[n] = ∑ ────────
- // (d - 1)²
- //
- sSqr := 2 * (2 * (d1 / TComplex.Sqr(d1 - 1)).Real + d3 / Sqr(d3 - 1));
- if (sSqr < SigmaSqr) then
- qLow := q
- else
- qHigh := q;
- s := qHigh - qLow;
- until (s <= Limit);
- q := (qLow + qHigh) / 2;
- // Compute z-poles
- // [4] Equation (20)
- d1 := TComplex.Power(d1_org, 1 / q);
- d3 := Power(d3_org, 1 / q);
- AbsD1Sqr := TComplex.AbsSqr(d1); // d1*d2 = d1*conj(d1) = |d1|^2 = std::norm(d1)
- re2d1 := 2 * d1.Real; // d1+d2 = d1+conj(d1) = 2*real(d1)
- // Compute filter coefficients
- // [4] Equation (13)
- // Note: For performance b1..b3 are divided b0 to avoid doing so in the inner loop.
- // Additionally B is stored in B[0] since we no longer need b0.
- B0 := 1 / (AbsD1Sqr * d3);
- B[1] := (AbsD1Sqr + d3 * re2d1) * B0;
- B[2] := -(d3 + re2d1) * B0;
- B[3] := B0;
- // B: Normalization constant
- // [1] Equation (10)
- B[0] := 1.0 - (B[1] + B[2] + B[3]);
- end;
- //------------------------------------------------------------------------------
- // Uses algorithm from [2] to calculate coefficients (more precise approximation).
- // Not stable for sigma >= 20 (Radius >= 50) - produces "crazy" results.
- (*
- The use of ComputeCoefficientsGabor is currently disabled by default because:
- a) ComputeCoefficientsGaussianDerivative produces almost the same values for
- Radius=1 as ComputeCoefficientsGabor does for Radius=2 which makes no
- sense to the user.
- b) ComputeCoefficientsGabor doesn't support Sigma < 0.5 so we can't just use
- it instead of ComputeCoefficientsGaussianDerivative.
- * Radius: 1; Sigma: 0.300386630413846
- ComputeCoefficientsGaussianDerivative
- B[0.840293169021606, 0.170402005314827, -0.0110350390896201, 0.00033983588218689]
- * Radius: 2; Sigma: 0.300386630413846) 0.600773260827692
- ComputeCoefficientsGabor
- B[0.845094859600067, 0.164933204650879, -0.0103367269039154, 0.000308647722704336]
- *)
- procedure ComputeCoefficientsGabor(Sigma: TFloat; var B: TQuadFloat);
- var
- q, qq, qqq: Double;
- B0: Double;
- const
- // [2] Equation (7)
- m0 = 1.16680;
- m1 = 1.10783;
- m2 = 1.40586;
- m1m1 = m1*m1;
- m2m2 = m2*m2;
- begin
- // Compute q from Standard Deviation σ
- // [2] Equation (16)
- //
- // ┌
- // │ 0.9804(σ -3.556) + 2.5091 for σ >= 3.556
- // q = ┤
- // │ 0.0561σ² +0.5784σ - 0.2568 for 0.5 <= σ <= 3.556
- // └
- // Note: Doesn't work well for values smaller than 0.5
- if (Sigma < 3.556) then
- q := (0.0561 * Sigma + 0.5784) * Sigma - 0.2568 // = 0.0561*Sqr(Sigma) + 0.5784*Sigma - 0.2568
- else
- q := 0.9804 * (Sigma - 3.556) + 2.5091;
- qq := Sqr(q);
- qqq := qq * q;
- // Compute filter coefficients
- // [2] Equation (15)
- //
- // b0 = 1 [*] scale = (m0 + q)(m1² + m2² + 2m1q + q²) [*] Something is missing here in the original paper
- // b1 = -q(2m0m1 + m1² + m2² + (2m0 + 4m1)q + 3q²) / scale
- // b2 = q²(m0 + 2m1 + 3q) / scale
- // b3 = -q³ / scale
- //
- // Note: We have reversed the sign of b1, b2 & b3 from [2]
- // Note: For performance we scale B[1..3] by 1/b0 to avoid doing so in the inner loop.
- // Additionally we store B in B[0].
- B0 := 1 / ((m0 + q) * (m1m1 + m2m2 + 2 * m1 * q + qq));
- B[1] := (q * (2 * m0 * m1 + m1m1 + m2m2 + (2 * m0 + 4 * m1) * q + 3 * qq)) * B0;
- B[2] := (-qq * (m0 + 2 * m1 + 3 * q)) * B0;
- B[3] := (qqq) * B0;
- // [2] Equation (19)
- //
- // B = (m0(m1² + m2²)/scale)²
- //
- B[0] := (m0 * (m1m1 + m2m2)) * B0;
- end;
- //------------------------------------------------------------------------------
- // Uses algorithm from [4] to calculate coefficients
- procedure ComputeCoefficientsGaussianDerivative(Sigma: TFloat; var B: TQuadFloat);
- var
- q, qq, qqq: Double;
- B0: Double;
- // Sigma2, Sigma3, Sigma4: Double;
- begin
- // Compute q from Standard Deviation
- (*
- if (Sigma <= 4) and (Sigma >= 1) then
- begin
- // [5] Equation (22)
- // Valid for the range σ = [1,4]
- Sigma2 := Sqr(Sigma);
- Sigma3 := Sigma2 * Sigma;
- Sigma4 := Sigma3 * Sigma;
- q := 0.0001 * Sigma4 - 0.0021 * Sigma3 + 0.0207 * Sigma2 + 0.3797 * Sigma + 0.1763;
- end else
- // Well, this sucks: The transition between the case above (Radius=13, Sigma=3.9) and the one
- // below (Radius=14, Sigma=4.2) isn't smooth; There's a visible "jump" in the amount of blur
- // when we transition between them.
- *)
- if (Sigma >= 2.5) then
- // [1] Equation (11b)
- q := 0.98711 * Sigma - 0.96330
- else
- if (Sigma >= 0.5) then
- // [1] Equation (11b)
- q := 3.97156 - 4.14554 * Sqrt(1.0 - 0.26891 * Sigma)
- else
- // [6]
- q := 0.1147705018520355224609375;
- qq := Sqr(q);
- qqq := qq * q;
- // Compute filter coefficients
- // [1] Equation (8c)
- B0 := 1 / (1.57825 + (2.44413 * q) + (1.4281 * qq) + (0.422205 * qqq));
- B[1] := ( (2.44413 * q) + (2.85619 * qq) + (1.26661 * qqq)) * B0;
- B[2] := (-( (1.4281 * qq) + (1.26661 * qqq))) * B0;
- B[3] := ( 0.422205 * qqq) * B0;
- // B: Normalization constant
- // [1] Equation (10)
- B[0] := 1.0 - (B[1] + B[2] + B[3]);
- end;
- //------------------------------------------------------------------------------
- // Edge correction matrix [3]
- //------------------------------------------------------------------------------
- procedure ComputeBoundaryCorrectionMatrix(const B: TQuadFloat; var M: TNineFloats);
- var
- Scale: Double;
- B3B3, B2B2: Double;
- B3B1, B3B2: Double;
- begin
- // Compute edge correction matrix M[3,3]
- // [3] Equation (5)
- //
- //
- // 1
- // Scale = ────────────────────────────────────────
- // (1+a1−a2+a3)(1−a1−a2−a3)(1+a2+(a1−a3)a3)
- //
- // ┌ ┐
- // │ −a3a1+1−a3²-a2 (a3+a1)(a2+a3a1) a3(a1+a3a2) │
- // M = Scale * │ a1+a3a2 −(a2−1)(a2+a3a1) −(a3a1+a3²+a2−1)a3 │
- // │ a3a1+a2+a1²−a2² a1a2+a3a2²−a1a3²−a3³−a3a2+a3 a3(a1+a3a2) │
- // └ ┘
- //
- Scale := 1 / ((1 + B[1] - B[2] + B[3]) * (1 - B[1] - B[2] - B[3]) * (1 + B[2] + (B[1] - B[3]) * B[3]));
- B3B3 := Sqr(B[3]);
- B2B2 := Sqr(B[2]);
- B3B1 := B[3] * B[1];
- B3B2 := B[3] * B[2];
- M[0] := Scale * (-B3B1 + 1 - B3B3 - B[2]); // [0,0]
- M[1] := Scale * (B[3] + B[1]) * (B[2] + B3B1); // [0,1]
- M[3] := Scale * (B[1] + B3B2); // [1,0]
- M[2] := M[3] * B[3]; // [0,2]
- M[4] := -Scale * (B[2] - 1) * (B[2] + B3B1); // [1,1]
- M[5] := -Scale * B[3] * (B3B1 + B3B3 + B[2] - 1); // [1,2]
- M[6] := Scale * (B3B1 + B[2] + Sqr(B[1]) - B2B2); // [2,0]
- M[7] := Scale * (B[1] * B[2] + B[3] * B2B2 - B3B3 * (B[1]+B[3]) - B3B2 + B[3]); // [2,1]
- M[8] := Scale * B[3] * (B[1] + B3B2); // [2,2]
- end;
- //------------------------------------------------------------------------------
- // Single channel buffer I/O
- //------------------------------------------------------------------------------
- // Note: Although the layout of the Graphics32 TColor32Entry type depends on the
- // RGBA_FORMAT define, the position of the Alpha channel in the record is
- // constant which is what is important for us here.
- //------------------------------------------------------------------------------
- procedure LoadChannel(Src: TBitmap32; Channel: integer; Buffer: PFloatArray
- {$ifdef IIR_BLUR_DIV_LUT}; const PremultiplyLUT: TPremultiplyLUT{$endif});
- var
- DestValues: PFloatArray;
- SourceValues: PColor32EntryArray;
- AlphaValues: PColor32EntryArray;
- Count: integer;
- i: integer;
- Alpha: Byte;
- begin
- Count := Src.Height * Src.Width;
- DestValues := Buffer;
- SourceValues := PColor32EntryArray(@(PByte(Src.Bits)[Channel]));
- if (Channel = Ord(ccAlpha)) then
- begin
- for i := 0 to Count-1 do
- DestValues[i] := SourceValues[i].Planes[0];
- end else
- begin
- AlphaValues := PColor32EntryArray(@(PColor32Entry(Src.Bits).A));
- for i := 0 to Count-1 do
- begin
- Alpha := AlphaValues[i].Planes[0];
- // Important:
- // 1) First convert to linear color space.
- // 2) Then alpha premultiply.
- if (Alpha = 255) then // Most common case first
- {$ifdef IIR_BLUR_DIV_LUT}
- DestValues[i] := PremultiplyLUT.MulDiv255[SourceValues[i].Planes[0], 255]
- {$else IIR_BLUR_DIV_LUT}
- DestValues[i] := GAMMA_DECODING_TABLE[SourceValues[i].Planes[0]]
- {$endif IIR_BLUR_DIV_LUT}
- else
- if (Alpha = 0) then
- DestValues[i] := 0
- else
- // Premultiply: ColorPremult := Color * Alpha / 255
- {$ifdef IIR_BLUR_DIV_LUT}
- DestValues[i] := PremultiplyLUT.MulDiv255[SourceValues[i].Planes[0], Alpha];
- {$else IIR_BLUR_DIV_LUT}
- DestValues[i] := GAMMA_DECODING_TABLE[SourceValues[i].Planes[0]] * Alpha * OneOver255;
- {$endif IIR_BLUR_DIV_LUT}
- end;
- end;
- end;
- //------------------------------------------------------------------------------
- procedure SaveChannel(Dst: TBitmap32; Channel: integer; Buffer: PFloatArray
- {$ifdef IIR_BLUR_DIV_LUT}; const PremultiplyLUT: TPremultiplyLUT{$endif});
- var
- SourceValues: PFloatArray;
- DestValues: PColor32EntryArray;
- AlphaValues: PColor32EntryArray;
- Count: integer;
- i: integer;
- n: TFloat;
- Color, Alpha: Byte;
- begin
- Count := Dst.Height * Dst.Width;
- SourceValues := Buffer;
- DestValues := PColor32EntryArray(@(PByte(Dst.Bits)[Channel]));
- if (Channel = Ord(ccAlpha)) then
- begin
- for i := 0 to Count-1 do
- begin
- Alpha := Clamp(FastRound(SourceValues[i]));
- DestValues[i].Planes[0] := Alpha;
- end;
- end else
- begin
- AlphaValues := PColor32EntryArray(@(PColor32Entry(Dst.Bits).A));
- for i := 0 to Count-1 do
- begin
- Alpha := AlphaValues[i].Planes[0];
- // Important:
- // 1) First alpha unpremultiply.
- // 2) Then convert to gamma/sRGB color space.
- if (Alpha = 255) then // Most common case first
- begin
- {$ifdef IIR_BLUR_DIV_LUT}
- n := SourceValues[i];
- Color := Clamp(FastRound(n));
- Color := PremultiplyLUT.Mul255Div[Color, 255];
- {$else IIR_BLUR_DIV_LUT}
- Color := FastRound(SourceValues[i]);
- Color := GAMMA_ENCODING_TABLE[Color];
- {$endif IIR_BLUR_DIV_LUT}
- end else
- if (Alpha = 0) then
- Color := 0
- else
- begin
- // Unpremultiply: Color := ColorPremult * 255 / AlphaValues
- {$ifdef IIR_BLUR_DIV_LUT}
- n := SourceValues[i];
- Color := Clamp(FastRound(n)); // Warning: Loss of precision due to rounding before division
- Color := PremultiplyLUT.Mul255Div[Color, Alpha];
- {$else IIR_BLUR_DIV_LUT}
- // Note: We're doing a double Round() here because SourceValues[i]/Alpha can be greater than 1
- // since we're dividing an unrounded value by a rounded value.
- // For example if both values are equal (e.g. source was $FFFFFFFF) and the 'alpha' value was rounded
- // down, then we will be dividing a larger value (e.g. 25.4) with a smaller (e.g. 25).
- // To avoid this rounding error we would have to keep a copy of the unrounded alpha values and
- // divide with those.
- Color := FastRound(FastRound(SourceValues[i]) / Alpha * 255 );
- Color := GAMMA_ENCODING_TABLE[Color];
- {$endif IIR_BLUR_DIV_LUT}
- end;
- DestValues[i].Planes[0] := Color;
- end;
- end;
- end;
- //------------------------------------------------------------------------------
- // Buffer management
- //------------------------------------------------------------------------------
- function AllocateChannelBuffer(out Buffer: pointer; Size: integer): PFloatArray;
- const
- Alignment: NativeUInt = 64*1024;
- begin
- {$ifdef IIR_BLUR_ALIGN_BUFFERS}
- // Allocate a cache-friendly channel buffer
- // TODO : Although this gives us a small performance improvement it is most likely not worth the effort
- Inc(Size, 2 * Alignment);
- GetMem(Buffer, Size);
- Result := Buffer;
- Inc(PByte(Result), Alignment);
- // Ensure data alignment by X bytes
- Result := Pointer(NativeUInt(Result) and (NativeUInt(-1) xor (Alignment - 1)));
- // Make sure data is not aligned by 2*A bytes
- NativeUInt(Result) := NativeUInt(Result) or Alignment;
- {$else IIR_BLUR_ALIGN_BUFFERS}
- GetMem(Buffer, Size);
- Result := Buffer;
- {$endif IIR_BLUR_ALIGN_BUFFERS}
- end;
- //------------------------------------------------------------------------------
- // Core function
- //------------------------------------------------------------------------------
- procedure InternalRecursiveGaussianBlur(Src, Dst: TBitmap32; Sigma: TFloat; TwoDimensional: boolean
- {$ifdef IIR_BLUR_DIV_LUT}; const PremultiplyLUT: TPremultiplyLUT{$endif});
- var
- B: TQuadFloat;
- M: TNineFloats; // [3] Fix boundary
- Buffer, TransposedBuffer: PFloatArray;
- RowBuffer: PFloatArray;
- procedure BlurRow(Row: integer; Width, Height: integer; InBuffer, OutBuffer: PFloatArray);
- var
- InOffset, OutOffset: integer;
- pIn, pOut: PFloatArray;
- v: TQuadFloat;
- begin
- (*
- ** Causal filter (forward)
- *)
- InOffset := Row * Width;
- pIn := PFloatArray(@InBuffer[InOffset]);
- pOut := RowBuffer;
- IIR_BlurFilterForward(pIn, pOut, B, Width, v);
- (*
- ** Anti-causal filter (backward)
- *)
- OutOffset := InOffset + Width-1;
- pIn := @RowBuffer[Width-1];
- pOut := @OutBuffer[OutOffset];
- {$ifdef IIR_BLUR_EDGE_CORRECTION}
- (*
- ** [3] Apply Triggs/Sdika edge correction
- *)
- begin
- IIR_BlurApplyEdgeCorrection(@InBuffer[InOffset + Width-1], B, v, M);
- pOut[0] := v[1];
- Dec(PFloat(pOut));
- Dec(PFloat(pIn));
- Dec(Width); // Adjust backward loop count
- end;
- {$else IIR_BLUR_EDGE_CORRECTION}
- (*
- ** Regular initialization of backward pass. Not used with [3]
- *)
- // [2] Equation (21)
- v[1] := pIn[0] / (B[0] + B[1] + B[2] + B[3]);
- v[2] := v[1];
- v[3] := v[1];
- {$endif IIR_BLUR_EDGE_CORRECTION}
- IIR_BlurFilterBackward(pIn, pOut, B, Width, v);
- end;
- procedure BlurBuffer;
- var
- i: integer;
- begin
- (*
- ** A note about transpose:
- ** An earlier version performed the transpose operation between the horizontal and vertical pass, as part of the
- ** backward (anti-causal) row filter.
- ** Tests has shown that this is at least 25% slower than performing the transpose in one go, after the complete
- ** horizontal and vertical passes, using a blocked transpose algorithm.
- *)
- for i := 0 to Src.Height-1 do
- BlurRow(i, Src.Width, Src.Height, Buffer, Buffer);
- Transpose32(@Buffer[0], @TransposedBuffer[0], Src.Width, Src.Height);
- for i := 0 to Src.Width-1 do
- BlurRow(i, Src.Height, Src.Width, TransposedBuffer, TransposedBuffer);
- Transpose32(@TransposedBuffer[0], @Buffer[0], Src.Height, Src.Width);
- end;
- procedure BlurHorizontal;
- var
- i: integer;
- begin
- for i := 0 to Src.Height-1 do
- BlurRow(i, Src.Width, Src.Height, Buffer, Buffer);
- end;
- var
- Channel: integer;
- MaxRows: integer;
- UnAlignRowBuffer: pointer;
- UnAlignBuffer: pointer;
- UnAlignTransposedBuffer: pointer;
- begin
- if (Sigma < GaussianRadiusToSigma) then
- begin
- Src.CopyMapTo(Dst);
- exit;
- end;
- Dst.SetSize(Src.Width, Src.Height);
- {$ifdef IIR_BLUR_INKSCAPE_COEFFICIENTS}
- if (Sigma >= 20) then
- ComputeCoefficientsInkscape(Sigma, B)
- else
- {$endif IIR_BLUR_INKSCAPE_COEFFICIENTS}
- {$ifdef IIR_BLUR_GABOR_COEFFICIENTS}
- if (Sigma >= 0.5) then
- ComputeCoefficientsGabor(Sigma, B)
- else
- {$endif IIR_BLUR_GABOR_COEFFICIENTS}
- ComputeCoefficientsGaussianDerivative(Sigma, B);
- // [3] Fix boundary
- ComputeBoundaryCorrectionMatrix(B, M);
- MaxRows := Max(Src.Height, Src.Width);
- UnAlignBuffer := nil;
- UnAlignTransposedBuffer := nil;
- UnAlignRowBuffer := nil;
- try
- Buffer := AllocateChannelBuffer(UnAlignBuffer, Src.Width * Src.Height * SizeOf(TFloat));
- if (TwoDimensional) then
- TransposedBuffer := AllocateChannelBuffer(UnAlignTransposedBuffer, Src.Width * Src.Height * SizeOf(TFloat));
- RowBuffer := AllocateChannelBuffer(UnAlignRowBuffer, MaxRows * SizeOf(TFloat));
- // Start with A channel so unpremult of RGB channels uses correct value
- for Channel := 3 downto 0 do
- begin
- LoadChannel(Src, Channel, Buffer
- {$ifdef IIR_BLUR_DIV_LUT}, PremultiplyLUT{$endif});
- if (TwoDimensional) then
- BlurBuffer
- else
- BlurHorizontal;
- SaveChannel(Dst, Channel, Buffer
- {$ifdef IIR_BLUR_DIV_LUT}, PremultiplyLUT{$endif});
- end;
- finally
- Freemem(UnAlignRowBuffer);
- Freemem(UnAlignTransposedBuffer);
- Freemem(UnAlignBuffer);
- end;
- end;
- //------------------------------------------------------------------------------
- procedure RecursiveGaussianBlur(Src, Dst: TBitmap32; Sigma: TFloat);
- begin
- InternalRecursiveGaussianBlur(Src, Dst, Sigma, True
- {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.PremultiplyLUT^{$endif});
- end;
- procedure RecursiveGaussianBlurRadius(Src, Dst: TBitmap32; Radius: TFloat);
- begin
- InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, True
- {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.PremultiplyLUT^{$endif});
- end;
- procedure RecursiveGaussianBlurGamma(Src, Dst: TBitmap32; Sigma: TFloat);
- begin
- InternalRecursiveGaussianBlur(Src, Dst, Sigma, True
- {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.GammaPremultiplyLUT^{$endif});
- end;
- procedure RecursiveGaussianBlurRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat);
- begin
- InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, True
- {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.GammaPremultiplyLUT^{$endif});
- end;
- //------------------------------------------------------------------------------
- procedure RecursiveGaussianBlurHorizontalRadius(Src, Dst: TBitmap32; Radius: TFloat);
- begin
- InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, False
- {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.PremultiplyLUT^{$endif});
- end;
- procedure RecursiveGaussianBlurHorizontalRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat);
- begin
- InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, False
- {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.GammaPremultiplyLUT^{$endif});
- end;
- //------------------------------------------------------------------------------
- //
- // Registration of internal bindings
- //
- //------------------------------------------------------------------------------
- procedure RegisterBindings;
- begin
- BlurRegistry.RegisterBinding(@@IIR_BlurFilterForward, 'IIR_BlurFilterForward');
- BlurRegistry.RegisterBinding(@@IIR_BlurFilterBackward, 'IIR_BlurFilterBackward');
- BlurRegistry.RegisterBinding(@@IIR_BlurApplyEdgeCorrection, 'IIR_BlurApplyEdgeCorrection');
- {$ifdef IIR_BLUR_DEFAULT}
- // Register as default Blur32 implementation
- BlurRegistry[@@Blur32Proc].Add( @RecursiveGaussianBlurRadius, [isPascal]).Name := 'RecursiveGaussianBlurRadius';
- BlurRegistry[@@GammaBlur32Proc].Add( @RecursiveGaussianBlurRadiusGamma, [isPascal]).Name := 'RecursiveGaussianBlurRadiusGamma';
- {$endif IIR_BLUR_DEFAULT}
- BlurRegistry[@@HorizontalBlur32].Add( @RecursiveGaussianBlurHorizontalRadius, [isPascal]).Name := 'RecursiveGaussianBlurHorizontalRadius';
- BlurRegistry[@@GammaHorizontalBlur32].Add( @RecursiveGaussianBlurHorizontalRadiusGamma, [isPascal]).Name := 'RecursiveGaussianBlurHorizontalRadiusGamma';
- BlurRegistry[@@IIR_BlurFilterForward].Add( @BlurFilterForward_Pas, [isPascal]).Name := 'BlurFilterForward_Pas';
- BlurRegistry[@@IIR_BlurFilterBackward].Add( @BlurFilterBackward_Pas, [isPascal]).Name := 'BlurFilterBackward_Pas';
- BlurRegistry[@@IIR_BlurApplyEdgeCorrection].Add(@BlurApplyEdgeCorrection_Pas, [isPascal]).Name := 'BlurApplyEdgeCorrection_Pas';
- {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))}
- {$ifdef IIR_BLUR_SIMD}
- BlurRegistry[@@IIR_BlurFilterForward].Add( @BlurFilterForward_SSE41,
- {$if defined(IIR_USE_DPPS)}
- [isSSE41]).Name := 'BlurFilterForward_SSE41';
- {$elseif defined(IIR_USE_HADDPS)}
- [isSSE3]).Name := 'BlurFilterForward_SSE41';
- {$else}
- [isSSE2]).Name := 'BlurFilterForward_SSE41';
- {$ifend}
- BlurRegistry[@@IIR_BlurFilterBackward].Add( @BlurFilterBackward_SSE41, [isSSE41]).Name := 'BlurFilterBackward_SSE41';
- {$endif IIR_BLUR_SIMD}
- {$ifdef IIR_BLUR_EDGE_CORRECTION_SIMD}
- BlurRegistry[@@IIR_BlurApplyEdgeCorrection].Add(@BlurApplyEdgeCorrection_SSE41, [isSSE41]).Name := 'BlurApplyEdgeCorrection_SSE41';
- {$endif IIR_BLUR_EDGE_CORRECTION_SIMD}
- BlendRegistry.RebindAll;
- {$ifend}
- end;
- //------------------------------------------------------------------------------
- //------------------------------------------------------------------------------
- //------------------------------------------------------------------------------
- initialization
- RegisterBindings;
- end.
|