unit GR32.Blur.SelectiveGaussian;

(* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1 or LGPL 2.1 with linking exception
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * Alternatively, the contents of this file may be used under the terms of the
 * Free Pascal modified version of the GNU Lesser General Public License
 * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions
 * of this license are applicable instead of those above.
 * Please see the file LICENSE.txt for additional information concerning this
 * license.
 *
 * The Original Code is Selective Gaussian Blur for Graphics32
 *
 * The Initial Developer of the Original Code is
 * Mattias Andersson
 *
 * Portions created by the Initial Developer are Copyright (C) 2005
 * the Initial Developer. All Rights Reserved.
 *
 * ***** END LICENSE BLOCK ***** *)

interface

{$include GR32.inc}

uses
{$if not defined(FPC)}
  System.SysUtils, // Must be before GR32 so we get the correct PByteArray
{$else}
  SysUtils,
{$ifend}
  GR32;


//------------------------------------------------------------------------------
//
//      Selective Gaussian Blur
//
//------------------------------------------------------------------------------
// Definition of Selective Gaussian Blur from the GIMP User Manual:
//
// The Selective Gaussian Blur filter performs a mathematical region-based
// selection of the image in small chunks, and determines the level of detail
// within that chunk. After this it applies a Gaussian-based blur to it.
// Selective Gaussian Blur can be very processor intensive, but produces very
// controlled blurring.
//
// The Blur Radius setting affects the maximum number of pixels considered for
// blurring. The higher the setting, the higher the number of pixels that will
// be included in the region analysis. Be aware that a higher setting will take
// considerably longer to compute.
//
// The Delta affects the level of detail that will be blurred. A higher setting
// here will produce more smoothing of the pixels in the radius.
//
// A common use for the Selective Gaussian Blur filter is smoothing areas
// affected by populations of JPEG artifacts, or bad pixelization distortions.
//------------------------------------------------------------------------------
// Can, in theory, be used as (a bad) ordinary Gaussian Blur by specifying
// Delta >= 255.
//
// Note that the selective blur, by design, does not blur the alpha channel.
//------------------------------------------------------------------------------
type
  TSelectiveGaussian32Proc = procedure(ASource, ADest: TBitmap32; Radius: TFloat; Delta: Integer);

var
  SelectiveGaussianBlur32: TSelectiveGaussian32Proc;
  GammaSelectiveGaussianBlur32: TSelectiveGaussian32Proc;


//------------------------------------------------------------------------------
// Selective Gaussian Blur
// Mattias Andersson, 2005
//------------------------------------------------------------------------------
// SIMD optimized versions
//------------------------------------------------------------------------------
// The performance of SelectiveGaussian1 is generally better than
// SelectiveGaussian2 (25% faster on some images) but it also has a slightly
// higher signal loss. For example:
// - SelectiveGaussian1 on a solid color (value=255), yields value=253.
// - SelectiveGaussian2 on a solid color (value=255), yields value=254.
// This is generally not a problem since such a small difference isn't visible.
//------------------------------------------------------------------------------
procedure SelectiveGaussian1(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
procedure SelectiveGaussianGamma1(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
procedure SelectiveGaussian2(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
procedure SelectiveGaussianGamma2(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);

type
  TSelectiveGaussianAccumulateProc = procedure(PSrc: PByteArray; PFact: PWordArray; Count, Min, Max: Integer; out Sum, FactSum: Cardinal);

var
  SelectiveGaussianAccumulate: TSelectiveGaussianAccumulateProc;


//------------------------------------------------------------------------------
// Selective Gaussian Blur
// Eric Grange, 2005
//------------------------------------------------------------------------------
// https://borland.public.delphi.language.basm.narkive.com/XiSH6pUn/anyone-up-for-a-selective-gaussian-optimization
// https://web.archive.org/web/20240914225741/https://borland.public.delphi.language.basm.narkive.com/XiSH6pUn/anyone-up-for-a-selective-gaussian-optimization
//------------------------------------------------------------------------------
// Unoptimized reference implemention.
//------------------------------------------------------------------------------
procedure SelectiveGaussianGimp(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
procedure SelectiveGaussianGimpGamma(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);


//------------------------------------------------------------------------------
// Various Selective Gaussian Blur variations
// Mattias Andersson, 2005
//------------------------------------------------------------------------------
// http://delphi.newswhat.com/geoxml/forumhistorythread?groupname=borland.public.delphi.language.basm&messageid=42c91608$1@newsgroups.borland.com (link dead)
// https://groups.google.com/g/borland.public.delphi.language.basm/c/QXxiJZnIOa8/m/YMID8XaqzdsJ
// https://web.archive.org/web/20240914232817/https://groups.google.com/g/borland.public.delphi.language.basm/c/QXxiJZnIOa8/m/YMID8XaqzdsJ
//------------------------------------------------------------------------------
// Unoptimized reference implementions.
//------------------------------------------------------------------------------
procedure SelectiveGaussianNew(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
procedure SelectiveGaussianNewGamma(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
procedure SelectiveGaussianHorzVert(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer); deprecated 'Destroys source pixels';


//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------

implementation

uses
{$if not defined(FPC)}
  System.Math,
  System.SyncObjs, // TCriticalSection
{$else}
  Math,
  SyncObjs, // TCriticalSection
{$ifend}
  GR32_Gamma,
  GR32.Blur,
  GR32_Bindings,
  GR32_LowLevel,
  GR32_System,
  GR32_OrdinalMaps,
  GR32.Types.SIMD;

// Ensure that we use the GR32.TFloat and not FPC's Math.TFloat (which is an alias for Double!)
type
  TFloat = GR32.TFloat;
  PFloat = ^TFloat;

//------------------------------------------------------------------------------
//
//      PremultiplyLUT
//
//------------------------------------------------------------------------------
// Lookup tables for alpha premultiplication.
//
//   MulDiv255[a,b] = a * b / 255           Used for premultiplication
//   Mul255Div[a,b] = Round(a * 255 / b)    Used for unpremultiplication
//
// where
//
//   a: Color value
//   b: Alpha value
//
// PremultiplyLUT is used for pre- and unpremultiplication.
// GammaPremultiplyLUT rolls the gamma correction and pre-/unpremultiplication
// operations into one for a significant gain in precision at no extra cost in
// performance.
//
//------------------------------------------------------------------------------
type
  PPremultiplyLUT = ^TPremultiplyLUT;

  TPremultiplyLUT = record
  strict private
    class constructor Create;
    class destructor Destroy;
    const OneOver255 = 1 / 255;
  strict private class var
    FLock: TCriticalSection;
    FPremultiplyLUT: PPremultiplyLUT;
    FGammaPremultiplyLUT: PPremultiplyLUT;
  strict private
    FsRGB: boolean;
    FGamma: Double;
    FGammaInv: Double;
    procedure SetGamma(const GammaValue: Double; sRGB: boolean);
    procedure GammaChangedHandler;
  public type
    TLUT88 = array[byte, byte] of byte;
  public
    Mul255Div: TLUT88;
    MulDiv255: TLUT88;
  public
    class function PremultiplyLUT: PPremultiplyLUT; static;
    class function GammaPremultiplyLUT: PPremultiplyLUT; static;
    class procedure Apply(const LUT: TLUT88; Values, Alpha: PByteArray; Count: integer); static;

    property sRGB: boolean read FsRGB;
    property Gamma: Double read FGamma;
  end;

//------------------------------------------------------------------------------

class constructor TPremultiplyLUT.Create;
begin
  FLock := TCriticalSection.Create;
  FPremultiplyLUT := nil;
  FGammaPremultiplyLUT := nil;
end;

class destructor TPremultiplyLUT.Destroy;
begin
  FLock.Free;

  if (FPremultiplyLUT <> nil) then
    Dispose(FPremultiplyLUT);

  if (FGammaPremultiplyLUT <> nil) then
  begin
    UnregisterGammaChangeNotification(FGammaPremultiplyLUT.GammaChangedHandler);
    Dispose(FGammaPremultiplyLUT);
  end;
end;

//------------------------------------------------------------------------------

class procedure TPremultiplyLUT.Apply(const LUT: TLUT88; Values, Alpha: PByteArray; Count: integer);
begin
  while (Count > 0) do
  begin
    PByte(Values)^ := LUT[PByte(Values)^, PByte(Alpha)^];
    Inc(PByte(Values));
    Inc(PByte(Alpha));
    Dec(Count);
  end;
end;

//------------------------------------------------------------------------------

class function TPremultiplyLUT.PremultiplyLUT: PPremultiplyLUT;
var
  AlphaValue, ColorValue: Integer;
begin
  if (FPremultiplyLUT = nil) then
  begin
    FLock.Acquire;

    if (FPremultiplyLUT = nil) then
    begin
      New(FPremultiplyLUT);

      for ColorValue := 0 to 255 do
      begin
        FPremultiplyLUT.Mul255Div[ColorValue, 0] := 0;
        FPremultiplyLUT.MulDiv255[ColorValue, 0] := 0;

        for AlphaValue := 1 to 255 do
        begin
          FPremultiplyLUT.Mul255Div[ColorValue, AlphaValue] := Clamp(Round(ColorValue * 255 / AlphaValue));
          FPremultiplyLUT.MulDiv255[ColorValue, AlphaValue] := Round(ColorValue * AlphaValue * OneOver255);
        end;
      end;
    end;

    FLock.Release;
  end;

  Result := FPremultiplyLUT;
end;

//------------------------------------------------------------------------------

class function TPremultiplyLUT.GammaPremultiplyLUT: PPremultiplyLUT;
begin
  if (FGammaPremultiplyLUT = nil) then
  begin
    FLock.Acquire;

    if (FGammaPremultiplyLUT = nil) then
    begin
      New(FGammaPremultiplyLUT);
      RegisterGammaChangeNotification(FGammaPremultiplyLUT.GammaChangedHandler);
      FGammaPremultiplyLUT.SetGamma(GAMMA_VALUE, GAMMA_IS_SRGB);
    end;

    FLock.Release;
  end;

  Result := FGammaPremultiplyLUT;
end;

procedure TPremultiplyLUT.GammaChangedHandler;
begin
  SetGamma(GAMMA_VALUE, GAMMA_IS_SRGB);
end;

procedure TPremultiplyLUT.SetGamma(const GammaValue: Double; sRGB: boolean);
var
  AlphaValue, ColorValue: Integer;
  n: Single;
  ColorLinear, ColorRGB: TFloat;
begin
  if (FsRGB = sRGB) and ((FsRGB) or (GammaValue = FGamma)) then
    exit;

  FsRGB := sRGB;

  if (not FsRGB) then
  begin
    FGamma := GammaValue;
    FGammaInv := 1 / FGamma;
  end;

  for ColorValue := 0 to 255 do
  begin
    Mul255Div[ColorValue, 0] := 0;
    MulDiv255[ColorValue, 0] := 0;

    // sRGB -> Linear RGB / 255
    ColorLinear := ColorValue * OneOver255;
    if (FsRGB) then
    begin
      if (ColorLinear >= 0.04045) then
        ColorLinear := Power((ColorLinear + 0.055) * (1 / 1.055), 2.4)
      else
        ColorLinear := ColorLinear * (1 / 12.92);

    end else
      ColorLinear := Power(ColorLinear, FGamma);

    // ColorValue: Color, AlphaValue: Alpha
    for AlphaValue := 1 to 255 do
    begin
      // Linear RGB -> Premultiplied, Linear RGB
      n := ColorLinear * AlphaValue;
      MulDiv255[ColorValue, AlphaValue] := Round(n);

      // Premultiplied, Linear RGB -> Unpremultiplied, Linear RGB
      n := ColorValue / AlphaValue;

      // Linear RGB -> sRGB / 255
      if (FsRGB) then
      begin
        if (n >= 0.0031308) then
          ColorRGB := 1.055 * Power(n, 1 / 2.4) - 0.055
        else
          ColorRGB := n * 12.92;
      end else
        ColorRGB := Power(n, FGammaInv);

      Mul255Div[ColorValue, AlphaValue] := Clamp(Round(ColorRGB * 255));
    end;
  end;
end;

//------------------------------------------------------------------------------

type
  TSingleDynArray = array of Single;

function GaussianKernel(Radius: Single): TSingleDynArray;
var
  i, R: Integer;
  StdDev, C1, C2: Single;
begin
  R := Ceil(Radius);
  SetLength(Result, R + 1);
  StdDev := Radius * GaussianRadiusToSigma;
  C1 := 1.0 / Sqrt(2.0 * Pi * StdDev);
  C2 := -0.5 / Sqr(StdDev);
  for i := 0 to R do
    Result[i] := C1 * Exp(Sqr(i) * C2);
end;


//------------------------------------------------------------------------------
//
// The original implementation of selective gaussian blur (similar to the
// one in GIMP).
//
//------------------------------------------------------------------------------
// Adapted from Eric Grange's version which in turn was based on the source
// of "Selective gaussian blur filter for the GIMP".
//------------------------------------------------------------------------------
procedure InternalSelectiveGaussianGimp(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer; const PremultiplyLUT: TPremultiplyLUT);
var
  X, Y, Plane, WindowX, WindowY, R, MaxX, MaxY: Integer;
  MinValue, MaxValue: Integer;
  Kernel: TSingleDynArray;
  Sum, Fact, Weight: Single;
  RefColor: TColor32Entry;
  RefValue: Integer;
  SampleColor: TColor32Entry;
  SampleValue: Integer;
  DstValue: Byte;
  pDestColor: PColor32Entry;
  pDstLine, pSrcLine: PColor32Array;
begin
  ASSERT(Src <> Dst);

  if (Radius < GaussianRadiusToSigma) or (Delta <= 0) or (Src.Empty) then
  begin
    Src.CopyMapTo(Dst);
    exit;
  end;

  R := Ceil(Radius);
  MaxX := Src.Width - 1;
  MaxY := Src.Height - 1;

  Dst.SetSizeFrom(Src);
  Kernel := GaussianKernel(Radius);
  try

    for Y := 0 to MaxY do
    begin
      pDstLine := Dst.ScanLine[Y];

      for X := 0 to MaxX do
      begin
        RefColor := TColor32Entry(Src[X, Y]);
        pDestColor := @(pDstLine[X]);

        // Process each of the RGB channels in turn
        for Plane := 0 to 2 do
        begin
          Sum := 0;
          Fact := 0;

          RefValue := RefColor.Planes[Plane];
          // Premultiply and gamma (sRGB->LinearRGB)
          RefValue := PremultiplyLUT.MulDiv255[RefValue, RefColor.A];

          MinValue := RefValue - Delta;
          MaxValue := RefValue + Delta;

          for WindowY := -R to R do
          begin
            if WindowY + Y < 0 then
              Continue;
            if WindowY + Y > MaxY then
              Break;

            pSrcLine := Src.ScanLine[WindowY + Y];
            for WindowX := -R to R do
            begin
              if WindowX + X < 0 then
                Continue;
              if WindowX + X > MaxX then
                Break;

              SampleColor := TColor32Entry(pSrcLine[WindowX + X]);
              SampleValue := SampleColor.Planes[Plane];
              // Premultiply and gamma (sRGB->LinearRGB)
              SampleValue := PremultiplyLUT.MulDiv255[SampleValue, SampleColor.A];

              if (SampleValue >= MinValue) and (SampleValue <= MaxValue) then
              begin
                Weight := Kernel[Abs(WindowX)] * Kernel[Abs(WindowY)];
                Sum := Sum + SampleValue * Weight;
                Fact := Fact + Weight;
              end;
            end;
          end;

          DstValue := FastRound(Sum / Fact); // TODO : Need to Clamp. Rounding errors can cause values to grow beyond 255
          // Unpremultiply and gamma (LinearRGB->sRGB)
          DstValue := PremultiplyLUT.Mul255Div[DstValue, RefColor.A];

          pDestColor.Planes[Plane] := DstValue;
        end;

        // Copy alpha
        pDestColor.A := RefColor.A;
      end;
    end;
  finally
    Kernel := nil;
  end;
end;

//------------------------------------------------------------------------------

procedure SelectiveGaussianGimp(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
var
  PremultiplyLUT: PPremultiplyLUT;
begin
  PremultiplyLUT := TPremultiplyLUT.PremultiplyLUT;
  InternalSelectiveGaussianGimp(Src, Dst, Radius, Delta, PremultiplyLUT^);
end;

procedure SelectiveGaussianGimpGamma(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
var
  PremultiplyLUT: PPremultiplyLUT;
begin
  PremultiplyLUT := TPremultiplyLUT.GammaPremultiplyLUT;
  InternalSelectiveGaussianGimp(Src, Dst, Radius, Delta, PremultiplyLUT^);
end;


//------------------------------------------------------------------------------
//
// Optimized algorithm that performs horizontal and vertical blurring only
// once for each reference color at a certain position (x, y). A table is
// used for cacheing convolution sum of the reference color values that have
// already been visited. Vertical blurring can then be performed in a
// single pass by looking up already cached entries for a given reference
// color (and we can thus take advantage of the fact the gaussian is
// separable). In order to minimize required memory, the horizontal pass is
// performed on one column at a time. This requires Src.Height * 2^BitDepth
// bytes of memory, so it's a problem if we want to support 16-bit images.
// Another problem with images with higher bit-depth is that there is a
// smaller probability that adjacent colors have the same pixel values.
//
//------------------------------------------------------------------------------
procedure InternalSelectiveGaussianNew(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer; const PremultiplyLUT: TPremultiplyLUT);
type
  PCacheEntry = ^TCacheEntry;
  TCacheEntry = record
    Sum: Single;  // intermediate sum of convolution
    Fact: Single; // intermediate sum of kernel weights
  end;
var
  X, Y, Plane, WindowX, WindowY, LoY, SampleValue, R, MaxX, MaxY: Integer;
  MinValue, MaxValue: Integer;
  Kernel: TSingleDynArray;
  Sum, Fact, Weight: Single;
  Color: TColor32Entry;
  pColor: PColor32Entry;
  RefValue: Integer;
  PSrcLine: PColor32Array;
  PEntry: PCacheEntry;
  SumCache: array of array [Byte] of TCacheEntry;
  LastPos: array [Byte] of Integer;
  Value: Byte;
begin
  ASSERT(Src <> Dst);

  if (Radius < GaussianRadiusToSigma) or (Delta <= 0) or (Src.Empty) then
  begin
    Src.CopyMapTo(Dst);
    exit;
  end;

  R := Ceil(Radius);
  MaxX := Src.Width - 1;
  MaxY := Src.Height - 1;

  Dst.SetSizeFrom(Src);
  SetLength(SumCache, Src.Height);
  Kernel := GaussianKernel(Radius);

  for X := 0 to MaxX do
  begin

    // Process each of the RGB channels in turn
    for Plane := 0 to 2 do
    begin
      FillLongword(LastPos[0], Length(LastPos), Cardinal(Low(Integer)));

      for Y := 0 to MaxY do
      begin

        Color := TColor32Entry(Src[X, Y]);
        RefValue := Color.Planes[Plane];

        // Premultiply and gamma (sRGB->LinearRGB)
        RefValue := PremultiplyLUT.MulDiv255[RefValue, Color.A];

        MinValue := RefValue - Delta;
        MaxValue := RefValue + Delta;

        if LastPos[RefValue] < Y - R then
          LoY := Y - R
        else
          LoY := LastPos[RefValue] + 1;

        for WindowY := LoY to Y + R do
        begin
          if WindowY < 0 then
            Continue;
          if WindowY > MaxY then
            Break;

          Sum := 0;
          Fact := 0;
          PSrcLine := Src.Scanline[WindowY];

          for WindowX := -R to R do
          begin
            if WindowX + X < 0 then
              Continue;
            if WindowX + X > MaxX then
              Break;

            Color := TColor32Entry(PSrcLine[WindowX + X]);
            SampleValue := Color.Planes[Plane];
            // Premultiply and gamma (sRGB->LinearRGB)
            SampleValue := PremultiplyLUT.MulDiv255[SampleValue, Color.A];

            if (SampleValue >= MinValue) and (SampleValue <= MaxValue) then
            begin
              Weight := Kernel[Abs(WindowX)];
              Sum := Sum + SampleValue * Weight;
              Fact := Fact + Weight;
            end;
          end;

          PEntry := @SumCache[WindowY][RefValue];
          PEntry.Sum := Sum;
          PEntry.Fact := Fact;
        end;

        LastPos[RefValue] := Y + R;
      end;

      for Y := 0 to MaxY do
      begin
        Color := TColor32Entry(Src[X, Y]);
        RefValue := Color.Planes[Plane];
        // Premultiply and gamma (sRGB->LinearRGB)
        RefValue := PremultiplyLUT.MulDiv255[RefValue, Color.A];

        Sum := 0;
        Fact := 0;
        for WindowY := -R to R do
        begin
          if WindowY + Y < 0 then
            Continue;
          if WindowY + Y > MaxY then
            Break;
          Weight := Kernel[Abs(WindowY)];
          PEntry := @SumCache[WindowY + Y][RefValue];
          Sum := Sum + PEntry.Sum * Weight;
          Fact := Fact + PEntry.Fact * Weight;
        end;

        pColor := PColor32Entry(Dst.PixelPtr[X, Y]);

        Value := Round(Sum / Fact);
        // Unpremultiply and gamma (LinearRGB->sRGB)
        Value := PremultiplyLUT.Mul255Div[Value, pColor.A];

        pColor.Planes[Plane] := Value;
      end;

      // Copy alpha
      for Y := 0 to MaxY do
        PColor32Entry(Dst.PixelPtr[X, Y]).A := TColor32Entry(Src[X, Y]).A;
    end;
  end;
end;

//------------------------------------------------------------------------------

procedure SelectiveGaussianNew(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
var
  PremultiplyLUT: PPremultiplyLUT;
begin
  PremultiplyLUT := TPremultiplyLUT.PremultiplyLUT;
  InternalSelectiveGaussianNew(Src, Dst, Radius, Delta, PremultiplyLUT^);
end;

procedure SelectiveGaussianNewGamma(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
var
  PremultiplyLUT: PPremultiplyLUT;
begin
  PremultiplyLUT := TPremultiplyLUT.GammaPremultiplyLUT;
  InternalSelectiveGaussianNew(Src, Dst, Radius, Delta, PremultiplyLUT^);
end;


//------------------------------------------------------------------------------
//
// This algorithm performs selective blurring first horizontally (storing
// the intermediate result in a bitmap), and then vertically. The output
// is slightly different from ordinary gaussian selective blurring, but
// the speed-up is significant.
//
//------------------------------------------------------------------------------
procedure SelectiveGaussianHorzVert(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
var
  X, Y, Plane, {LoU,} WindowX, WindowY, SampleValue, R, MaxX, MaxY: Integer;
  MinValue, MaxValue: Integer;
  Kernel: TSingleDynArray;
  Sum, Fact, Weight: Single;
  RefColor: TColor32;
  RefValue: Integer;
  PDstLine, PSrcLine: PColor32Array;
begin
  ASSERT(Src <> Dst);

  if (Radius < GaussianRadiusToSigma) or (Delta <= 0) or (Src.Empty) then
  begin
    Src.CopyMapTo(Dst);
    exit;
  end;

  R := Ceil(Radius);
  MaxX := Src.Width - 1;
  MaxY := Src.Height - 1;

  Dst.SetSizeFrom(Src);
  Kernel := GaussianKernel(Radius);
  try
    for Y := 0 to MaxY do
    begin
      PDstLine := Dst.ScanLine[Y];
      PSrcLine := Src.ScanLine[Y];
      for X := 0 to MaxX do
      begin
        RefColor := Src[X, Y];
        for Plane := 0 to 2 do
        begin
          Sum := 0;
          Fact := 0;
          // TODO : Premultiply and gamma (sRGB->LinearRGB)
          RefValue := TColor32Entry(RefColor).Planes[Plane];
          MinValue := RefValue - Delta;
          MaxValue := RefValue + Delta;
          for WindowX := -R to R do
          begin
            if WindowX + X < 0 then
              Continue;
            if WindowX + X > MaxX then
              Break;

            SampleValue := TColor32Entry(PSrcLine[WindowX + X]).Planes[Plane];
            if (SampleValue >= MinValue) and (SampleValue <= MaxValue) then
            begin
              Weight := Kernel[Abs(WindowX)];
              Sum := Sum + SampleValue * Weight;
              Fact := Fact + Weight;
            end;
          end;
          TColor32Entry(PDstLine[X]).Planes[Plane] := Round(Sum / Fact);
        end;
      end;
    end;

    for Y := 0 to MaxY do
    begin
      // TODO : This is using the source bitmap as a temporary buffer
      // thus destroying the source bitmap!
      PDstLine := Src.ScanLine[Y];
      for X := 0 to MaxX do
      begin
        RefColor := Dst[X, Y];
        for Plane := 0 to 2 do
        begin
          Sum := 0;
          Fact := 0;
          RefValue := TColor32Entry(RefColor).Planes[Plane];
          MinValue := RefValue - Delta;
          MaxValue := RefValue + Delta;
          for WindowY := -R to R do
          begin
            if WindowY + Y < 0 then
              Continue;
            if WindowY + Y > MaxY then
              Break;

            SampleValue := TColor32Entry(Dst[X, WindowY + Y]).Planes[Plane];
            if (SampleValue >= MinValue) and (SampleValue <= MaxValue) then
            begin
              Weight := Kernel[Abs(WindowY)];
              Sum := Sum + SampleValue * Weight;
              Fact := Fact + Weight;
            end;
          end;
          // TODO : Unpremultiply and gamma (LinearRGB->sRGB)
          TColor32Entry(PDstLine[X]).Planes[Plane] := Round(Sum / Fact);
        end;
        TColor32Entry(PDstLine[X]).A := TColor32Entry(RefColor).A;
      end;
    end;
    Dst.Assign(Src);
  finally
    Kernel := nil;
  end;
end;


//------------------------------------------------------------------------------
//
// SIMD optimized Selective Gaussian Blur
// Originally by Mattias Andersson
//
//------------------------------------------------------------------------------
// Modified to also blur alpha channel.
// Various fixes for size not mod 4 and buffer overflows.
// Replaced MMX version with SSE2 version.
//------------------------------------------------------------------------------
type
  PCacheEntry = ^TCacheEntry;
  TCacheEntry = record
    Sum: Cardinal;  // intermediate sum of convolution
    Fact: Cardinal; // intermediate sum of kernel weights
  end;

  PRangeEntry = ^TRangeEntry;
  TRangeEntry = packed record
    Min: Byte;
    Max: Byte;
    Sum: Cardinal;
  end;

//------------------------------------------------------------------------------

function GaussianKernelInt(Radius: Single): TArrayOfWord;
var
  i, R: Integer;
  C: Single;
begin
  R := Ceil(Radius);
  SetLength(Result, R * 2 + 1);
  C := -0.5 / Sqr(Radius * GaussianRadiusToSigma);
//  for i := -R to R do
  for i := 0 to R do
  begin
//    Result[R+i] := Round(255 * Exp(Sqr(i) * C));
    Result[R+i] := Round(255 * Exp(Sqr(i) * C));
    Result[R-i] := Result[R+i];
  end;
end;

//------------------------------------------------------------------------------

procedure Accumulate_Pas(pSrc: PByteArray; pFact: PWordArray; Count, Min, Max: Integer; out Sum, FactSum: Cardinal);
begin
  Sum := 0;
  FactSum := 0;
  while (Count > 0) do
  begin
    Dec(Count);
    if (pSrc[Count] > Min) and (pSrc[Count] < Max) then
    begin
      Sum := Sum + ((pSrc[Count] * pFact[Count]) shr 8);
      FactSum := FactSum + pFact[Count];
    end;
  end;
end;

{$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))}

procedure Accumulate_SSE2(pSrc: Pointer; pFact: Pointer; Count, Min, Max: Integer; out Sum, FactSum: Cardinal); //{$IFDEF FPC} assembler; {$ENDIF}
  // Parameters (x86):
  //   EAX <- pSrc
  //   EDX <- pFact
  //   ECX <- Count
  //   Stack[0] <- Min
  //   Stack[1] <- Max
  //   Stack[2] <- @Sum
  //   Stack[3] <- @FactSum
  //
  // Parameters (x64):
  //   RCX <- pSrc
  //   RDX <- pFact
  //   R8 <- Count
  //   R9 <- Min
  //   Stack[0] <- Max
  //   Stack[1] <- @Sum
  //   Stack[2] <- @FactSum

  // SSE register usage:
  //   XMM0: Min | Min | Min | Min
  //   XMM1: Max | Max | Max | Max
  //   XMM2: Four pSrc bytes
  //   XMM3: Four pFact words
  //   XMM4: "Misc"
  //   XMM5: Sum
  //   XMM6: FactSum
  //   XMM7: "Zero"
{$if defined(TARGET_x64) and defined(FPC)}begin{$ifend}
asm
{$if defined(TARGET_x64)}
{$IFNDEF FPC}
  .SAVENV XMM4
  .SAVENV XMM5
  .SAVENV XMM6
  .SAVENV XMM7
{$ENDIF}
{$elseif defined(TARGET_x86)}
  // nothing
{$else}
{$message fatal 'Unsupported target'}
{$ifend}
{$IFDEF FPC}
{$define RETARD_COMPILER} // Just to make it clear what I think of FPC's assembler
{$ENDIF}

  // initialize
        // M0 := Min;
        MOVD        XMM0, Min
        PUNPCKLWD   XMM0, XMM0  // Unpack Low Data ([ab][cd] -> [acbd])
        PUNPCKLDQ   XMM0, XMM0
        // M1 := Max;
        MOVD        XMM1, Max
        PUNPCKLWD   XMM1, XMM1
        PUNPCKLDQ   XMM1, XMM1

        PXOR        XMM5, XMM5
        PXOR        XMM6, XMM6
        PXOR        XMM7, XMM7

        // Negative offset "trick"
{$if defined(TARGET_x86)}
        LEA         pSrc, [pSrc+Count]
        LEA         pFact, [pFact+Count*2]
        NEG         Count
{$elseif defined(TARGET_x64)}
{$IFNDEF RETARD_COMPILER}
        LEA         pSrc, [pSrc+R8]
        LEA         pFact, [pFact+R8*2]
{$ELSE}
        LEA         ECX, [RCX+R8]
        LEA         EDX, [RDX+R8*2]
{$ENDIF}
        NEG         R8
{$ifend}

        // if (Count mod 4 = 0) then goto :ProcessFours
{$if defined(TARGET_x86)}
        TEST        Count, $0003
{$elseif defined(TARGET_x64)}
        TEST        R8, $0003
{$ifend}
        JZ          @ProcessFours

        // Process Count/4 remainders
{$if defined(TARGET_x86)}
        PUSH        EBX
        PUSH        EDI
{$ifend}
@NextOne:


{$if defined(TARGET_x86)}
        // if (pSrc[Count] <= Min) or (pSrc[Count] >= Max) then goto :SkipOne
        MOVZX       EBX, BYTE PTR[pSrc+Count] // Load single byte

        CMP         Min, EBX
        JGE         @SkipOne
        CMP         EBX, Max
        JGE         @SkipOne

        // Sum := Sum + ((pSrc[Count] * pFact[Count]) shr 8);
        MOVZX       EDI, WORD PTR[pFact+Count*2] // Load single word

        IMUL        EBX, EDI
        SHR         EBX, 8
        MOVD        XMM2, EBX
        PADDD       XMM5, XMM2

        // FactSum := FactSum + pFact[Count];
        MOVD        XMM3, EDI
        PADDD       XMM6, XMM3
{$elseif defined(TARGET_x64)}
        // if (pSrc[Count] <= Min) or (pSrc[Count] >= Max) then goto :SkipOne
{$IFNDEF RETARD_COMPILER}
        MOVZX       R10D, BYTE PTR[pSrc+R8] // Load single byte
{$ELSE}
        MOVZX       R10D, BYTE PTR[RCX+R8] // Load single byte
{$ENDIF}

{$IFNDEF RETARD_COMPILER}
        CMP         Min, R10D
{$ELSE}
        CMP         R9D, R10D
{$ENDIF}
        JGE         @SkipOne
        CMP         R10D, Max
        JGE         @SkipOne

        // Sum := Sum + ((pSrc[Count] * pFact[Count]) shr 8);
{$IFNDEF RETARD_COMPILER}
        MOVZX       R11D, WORD PTR[pFact+R8*2] // Load single word
{$ELSE}
        MOVZX       R11D, WORD PTR[RDX+R8*2] // Load single word
{$ENDIF}

        IMUL        R10D, R11D
        SHR         R10D, 8
        MOVD        XMM2, R10D
        PADDD       XMM5, XMM2

        // FactSum := FactSum + pFact[Count];
        MOVD        XMM3, R11D
        PADDD       XMM6, XMM3
{$ifend}

@SkipOne:
{$if defined(TARGET_x86)}
        INC         Count
{$elseif defined(TARGET_x64)}
        INC         R8
{$ifend}
        // if (Count mod 4 <> 0) then goto :NextOne
{$if defined(TARGET_x86)}
        TEST        Count, $0003
{$elseif defined(TARGET_x64)}
        TEST        R8, $0003
{$ifend}
        JNZ         @NextOne

{$if defined(TARGET_x86)}
        POP         EDI
        POP         EBX
{$ifend}

@ProcessFours:
        // Count := Count div 4
        // if (Count = 0) then goto :Done
{$if defined(TARGET_x86)}
        SAR         Count, 2
        JCXZ        @Done
{$elseif defined(TARGET_x64)}
        SAR         R8, 2
        JZ          @Done
{$ifend}


  // loop start
@Loop:
        // if (pSrc[Count] > Min) and (pSrc[Count] < Max) then
        // begin
        //   Sum := Sum + pSrc[Count] * pFact[Count];
        //   FactSum := FactSum + pFact[Count];
        // end;

        // M2 := pSrc[Count];
{$if defined(TARGET_x86)}
        MOVD        XMM2, DWORD PTR [pSrc+Count*4] // Load four bytes
{$elseif defined(TARGET_x64)}
{$IFNDEF RETARD_COMPILER}
        MOVD        XMM2, DWORD PTR [pSrc+R8*4] // Load four bytes
{$ELSE}
        MOVD        XMM2, DWORD PTR [RCX+R8*4] // Load four bytes
{$ENDIF}
{$ifend}
        PUNPCKLBW   XMM2, XMM7
        // M3 := pFact[Count];
{$if defined(TARGET_x86)}
        MOVQ        XMM3, QWORD PTR [pFact+Count*8] // Load four words
{$elseif defined(TARGET_x64)}
{$IFNDEF RETARD_COMPILER}
        MOVQ        XMM3, QWORD PTR [pFact+R8*8] // Load four words
{$ELSE}
        MOVQ        XMM3, QWORD PTR [RDX+R8*8] // Load four words
{$ENDIF}
{$ifend}

  // store threshold mask in MM4
        // M4 := M2;
        MOVQ        XMM4, XMM2
        // if (M4 > Min) then M4 := $FF else M4 := $00;
        PCMPGTW     XMM4, XMM0  // Compare Packed Signed Integers for Greater Than
        // M4 := M4 and Max;
        PAND        XMM4, XMM1
        // if (M4 > M2) then M4 := $FF else M4 := $00;
        PCMPGTW     XMM4, XMM2

  // mask colors and weights
        // M2 := M2 and M4;
        PAND        XMM2, XMM4
        // M3 := M3 and M4;
        PAND        XMM3, XMM4

  // multiply colors and weights
        // M2 := M2 * M3;
        PMULLW      XMM2, XMM3
        // Clear lower byte of four words
{$if (not defined(FPC)) or (not defined(TARGET_X64))}
        PAND        XMM2, DQWORD PTR [SSE_FF00FF00_ALIGNED]
{$else}
        PAND        XMM2, DQWORD PTR [rip+SSE_FF00FF00_ALIGNED]
{$ifend}

  // perform accumulation
        // M5 := M5 + M2;
        PSADBW      XMM2, XMM7  // Compute Sum of Absolute Differences
                                // This sums the four ((M2 * M3) shl 8).
        PADDD       XMM5, XMM2

        // M6 := M6 + M3;
        PSADBW      XMM3, XMM7
        PADDD       XMM6, XMM3

  // loop end
{$if defined(TARGET_x86)}
        INC         Count
{$elseif defined(TARGET_x64)}
        INC         R8
{$ifend}
        JNZ         @Loop

@Done:

{$if defined(TARGET_x86)}
        MOV         EAX, Sum
        MOVD        DWORD PTR [EAX], XMM5
        MOV         EAX, FactSum
        MOVD        DWORD PTR [EAX], XMM6
{$elseif defined(TARGET_x64)}
        MOV         RAX, Sum
        MOVD        DWORD PTR [RAX], XMM5
        MOV         RAX, FactSum
        MOVD        DWORD PTR [RAX], XMM6
{$ifend}

{$if defined(TARGET_x64) and defined(FPC)}end['XMM4', 'XMM5', 'XMM6', 'XMM7'];{$ifend}
end;
{$ifend}

//------------------------------------------------------------------------------

procedure InternalSelectiveGaussian1(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer; const PremultiplyLUT: TPremultiplyLUT);
const
  SourcePlanes: array[0..2] of TConversionType = (ctBlue, ctGreen, ctRed);
var
  X, Y, WindowY, MinX: NativeInt; // NativeInt required on FPC to support negative array offset
  Plane, LoY, R, MaxX, MaxY: Integer;
  XCount, ColCount: Integer;
  MinValue, MaxValue: Integer;
  Kernel: TArrayOfWord;
  PKernel: PWordArray;
  VSum, VFact, Sum: Cardinal;
  Weight, Fact, CFact: Cardinal;
  RefValue: Cardinal;
  CacheEntry: PCacheEntry;
  Value: Cardinal;
  pColor: PColor32Entry;

  SumCache: array of array [Byte] of TCacheEntry;
  LastPos: array [Byte] of Integer;

  Map: array[Low(SourcePlanes)..High(SourcePlanes)] of TByteMap;
  pMap: PByteArray;
begin
  ASSERT(Src <> Dst);

  if (Radius < GaussianRadiusToSigma) or (Delta <= 0) or (Src.Empty) then
  begin
    Src.CopyMapTo(Dst);
    exit;
  end;

  R := Ceil(Radius);
  MaxX := Src.Width - 1;
  MaxY := Src.Height - 1;
  ColCount := Src.Width;

  Dst.SetSizeFrom(Src);
  SetLength(SumCache, Src.Height);

  Kernel := GaussianKernelInt(Radius);
  PKernel := PWordArray(@Kernel[R]); // Note: Pointer to midpoint

  for Plane := Low(Map) to High(Map) do
    Map[Plane] := TByteMap.Create;
  try

    // Load RGB into separate maps
    for Plane := Low(Map) to High(Map) do
    begin
      Map[Plane].ReadFrom(Src, SourcePlanes[Plane]);

      // Premultiply and gamma (sRGB->LinearRGB)
      pMap := Map[Plane].Bits;
      for X := 0 to Src.Width*Src.Height-1 do
      begin
        TColor32Entry(Dst.Bits[X]).A := TColor32Entry(Src.Bits[X]).A;
        pMap[X] := PremultiplyLUT.MulDiv255[pMap[X], TColor32Entry(Src.Bits[X]).A];
      end;
    end;

    CFact := 0;
    for X := -R to R do
      CFact := CFact + PKernel[X];

    for X := 0 to MaxX do
    begin
      MinX := Max(-R, -X);
      XCount := Min(R, MaxX - X) - MinX;

      // Process each channel in turn
      for Plane := Low(Map) to High(Map) do
      begin
        pMap := PByteArray(Map[Plane].ValPtr[X + MinX, 0]);

        FillLongword(LastPos[0], Length(LastPos), Cardinal(Low(Integer)));
        for Y := 0 to MaxY do
        begin
          RefValue := pMap[Y * ColCount - MinX];

          MinValue := integer(RefValue) - Delta;
          MaxValue := integer(RefValue) + Delta;

          if LastPos[RefValue] < Y - R then
          begin
            if Y < R then
              LoY := 0
            else
              LoY := Y - R
          end else
            LoY := LastPos[RefValue] + 1;

          VSum := 0;
          VFact := 0;
          for WindowY := Y - R to LoY - 1 do
          begin
            if WindowY < 0 then
              Continue;
            Weight := PKernel[WindowY - Y];
            CacheEntry := @SumCache[WindowY][RefValue];
            VSum := VSum + CacheEntry.Sum * Weight;
            VFact := VFact + CacheEntry.Fact * Weight;
          end;

          for WindowY := LoY to Y + R do
          begin
            if WindowY > MaxY then
              Break;

            SelectiveGaussianAccumulate(@pMap[WindowY * ColCount], @PKernel[MinX], XCount, MinValue, MaxValue, Sum, Fact);

            CacheEntry := @SumCache[WindowY][RefValue];
            CacheEntry.Sum := Sum;
            CacheEntry.Fact := Fact;

            Weight := PKernel[WindowY - Y];
            VSum := VSum + Sum * Weight;
            VFact := VFact + Fact * Weight;
          end;
          LastPos[RefValue] := Min(Y + R, MaxY);

          Value := 0;
          pColor := PColor32Entry(Dst.PixelPtr[X, Y]);

          // Note:
          // It's tempting to lessen the rounding error by doing a
          // "VSum shl 8" below instead of a "VFact shr 8" here.
          // Unfortunately that can cause an overflow because
          // "VSum shl 8" overflows 31 bits and turn the result
          // negative.
          // In order to avoid this overflow we need the Sum variables
          // to be unsigned so we can use all 32 bits.
          // Old:
          //   VFact := VFact shr 8;
          //   Value := VSum div (VFact shr 8);
          // New:
          //   Value := (VSum shl 8) div VFact;

          if (VSum <> 0) and (VFact <> 0) then
          begin

            // We could improve the precision and lessen the signal loss by
            // doing a Round instead of a Div here, but it only improves
            // the loss slightly and it absolutely kills the performance.
            // Value := Round((VSum shl 8) / VFact);

            Value := (VSum shl 8) div VFact;

            // Unpremultiply and gamma (LinearRGB->sRGB)
            Value := PremultiplyLUT.Mul255Div[Value, pColor.A];

          end;

          pColor.Planes[Plane] := Value;

        end;
      end;
    end;

  finally
    for Plane := Low(Map) to High(Map) do
      Map[Plane].Free;
  end;
end;

//------------------------------------------------------------------------------

procedure SelectiveGaussian1(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
var
  PremultiplyLUT: PPremultiplyLUT;
begin
  PremultiplyLUT := TPremultiplyLUT.PremultiplyLUT;
  InternalSelectiveGaussian1(Src, Dst, Radius, Delta, PremultiplyLUT^);
end;

procedure SelectiveGaussianGamma1(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
var
  PremultiplyLUT: PPremultiplyLUT;
begin
  PremultiplyLUT := TPremultiplyLUT.GammaPremultiplyLUT;
  InternalSelectiveGaussian1(Src, Dst, Radius, Delta, PremultiplyLUT^);
end;

//------------------------------------------------------------------------------

procedure InternalSelectiveGaussian2(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer; const PremultiplyLUT: TPremultiplyLUT);
const
  SourcePlanes: array[0..2] of TConversionType = (ctBlue, ctGreen, ctRed);
var
  KernelMinX, KernelMaxX, WindowX, WindowY: NativeInt;
  Plane, LoY, R: Integer;
  X, Y, MaxX, MaxY: integer;
  MinValue, MaxValue: integer;
  ColCount: integer;
  Kernel: TArrayOfWord;
  PKernel: PWordArray;
  VSum, VFact, Sum: Cardinal;
  Weight, Fact, CFact: Cardinal;
  RefValue: integer;
  PSrcLine: PByteArray;
  CacheEntry: PCacheEntry;
  RangeEntry: PRangeEntry;
  pColor: PColor32Entry;
  SampleValue: integer;
  Value: Cardinal;

  SumCache: array of array [Byte] of TCacheEntry;
  RangeCache: array of TRangeEntry;
  LastPos: array [Byte] of Integer;

  Map: array[Low(SourcePlanes)..High(SourcePlanes)] of TByteMap;
  PMap: PByteArray;
begin
  ASSERT(Src <> Dst);

  if (Radius < GaussianRadiusToSigma) or (Delta <= 0) or (Src.Empty) then
  begin
    Src.CopyMapTo(Dst);
    exit;
  end;

  R := Ceil(Radius);
  MaxX := Src.Width - 1;
  MaxY := Src.Height - 1;
  ColCount := Src.Width;

  Dst.SetSizeFrom(Src);
  SetLength(SumCache, Src.Height);
  SetLength(RangeCache, Src.Height);

  Kernel := GaussianKernelInt(Radius);
  PKernel := PWordArray(@Kernel[R]);

  for Plane := Low(Map) to High(Map) do
    Map[Plane] := TByteMap.Create;
  try

    // Load RGB into separate maps
    for Plane := Low(Map) to High(Map) do
    begin
      Map[Plane].ReadFrom(Src, SourcePlanes[Plane]);

      // Premultiply and gamma (sRGB->LinearRGB)
      pMap := Map[Plane].Bits;
      for X := 0 to Src.Width*Src.Height-1 do
      begin
        TColor32Entry(Dst.Bits[X]).A := TColor32Entry(Src.Bits[X]).A;
        pMap[X] := PremultiplyLUT.MulDiv255[pMap[X], TColor32Entry(Src.Bits[X]).A];
      end;
    end;

    CFact := 0;
    for WindowX := -R to R do
      CFact := CFact + PKernel[WindowX];

    for X := 0 to MaxX do
    begin
      KernelMinX := Max(-R, -X);
      KernelMaxX := Min(R, MaxX - X);

      // Process each channel in turn
      for Plane := Low(Map) to High(Map) do
      begin
        PMap := PByteArray(Map[Plane].ValPtr[X, 0]);

        // compute range entries

        for Y := 0 to MaxY do
        begin
          PSrcLine := PByteArray(Map[Plane].ValPtr[X, Y]);

          Sum := 0;
          MinValue := 255;
          MaxValue := 0;

          for WindowX := KernelMinX to KernelMaxX do
          begin
            SampleValue := PSrcLine[WindowX];
            Sum := Sum + Cardinal(PKernel[WindowX] * SampleValue);

            if SampleValue < MinValue then
              MinValue := SampleValue;
            if SampleValue > MaxValue then
              MaxValue := SampleValue;
          end;

          RangeEntry := @RangeCache[Y];
          RangeEntry.Min := MinValue;
          RangeEntry.Max := MaxValue;
          RangeEntry.Sum := Sum shr 8;
        end;

        FillLongword(LastPos[0], Length(LastPos), Cardinal(Low(Integer)));
        for Y := 0 to MaxY do
        begin
          RefValue := PMap[Y * ColCount];

          MinValue := RefValue - Delta;
          MaxValue := RefValue + Delta;

          if LastPos[RefValue] < Y - R then
          begin
            if Y < R then
              LoY := 0
            else
              LoY := Y - R
          end else
            LoY := LastPos[RefValue] + 1;

          VSum := 0;
          VFact := 0;
          for WindowY := Y - R to LoY - 1 do
          begin
            if WindowY < 0 then
              Continue;
            Weight := PKernel[WindowY - Y];
            CacheEntry := @SumCache[WindowY][RefValue];
            VSum := VSum + CacheEntry.Sum * Weight;
            VFact := VFact + CacheEntry.Fact * Weight;
          end;

          for WindowY := LoY to Y + R do
          begin
            if WindowY > MaxY then
              break;

            RangeEntry := @RangeCache[WindowY];

            if (RangeEntry.Min < MinValue) or (RangeEntry.Max > MaxValue) then
            begin

              SelectiveGaussianAccumulate(@PMap[WindowY * ColCount + KernelMinX], @PKernel[KernelMinX], KernelMaxX - KernelMinX, MinValue, MaxValue, Sum, Fact);

            end else
            begin
              Sum := RangeEntry.Sum;
              Fact := CFact;
              if X - R < 0 then
                for WindowX := X to R - 1 do
                  Fact := Fact - PKernel[WindowX + 1]
              else
              if X + R > MaxX then
                for WindowX := MaxX - X to R - 1 do
                  Fact := Fact - PKernel[WindowX + 1];
            end;

            CacheEntry := @SumCache[WindowY][RefValue];
            CacheEntry.Sum := Sum;
            CacheEntry.Fact := Fact;

            Weight := PKernel[WindowY - Y];
            VSum := VSum + Sum * Weight;
            VFact := VFact + Fact * Weight;
          end;

          LastPos[RefValue] := Min(Y + R, MaxY);

          Value := 0;
          pColor := PColor32Entry(Dst.PixelPtr[X, Y]);

          // Disabled: We do a "VSum shl 8" below instead.
          // See comment in InternalSelectiveGaussian2.
          // VFact := VFact shr 8;

          if (VSum <> 0) and (VFact <> 0) then
          begin

            Value := (VSum shl 8) div VFact;

            // Unpremultiply and gamma (LinearRGB->sRGB)
            Value := PremultiplyLUT.Mul255Div[Value, pColor.A];

          end;

          pColor.Planes[Plane] := Value;

        end;
      end;
    end;

  finally
    for Plane := Low(Map) to High(Map) do
      Map[Plane].Free;
  end;
end;

//------------------------------------------------------------------------------

procedure SelectiveGaussian2(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
var
  PremultiplyLUT: PPremultiplyLUT;
begin
  PremultiplyLUT := TPremultiplyLUT.PremultiplyLUT;
  InternalSelectiveGaussian2(Src, Dst, Radius, Delta, PremultiplyLUT^);
end;

procedure SelectiveGaussianGamma2(Src, Dst: TBitmap32; Radius: TFloat; Delta: Integer);
var
  PremultiplyLUT: PPremultiplyLUT;
begin
  PremultiplyLUT := TPremultiplyLUT.GammaPremultiplyLUT;
  InternalSelectiveGaussian2(Src, Dst, Radius, Delta, PremultiplyLUT^);
end;


//------------------------------------------------------------------------------
//
//      Bindings
//
//------------------------------------------------------------------------------
procedure SelectiveGaussian32NotImplemented(ASource, ADest: TBitmap32; Radius: TFloat; Delta: Integer);
begin
  raise Exception.Create('This blur function has not been implemented');
end;

procedure RegisterBindings;
begin
  BlurRegistry.RegisterBinding(@@SelectiveGaussianBlur32, 'SelectiveGaussianBlur32');
  BlurRegistry.RegisterBinding(@@GammaSelectiveGaussianBlur32, 'GammaSelectiveGaussianBlur32');
  BlurRegistry.RegisterBinding(@@SelectiveGaussianAccumulate, 'SelectiveGaussianAccumulate');


  (*
  ** SelectiveGaussianAccumulate
  *)
  BlurRegistry[@@SelectiveGaussianAccumulate].Add(@Accumulate_Pas,              [isPascal]).Name := 'Accumulate_Pas';
{$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))}
  BlurRegistry[@@SelectiveGaussianAccumulate].Add(@Accumulate_SSE2,             [isSSE2]).Name := 'Accumulate_SSE2';
{$ifend}


  // Implementation ordered by performance:
  // 1. SelectiveGaussian1
  // 2. SelectiveGaussian2
  // 3. SelectiveGaussianHorzVert (destructive, disqualified)
  // 4. SelectiveGaussianNew
  // 5. SelectiveGaussianGimp

  (*
  ** SelectiveGaussianBlur32
  *)
  BlurRegistry[@@SelectiveGaussianBlur32].Add(@SelectiveGaussianGimp,         [isPascal], 1024).Name := 'SelectiveGaussianGimp';
  BlurRegistry[@@SelectiveGaussianBlur32].Add(@SelectiveGaussianNew,          [isPascal], 768).Name := 'SelectiveGaussianNew';
  BlurRegistry[@@SelectiveGaussianBlur32].Add(@SelectiveGaussian2,            [isPascal], 0).Name := 'SelectiveGaussian2';
  BlurRegistry[@@SelectiveGaussianBlur32].Add(@SelectiveGaussian1,            [isPascal], -256).Name := 'SelectiveGaussian1';

  (*
  ** GammaSelectiveGaussianBlur32
  *)
  BlurRegistry[@@GammaSelectiveGaussianBlur32].Add(@SelectiveGaussianGimpGamma,    [isPascal], 1024).Name := 'SelectiveGaussianGimpGamma';
  BlurRegistry[@@GammaSelectiveGaussianBlur32].Add(@SelectiveGaussianNewGamma,     [isPascal], 768).Name := 'SelectiveGaussianNewGamma';
  BlurRegistry[@@GammaSelectiveGaussianBlur32].Add(@SelectiveGaussianGamma2,       [isPascal], 0).Name := 'SelectiveGaussianGamma2';
  BlurRegistry[@@GammaSelectiveGaussianBlur32].Add(@SelectiveGaussianGamma1,       [isPascal], -256).Name := 'SelectiveGaussianGamma1';


  (*
  ** Rebind the above bindings
  *)
  BlurRegistry[@@SelectiveGaussianBlur32].Rebind;
  BlurRegistry[@@GammaSelectiveGaussianBlur32].Rebind;
  BlurRegistry[@@SelectiveGaussianAccumulate].Rebind;
end;

initialization
  SelectiveGaussianBlur32 := SelectiveGaussian32NotImplemented;
  GammaSelectiveGaussianBlur32 := SelectiveGaussian32NotImplemented;
  RegisterBindings;
end.