GR32.Blur.RecursiveGaussian.pas 52 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587
  1. unit GR32.Blur.RecursiveGaussian;
  2. (* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1 or LGPL 2.1 with linking exception
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * Alternatively, the contents of this file may be used under the terms of the
  16. * Free Pascal modified version of the GNU Lesser General Public License
  17. * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions
  18. * of this license are applicable instead of those above.
  19. * Please see the file LICENSE.txt for additional information concerning this
  20. * license.
  21. *
  22. * The Original Code is Recursive Gaussian Blur for Graphics32
  23. *
  24. * The Initial Developer of the Original Code is
  25. * Anders Melander <[email protected]>
  26. *
  27. * Portions created by the Initial Developer are Copyright (C) 2010-2018
  28. * the Initial Developer. All Rights Reserved.
  29. *
  30. * ***** END LICENSE BLOCK ***** *)
  31. interface
  32. {$include GR32.inc}
  33. // IIR = Infinite Input Response
  34. {$define IIR_BLUR_DEFAULT} // Register as default Blur32 implementation.
  35. {$define IIR_BLUR_SIMD} // Enable forward/backward filter implemented using Streaming SIMD Extensions (SSE).
  36. {$define IIR_BLUR_EDGE_CORRECTION} // Apply the boundary corrections from [3]
  37. {$define IIR_BLUR_EDGE_CORRECTION_SIMD} // Enable SIMD implementation of [3] - significantly faster than the Pascal version but unfortunately not a hotspot
  38. {.$define IIR_BLUR_INKSCAPE_COEFFICIENTS}// Enable use of InkScape [7] coefficient calculation
  39. {.$define IIR_BLUR_GABOR_COEFFICIENTS} // Enable use of Gabor [2] coefficient calculation
  40. {$define IIR_USE_HADDPS} // Use the HADDPS SSE3 instruction instead of two ADDPS SSE instructions. See comment in code.
  41. {$define IIR_USE_DPPS} // Use the DPPS SSE4.1 instruction instead of a MULPS and two ADDPS SSE instructions. See comment in code.
  42. {$define IIR_BLUR_ALIGN_BUFFERS} // Use aligned buffers (note only start of buffer is aligned; Individual rows are not)
  43. {$define IIR_BLUR_DIV_LUT} // Use a Look Up Table for alpha pre- and unpremultiplication and gamma correction. Slightly faster and much more precise.
  44. // Note that GammaBlur32 requires that IIR_BLUR_DIV_LUT is defined.
  45. uses
  46. GR32;
  47. //------------------------------------------------------------------------------
  48. //
  49. // Recursive Gaussian Blur
  50. //
  51. //------------------------------------------------------------------------------
  52. // Anders Melander, August 2010-September 2018
  53. //------------------------------------------------------------------------------
  54. // References:
  55. //
  56. // [1] Recursive implementation of the Gaussian filter
  57. // Ian T. Young, Lucas J. van Vliet
  58. // Signal Processing, Volume 44, Number 2, June 1995, pp. 139-151(13)
  59. //
  60. // [2] Recursive Gabor Filtering
  61. // Ian T. Young, Lucas J. van Vliet, and Michael van Ginkel
  62. // IEEE Transactions on Signal Processing, Volume 50, Number 11, Nov 2002, pp. 2799–2805
  63. //
  64. // [3] Boundary Conditions for Young - van Vliet Recursive Filtering
  65. // Bill Triggs, Michael Sdika
  66. // IEEE Transactions on Signal Processing, Volume 54, Number 6, June 2006, pp. 2365-2367
  67. //
  68. // [4] Recursive Gaussian Derivative Filters
  69. // L. J. Van Vliet, I. T. Young, and P. W. Verbeek
  70. // Fourteenth International Conference on Pattern Recognition, 1998. Brisbane.
  71. // Proceedings, Volume 1, pp. 509-514
  72. //
  73. // [5] Performance of three recursive algorithms for fast space-variant Gaussian filtering
  74. // Sovira Tan, Jason L. Dale, Alan Johnston
  75. // Real-Time Imaging, Number 9, 2003, pp 215–228
  76. //
  77. // [6] Gimp, Retinex plug-in
  78. // Fabien Pelisson
  79. //
  80. // [7] Inkscape, Gaussian blur renderer
  81. // Niko Kiirala, bulia byak, Jasper van de Gronde
  82. //------------------------------------------------------------------------------
  83. //
  84. // From [1]:
  85. //
  86. // This implementation yields an infinite impulse response filter that has six
  87. // MADDs (Multiply/Add) per dimension independent of the value of sigma in the
  88. // Gaussian kernel. In contrast to the Deriche implementation (1987), the
  89. // coefficients of our recursive filter have a simple, closed-form solution for a
  90. // desired value of the Gaussian sigma. Our implementation is, in general, faster
  91. // than:
  92. //
  93. // (1) an implementation based upon direct convolution with samples of a Gaussian,
  94. // (2) repeated convolutions with a kernel such as the uniform filter, and
  95. // (3) an FFT implementation of a Gaussian filter.
  96. //
  97. //------------------------------------------------------------------------------
  98. procedure RecursiveGaussianBlur(Src, Dst: TBitmap32; Sigma: TFloat);
  99. procedure RecursiveGaussianBlurRadius(Src, Dst: TBitmap32; Radius: TFloat);
  100. procedure RecursiveGaussianBlurGamma(Src, Dst: TBitmap32; Sigma: TFloat);
  101. procedure RecursiveGaussianBlurRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat);
  102. procedure RecursiveGaussianBlurHorizontalRadius(Src, Dst: TBitmap32; Radius: TFloat);
  103. procedure RecursiveGaussianBlurHorizontalRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat);
  104. //------------------------------------------------------------------------------
  105. //
  106. // Recursive Gaussian Blur internal API types
  107. //
  108. //------------------------------------------------------------------------------
  109. // For now this implementation only works when TFloat=Single since the SSE
  110. // and transpose code has been written for 32-bit values.
  111. //------------------------------------------------------------------------------
  112. type
  113. TQuadFloatRec = packed record
  114. v0: TFloat;
  115. v1: TFloat;
  116. v2: TFloat;
  117. v3: TFloat;
  118. end;
  119. TQuadFloat = array[0..3] of TFloat;
  120. PQuadFloat = ^TQuadFloat;
  121. TNineFloats = array[0..8] of TFloat;
  122. //------------------------------------------------------------------------------
  123. //
  124. // Recursive Gaussian Blur internal bindings
  125. //
  126. //------------------------------------------------------------------------------
  127. type
  128. TIIR_BlurFilterBackwardProc = procedure (pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat);
  129. TIIR_BlurFilterForwardProc = procedure (pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat);
  130. TIIR_BlurApplyEdgeCorrection = procedure (iPlus: PFloat; const B: TQuadFloat; var v: TQuadFloat; const M: TNineFloats);
  131. var
  132. IIR_BlurFilterBackward: TIIR_BlurFilterBackwardProc = nil;
  133. IIR_BlurFilterForward: TIIR_BlurFilterForwardProc = nil;
  134. IIR_BlurApplyEdgeCorrection: TIIR_BlurApplyEdgeCorrection = nil;
  135. //------------------------------------------------------------------------------
  136. //------------------------------------------------------------------------------
  137. //------------------------------------------------------------------------------
  138. implementation
  139. uses
  140. SyncObjs, // TCriticalSection
  141. Math,
  142. GR32.Blur,
  143. GR32_Bindings,
  144. GR32.Transpose,
  145. GR32.Math.Complex,
  146. GR32_OrdinalMaps,
  147. GR32_Blend,
  148. GR32_LowLevel,
  149. GR32_Math,
  150. GR32_System,
  151. GR32_Gamma;
  152. // Ensure that we use the GR32.TFloat and not FPC's Math.TFloat (which is an alias for Double!)
  153. type
  154. TFloat = GR32.TFloat;
  155. PFloat = ^TFloat;
  156. const
  157. OneOver255: TFloat = 1/255;
  158. //------------------------------------------------------------------------------
  159. //
  160. // PremultiplyLUT
  161. //
  162. //------------------------------------------------------------------------------
  163. // Lookup tables for alpha premultiplication.
  164. //
  165. // MulDiv255[a,b] = a * b / 255 Used for premultiplication
  166. // Mul255Div[a,b] = Round(a * 255 / b) Used for unpremultiplication
  167. //
  168. // where
  169. //
  170. // a: Color value
  171. // b: Alpha value
  172. //
  173. // PremultiplyLUT is used for pre- and unpremultiplication.
  174. // GammaPremultiplyLUT rolls the gamma correction and pre-/unpremultiplication
  175. // operations into one for a significant gain in precision at no extra cost in
  176. // performance.
  177. //
  178. //------------------------------------------------------------------------------
  179. {$ifdef IIR_BLUR_DIV_LUT}
  180. type
  181. PPremultiplyLUT = ^TPremultiplyLUT;
  182. TPremultiplyLUT = record
  183. strict private
  184. class constructor Create;
  185. class destructor Destroy;
  186. strict private class var
  187. FLock: TCriticalSection;
  188. FPremultiplyLUT: PPremultiplyLUT;
  189. FGammaPremultiplyLUT: PPremultiplyLUT;
  190. strict private
  191. FsRGB: boolean;
  192. FGamma: Double;
  193. FGammaInv: Double;
  194. procedure SetGamma(const GammaValue: Double; sRGB: boolean);
  195. procedure GammaChangedHandler;
  196. public
  197. Mul255Div: array[byte, byte] of byte;
  198. MulDiv255: array[byte, byte] of TFloat;
  199. public class var
  200. public
  201. class function PremultiplyLUT: PPremultiplyLUT; static;
  202. class function GammaPremultiplyLUT: PPremultiplyLUT; static;
  203. property sRGB: boolean read FsRGB;
  204. property Gamma: Double read FGamma;
  205. end;
  206. //------------------------------------------------------------------------------
  207. class constructor TPremultiplyLUT.Create;
  208. begin
  209. FLock := TCriticalSection.Create;
  210. FPremultiplyLUT := nil;
  211. FGammaPremultiplyLUT := nil;
  212. end;
  213. class destructor TPremultiplyLUT.Destroy;
  214. begin
  215. FLock.Free;
  216. if (FPremultiplyLUT <> nil) then
  217. Dispose(FPremultiplyLUT);
  218. if (FGammaPremultiplyLUT <> nil) then
  219. begin
  220. UnregisterGammaChangeNotification(FGammaPremultiplyLUT.GammaChangedHandler);
  221. Dispose(FGammaPremultiplyLUT);
  222. end;
  223. end;
  224. //------------------------------------------------------------------------------
  225. class function TPremultiplyLUT.PremultiplyLUT: PPremultiplyLUT;
  226. var
  227. AlphaValue, ColorValue: Integer;
  228. begin
  229. if (FPremultiplyLUT = nil) then
  230. begin
  231. FLock.Acquire;
  232. if (FPremultiplyLUT = nil) then
  233. begin
  234. New(FPremultiplyLUT);
  235. for ColorValue := 0 to 255 do
  236. begin
  237. FPremultiplyLUT.Mul255Div[ColorValue, 0] := 0;
  238. FPremultiplyLUT.MulDiv255[ColorValue, 0] := 0.0;
  239. for AlphaValue := 1 to 255 do
  240. begin
  241. FPremultiplyLUT.Mul255Div[ColorValue, AlphaValue] := Clamp(Round(ColorValue * 255 / AlphaValue));
  242. FPremultiplyLUT.MulDiv255[ColorValue, AlphaValue] := ColorValue * AlphaValue * OneOver255;;
  243. end;
  244. end;
  245. end;
  246. FLock.Release;
  247. end;
  248. Result := FPremultiplyLUT;
  249. end;
  250. //------------------------------------------------------------------------------
  251. class function TPremultiplyLUT.GammaPremultiplyLUT: PPremultiplyLUT;
  252. begin
  253. if (FGammaPremultiplyLUT = nil) then
  254. begin
  255. FLock.Acquire;
  256. if (FGammaPremultiplyLUT = nil) then
  257. begin
  258. New(FGammaPremultiplyLUT);
  259. RegisterGammaChangeNotification(FGammaPremultiplyLUT.GammaChangedHandler);
  260. FGammaPremultiplyLUT.SetGamma(GAMMA_VALUE, GAMMA_IS_SRGB);
  261. end;
  262. FLock.Release;
  263. end;
  264. Result := FGammaPremultiplyLUT;
  265. end;
  266. procedure TPremultiplyLUT.GammaChangedHandler;
  267. begin
  268. SetGamma(GAMMA_VALUE, GAMMA_IS_SRGB);
  269. end;
  270. procedure TPremultiplyLUT.SetGamma(const GammaValue: Double; sRGB: boolean);
  271. var
  272. AlphaValue, ColorValue: Integer;
  273. n: Single;
  274. ColorLinear, ColorRGB: TFloat;
  275. begin
  276. if (FsRGB = sRGB) and ((FsRGB) or (GammaValue = FGamma)) then
  277. exit;
  278. FsRGB := sRGB;
  279. if (not FsRGB) then
  280. begin
  281. FGamma := GammaValue;
  282. FGammaInv := 1 / FGamma;
  283. end;
  284. for ColorValue := 0 to 255 do
  285. begin
  286. Mul255Div[ColorValue, 0] := 0;
  287. MulDiv255[ColorValue, 0] := 0.0;
  288. // sRGB -> Linear RGB / 255
  289. ColorLinear := ColorValue * OneOver255;
  290. if (FsRGB) then
  291. begin
  292. if (ColorLinear >= 0.04045) then
  293. ColorLinear := Power((ColorLinear + 0.055) * (1 / 1.055), 2.4)
  294. else
  295. ColorLinear := ColorLinear * (1 / 12.92);
  296. end else
  297. ColorLinear := Power(ColorLinear, FGamma);
  298. // ColorValue: Color, AlphaValue: Alpha
  299. for AlphaValue := 1 to 255 do
  300. begin
  301. // Linear RGB -> Premultiplied, Linear RGB
  302. n := ColorLinear * AlphaValue;
  303. MulDiv255[ColorValue, AlphaValue] := n;
  304. // Premultiplied, Linear RGB -> Unpremultiplied, Linear RGB
  305. n := ColorValue / AlphaValue;
  306. // Linear RGB -> sRGB / 255
  307. if (FsRGB) then
  308. begin
  309. if (n >= 0.0031308) then
  310. ColorRGB := 1.055 * Power(n, 1 / 2.4) - 0.055
  311. else
  312. ColorRGB := n * 12.92;
  313. end else
  314. ColorRGB := Power(n, FGammaInv);
  315. Mul255Div[ColorValue, AlphaValue] := Clamp(Round(ColorRGB * 255));
  316. end;
  317. end;
  318. end;
  319. {$endif IIR_BLUR_DIV_LUT}
  320. //------------------------------------------------------------------------------
  321. //
  322. // Recursive Gaussian Blur
  323. //
  324. //------------------------------------------------------------------------------
  325. //------------------------------------------------------------------------------
  326. // Low level, SIMD implementations
  327. //------------------------------------------------------------------------------
  328. {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))}
  329. procedure BlurFilterForward_SSE41(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat); {$IFDEF FPC}assembler;{$ENDIF}
  330. // Parameters (x86):
  331. // EAX <- pIn
  332. // EDX <- pOut
  333. // ECX <- B
  334. // Stack[0] <- Width
  335. // Stack[1] <- v
  336. // Preserves: (none)
  337. //
  338. // Parameters (x64):
  339. // RCX <- pIn
  340. // RDX <- pOut
  341. // R8 <- B
  342. // R9 <- Width
  343. // Stack[0] <- v
  344. // Preserves: (none)
  345. //
  346. // SSE register usage:
  347. // XMM0: B3 | B2 | B1 | B0
  348. // XMM1: * | v1 | v2 | v3
  349. // XMM2, XMM3: misc use
  350. asm
  351. {$if defined(TARGET_x64)}
  352. {$IFNDEF FPC}
  353. // nothing
  354. {$ELSE}
  355. // nothing
  356. {$ENDIF}
  357. {$elseif defined(TARGET_x86)}
  358. // nothing
  359. {$else}
  360. {$message fatal 'Unsupported target'}
  361. {$ifend}
  362. MOVUPS XMM0, TQuadFloatRec PTR [B] // XMM0 <- B3 | B2 | B1 | B0
  363. (*
  364. ** Initialization of forward pass
  365. **
  366. ** v[1] := pIn^ / (B[0] + B[1] + B[2] + B[3]);
  367. ** v[2] := v[1];
  368. ** v[3] := v[1];
  369. *)
  370. MOVAPS XMM2, XMM0 // XMM2 <- B3 | B2 | B1 | B0
  371. // XMM2[0] <- (B + B1 + B2 + B3)
  372. // TODO : HADDPS is slow according to this:
  373. // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
  374. // However, my own benchmarks indicate that the alternative is slower...
  375. {$ifdef IIR_USE_HADDPS}
  376. HADDPS XMM2, XMM2 // XMM2[0..3] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3]
  377. HADDPS XMM2, XMM2
  378. {$else IIR_USE_HADDPS}
  379. MOVHLPS XMM3, XMM2 // XMM3[0,1] <- XMM2[2,3]
  380. ADDPS XMM2, XMM3 // XMM2[0] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3]
  381. PSHUFD XMM3, XMM2, $01
  382. ADDSS XMM2, XMM3
  383. {$endif IIR_USE_HADDPS}
  384. MOVSS XMM1, [pIn] // XMM1[0] <- pIn^
  385. DIVSS XMM1, XMM2 // XMM1[0] <- XMM1[0] / XMM2[0]
  386. SHUFPS XMM1, XMM1, $00 // XMM1[0..3] <- XMM1[0]
  387. // Iterate over all cols using the "negative offset technique"
  388. {$if defined(TARGET_x86)}
  389. // We're now done with the value in the "B" register so it's repurposed for the "Width" counter.
  390. MOV ECX, Width
  391. LEA pIn, [pIn+ECX*4]
  392. LEA pOut, [pOut+ECX*4]
  393. NEG ECX
  394. {$elseif defined(TARGET_x64)}
  395. LEA pIn, [pIn+R9*4]
  396. LEA pOut, [pOut+R9*4]
  397. NEG R9
  398. {$ifend}
  399. @LOOP:
  400. (*
  401. ** v0 := (pIn^ * B0) + (v1 * B1) + (v2 * B2) + (v3 * B3);
  402. *)
  403. {$if defined(TARGET_x86)}
  404. MOVSS XMM2, [pIn+ECX*4] // XMM2[0] <- pIn
  405. {$elseif defined(TARGET_x64)}
  406. MOVSS XMM2, TFloat PTR [pIn+R9*4] // XMM2[0] <- pIn
  407. {$ifend}
  408. MOVSS XMM1, XMM2 // XMM1[0] <- XMM2[0]
  409. MOVAPS XMM2, XMM1 // XMM2 <- XMM1
  410. // TODO : DPPS might be slow on some CPUs:
  411. // https://stackoverflow.com/questions/37879678/dot-product-performance-with-sse-instructions-is-dpps-worth-using
  412. // On my system it's significantly faster than the alternative
  413. {$ifdef IIR_USE_DPPS}
  414. DPPS XMM2, XMM0, $FF
  415. {$else IIR_USE_DPPS}
  416. // Multiply B[] with v[]
  417. MULPS XMM2, XMM0 // XMM2 <- XMM2 * XMM0
  418. // Add the four results of the previous vectorized multiplication
  419. MOVHLPS XMM3, XMM2 // XMM2[0] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3]
  420. ADDPS XMM2, XMM3
  421. PSHUFD XMM3, XMM2, $01
  422. ADDSS XMM2, XMM3
  423. {$endif IIR_USE_DPPS}
  424. MOVSS XMM1, XMM2 // XMM1[0] <- v0
  425. // pOut^ := v0;
  426. {$if defined(TARGET_x86)}
  427. MOVSS [pOut+ECX*4], XMM2
  428. {$elseif defined(TARGET_x64)}
  429. MOVSS TFloat PTR [pOut+R9*4], XMM2
  430. {$ifend}
  431. (*
  432. ** Shift cached output values
  433. ** v3 := v2;
  434. ** v2 := v1;
  435. ** v1 := v0;
  436. **)
  437. SHUFPS XMM1, XMM1, $93 // XMM1[0,1,2,3] <- XMM1[3,0,1,2]
  438. // Loop to next col
  439. {$if defined(TARGET_x86)}
  440. INC ECX
  441. {$elseif defined(TARGET_x64)}
  442. INC R9
  443. {$ifend}
  444. JNZ @LOOP
  445. // Update v1, v2, v3 for backward filter
  446. {$if defined(TARGET_x86)}
  447. MOV EDX, v
  448. MOVUPS [EDX], XMM1
  449. {$elseif defined(TARGET_x64)}
  450. // TODO : For some reason this doesn't work ("v" isn't updated):
  451. // MOVUPS [TQuadFloat PTR v], XMM1
  452. MOV RDX, v
  453. MOVUPS [RDX], XMM1
  454. {$ifend}
  455. {$if defined(TARGET_x64)}
  456. {$IFDEF FPC}
  457. // nothing
  458. {$ENDIF}
  459. {$elseif defined(TARGET_x86)}
  460. // nothing
  461. {$ifend}
  462. end;
  463. //------------------------------------------------------------------------------
  464. procedure BlurFilterBackward_SSE41(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; const v: TQuadFloat); {$IFDEF FPC}assembler;{$ENDIF}
  465. // Parameters (x86):
  466. // EAX <- pIn
  467. // EDX <- pOut
  468. // ECX <- B
  469. // Stack[0] <- Width
  470. // Stack[1] <- v
  471. // Preserves: (none)
  472. //
  473. // Parameters (x64):
  474. // RCX <- pIn
  475. // RDX <- pOut
  476. // R8 <- B
  477. // R9 <- Width
  478. // Stack[0] <- v
  479. // Preserves: (none)
  480. //
  481. // SSE register usage:
  482. // XMM0: B3 | B2 | B1 | B0
  483. // XMM1: * | v1 | v2 | v3
  484. // XMM2, XMM3: misc use
  485. asm
  486. MOVUPS XMM0, TQuadFloatRec PTR [B] // XMM0 <- B3 | B2 | B1 | B0
  487. {$if defined(TARGET_x86)}
  488. MOV ECX, v
  489. MOVUPS XMM1, [ECX] // XMM1 <- v3 | v2 | v1 | *
  490. {$elseif defined(TARGET_x64)}
  491. // TODO : For some reason this doesn't work ("v" isn't loaded):
  492. // MOVUPS XMM1, [TQuadFloat PTR v] // XMM1 <- v3 | v2 | v1 | *
  493. MOV R8, QWORD PTR v
  494. MOVUPS XMM1, [R8] // XMM1 <- v3 | v2 | v1 | *
  495. {$ifend}
  496. // Iterate over all cols
  497. // Note: We're iterating backwards from the last to the first col.
  498. // On entry, pIn and pOut points to the last col. Here we adjust them to point to the first col
  499. // so we can use the col counter as an offset.
  500. {$if defined(TARGET_x86)}
  501. // We're now done with the value in the "B" register so it's repurposed for the "Width" counter.
  502. MOV ECX, Width
  503. // Although the compiler accepts [RegA-RegB*4] it assembles it to [RegA+RegB*4] so we need
  504. // to temporarily negate the offset value
  505. NEG ECX
  506. LEA pIn, [pIn+ECX*4]
  507. LEA pOut, [pOut+ECX*4]
  508. NEG ECX
  509. {$elseif defined(TARGET_x64)}
  510. NEG R9
  511. LEA pIn, [pIn+R9*4]
  512. LEA pOut, [pOut+R9*4]
  513. NEG R9
  514. {$ifend}
  515. @LOOP:
  516. (*
  517. ** v0 := (B0 * pIn^) + (v1 * B1) + (v2 * B2) + (v3 * B3);
  518. *)
  519. // Multiply B[] with v[]
  520. {$if defined(TARGET_x86)}
  521. INSERTPS XMM1, [pIn+ECX*4], $00 // XMM1[0] <- pIn
  522. {$elseif defined(TARGET_x64)}
  523. INSERTPS XMM1, TFloat PTR [pIn+R9*4], $00 // XMM1[0] <- pIn
  524. {$ifend}
  525. MOVAPS XMM2, XMM1 // XMM2 <- XMM1
  526. {$ifdef IIR_USE_DPPS}
  527. DPPS XMM2, XMM0, $FF // XMM2 <- XMM2 dot XMM1
  528. {$else IIR_USE_DPPS}
  529. MULPS XMM2, XMM0 // XMM2 <- XMM2 * XMM0
  530. // Add the four results of the previous vectorized multiplication
  531. MOVHLPS XMM3, XMM2 // XMM2[0] <- XMM2[0] + XMM2[1] + XMM2[2] + XMM2[3]
  532. ADDPS XMM2, XMM3
  533. PSHUFD XMM3, XMM2, $01
  534. ADDSS XMM2, XMM3
  535. {$endif IIR_USE_DPPS}
  536. MOVSS XMM1, XMM2 // XMM1[0] <- v0
  537. // pOut^ := v0;
  538. {$if defined(TARGET_x86)}
  539. MOVSS [pOut+ECX*4], XMM2
  540. {$elseif defined(TARGET_x64)}
  541. MOVSS TFloat PTR [pOut+R9*4], XMM2
  542. {$ifend}
  543. (*
  544. ** Shift cached output values
  545. ** v3 := v2;
  546. ** v2 := v1;
  547. ** v1 := v0;
  548. **)
  549. SHUFPS XMM1, XMM1, $93 // XMM0 <- v2 | v1 | v0 | v3
  550. // Loop to next (previous actually) col
  551. {$if defined(TARGET_x86)}
  552. DEC ECX
  553. {$elseif defined(TARGET_x64)}
  554. DEC R9
  555. {$ifend}
  556. JNZ @LOOP
  557. end;
  558. //------------------------------------------------------------------------------
  559. procedure BlurApplyEdgeCorrection_SSE41(iPlus: PFloat; const B: TQuadFloat; var v: TQuadFloat; const M: TNineFloats); // {$IFDEF FPC}assembler;{$ENDIF}
  560. // Parameters (x86):
  561. // EAX <- iPlus
  562. // EDX <- B0
  563. // ECX <- v
  564. // Stack[0] <- M
  565. // Preserves: (none)
  566. //
  567. // Parameters (x64):
  568. // RCX <- iPlus
  569. // RDX <- B0
  570. // R8 <- v
  571. // R9 <- M
  572. // Preserves: XMM4
  573. //
  574. // SSE register usage:
  575. // XMM0: B3 | B2 | B1 | B0
  576. // XMM1: * | v1 | v2 | v3
  577. // XMM2, XMM3, XMM4: misc use
  578. {$if defined(TARGET_x64) and defined(FPC)}begin{$ifend}
  579. asm
  580. {$if defined(TARGET_x64)}
  581. {$IFNDEF FPC}
  582. .SAVENV XMM4
  583. {$ENDIF}
  584. {$elseif defined(TARGET_x86)}
  585. // nothing
  586. {$else}
  587. {$message fatal 'Unsupported target'}
  588. {$ifend}
  589. (*
  590. ** unp0 := v[1] - uPlus;
  591. ** unp1 := v[2] - uPlus;
  592. ** unp2 := v[3] - uPlus;
  593. *)
  594. // XMM1 = [v3 | v2 | v1 | 0]
  595. MOVUPS XMM1, TQuadFloat PTR [v] // XMM1 <- v3 | v2 | v1 | v0
  596. // Save v0
  597. MOVAPS XMM5, XMM1
  598. // Now get rid of XMM1[0]
  599. SHUFPS XMM1, XMM1, $39 // XMM1 <- v0 | v3 | v2 | v1
  600. INSERTPS XMM1, XMM1, $F8 // XMM1 <- 0 | v3 | v2 | v1
  601. // XMM0 = [0 | u+ | u+ | u+]
  602. MOVSS XMM0, [iPlus] // XMM0[0] <- u+
  603. SHUFPS XMM0, XMM0, $00 // XMM0 <- u+ | u+ | u+ | u+
  604. // XMM1 = XMM1*XMM0 = [0, unp2, unp1, unp0]
  605. SUBPS XMM1, XMM0 // XMM1 <- XMM0 - u+
  606. // XMM1 = [0, unp2,unp1,unp0]
  607. INSERTPS XMM1, XMM1, $F8 // XMM1[3] <- 0
  608. (*
  609. ** v[1] := ( (M0 * unp0) + (M1 * unp1) + (M2 * unp2) ) * B0 + vPlus;
  610. ** v[2] := ( (M3 * unp0) + (M4 * unp1) + (M5 * unp2) ) * B0 + vPlus;
  611. ** v[3] := ( (M6 * unp0) + (M7 * unp1) + (M8 * unp2) ) * B0 + vPlus;
  612. *)
  613. // XMM2 = [0 | M2 | M1 | M0]
  614. // XMM3 = [0 | M5 | M4 | M3]
  615. // XMM4 = [0 | M8 | M7 | M6]
  616. {$if defined(TARGET_x86)}
  617. MOV EAX, M
  618. MOVUPS XMM2, [EAX] // XMM1 <- * | M2 | M1 | M0
  619. {$elseif defined(TARGET_x64)}
  620. MOVUPS XMM2, TQuadFloat PTR [M] // XMM1 <- * | M2 | M1 | M0
  621. {$ifend}
  622. INSERTPS XMM2, XMM2, $F8 // XMM2 <- 0 | M2 | M1 | M0
  623. {$if defined(TARGET_x86)}
  624. ADD EAX, 12//SizeOf(TFloat)*3
  625. MOVUPS XMM3, [EAX] // XMM3 <- * | M5 | M4 | M3
  626. {$elseif defined(TARGET_x64)}
  627. ADD M, 12//SizeOf(TFloat)*3
  628. MOVUPS XMM3, TQuadFloat PTR [M] // XMM3 <- * | M5 | M4 | M3
  629. {$ifend}
  630. INSERTPS XMM3, XMM3, $F8 // XMM3 <- 0 | M5 | M4 | M3
  631. {$if defined(TARGET_x86)}
  632. ADD EAX, 12//SizeOf(TFloat)*3
  633. MOVUPS XMM4, [EAX] // XMM4 <- * | M8 | M7 | M6
  634. {$elseif defined(TARGET_x64)}
  635. ADD M, 12//SizeOf(TFloat)*3
  636. MOVUPS XMM4, TQuadFloat PTR [M] // XMM4 <- * | M8 | M7 | M6
  637. {$ifend}
  638. INSERTPS XMM4, XMM4, $F8 // XMM4 <- 0 | M8 | M7 | M6
  639. // XMM2[1] = [M2, M1, M0]*[unp2, unp1, unp0] = (M0 * unp0) + (M1 * unp1) + (M2 * unp2)
  640. // TODO : {$ifdef IIR_USE_DPPS}
  641. DPPS XMM2, XMM1, $72 // XMM2[1] <- [M2, M1, M0]*[unp2, unp1, unp0]
  642. // XMM3[2] = [M5, M4, M3]*[unp2, unp1, unp0] = (M3 * unp0) + (M4 * unp1) + (M5 * unp2)
  643. // TODO : {$ifdef IIR_USE_DPPS}
  644. DPPS XMM3, XMM1, $74 // XMM3[2] <- [M5, M4, M3]*[unp2, unp1, unp0]
  645. // XMM4[3] = [M8, M7, M6]*[unp2, unp1, unp0] = (M6 * unp0) + (M7 * unp1) + (M8 * unp2)
  646. // TODO : {$ifdef IIR_USE_DPPS}
  647. DPPS XMM4, XMM1, $78 // XMM4[3] <- [M8, M7, M6]*[unp2, unp1, unp0]
  648. // XMM2[1..3] = [ XMM2[1] | XMM3[2] | XMM4[3] ]
  649. INSERTPS XMM2, XMM3, $A0 // XMM2[2] <- XMM3[2]
  650. INSERTPS XMM2, XMM4, $F0 // XMM2[3] <- XMM4[3]
  651. // XMM1 = B[0]
  652. {$IFNDEF FPC}
  653. MOVSS XMM1, TFloat PTR [B] // XMM1[1] <- B0
  654. {$ELSE}
  655. {$if defined(TARGET_x64)}
  656. MOVSS XMM1, TFloat PTR [RDX] // XMM1[1] <- B0
  657. {$else}
  658. MOVSS XMM1, TFloat PTR [EDX] // XMM1[1] <- B0
  659. {$ifend}
  660. {$ENDIF}
  661. SHUFPS XMM1, XMM1, 0 // XMM1 <- B0 | B0 | B0 | B0
  662. // XMM2 = XMM2 * B0
  663. MULPS XMM2, XMM1 // XMM2 <- XMM2 * B0
  664. // XMM2 = XMM2 + v+
  665. ADDPS XMM2, XMM0 // XMM2 <- XMM2 + v+
  666. INSERTPS XMM2, XMM5, $00 // XMM2[0] <- XMM5[0]
  667. MOVUPS TQuadFloat PTR [v], XMM2 // XMM2 -> v
  668. {$if defined(TARGET_x64) and defined(FPC)}end['XMM4'];{$ifend}
  669. end;
  670. {$ifend}
  671. //------------------------------------------------------------------------------
  672. // Low level, Pascal implementations
  673. //------------------------------------------------------------------------------
  674. procedure BlurFilterForward_Pas(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var vv: TQuadFloat); inline;
  675. var
  676. i: integer;
  677. v: TQuadFloat;
  678. begin
  679. // Initialization of forward pass
  680. // [2] Equation (20)
  681. //
  682. // in[1]
  683. // w[1] = w[2] = w[3] = ──────────────────
  684. // 1+ fd1 + fd2 + fd3
  685. //
  686. v[1] := pIn[0] / (B[0] + B[1] + B[2] + B[3]);
  687. v[2] := v[1];
  688. v[3] := v[1];
  689. // Forward pass
  690. // [1] Equation (9a)
  691. //
  692. // B[1]*v[n-1] + B[2]*v[n-2] + B[3]*v[n-3] ∑ B[1..3]
  693. // w[n] = B * in[n] + ─────────────────────────────────────── , B = 1 - ───────── ([1] equation 10)
  694. // B[0] B[0]
  695. //
  696. for i := 0 to Width-1 do
  697. begin
  698. // Note: In this implementation B[1..3] has already been scaled by 1/b0 and B[0] instead contains B.
  699. v[0] := (pIn[i] * B[0]) + (v[1] * B[1]) + (v[2] * B[2]) + (v[3] * B[3]);
  700. pOut[i] := v[0];
  701. // Shift cached output values
  702. v[3] := v[2];
  703. v[2] := v[1];
  704. v[1] := v[0];
  705. end;
  706. vv := v;
  707. end;
  708. //------------------------------------------------------------------------------
  709. procedure BlurFilterBackward_Pas(pIn, pOut: PFloatArray; const B: TQuadFloat; Width: Cardinal; var v: TQuadFloat); inline;
  710. var
  711. i: integer;
  712. begin
  713. Dec(PFloat(pIn), Width-1);
  714. Dec(PFloat(pOut), Width-1);
  715. // Backward pass
  716. // [1] Equation (9b)
  717. //
  718. // B[1]*out[n+1] + B[2]*out[n+2] + B[3]*out[n+3] ∑ B[1..3]
  719. // out[n] = B * w[n] + ───────────────────────────────────────────── , B = 1 - ───────── ([1] equation 10)
  720. // B[0] B[0]
  721. //
  722. for i := Width-1 downto 0 do // Using negative array index works too but will give the purists a heart attack
  723. begin
  724. v[0] := (B[0] * pIn[i]) + (v[1] * B[1]) + (v[2] * B[2]) + (v[3] * B[3]);
  725. pOut[i] := v[0];
  726. // Shift cached output values
  727. v[3] := v[2];
  728. v[2] := v[1];
  729. v[1] := v[0];
  730. end;
  731. end;
  732. //------------------------------------------------------------------------------
  733. procedure BlurApplyEdgeCorrection_Pas(iPlus: PFloat; const B: TQuadFloat; var v: TQuadFloat; const M: TNineFloats);
  734. var
  735. uPlus, vPlus: TFloat;
  736. unp0, unp1, unp2: TFloat;
  737. begin
  738. // [3] Equation (15)
  739. // iPlus = infinite stream of input values = value of rightmost pixel in our case
  740. // [3] Equation (15a)
  741. //
  742. // i+ i+
  743. // u+ = ────────────── = ────
  744. // 1 - ∑ b1..b3 B
  745. //
  746. // Note: B = B[0] = 1-(B[1]+B[2]+B[3]), [1] Equation (10)
  747. // See calculation of B[0] in ComputeCoefficients*
  748. uPlus := iPlus^; // We have already scaled by B
  749. // [3] Equation (15b)
  750. //
  751. // u+ u+
  752. // v+ = ────────────── = ────
  753. // 1 - ∑ b1..b3 B
  754. //
  755. vPlus := uPlus; // We have already scaled by B
  756. // [3] Equation (14)
  757. //
  758. // ┌ ┐ ┌ ┐
  759. // │v[2] - u+│ │ v+ │
  760. // v[] = M[] * │v[1] - u+│ + │ v+ │
  761. // │v[0] - u+│ │ v+ │
  762. // └ ┘ └ ┘
  763. //
  764. unp0 := v[1] - uPlus;
  765. unp1 := v[2] - uPlus;
  766. unp2 := v[3] - uPlus;
  767. v[1] := ( (M[0] * unp0) + (M[1] * unp1) + (M[2] * unp2) ) * B[0] + vPlus;
  768. v[2] := ( (M[3] * unp0) + (M[4] * unp1) + (M[5] * unp2) ) * B[0] + vPlus;
  769. v[3] := ( (M[6] * unp0) + (M[7] * unp1) + (M[8] * unp2) ) * B[0] + vPlus;
  770. end;
  771. //------------------------------------------------------------------------------
  772. // Coefficient calculation
  773. //------------------------------------------------------------------------------
  774. // Uses algorithm from Inkscape to calculate coefficients (Adapted from [7]. Based on [4]).
  775. // (Iterative and supposedly even better approximation than [2]).
  776. // Note: My (anme) own tests shows that [2] produces a much more precise approximation
  777. // of the gaussian curve.
  778. procedure ComputeCoefficientsInkscape(Sigma: TFloat; var B: TQuadFloat);
  779. var
  780. d1_org: TComplex;
  781. d3_org: Double;
  782. qLow, qHigh: Double;
  783. SigmaSqr: Double;
  784. s, q: Double;
  785. d1: TComplex;
  786. d3: Double;
  787. sSqr: Double;
  788. AbsD1Sqr: Double;
  789. re2d1: Double;
  790. B0: Double;
  791. Limit: Double;
  792. begin
  793. (*
  794. The use of ComputeCoefficientsInkscape is currently disabled by default because:
  795. - ComputeCoefficientsInkscape is only valid for Sigma >= 20 and the transition
  796. between the default coefficient and ComputeCoefficientsInkscape isn't smooth.
  797. *)
  798. // [4] Table 1, 3rd order filter
  799. d1_org := TComplex.From(1.40098, 1.00236);
  800. d3_org := 1.85132;
  801. SigmaSqr := Sqr(Sigma);
  802. // [4] suggests using "a few iterations of a linear extrapolation scheme" to find
  803. // the value of q that yields a filter h[n] with the specified variance.
  804. // Here we use a binary search instead.
  805. qLow := 1; // Don't go lower than sigma=2 (we'd probably want a normal convolution in that case anyway)
  806. qHigh := 2 * Sigma;
  807. {$if (SizeOf(TFloat) > SizeOf(Single))}
  808. Limit := Sigma / (1 shl 30)
  809. {$else}
  810. Limit := Sigma / (1 shl 20);
  811. {$ifend}
  812. repeat
  813. q := (qLow + qHigh) / 2;
  814. // Compute scaled filter coefficients
  815. // [4] Equation (20)
  816. // 1 jθ(d) 1 jθ(d)
  817. // ─── ───── ─── ───── -2
  818. // var(h[n]) = ∑ 2|d| q * e q * ( |d| q - e q )
  819. //
  820. d1 := TComplex.Power(d1_org, 1 / q);
  821. d3 := Power(d3_org, 1 / q);
  822. // Compute actual sigma^2
  823. // [4] Equation (10)
  824. //
  825. // 2d
  826. // ∑ n²*h[n] = ∑ ────────
  827. // (d - 1)²
  828. //
  829. sSqr := 2 * (2 * (d1 / TComplex.Sqr(d1 - 1)).Real + d3 / Sqr(d3 - 1));
  830. if (sSqr < SigmaSqr) then
  831. qLow := q
  832. else
  833. qHigh := q;
  834. s := qHigh - qLow;
  835. until (s <= Limit);
  836. q := (qLow + qHigh) / 2;
  837. // Compute z-poles
  838. // [4] Equation (20)
  839. d1 := TComplex.Power(d1_org, 1 / q);
  840. d3 := Power(d3_org, 1 / q);
  841. AbsD1Sqr := TComplex.AbsSqr(d1); // d1*d2 = d1*conj(d1) = |d1|^2 = std::norm(d1)
  842. re2d1 := 2 * d1.Real; // d1+d2 = d1+conj(d1) = 2*real(d1)
  843. // Compute filter coefficients
  844. // [4] Equation (13)
  845. // Note: For performance b1..b3 are divided b0 to avoid doing so in the inner loop.
  846. // Additionally B is stored in B[0] since we no longer need b0.
  847. B0 := 1 / (AbsD1Sqr * d3);
  848. B[1] := (AbsD1Sqr + d3 * re2d1) * B0;
  849. B[2] := -(d3 + re2d1) * B0;
  850. B[3] := B0;
  851. // B: Normalization constant
  852. // [1] Equation (10)
  853. B[0] := 1.0 - (B[1] + B[2] + B[3]);
  854. end;
  855. //------------------------------------------------------------------------------
  856. // Uses algorithm from [2] to calculate coefficients (more precise approximation).
  857. // Not stable for sigma >= 20 (Radius >= 50) - produces "crazy" results.
  858. (*
  859. The use of ComputeCoefficientsGabor is currently disabled by default because:
  860. a) ComputeCoefficientsGaussianDerivative produces almost the same values for
  861. Radius=1 as ComputeCoefficientsGabor does for Radius=2 which makes no
  862. sense to the user.
  863. b) ComputeCoefficientsGabor doesn't support Sigma < 0.5 so we can't just use
  864. it instead of ComputeCoefficientsGaussianDerivative.
  865. * Radius: 1; Sigma: 0.300386630413846
  866. ComputeCoefficientsGaussianDerivative
  867. B[0.840293169021606, 0.170402005314827, -0.0110350390896201, 0.00033983588218689]
  868. * Radius: 2; Sigma: 0.300386630413846) 0.600773260827692
  869. ComputeCoefficientsGabor
  870. B[0.845094859600067, 0.164933204650879, -0.0103367269039154, 0.000308647722704336]
  871. *)
  872. procedure ComputeCoefficientsGabor(Sigma: TFloat; var B: TQuadFloat);
  873. var
  874. q, qq, qqq: Double;
  875. B0: Double;
  876. const
  877. // [2] Equation (7)
  878. m0 = 1.16680;
  879. m1 = 1.10783;
  880. m2 = 1.40586;
  881. m1m1 = m1*m1;
  882. m2m2 = m2*m2;
  883. begin
  884. // Compute q from Standard Deviation σ
  885. // [2] Equation (16)
  886. //
  887. // ┌
  888. // │ 0.9804(σ -3.556) + 2.5091 for σ >= 3.556
  889. // q = ┤
  890. // │ 0.0561σ² +0.5784σ - 0.2568 for 0.5 <= σ <= 3.556
  891. // └
  892. // Note: Doesn't work well for values smaller than 0.5
  893. if (Sigma < 3.556) then
  894. q := (0.0561 * Sigma + 0.5784) * Sigma - 0.2568 // = 0.0561*Sqr(Sigma) + 0.5784*Sigma - 0.2568
  895. else
  896. q := 0.9804 * (Sigma - 3.556) + 2.5091;
  897. qq := Sqr(q);
  898. qqq := qq * q;
  899. // Compute filter coefficients
  900. // [2] Equation (15)
  901. //
  902. // b0 = 1 [*] scale = (m0 + q)(m1² + m2² + 2m1q + q²) [*] Something is missing here in the original paper
  903. // b1 = -q(2m0m1 + m1² + m2² + (2m0 + 4m1)q + 3q²) / scale
  904. // b2 = q²(m0 + 2m1 + 3q) / scale
  905. // b3 = -q³ / scale
  906. //
  907. // Note: We have reversed the sign of b1, b2 & b3 from [2]
  908. // Note: For performance we scale B[1..3] by 1/b0 to avoid doing so in the inner loop.
  909. // Additionally we store B in B[0].
  910. B0 := 1 / ((m0 + q) * (m1m1 + m2m2 + 2 * m1 * q + qq));
  911. B[1] := (q * (2 * m0 * m1 + m1m1 + m2m2 + (2 * m0 + 4 * m1) * q + 3 * qq)) * B0;
  912. B[2] := (-qq * (m0 + 2 * m1 + 3 * q)) * B0;
  913. B[3] := (qqq) * B0;
  914. // [2] Equation (19)
  915. //
  916. // B = (m0(m1² + m2²)/scale)²
  917. //
  918. B[0] := (m0 * (m1m1 + m2m2)) * B0;
  919. end;
  920. //------------------------------------------------------------------------------
  921. // Uses algorithm from [4] to calculate coefficients
  922. procedure ComputeCoefficientsGaussianDerivative(Sigma: TFloat; var B: TQuadFloat);
  923. var
  924. q, qq, qqq: Double;
  925. B0: Double;
  926. // Sigma2, Sigma3, Sigma4: Double;
  927. begin
  928. // Compute q from Standard Deviation
  929. (*
  930. if (Sigma <= 4) and (Sigma >= 1) then
  931. begin
  932. // [5] Equation (22)
  933. // Valid for the range σ = [1,4]
  934. Sigma2 := Sqr(Sigma);
  935. Sigma3 := Sigma2 * Sigma;
  936. Sigma4 := Sigma3 * Sigma;
  937. q := 0.0001 * Sigma4 - 0.0021 * Sigma3 + 0.0207 * Sigma2 + 0.3797 * Sigma + 0.1763;
  938. end else
  939. // Well, this sucks: The transition between the case above (Radius=13, Sigma=3.9) and the one
  940. // below (Radius=14, Sigma=4.2) isn't smooth; There's a visible "jump" in the amount of blur
  941. // when we transition between them.
  942. *)
  943. if (Sigma >= 2.5) then
  944. // [1] Equation (11b)
  945. q := 0.98711 * Sigma - 0.96330
  946. else
  947. if (Sigma >= 0.5) then
  948. // [1] Equation (11b)
  949. q := 3.97156 - 4.14554 * Sqrt(1.0 - 0.26891 * Sigma)
  950. else
  951. // [6]
  952. q := 0.1147705018520355224609375;
  953. qq := Sqr(q);
  954. qqq := qq * q;
  955. // Compute filter coefficients
  956. // [1] Equation (8c)
  957. B0 := 1 / (1.57825 + (2.44413 * q) + (1.4281 * qq) + (0.422205 * qqq));
  958. B[1] := ( (2.44413 * q) + (2.85619 * qq) + (1.26661 * qqq)) * B0;
  959. B[2] := (-( (1.4281 * qq) + (1.26661 * qqq))) * B0;
  960. B[3] := ( 0.422205 * qqq) * B0;
  961. // B: Normalization constant
  962. // [1] Equation (10)
  963. B[0] := 1.0 - (B[1] + B[2] + B[3]);
  964. end;
  965. //------------------------------------------------------------------------------
  966. // Edge correction matrix [3]
  967. //------------------------------------------------------------------------------
  968. procedure ComputeBoundaryCorrectionMatrix(const B: TQuadFloat; var M: TNineFloats);
  969. var
  970. Scale: Double;
  971. B3B3, B2B2: Double;
  972. B3B1, B3B2: Double;
  973. begin
  974. // Compute edge correction matrix M[3,3]
  975. // [3] Equation (5)
  976. //
  977. //
  978. // 1
  979. // Scale = ────────────────────────────────────────
  980. // (1+a1−a2+a3)(1−a1−a2−a3)(1+a2+(a1−a3)a3)
  981. //
  982. // ┌ ┐
  983. // │ −a3a1+1−a3²-a2 (a3+a1)(a2+a3a1) a3(a1+a3a2) │
  984. // M = Scale * │ a1+a3a2 −(a2−1)(a2+a3a1) −(a3a1+a3²+a2−1)a3 │
  985. // │ a3a1+a2+a1²−a2² a1a2+a3a2²−a1a3²−a3³−a3a2+a3 a3(a1+a3a2) │
  986. // └ ┘
  987. //
  988. Scale := 1 / ((1 + B[1] - B[2] + B[3]) * (1 - B[1] - B[2] - B[3]) * (1 + B[2] + (B[1] - B[3]) * B[3]));
  989. B3B3 := Sqr(B[3]);
  990. B2B2 := Sqr(B[2]);
  991. B3B1 := B[3] * B[1];
  992. B3B2 := B[3] * B[2];
  993. M[0] := Scale * (-B3B1 + 1 - B3B3 - B[2]); // [0,0]
  994. M[1] := Scale * (B[3] + B[1]) * (B[2] + B3B1); // [0,1]
  995. M[3] := Scale * (B[1] + B3B2); // [1,0]
  996. M[2] := M[3] * B[3]; // [0,2]
  997. M[4] := -Scale * (B[2] - 1) * (B[2] + B3B1); // [1,1]
  998. M[5] := -Scale * B[3] * (B3B1 + B3B3 + B[2] - 1); // [1,2]
  999. M[6] := Scale * (B3B1 + B[2] + Sqr(B[1]) - B2B2); // [2,0]
  1000. M[7] := Scale * (B[1] * B[2] + B[3] * B2B2 - B3B3 * (B[1]+B[3]) - B3B2 + B[3]); // [2,1]
  1001. M[8] := Scale * B[3] * (B[1] + B3B2); // [2,2]
  1002. end;
  1003. //------------------------------------------------------------------------------
  1004. // Single channel buffer I/O
  1005. //------------------------------------------------------------------------------
  1006. // Note: Although the layout of the Graphics32 TColor32Entry type depends on the
  1007. // RGBA_FORMAT define, the position of the Alpha channel in the record is
  1008. // constant which is what is important for us here.
  1009. //------------------------------------------------------------------------------
  1010. procedure LoadChannel(Src: TBitmap32; Channel: integer; Buffer: PFloatArray
  1011. {$ifdef IIR_BLUR_DIV_LUT}; const PremultiplyLUT: TPremultiplyLUT{$endif});
  1012. var
  1013. DestValues: PFloatArray;
  1014. SourceValues: PColor32EntryArray;
  1015. AlphaValues: PColor32EntryArray;
  1016. Count: integer;
  1017. i: integer;
  1018. Alpha: Byte;
  1019. begin
  1020. Count := Src.Height * Src.Width;
  1021. DestValues := Buffer;
  1022. SourceValues := PColor32EntryArray(@(PByte(Src.Bits)[Channel]));
  1023. if (Channel = Ord(ccAlpha)) then
  1024. begin
  1025. for i := 0 to Count-1 do
  1026. DestValues[i] := SourceValues[i].Planes[0];
  1027. end else
  1028. begin
  1029. AlphaValues := PColor32EntryArray(@(PColor32Entry(Src.Bits).A));
  1030. for i := 0 to Count-1 do
  1031. begin
  1032. Alpha := AlphaValues[i].Planes[0];
  1033. // Important:
  1034. // 1) First convert to linear color space.
  1035. // 2) Then alpha premultiply.
  1036. if (Alpha = 255) then // Most common case first
  1037. {$ifdef IIR_BLUR_DIV_LUT}
  1038. DestValues[i] := PremultiplyLUT.MulDiv255[SourceValues[i].Planes[0], 255]
  1039. {$else IIR_BLUR_DIV_LUT}
  1040. DestValues[i] := GAMMA_DECODING_TABLE[SourceValues[i].Planes[0]]
  1041. {$endif IIR_BLUR_DIV_LUT}
  1042. else
  1043. if (Alpha = 0) then
  1044. DestValues[i] := 0
  1045. else
  1046. // Premultiply: ColorPremult := Color * Alpha / 255
  1047. {$ifdef IIR_BLUR_DIV_LUT}
  1048. DestValues[i] := PremultiplyLUT.MulDiv255[SourceValues[i].Planes[0], Alpha];
  1049. {$else IIR_BLUR_DIV_LUT}
  1050. DestValues[i] := GAMMA_DECODING_TABLE[SourceValues[i].Planes[0]] * Alpha * OneOver255;
  1051. {$endif IIR_BLUR_DIV_LUT}
  1052. end;
  1053. end;
  1054. end;
  1055. //------------------------------------------------------------------------------
  1056. procedure SaveChannel(Dst: TBitmap32; Channel: integer; Buffer: PFloatArray
  1057. {$ifdef IIR_BLUR_DIV_LUT}; const PremultiplyLUT: TPremultiplyLUT{$endif});
  1058. var
  1059. SourceValues: PFloatArray;
  1060. DestValues: PColor32EntryArray;
  1061. AlphaValues: PColor32EntryArray;
  1062. Count: integer;
  1063. i: integer;
  1064. n: TFloat;
  1065. Color, Alpha: Byte;
  1066. begin
  1067. Count := Dst.Height * Dst.Width;
  1068. SourceValues := Buffer;
  1069. DestValues := PColor32EntryArray(@(PByte(Dst.Bits)[Channel]));
  1070. if (Channel = Ord(ccAlpha)) then
  1071. begin
  1072. for i := 0 to Count-1 do
  1073. begin
  1074. Alpha := Clamp(FastRound(SourceValues[i]));
  1075. DestValues[i].Planes[0] := Alpha;
  1076. end;
  1077. end else
  1078. begin
  1079. AlphaValues := PColor32EntryArray(@(PColor32Entry(Dst.Bits).A));
  1080. for i := 0 to Count-1 do
  1081. begin
  1082. Alpha := AlphaValues[i].Planes[0];
  1083. // Important:
  1084. // 1) First alpha unpremultiply.
  1085. // 2) Then convert to gamma/sRGB color space.
  1086. if (Alpha = 255) then // Most common case first
  1087. begin
  1088. {$ifdef IIR_BLUR_DIV_LUT}
  1089. n := SourceValues[i];
  1090. Color := Clamp(FastRound(n));
  1091. Color := PremultiplyLUT.Mul255Div[Color, 255];
  1092. {$else IIR_BLUR_DIV_LUT}
  1093. Color := FastRound(SourceValues[i]);
  1094. Color := GAMMA_ENCODING_TABLE[Color];
  1095. {$endif IIR_BLUR_DIV_LUT}
  1096. end else
  1097. if (Alpha = 0) then
  1098. Color := 0
  1099. else
  1100. begin
  1101. // Unpremultiply: Color := ColorPremult * 255 / AlphaValues
  1102. {$ifdef IIR_BLUR_DIV_LUT}
  1103. n := SourceValues[i];
  1104. Color := Clamp(FastRound(n)); // Warning: Loss of precision due to rounding before division
  1105. Color := PremultiplyLUT.Mul255Div[Color, Alpha];
  1106. {$else IIR_BLUR_DIV_LUT}
  1107. // Note: We're doing a double Round() here because SourceValues[i]/Alpha can be greater than 1
  1108. // since we're dividing an unrounded value by a rounded value.
  1109. // For example if both values are equal (e.g. source was $FFFFFFFF) and the 'alpha' value was rounded
  1110. // down, then we will be dividing a larger value (e.g. 25.4) with a smaller (e.g. 25).
  1111. // To avoid this rounding error we would have to keep a copy of the unrounded alpha values and
  1112. // divide with those.
  1113. Color := FastRound(FastRound(SourceValues[i]) / Alpha * 255 );
  1114. Color := GAMMA_ENCODING_TABLE[Color];
  1115. {$endif IIR_BLUR_DIV_LUT}
  1116. end;
  1117. DestValues[i].Planes[0] := Color;
  1118. end;
  1119. end;
  1120. end;
  1121. //------------------------------------------------------------------------------
  1122. // Buffer management
  1123. //------------------------------------------------------------------------------
  1124. function AllocateChannelBuffer(out Buffer: pointer; Size: integer): PFloatArray;
  1125. const
  1126. Alignment: NativeUInt = 64*1024;
  1127. begin
  1128. {$ifdef IIR_BLUR_ALIGN_BUFFERS}
  1129. // Allocate a cache-friendly channel buffer
  1130. // TODO : Although this gives us a small performance improvement it is most likely not worth the effort
  1131. Inc(Size, 2 * Alignment);
  1132. GetMem(Buffer, Size);
  1133. Result := Buffer;
  1134. Inc(PByte(Result), Alignment);
  1135. // Ensure data alignment by X bytes
  1136. Result := Pointer(NativeUInt(Result) and (NativeUInt(-1) xor (Alignment - 1)));
  1137. // Make sure data is not aligned by 2*A bytes
  1138. NativeUInt(Result) := NativeUInt(Result) or Alignment;
  1139. {$else IIR_BLUR_ALIGN_BUFFERS}
  1140. GetMem(Buffer, Size);
  1141. Result := Buffer;
  1142. {$endif IIR_BLUR_ALIGN_BUFFERS}
  1143. end;
  1144. //------------------------------------------------------------------------------
  1145. // Core function
  1146. //------------------------------------------------------------------------------
  1147. procedure InternalRecursiveGaussianBlur(Src, Dst: TBitmap32; Sigma: TFloat; TwoDimensional: boolean
  1148. {$ifdef IIR_BLUR_DIV_LUT}; const PremultiplyLUT: TPremultiplyLUT{$endif});
  1149. var
  1150. B: TQuadFloat;
  1151. M: TNineFloats; // [3] Fix boundary
  1152. Buffer, TransposedBuffer: PFloatArray;
  1153. RowBuffer: PFloatArray;
  1154. procedure BlurRow(Row: integer; Width, Height: integer; InBuffer, OutBuffer: PFloatArray);
  1155. var
  1156. InOffset, OutOffset: integer;
  1157. pIn, pOut: PFloatArray;
  1158. v: TQuadFloat;
  1159. begin
  1160. (*
  1161. ** Causal filter (forward)
  1162. *)
  1163. InOffset := Row * Width;
  1164. pIn := PFloatArray(@InBuffer[InOffset]);
  1165. pOut := RowBuffer;
  1166. IIR_BlurFilterForward(pIn, pOut, B, Width, v);
  1167. (*
  1168. ** Anti-causal filter (backward)
  1169. *)
  1170. OutOffset := InOffset + Width-1;
  1171. pIn := @RowBuffer[Width-1];
  1172. pOut := @OutBuffer[OutOffset];
  1173. {$ifdef IIR_BLUR_EDGE_CORRECTION}
  1174. (*
  1175. ** [3] Apply Triggs/Sdika edge correction
  1176. *)
  1177. begin
  1178. IIR_BlurApplyEdgeCorrection(@InBuffer[InOffset + Width-1], B, v, M);
  1179. pOut[0] := v[1];
  1180. Dec(PFloat(pOut));
  1181. Dec(PFloat(pIn));
  1182. Dec(Width); // Adjust backward loop count
  1183. end;
  1184. {$else IIR_BLUR_EDGE_CORRECTION}
  1185. (*
  1186. ** Regular initialization of backward pass. Not used with [3]
  1187. *)
  1188. // [2] Equation (21)
  1189. v[1] := pIn[0] / (B[0] + B[1] + B[2] + B[3]);
  1190. v[2] := v[1];
  1191. v[3] := v[1];
  1192. {$endif IIR_BLUR_EDGE_CORRECTION}
  1193. IIR_BlurFilterBackward(pIn, pOut, B, Width, v);
  1194. end;
  1195. procedure BlurBuffer;
  1196. var
  1197. i: integer;
  1198. begin
  1199. (*
  1200. ** A note about transpose:
  1201. ** An earlier version performed the transpose operation between the horizontal and vertical pass, as part of the
  1202. ** backward (anti-causal) row filter.
  1203. ** Tests has shown that this is at least 25% slower than performing the transpose in one go, after the complete
  1204. ** horizontal and vertical passes, using a blocked transpose algorithm.
  1205. *)
  1206. for i := 0 to Src.Height-1 do
  1207. BlurRow(i, Src.Width, Src.Height, Buffer, Buffer);
  1208. Transpose32(@Buffer[0], @TransposedBuffer[0], Src.Width, Src.Height);
  1209. for i := 0 to Src.Width-1 do
  1210. BlurRow(i, Src.Height, Src.Width, TransposedBuffer, TransposedBuffer);
  1211. Transpose32(@TransposedBuffer[0], @Buffer[0], Src.Height, Src.Width);
  1212. end;
  1213. procedure BlurHorizontal;
  1214. var
  1215. i: integer;
  1216. begin
  1217. for i := 0 to Src.Height-1 do
  1218. BlurRow(i, Src.Width, Src.Height, Buffer, Buffer);
  1219. end;
  1220. var
  1221. Channel: integer;
  1222. MaxRows: integer;
  1223. UnAlignRowBuffer: pointer;
  1224. UnAlignBuffer: pointer;
  1225. UnAlignTransposedBuffer: pointer;
  1226. begin
  1227. if (Sigma < GaussianRadiusToSigma) then
  1228. begin
  1229. Src.CopyMapTo(Dst);
  1230. exit;
  1231. end;
  1232. Dst.SetSize(Src.Width, Src.Height);
  1233. {$ifdef IIR_BLUR_INKSCAPE_COEFFICIENTS}
  1234. if (Sigma >= 20) then
  1235. ComputeCoefficientsInkscape(Sigma, B)
  1236. else
  1237. {$endif IIR_BLUR_INKSCAPE_COEFFICIENTS}
  1238. {$ifdef IIR_BLUR_GABOR_COEFFICIENTS}
  1239. if (Sigma >= 0.5) then
  1240. ComputeCoefficientsGabor(Sigma, B)
  1241. else
  1242. {$endif IIR_BLUR_GABOR_COEFFICIENTS}
  1243. ComputeCoefficientsGaussianDerivative(Sigma, B);
  1244. // [3] Fix boundary
  1245. ComputeBoundaryCorrectionMatrix(B, M);
  1246. MaxRows := Max(Src.Height, Src.Width);
  1247. UnAlignBuffer := nil;
  1248. UnAlignTransposedBuffer := nil;
  1249. UnAlignRowBuffer := nil;
  1250. try
  1251. Buffer := AllocateChannelBuffer(UnAlignBuffer, Src.Width * Src.Height * SizeOf(TFloat));
  1252. if (TwoDimensional) then
  1253. TransposedBuffer := AllocateChannelBuffer(UnAlignTransposedBuffer, Src.Width * Src.Height * SizeOf(TFloat));
  1254. RowBuffer := AllocateChannelBuffer(UnAlignRowBuffer, MaxRows * SizeOf(TFloat));
  1255. // Start with A channel so unpremult of RGB channels uses correct value
  1256. for Channel := 3 downto 0 do
  1257. begin
  1258. LoadChannel(Src, Channel, Buffer
  1259. {$ifdef IIR_BLUR_DIV_LUT}, PremultiplyLUT{$endif});
  1260. if (TwoDimensional) then
  1261. BlurBuffer
  1262. else
  1263. BlurHorizontal;
  1264. SaveChannel(Dst, Channel, Buffer
  1265. {$ifdef IIR_BLUR_DIV_LUT}, PremultiplyLUT{$endif});
  1266. end;
  1267. finally
  1268. Freemem(UnAlignRowBuffer);
  1269. Freemem(UnAlignTransposedBuffer);
  1270. Freemem(UnAlignBuffer);
  1271. end;
  1272. end;
  1273. //------------------------------------------------------------------------------
  1274. procedure RecursiveGaussianBlur(Src, Dst: TBitmap32; Sigma: TFloat);
  1275. begin
  1276. InternalRecursiveGaussianBlur(Src, Dst, Sigma, True
  1277. {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.PremultiplyLUT^{$endif});
  1278. end;
  1279. procedure RecursiveGaussianBlurRadius(Src, Dst: TBitmap32; Radius: TFloat);
  1280. begin
  1281. InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, True
  1282. {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.PremultiplyLUT^{$endif});
  1283. end;
  1284. procedure RecursiveGaussianBlurGamma(Src, Dst: TBitmap32; Sigma: TFloat);
  1285. begin
  1286. InternalRecursiveGaussianBlur(Src, Dst, Sigma, True
  1287. {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.GammaPremultiplyLUT^{$endif});
  1288. end;
  1289. procedure RecursiveGaussianBlurRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat);
  1290. begin
  1291. InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, True
  1292. {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.GammaPremultiplyLUT^{$endif});
  1293. end;
  1294. //------------------------------------------------------------------------------
  1295. procedure RecursiveGaussianBlurHorizontalRadius(Src, Dst: TBitmap32; Radius: TFloat);
  1296. begin
  1297. InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, False
  1298. {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.PremultiplyLUT^{$endif});
  1299. end;
  1300. procedure RecursiveGaussianBlurHorizontalRadiusGamma(Src, Dst: TBitmap32; Radius: TFloat);
  1301. begin
  1302. InternalRecursiveGaussianBlur(Src, Dst, Radius * GaussianRadiusToSigma, False
  1303. {$ifdef IIR_BLUR_DIV_LUT}, TPremultiplyLUT.GammaPremultiplyLUT^{$endif});
  1304. end;
  1305. //------------------------------------------------------------------------------
  1306. //
  1307. // Registration of internal bindings
  1308. //
  1309. //------------------------------------------------------------------------------
  1310. procedure RegisterBindings;
  1311. begin
  1312. BlurRegistry.RegisterBinding(@@IIR_BlurFilterForward, 'IIR_BlurFilterForward');
  1313. BlurRegistry.RegisterBinding(@@IIR_BlurFilterBackward, 'IIR_BlurFilterBackward');
  1314. BlurRegistry.RegisterBinding(@@IIR_BlurApplyEdgeCorrection, 'IIR_BlurApplyEdgeCorrection');
  1315. {$ifdef IIR_BLUR_DEFAULT}
  1316. // Register as default Blur32 implementation
  1317. BlurRegistry[@@Blur32Proc].Add( @RecursiveGaussianBlurRadius, [isPascal]).Name := 'RecursiveGaussianBlurRadius';
  1318. BlurRegistry[@@GammaBlur32Proc].Add( @RecursiveGaussianBlurRadiusGamma, [isPascal]).Name := 'RecursiveGaussianBlurRadiusGamma';
  1319. {$endif IIR_BLUR_DEFAULT}
  1320. BlurRegistry[@@HorizontalBlur32].Add( @RecursiveGaussianBlurHorizontalRadius, [isPascal]).Name := 'RecursiveGaussianBlurHorizontalRadius';
  1321. BlurRegistry[@@GammaHorizontalBlur32].Add( @RecursiveGaussianBlurHorizontalRadiusGamma, [isPascal]).Name := 'RecursiveGaussianBlurHorizontalRadiusGamma';
  1322. BlurRegistry[@@IIR_BlurFilterForward].Add( @BlurFilterForward_Pas, [isPascal]).Name := 'BlurFilterForward_Pas';
  1323. BlurRegistry[@@IIR_BlurFilterBackward].Add( @BlurFilterBackward_Pas, [isPascal]).Name := 'BlurFilterBackward_Pas';
  1324. BlurRegistry[@@IIR_BlurApplyEdgeCorrection].Add(@BlurApplyEdgeCorrection_Pas, [isPascal]).Name := 'BlurApplyEdgeCorrection_Pas';
  1325. {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))}
  1326. {$ifdef IIR_BLUR_SIMD}
  1327. BlurRegistry[@@IIR_BlurFilterForward].Add( @BlurFilterForward_SSE41,
  1328. {$if defined(IIR_USE_DPPS)}
  1329. [isSSE41]).Name := 'BlurFilterForward_SSE41';
  1330. {$elseif defined(IIR_USE_HADDPS)}
  1331. [isSSE3]).Name := 'BlurFilterForward_SSE41';
  1332. {$else}
  1333. [isSSE2]).Name := 'BlurFilterForward_SSE41';
  1334. {$ifend}
  1335. BlurRegistry[@@IIR_BlurFilterBackward].Add( @BlurFilterBackward_SSE41, [isSSE41]).Name := 'BlurFilterBackward_SSE41';
  1336. {$endif IIR_BLUR_SIMD}
  1337. {$ifdef IIR_BLUR_EDGE_CORRECTION_SIMD}
  1338. BlurRegistry[@@IIR_BlurApplyEdgeCorrection].Add(@BlurApplyEdgeCorrection_SSE41, [isSSE41]).Name := 'BlurApplyEdgeCorrection_SSE41';
  1339. {$endif IIR_BLUR_EDGE_CORRECTION_SIMD}
  1340. BlendRegistry.RebindAll;
  1341. {$ifend}
  1342. end;
  1343. //------------------------------------------------------------------------------
  1344. //------------------------------------------------------------------------------
  1345. //------------------------------------------------------------------------------
  1346. initialization
  1347. RegisterBindings;
  1348. end.