GR32_Math.pas 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213
  1. unit GR32_Math;
  2. (* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1 or LGPL 2.1 with linking exception
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * Alternatively, the contents of this file may be used under the terms of the
  16. * Free Pascal modified version of the GNU Lesser General Public License
  17. * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions
  18. * of this license are applicable instead of those above.
  19. * Please see the file LICENSE.txt for additional information concerning this
  20. * license.
  21. *
  22. * The Original Code is Additional Math Routines for Graphics32
  23. *
  24. * The Initial Developer of the Original Code is
  25. * Mattias Andersson <[email protected]>
  26. * (parts of this unit were moved here from GR32_System.pas and GR32.pas by Alex A. Denisov)
  27. *
  28. * Portions created by the Initial Developer are Copyright (C) 2005-2009
  29. * the Initial Developer. All Rights Reserved.
  30. *
  31. * Contributor(s):
  32. * Michael Hansen <[email protected]>
  33. *
  34. * ***** END LICENSE BLOCK ***** *)
  35. interface
  36. {$I GR32.inc}
  37. uses
  38. GR32, GR32_Bindings
  39. {$IFDEF FPC}{$IFDEF TARGET_X64}, GR32_Math_FPC{$ENDIF}{$ENDIF};
  40. { Fixed point math routines }
  41. function FixedFloor(A: TFixed): Integer;
  42. function FixedCeil(A: TFixed): Integer;
  43. function FixedMul(A, B: TFixed): TFixed;
  44. function FixedDiv(A, B: TFixed): TFixed;
  45. function OneOver(Value: TFixed): TFixed;
  46. function FixedRound(A: TFixed): Integer;
  47. function FixedSqr(Value: TFixed): TFixed;
  48. function FixedSqrtLP(Value: TFixed): TFixed; // 8-bit precision
  49. function FixedSqrtHP(Value: TFixed): TFixed; // 16-bit precision
  50. // Fixed point interpolation
  51. function FixedCombine(W, X, Y: TFixed): TFixed;
  52. { Trigonometric routines }
  53. procedure SinCos(const Theta: TFloat; out Sin, Cos: TFloat); overload;
  54. procedure SinCos(const Theta, Radius: Single; out Sin, Cos: Single); overload;
  55. procedure SinCos(const Theta, ScaleX, ScaleY: TFloat; out Sin, Cos: Single); overload;
  56. function Hypot(const X, Y: TFloat): TFloat; overload;
  57. function Hypot(const X, Y: Integer): Integer; overload;
  58. function FastSqrt(const Value: TFloat): TFloat;
  59. function FastSqrtBab1(const Value: TFloat): TFloat;
  60. function FastSqrtBab2(const Value: TFloat): TFloat;
  61. function FastInvSqrt(const Value: Single): Single; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF} overload;
  62. { Misc. Routines }
  63. { MulDiv a faster implementation of Windows.MulDiv funtion }
  64. function MulDiv(Multiplicand, Multiplier, Divisor: Integer): Integer;
  65. // tells if X is a power of 2, returns true when X = 1,2,4,8,16 etc.
  66. function IsPowerOf2(Value: Integer): Boolean; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF}
  67. // returns X rounded down to the nearest power of two
  68. function PrevPowerOf2(Value: Integer): Integer;
  69. // returns X rounded down to the nearest power of two, i.e. 5 -> 8, 7 -> 8, 15 -> 16
  70. function NextPowerOf2(Value: Integer): Integer;
  71. // fast average without overflow, useful for e.g. fixed point math
  72. function Average(A, B: Integer): Integer;
  73. // fast sign function
  74. function Sign(Value: Integer): Integer;
  75. function FloatMod(x, y: Double): Double; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF}
  76. function DivMod(Dividend, Divisor: Integer; var Remainder: Integer): Integer;
  77. type
  78. TCumSumProc = procedure(Values: PSingleArray; Count: Integer);
  79. var
  80. CumSum: TCumSumProc;
  81. var
  82. MathRegistry: TFunctionRegistry;
  83. implementation
  84. uses
  85. Math, GR32_System;
  86. {$IFDEF PUREPASCAL}
  87. const
  88. FixedOneS: Single = 65536;
  89. {$ENDIF}
  90. {$IFDEF FPC}
  91. {$IFDEF TARGET_X64}
  92. function Ceil(X: Single): Integer;
  93. begin
  94. Result := Trunc(X);
  95. if (X - Result) > 0 then
  96. Inc(Result);
  97. end;
  98. function Floor(X: Single): Integer;
  99. begin
  100. Result := Trunc(X);
  101. if (X - Result) < 0 then
  102. Dec(Result);
  103. end;
  104. {$ENDIF}
  105. {$ENDIF}
  106. { Fixed-point math }
  107. function FixedFloor(A: TFixed): Integer;
  108. {$IFDEF PUREPASCAL}
  109. begin
  110. Result := A div FIXEDONE;
  111. {$ELSE}
  112. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  113. asm
  114. {$IFDEF TARGET_x86}
  115. SAR EAX, 16
  116. {$ENDIF}
  117. {$IFDEF TARGET_x64}
  118. MOV EAX, ECX
  119. SAR EAX, 16
  120. {$ENDIF}
  121. {$ENDIF}
  122. end;
  123. function FixedCeil(A: TFixed): Integer;
  124. {$IFDEF PUREPASCAL}
  125. begin
  126. Result := (A + $FFFF) div FIXEDONE;
  127. {$ELSE}
  128. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  129. asm
  130. {$IFDEF TARGET_x86}
  131. ADD EAX, $0000FFFF
  132. SAR EAX, 16
  133. {$ENDIF}
  134. {$IFDEF TARGET_x64}
  135. MOV EAX, ECX
  136. ADD EAX, $0000FFFF
  137. SAR EAX, 16
  138. {$ENDIF}
  139. {$ENDIF}
  140. end;
  141. function FixedRound(A: TFixed): Integer;
  142. {$IFDEF PUREPASCAL}
  143. begin
  144. Result := (A + $7FFF) div FIXEDONE;
  145. {$ELSE}
  146. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  147. asm
  148. {$IFDEF TARGET_x86}
  149. ADD EAX, $00007FFF
  150. SAR EAX, 16
  151. {$ENDIF}
  152. {$IFDEF TARGET_x64}
  153. MOV EAX, ECX
  154. ADD EAX, $00007FFF
  155. SAR EAX, 16
  156. {$ENDIF}
  157. {$ENDIF}
  158. end;
  159. function FixedMul(A, B: TFixed): TFixed;
  160. {$IFDEF PUREPASCAL}
  161. begin
  162. Result := Round(A * FixedToFloat * B);
  163. {$ELSE}
  164. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  165. asm
  166. {$IFDEF TARGET_x86}
  167. IMUL EDX
  168. SHRD EAX, EDX, 16
  169. {$ENDIF}
  170. {$IFDEF TARGET_x64}
  171. MOV EAX, ECX
  172. IMUL EDX
  173. SHRD EAX, EDX, 16
  174. {$ENDIF}
  175. {$ENDIF}
  176. end;
  177. function FixedDiv(A, B: TFixed): TFixed;
  178. {$IFDEF PUREPASCAL}
  179. begin
  180. Result := Round(A / B * FixedOne);
  181. {$ELSE}
  182. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  183. asm
  184. {$IFDEF TARGET_x86}
  185. MOV ECX, B
  186. CDQ
  187. SHLD EDX, EAX, 16
  188. SHL EAX, 16
  189. IDIV ECX
  190. {$ENDIF}
  191. {$IFDEF TARGET_x64}
  192. MOV EAX, ECX
  193. MOV ECX, EDX
  194. CDQ
  195. SHLD EDX, EAX, 16
  196. SHL EAX, 16
  197. IDIV ECX
  198. {$ENDIF}
  199. {$ENDIF}
  200. end;
  201. function OneOver(Value: TFixed): TFixed;
  202. {$IFDEF PUREPASCAL}
  203. const
  204. Dividend: Single = 4294967296; // FixedOne * FixedOne
  205. begin
  206. Result := Round(Dividend / Value);
  207. {$ELSE}
  208. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  209. asm
  210. {$IFDEF TARGET_x86}
  211. MOV ECX, Value
  212. XOR EAX, EAX
  213. MOV EDX, 1
  214. IDIV ECX
  215. {$ENDIF}
  216. {$IFDEF TARGET_x64}
  217. XOR EAX, EAX
  218. MOV EDX, 1
  219. IDIV ECX
  220. {$ENDIF}
  221. {$ENDIF}
  222. end;
  223. function FixedSqr(Value: TFixed): TFixed;
  224. {$IFDEF PUREPASCAL}
  225. begin
  226. Result := Round(Value * FixedToFloat * Value);
  227. {$ELSE}
  228. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  229. asm
  230. {$IFDEF TARGET_x86}
  231. IMUL EAX
  232. SHRD EAX, EDX, 16
  233. {$ENDIF}
  234. {$IFDEF TARGET_x64}
  235. MOV EAX, Value
  236. IMUL EAX
  237. SHRD EAX, EDX, 16
  238. {$ENDIF}
  239. {$ENDIF}
  240. end;
  241. function FixedSqrtLP(Value: TFixed): TFixed;
  242. {$IFDEF PUREPASCAL}
  243. begin
  244. Result := Round(Sqrt(Value * FixedOneS));
  245. {$ELSE}
  246. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  247. asm
  248. {$IFDEF TARGET_x86}
  249. PUSH EBX
  250. MOV ECX, EAX
  251. XOR EAX, EAX
  252. MOV EBX, $40000000
  253. @SqrtLP1:
  254. MOV EDX, ECX
  255. SUB EDX, EBX
  256. JL @SqrtLP2
  257. SUB EDX, EAX
  258. JL @SqrtLP2
  259. MOV ECX,EDX
  260. SHR EAX, 1
  261. OR EAX, EBX
  262. SHR EBX, 2
  263. JNZ @SqrtLP1
  264. SHL EAX, 8
  265. JMP @SqrtLP3
  266. @SqrtLP2:
  267. SHR EAX, 1
  268. SHR EBX, 2
  269. JNZ @SqrtLP1
  270. SHL EAX, 8
  271. @SqrtLP3:
  272. POP EBX
  273. {$ENDIF}
  274. {$IFDEF TARGET_x64}
  275. PUSH RBX
  276. XOR EAX, EAX
  277. MOV EBX, $40000000
  278. @SqrtLP1:
  279. MOV EDX, ECX
  280. SUB EDX, EBX
  281. JL @SqrtLP2
  282. SUB EDX, EAX
  283. JL @SqrtLP2
  284. MOV ECX,EDX
  285. SHR EAX, 1
  286. OR EAX, EBX
  287. SHR EBX, 2
  288. JNZ @SqrtLP1
  289. SHL EAX, 8
  290. JMP @SqrtLP3
  291. @SqrtLP2:
  292. SHR EAX, 1
  293. SHR EBX, 2
  294. JNZ @SqrtLP1
  295. SHL EAX, 8
  296. @SqrtLP3:
  297. POP RBX
  298. {$ENDIF}
  299. {$ENDIF}
  300. end;
  301. function FixedSqrtHP(Value: TFixed): TFixed;
  302. {$IFDEF PUREPASCAL}
  303. begin
  304. Result := Round(Sqrt(Value * FixedOneS));
  305. {$ELSE}
  306. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  307. asm
  308. {$IFDEF TARGET_x86}
  309. PUSH EBX
  310. MOV ECX, EAX
  311. XOR EAX, EAX
  312. MOV EBX, $40000000
  313. @SqrtHP1:
  314. MOV EDX, ECX
  315. SUB EDX, EBX
  316. jb @SqrtHP2
  317. SUB EDX, EAX
  318. jb @SqrtHP2
  319. MOV ECX,EDX
  320. SHR EAX, 1
  321. OR EAX, EBX
  322. SHR EBX, 2
  323. JNZ @SqrtHP1
  324. JZ @SqrtHP5
  325. @SqrtHP2:
  326. SHR EAX, 1
  327. SHR EBX, 2
  328. JNZ @SqrtHP1
  329. @SqrtHP5:
  330. MOV EBX, $00004000
  331. SHL EAX, 16
  332. SHL ECX, 16
  333. @SqrtHP3:
  334. MOV EDX, ECX
  335. SUB EDX, EBX
  336. jb @SqrtHP4
  337. SUB EDX, EAX
  338. jb @SqrtHP4
  339. MOV ECX, EDX
  340. SHR EAX, 1
  341. OR EAX, EBX
  342. SHR EBX, 2
  343. JNZ @SqrtHP3
  344. JMP @SqrtHP6
  345. @SqrtHP4:
  346. SHR EAX, 1
  347. SHR EBX, 2
  348. JNZ @SqrtHP3
  349. @SqrtHP6:
  350. POP EBX
  351. {$ENDIF}
  352. {$IFDEF TARGET_x64}
  353. PUSH RBX
  354. XOR EAX, EAX
  355. MOV EBX, $40000000
  356. @SqrtHP1:
  357. MOV EDX, ECX
  358. SUB EDX, EBX
  359. jb @SqrtHP2
  360. SUB EDX, EAX
  361. jb @SqrtHP2
  362. MOV ECX,EDX
  363. SHR EAX, 1
  364. OR EAX, EBX
  365. SHR EBX, 2
  366. JNZ @SqrtHP1
  367. JZ @SqrtHP5
  368. @SqrtHP2:
  369. SHR EAX, 1
  370. SHR EBX, 2
  371. JNZ @SqrtHP1
  372. @SqrtHP5:
  373. MOV EBX, $00004000
  374. SHL EAX, 16
  375. SHL ECX, 16
  376. @SqrtHP3:
  377. MOV EDX, ECX
  378. SUB EDX, EBX
  379. jb @SqrtHP4
  380. SUB EDX, EAX
  381. jb @SqrtHP4
  382. MOV ECX, EDX
  383. SHR EAX, 1
  384. OR EAX, EBX
  385. SHR EBX, 2
  386. JNZ @SqrtHP3
  387. JMP @SqrtHP6
  388. @SqrtHP4:
  389. SHR EAX, 1
  390. SHR EBX, 2
  391. JNZ @SqrtHP3
  392. @SqrtHP6:
  393. POP RBX
  394. {$ENDIF}
  395. {$ENDIF}
  396. end;
  397. function FixedCombine(W, X, Y: TFixed): TFixed;
  398. // EAX <- W, EDX <- X, ECX <- Y
  399. // combine fixed value X and fixed value Y with the weight of X given in W
  400. // Result Z = W * X + (1 - W) * Y = Y + (X - Y) * W
  401. // Fixed Point Version: Result Z = Y + (X - Y) * W / 65536
  402. {$IFDEF PUREPASCAL}
  403. begin
  404. Result := Round(Y + (X - Y) * FixedToFloat * W);
  405. {$ELSE}
  406. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  407. asm
  408. {$IFDEF TARGET_x86}
  409. SUB EDX, ECX
  410. IMUL EDX
  411. SHRD EAX, EDX, 16
  412. ADD EAX, ECX
  413. {$ENDIF}
  414. {$IFDEF TARGET_x64}
  415. MOV EAX, ECX
  416. SUB EDX, R8D
  417. IMUL EDX
  418. SHRD EAX, EDX, 16
  419. ADD EAX, R8D
  420. {$ENDIF}
  421. {$ENDIF}
  422. end;
  423. { Trigonometry }
  424. procedure SinCos(const Theta: TFloat; out Sin, Cos: TFloat);
  425. {$IFDEF NATIVE_SINCOS}
  426. var
  427. S, C: Extended;
  428. begin
  429. Math.SinCos(Theta, S, C);
  430. Sin := S;
  431. Cos := C;
  432. {$ELSE}
  433. {$IFDEF TARGET_x64}
  434. var
  435. Temp: TFloat;
  436. {$ENDIF}
  437. asm
  438. {$IFDEF TARGET_x86}
  439. FLD Theta
  440. FSINCOS
  441. FSTP DWORD PTR [EDX] // cosine
  442. FSTP DWORD PTR [EAX] // sine
  443. {$ENDIF}
  444. {$IFDEF TARGET_x64}
  445. MOVD Temp, Theta
  446. FLD Temp
  447. FSINCOS
  448. FSTP [Sin] // cosine
  449. FSTP [Cos] // sine
  450. {$ENDIF}
  451. {$ENDIF}
  452. end;
  453. procedure SinCos(const Theta, Radius: TFloat; out Sin, Cos: TFloat);
  454. {$IFDEF NATIVE_SINCOS}
  455. var
  456. S, C: Extended;
  457. begin
  458. Math.SinCos(Theta, S, C);
  459. Sin := S * Radius;
  460. Cos := C * Radius;
  461. {$ELSE}
  462. {$IFDEF TARGET_x64}
  463. var
  464. Temp: TFloat;
  465. {$ENDIF}
  466. asm
  467. {$IFDEF TARGET_x86}
  468. FLD Theta
  469. FSINCOS
  470. FMUL Radius
  471. FSTP DWORD PTR [EDX] // cosine
  472. FMUL Radius
  473. FSTP DWORD PTR [EAX] // sine
  474. {$ENDIF}
  475. {$IFDEF TARGET_x64}
  476. MOVD Temp, Theta
  477. FLD Temp
  478. MOVD Temp, Radius
  479. FSINCOS
  480. FMUL Temp
  481. FSTP [Cos]
  482. FMUL Temp
  483. FSTP [Sin]
  484. {$ENDIF}
  485. {$ENDIF}
  486. end;
  487. procedure SinCos(const Theta, ScaleX, ScaleY: TFloat; out Sin, Cos: Single); overload;
  488. {$IFDEF NATIVE_SINCOS}
  489. var
  490. S, C: Extended;
  491. begin
  492. Math.SinCos(Theta, S, C);
  493. Sin := S * ScaleX;
  494. Cos := C * ScaleY;
  495. {$ELSE}
  496. {$IFDEF TARGET_x64}
  497. var
  498. Temp: TFloat;
  499. {$ENDIF}
  500. asm
  501. {$IFDEF TARGET_x86}
  502. FLD Theta
  503. FSINCOS
  504. FMUL ScaleX
  505. FSTP DWORD PTR [EDX] // cosine
  506. FMUL ScaleY
  507. FSTP DWORD PTR [EAX] // sine
  508. {$ENDIF}
  509. {$IFDEF TARGET_x64}
  510. MOVD Temp, Theta
  511. FLD Temp
  512. FSINCOS
  513. MOVD Temp, ScaleX
  514. FMUL Temp
  515. FSTP [Cos]
  516. MOVD Temp, ScaleY
  517. FMUL Temp
  518. FSTP [Sin]
  519. {$ENDIF}
  520. {$ENDIF}
  521. end;
  522. function Hypot(const X, Y: TFloat): TFloat;
  523. {$IFDEF PUREPASCAL}
  524. begin
  525. Result := Sqrt(Sqr(X) + Sqr(Y));
  526. {$ELSE}
  527. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  528. asm
  529. {$IFDEF TARGET_x86}
  530. FLD X
  531. FMUL ST,ST
  532. FLD Y
  533. FMUL ST,ST
  534. FADDP ST(1),ST
  535. FSQRT
  536. FWAIT
  537. {$ENDIF}
  538. {$IFDEF TARGET_x64}
  539. MULSS XMM0, XMM0
  540. MULSS XMM1, XMM1
  541. ADDSS XMM0, XMM1
  542. SQRTSS XMM0, XMM0
  543. {$ENDIF}
  544. {$ENDIF}
  545. end;
  546. function Hypot(const X, Y: Integer): Integer;
  547. //{$IFDEF PUREPASCAL}
  548. begin
  549. Result := Round(Math.Hypot(X, Y));
  550. (*
  551. {$ELSE}
  552. {$IFDEF FPC}assembler;{$ENDIF}
  553. asm
  554. {$IFDEF TARGET_x64}
  555. IMUL RAX, RCX, RDX
  556. {$ELSE}
  557. FLD X
  558. FMUL ST,ST
  559. FLD Y
  560. FMUL ST,ST
  561. FADDP ST(1),ST
  562. FSQRT
  563. FISTP [ESP - 4]
  564. MOV EAX, [ESP - 4]
  565. FWAIT
  566. {$ENDIF}
  567. {$ENDIF}
  568. *)
  569. end;
  570. function FastSqrt(const Value: TFloat): TFloat;
  571. // see http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Approximations_that_depend_on_IEEE_representation
  572. {$IFDEF PUREPASCAL}
  573. var
  574. I: Integer absolute Value;
  575. J: Integer absolute Result;
  576. begin
  577. J := (I - $3F800000) div 2 + $3F800000;
  578. {$ELSE}
  579. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  580. asm
  581. {$IFDEF TARGET_x86}
  582. MOV EAX, DWORD PTR Value
  583. SUB EAX, $3F800000
  584. SAR EAX, 1
  585. ADD EAX, $3F800000
  586. MOV DWORD PTR [ESP - 4], EAX
  587. FLD DWORD PTR [ESP - 4]
  588. {$ENDIF}
  589. {$IFDEF TARGET_x64}
  590. SQRTSS XMM0, XMM0
  591. {$ENDIF}
  592. {$ENDIF}
  593. end;
  594. function FastSqrtBab1(const Value: TFloat): TFloat;
  595. // see http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Approximations_that_depend_on_IEEE_representation
  596. // additionally one babylonian step added
  597. {$IFNDEF PUREPASCAL}
  598. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  599. {$ENDIF}
  600. const
  601. CHalf : TFloat = 0.5;
  602. {$IFDEF PUREPASCAL}
  603. var
  604. I: Integer absolute Value;
  605. J: Integer absolute Result;
  606. begin
  607. J := (I - $3F800000) div 2 + $3F800000;
  608. Result := CHalf * (Result + Value / Result);
  609. {$ELSE}
  610. asm
  611. {$IFDEF TARGET_x86}
  612. MOV EAX, Value
  613. SUB EAX, $3F800000
  614. SAR EAX, 1
  615. ADD EAX, $3F800000
  616. MOV DWORD PTR [ESP - 4], EAX
  617. FLD Value
  618. FDIV DWORD PTR [ESP - 4]
  619. FADD DWORD PTR [ESP - 4]
  620. FMUL CHalf
  621. {$ENDIF}
  622. {$IFDEF TARGET_x64}
  623. SQRTSS XMM0, XMM0
  624. {$ENDIF}
  625. {$ENDIF}
  626. end;
  627. function FastSqrtBab2(const Value: TFloat): TFloat;
  628. // see http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Approximations_that_depend_on_IEEE_representation
  629. // additionally two babylonian steps added
  630. {$IFDEF PUREPASCAL}
  631. const
  632. CQuarter : TFloat = 0.25;
  633. var
  634. J: Integer absolute Result;
  635. begin
  636. Result := Value;
  637. J := ((J - (1 shl 23)) shr 1) + (1 shl 29);
  638. Result := Result + Value / Result;
  639. Result := CQuarter * Result + Value / Result;
  640. {$ELSE}
  641. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  642. const
  643. CHalf : TFloat = 0.5;
  644. asm
  645. {$IFDEF TARGET_x86}
  646. MOV EAX, Value
  647. SUB EAX, $3F800000
  648. SAR EAX, 1
  649. ADD EAX, $3F800000
  650. MOV DWORD PTR [ESP - 4], EAX
  651. FLD Value
  652. FDIV DWORD PTR [ESP - 4]
  653. FADD DWORD PTR [ESP - 4]
  654. FMUL CHalf
  655. {$ENDIF}
  656. {$IFDEF TARGET_x64}
  657. MOVD EAX, Value
  658. SUB EAX, $3F800000
  659. SAR EAX, 1
  660. ADD EAX, $3F800000
  661. MOVD XMM1, EAX
  662. DIVSS XMM0, XMM1
  663. ADDSS XMM0, XMM1
  664. MOVD XMM1, [RIP + CHalf]
  665. MULSS XMM0, XMM1
  666. {$ENDIF}
  667. {$ENDIF}
  668. end;
  669. function FastInvSqrt(const Value: Single): Single;
  670. var
  671. IntCst : Cardinal absolute result;
  672. begin
  673. Result := Value;
  674. IntCst := ($BE6EB50C - IntCst) shr 1;
  675. Result := 0.5 * Result * (3 - Value * Sqr(Result));
  676. end;
  677. { Misc. }
  678. function MulDiv(Multiplicand, Multiplier, Divisor: Integer): Integer;
  679. {$IFDEF PUREPASCAL}
  680. begin
  681. Result := Int64(Multiplicand) * Int64(Multiplier) div Divisor;
  682. {$ELSE}
  683. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  684. asm
  685. {$IFDEF TARGET_x86}
  686. PUSH EBX // Imperative save
  687. PUSH ESI // of EBX and ESI
  688. MOV EBX, EAX // Result will be negative or positive so set rounding direction
  689. XOR EBX, EDX // Negative: substract 1 in case of rounding
  690. XOR EBX, ECX // Positive: add 1
  691. OR EAX, EAX // Make all operands positive, ready for unsigned operations
  692. JNS @m1Ok // minimizing branching
  693. NEG EAX
  694. @m1Ok:
  695. OR EDX, EDX
  696. JNS @m2Ok
  697. NEG EDX
  698. @m2Ok:
  699. OR ECX, ECX
  700. JNS @DivOk
  701. NEG ECX
  702. @DivOK:
  703. MUL EDX // Unsigned multiply (Multiplicand*Multiplier)
  704. MOV ESI, EDX // Check for overflow, by comparing
  705. SHL ESI, 1 // 2 times the high-order 32 bits of the product (EDX)
  706. CMP ESI, ECX // with the Divisor.
  707. JAE @Overfl // If equal or greater than overflow with division anticipated
  708. DIV ECX // Unsigned divide of product by Divisor
  709. SUB ECX, EDX // Check if the result must be adjusted by adding or substracting
  710. CMP ECX, EDX // 1 (*.5 -> nearest integer), by comparing the difference of
  711. JA @NoAdd // Divisor and remainder with the remainder. If it is greater then
  712. INC EAX // no rounding needed; add 1 to result otherwise
  713. @NoAdd:
  714. OR EBX, EDX // From unsigned operations back the to original sign of the result
  715. JNS @Exit // must be positive
  716. NEG EAX // must be negative
  717. JMP @Exit
  718. @Overfl:
  719. OR EAX, -1 // 3 bytes alternative for MOV EAX,-1. Windows.MulDiv "overflow"
  720. // and "zero-divide" return value
  721. @Exit:
  722. POP ESI // Restore
  723. POP EBX // esi and EBX
  724. {$ENDIF}
  725. {$IFDEF TARGET_x64}
  726. MOV EAX, ECX // Result will be negative or positive so set rounding direction
  727. XOR ECX, EDX // Negative: substract 1 in case of rounding
  728. XOR ECX, R8D // Positive: add 1
  729. OR EAX, EAX // Make all operands positive, ready for unsigned operations
  730. JNS @m1Ok // minimizing branching
  731. NEG EAX
  732. @m1Ok:
  733. OR EDX, EDX
  734. JNS @m2Ok
  735. NEG EDX
  736. @m2Ok:
  737. OR R8D, R8D
  738. JNS @DivOk
  739. NEG R8D
  740. @DivOK:
  741. MUL EDX // Unsigned multiply (Multiplicand*Multiplier)
  742. MOV R9D, EDX // Check for overflow, by comparing
  743. SHL R9D, 1 // 2 times the high-order 32 bits of the product (EDX)
  744. CMP R9D, R8D // with the Divisor.
  745. JAE @Overfl // If equal or greater than overflow with division anticipated
  746. DIV R8D // Unsigned divide of product by Divisor
  747. SUB R8D, EDX // Check if the result must be adjusted by adding or substracting
  748. CMP R8D, EDX // 1 (*.5 -> nearest integer), by comparing the difference of
  749. JA @NoAdd // Divisor and remainder with the remainder. If it is greater then
  750. INC EAX // no rounding needed; add 1 to result otherwise
  751. @NoAdd:
  752. OR ECX, EDX // From unsigned operations back the to original sign of the result
  753. JNS @Exit // must be positive
  754. NEG EAX // must be negative
  755. JMP @Exit
  756. @Overfl:
  757. OR EAX, -1 // 3 bytes alternative for MOV EAX,-1. Windows.MulDiv "overflow"
  758. // and "zero-divide" return value
  759. @Exit:
  760. {$ENDIF}
  761. {$ENDIF}
  762. end;
  763. function IsPowerOf2(Value: Integer): Boolean;
  764. //returns true when X = 1,2,4,8,16 etc.
  765. begin
  766. Result := Value and (Value - 1) = 0;
  767. end;
  768. function PrevPowerOf2(Value: Integer): Integer;
  769. //returns X rounded down to the power of two
  770. {$IFDEF PUREPASCAL}
  771. begin
  772. Result := 1;
  773. while Value shr 1 > 0 do
  774. Result := Result shl 1;
  775. {$ELSE}
  776. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  777. asm
  778. {$IFDEF TARGET_x86}
  779. BSR ECX, EAX
  780. SHR EAX, CL
  781. SHL EAX, CL
  782. {$ENDIF}
  783. {$IFDEF TARGET_x64}
  784. MOV EAX, Value
  785. BSR ECX, EAX
  786. SHR EAX, CL
  787. SHL EAX, CL
  788. {$ENDIF}
  789. {$ENDIF}
  790. end;
  791. function NextPowerOf2(Value: Integer): Integer;
  792. //returns X rounded up to the power of two, i.e. 5 -> 8, 7 -> 8, 15 -> 16
  793. {$IFDEF PUREPASCAL}
  794. begin
  795. Result := 2;
  796. while Value shr 1 > 0 do
  797. Result := Result shl 1;
  798. {$ELSE}
  799. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  800. asm
  801. {$IFDEF TARGET_x86}
  802. DEC EAX
  803. JLE @1
  804. BSR ECX, EAX
  805. MOV EAX, 2
  806. SHL EAX, CL
  807. RET
  808. @1:
  809. MOV EAX, 1
  810. {$ENDIF}
  811. {$IFDEF TARGET_x64}
  812. MOV EAX, Value
  813. DEC EAX
  814. JLE @1
  815. BSR ECX, EAX
  816. MOV EAX, 2
  817. SHL EAX, CL
  818. RET
  819. @1:
  820. MOV EAX, 1
  821. {$ENDIF}
  822. {$ENDIF}
  823. end;
  824. function Average(A, B: Integer): Integer;
  825. //fast average without overflow, useful e.g. for fixed point math
  826. //(A + B)/2 = (A and B) + (A xor B)/2
  827. {$IFDEF PUREPASCAL}
  828. begin
  829. Result := (A and B) + (A xor B) div 2;
  830. {$ELSE}
  831. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  832. asm
  833. {$IFDEF TARGET_x86}
  834. MOV ECX, EDX
  835. XOR EDX, EAX
  836. SAR EDX, 1
  837. AND EAX, ECX
  838. ADD EAX, EDX
  839. {$ENDIF}
  840. {$IFDEF TARGET_x64}
  841. MOV EAX, A
  842. MOV ECX, EDX
  843. XOR EDX, EAX
  844. SAR EDX, 1
  845. AND EAX, ECX
  846. ADD EAX, EDX
  847. {$ENDIF}
  848. {$ENDIF}
  849. end;
  850. function Sign(Value: Integer): Integer;
  851. {$IFDEF PUREPASCAL}
  852. begin
  853. //Assumes 32 bit integer
  854. Result := (- Value) shr 31 - (Value shr 31);
  855. {$ELSE}
  856. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  857. asm
  858. {$IFDEF TARGET_x64}
  859. MOV EAX, Value
  860. {$ENDIF}
  861. CDQ
  862. NEG EAX
  863. ADC EDX, EDX
  864. MOV EAX, EDX
  865. {$ENDIF}
  866. end;
  867. function FloatMod(x, y: Double): Double;
  868. begin
  869. if (y = 0) then
  870. Result := X
  871. else
  872. Result := x - y * Floor(x / y);
  873. end;
  874. function DivMod(Dividend, Divisor: Integer; var Remainder: Integer): Integer;
  875. {$IFDEF PUREPASCAL}
  876. begin
  877. Result := Dividend div Divisor;
  878. Remainder := Dividend mod Divisor;
  879. {$ELSE}
  880. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  881. asm
  882. {$IFDEF TARGET_x86}
  883. PUSH EDX
  884. CDQ
  885. IDIV DWORD PTR [ESP]
  886. ADD ESP, $04
  887. MOV DWORD PTR [ECX], edx
  888. {$ENDIF}
  889. {$IFDEF TARGET_x64}
  890. MOV RAX, RCX
  891. MOV R9, RDX
  892. CDQ
  893. IDIV R9
  894. MOV DWORD PTR [R8], EDX
  895. {$ENDIF}
  896. {$ENDIF}
  897. end;
  898. procedure CumSum_Pas(Values: PSingleArray; Count: Integer);
  899. var
  900. I: Integer;
  901. V: TFloat;
  902. begin
  903. V := Values[0];
  904. for I := 1 to Count - 1 do
  905. begin
  906. if PInteger(@Values[I])^ <> 0 then
  907. V := V + Values[I];
  908. Values[I] := V;
  909. end;
  910. end;
  911. {$IFNDEF PUREPASCAL}
  912. // Aligned SSE2 version -- Credits: Sanyin <[email protected]>
  913. procedure CumSum_SSE2(Values: PSingleArray; Count: Integer); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  914. asm
  915. {$IFDEF TARGET_x86}
  916. MOV ECX,EDX
  917. CMP ECX,2 // if count < 2, exit
  918. JL @END
  919. CMP ECX,32 // if count < 32, avoid SSE2 overhead
  920. JL @SMALL
  921. {--- align memory ---}
  922. PUSH EBX
  923. PXOR XMM4,XMM4
  924. MOV EBX,EAX
  925. AND EBX,15 // get aligned count
  926. JZ @ENDALIGNING // already aligned
  927. ADD EBX,-16
  928. NEG EBX // get bytes to advance
  929. JZ @ENDALIGNING // already aligned
  930. MOV ECX,EBX
  931. SAR ECX,2 // div with 4 to get cnt
  932. SUB EDX,ECX
  933. ADD EAX,4
  934. DEC ECX
  935. JZ @SETUPLAST // one element
  936. @ALIGNINGLOOP:
  937. FLD DWORD PTR [EAX-4]
  938. FADD DWORD PTR [EAX]
  939. FSTP DWORD PTR [EAX]
  940. ADD EAX,4
  941. DEC ECX
  942. JNZ @ALIGNINGLOOP
  943. @SETUPLAST:
  944. MOVUPS XMM4,[EAX-4]
  945. PSLLDQ XMM4,12
  946. PSRLDQ XMM4,12
  947. @ENDALIGNING:
  948. POP EBX
  949. PUSH EBX
  950. MOV ECX,EDX
  951. SAR ECX,2
  952. @LOOP:
  953. MOVAPS XMM0,[EAX]
  954. PXOR XMM5,XMM5
  955. PCMPEQD XMM5,XMM0
  956. PMOVMSKB EBX,XMM5
  957. CMP EBX,$0000FFFF
  958. JNE @NORMAL
  959. PSHUFD XMM0,XMM4,0
  960. JMP @SKIP
  961. @NORMAL:
  962. ADDPS XMM0,XMM4
  963. PSHUFD XMM1,XMM0,$e4
  964. PSLLDQ XMM1,4
  965. PSHUFD XMM2,XMM1,$90
  966. PSHUFD XMM3,XMM1,$40
  967. ADDPS XMM2,XMM3
  968. ADDPS XMM1,XMM2
  969. ADDPS XMM0,XMM1
  970. PSHUFLW XMM4,XMM0,$E4
  971. PSRLDQ XMM4,12
  972. @SKIP:
  973. PREFETCHNTA [eax+16*16*2]
  974. MOVAPS [EAX],XMM0
  975. ADD EAX,16
  976. SUB ECX,1
  977. JNZ @LOOP
  978. POP EBX
  979. MOV ECX,EDX
  980. SAR ECX,2
  981. SHL ECX,2
  982. SUB EDX,ECX
  983. MOV ECX,EDX
  984. JZ @END
  985. @LOOP2:
  986. FLD DWORD PTR [EAX-4]
  987. FADD DWORD PTR [EAX]
  988. FSTP DWORD PTR [EAX]
  989. ADD EAX,4
  990. DEC ECX
  991. JNZ @LOOP2
  992. JMP @END
  993. @SMALL:
  994. MOV ECX,EDX
  995. ADD EAX,4
  996. DEC ECX
  997. @LOOP3:
  998. FLD DWORD PTR [EAX-4]
  999. FADD DWORD PTR [EAX]
  1000. FSTP DWORD PTR [EAX]
  1001. ADD EAX,4
  1002. DEC ECX
  1003. JNZ @LOOP3
  1004. {$ENDIF}
  1005. {$IFDEF TARGET_x64}
  1006. CMP EDX,2 // if count < 2, exit
  1007. JL @END
  1008. MOV RAX,RCX
  1009. MOV ECX,EDX
  1010. CMP ECX,32 // if count < 32, avoid SSE2 overhead
  1011. JL @SMALL
  1012. {--- align memory ---}
  1013. PXOR XMM4,XMM4
  1014. MOV R8D,EAX
  1015. AND R8D,15 // get aligned count
  1016. JZ @ENDALIGNING // already aligned
  1017. ADD R8D,-16
  1018. NEG R8D // get bytes to advance
  1019. JZ @ENDALIGNING // already aligned
  1020. MOV ECX,R8D
  1021. SAR ECX,2 // div with 4 to get cnt
  1022. SUB EDX,ECX
  1023. ADD RAX,4
  1024. DEC ECX
  1025. JZ @SETUPLAST // one element
  1026. @ALIGNINGLOOP:
  1027. FLD DWORD PTR [RAX - 4]
  1028. FADD DWORD PTR [RAX]
  1029. FSTP DWORD PTR [RAX]
  1030. ADD RAX,4
  1031. DEC ECX
  1032. JNZ @ALIGNINGLOOP
  1033. @SETUPLAST:
  1034. MOVUPS XMM4,[RAX - 4]
  1035. PSLLDQ XMM4,12
  1036. PSRLDQ XMM4,12
  1037. @ENDALIGNING:
  1038. MOV ECX,EDX
  1039. SAR ECX,2
  1040. @LOOP:
  1041. MOVAPS XMM0,[RAX]
  1042. PXOR XMM5,XMM5
  1043. PCMPEQD XMM5,XMM0
  1044. PMOVMSKB R8D,XMM5
  1045. CMP R8D,$0000FFFF
  1046. JNE @NORMAL
  1047. PSHUFD XMM0,XMM4,0
  1048. JMP @SKIP
  1049. @NORMAL:
  1050. ADDPS XMM0,XMM4
  1051. PSHUFD XMM1,XMM0,$e4
  1052. PSLLDQ XMM1,4
  1053. PSHUFD XMM2,XMM1,$90
  1054. PSHUFD XMM3,XMM1,$40
  1055. ADDPS XMM2,XMM3
  1056. ADDPS XMM1,XMM2
  1057. ADDPS XMM0,XMM1
  1058. PSHUFLW XMM4,XMM0,$E4
  1059. PSRLDQ XMM4,12
  1060. @SKIP:
  1061. PREFETCHNTA [RAX + 32 * 2]
  1062. MOVAPS [RAX],XMM0
  1063. ADD RAX,16
  1064. SUB ECX,1
  1065. JNZ @LOOP
  1066. MOV ECX,EDX
  1067. SAR ECX,2
  1068. SHL ECX,2
  1069. SUB EDX,ECX
  1070. MOV ECX,EDX
  1071. JZ @END
  1072. @LOOP2:
  1073. FLD DWORD PTR [RAX - 4]
  1074. FADD DWORD PTR [RAX]
  1075. FSTP DWORD PTR [RAX]
  1076. ADD RAX,4
  1077. DEC ECX
  1078. JNZ @LOOP2
  1079. JMP @END
  1080. @SMALL:
  1081. ADD RAX,4
  1082. DEC ECX
  1083. @LOOP3:
  1084. FLD DWORD PTR [RAX - 4]
  1085. FADD DWORD PTR [RAX]
  1086. FSTP DWORD PTR [RAX]
  1087. ADD RAX,4
  1088. DEC ECX
  1089. JNZ @LOOP3
  1090. {$ENDIF}
  1091. @END:
  1092. end;
  1093. {$ENDIF}
  1094. const
  1095. FID_CUMSUM = 0;
  1096. const
  1097. MathBindingFlagPascal = $0001;
  1098. procedure RegisterBindings;
  1099. begin
  1100. MathRegistry := NewRegistry('GR32_Math bindings');
  1101. MathRegistry.RegisterBinding(FID_CUMSUM, @@CumSum);
  1102. // pure pascal
  1103. MathRegistry.Add(FID_CUMSUM, @CumSum_Pas, [], MathBindingFlagPascal);
  1104. {$IFNDEF PUREPASCAL}
  1105. {$IFNDEF OMIT_SSE2}
  1106. // SSE2
  1107. MathRegistry.Add(FID_CUMSUM, @CumSum_SSE2, [ciSSE2]);
  1108. {$ENDIF}
  1109. {$ENDIF}
  1110. MathRegistry.RebindAll;
  1111. end;
  1112. initialization
  1113. RegisterBindings;
  1114. end.