GR32_Math.pas 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215
  1. unit GR32_Math;
  2. (* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1 or LGPL 2.1 with linking exception
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * Alternatively, the contents of this file may be used under the terms of the
  16. * Free Pascal modified version of the GNU Lesser General Public License
  17. * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions
  18. * of this license are applicable instead of those above.
  19. * Please see the file LICENSE.txt for additional information concerning this
  20. * license.
  21. *
  22. * The Original Code is Additional Math Routines for Graphics32
  23. *
  24. * The Initial Developer of the Original Code is
  25. * Mattias Andersson <[email protected]>
  26. * (parts of this unit were moved here from GR32_System.pas and GR32.pas by Alex A. Denisov)
  27. *
  28. * Portions created by the Initial Developer are Copyright (C) 2005-2009
  29. * the Initial Developer. All Rights Reserved.
  30. *
  31. * Contributor(s):
  32. * Michael Hansen <[email protected]>
  33. *
  34. * ***** END LICENSE BLOCK ***** *)
  35. interface
  36. {$I GR32.inc}
  37. uses GR32;
  38. { Fixed point math routines }
  39. function FixedFloor(A: TFixed): Integer;
  40. function FixedCeil(A: TFixed): Integer;
  41. function FixedMul(A, B: TFixed): TFixed;
  42. function FixedDiv(A, B: TFixed): TFixed;
  43. function OneOver(Value: TFixed): TFixed;
  44. function FixedRound(A: TFixed): Integer;
  45. function FixedSqr(Value: TFixed): TFixed;
  46. function FixedSqrtLP(Value: TFixed): TFixed; // 8-bit precision
  47. function FixedSqrtHP(Value: TFixed): TFixed; // 16-bit precision
  48. // Fixed point interpolation
  49. function FixedCombine(W, X, Y: TFixed): TFixed;
  50. { Trigonometric routines }
  51. procedure SinCos(const Theta: TFloat; out Sin, Cos: TFloat); overload;
  52. procedure SinCos(const Theta, Radius: Single; out Sin, Cos: Single); overload;
  53. procedure SinCos(const Theta, ScaleX, ScaleY: TFloat; out Sin, Cos: Single); overload;
  54. function Hypot(const X, Y: TFloat): TFloat; overload;
  55. function Hypot(const X, Y: Integer): Integer; overload;
  56. function FastSqrt(const Value: TFloat): TFloat;
  57. function FastSqrtBab1(const Value: TFloat): TFloat;
  58. function FastSqrtBab2(const Value: TFloat): TFloat;
  59. function FastInvSqrt(const Value: Single): Single; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF} overload;
  60. { Misc. Routines }
  61. { MulDiv a faster implementation of Windows.MulDiv funtion }
  62. function MulDiv(Multiplicand, Multiplier, Divisor: Integer): Integer;
  63. // tells if X is a power of 2, returns true when X = 1,2,4,8,16 etc.
  64. function IsPowerOf2(Value: Integer): Boolean; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF}
  65. // returns X rounded down to the nearest power of two
  66. function PrevPowerOf2(Value: Integer): Integer;
  67. // returns X rounded down to the nearest power of two, i.e. 5 -> 8, 7 -> 8, 15 -> 16
  68. function NextPowerOf2(Value: Integer): Integer;
  69. // fast average without overflow, useful for e.g. fixed point math
  70. function Average(A, B: Integer): Integer;
  71. // fast sign function
  72. function Sign(Value: Integer): Integer;
  73. function FloatMod(x, y: Double): Double; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF}
  74. function DivMod(Dividend, Divisor: Integer; var Remainder: Integer): Integer;
  75. {$IFDEF FPC}
  76. {$IFDEF TARGET_X64}
  77. (*
  78. FPC has no similar {$EXCESSPRECISION OFF} directive,
  79. but we can easily emulate that by overriding some internal math functions
  80. *)
  81. function PI: Single; [internproc: fpc_in_pi_real];
  82. //function Abs(D: Single): Single; [internproc: fpc_in_abs_real];
  83. //function Sqr(D: Single): Single; [internproc: fpc_in_sqr_real];
  84. function Sqrt(D: Single): Single; [internproc: fpc_in_sqrt_real];
  85. function ArcTan(D: Single): Single; [internproc: fpc_in_arctan_real];
  86. function Ln(D: Single): Single; [internproc: fpc_in_ln_real];
  87. function Sin(D: Single): Single; [internproc: fpc_in_sin_real];
  88. function Cos(D: Single): Single; [internproc: fpc_in_cos_real];
  89. function Exp(D: Single): Single; [internproc: fpc_in_exp_real];
  90. function Round(D: Single): Int64; [internproc: fpc_in_round_real];
  91. function Frac(D: Single): Single; [internproc: fpc_in_frac_real];
  92. function Int(D: Single): Single; [internproc: fpc_in_int_real];
  93. function Trunc(D: Single): Int64; [internproc: fpc_in_trunc_real];
  94. function Ceil(X: Single): Integer; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF}
  95. function Floor(X: Single): Integer; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF}
  96. {$ENDIF}
  97. {$ENDIF}
  98. type
  99. TCumSumProc = procedure(Values: PSingleArray; Count: Integer);
  100. var
  101. CumSum: TCumSumProc;
  102. implementation
  103. uses
  104. Math, GR32_System;
  105. {$IFDEF PUREPASCAL}
  106. const
  107. FixedOneS: Single = 65536;
  108. {$ENDIF}
  109. {$IFDEF FPC}
  110. {$IFDEF TARGET_X64}
  111. function Ceil(X: Single): Integer;
  112. begin
  113. Result := Trunc(X);
  114. if (X - Result) > 0 then
  115. Inc(Result);
  116. end;
  117. function Floor(X: Single): Integer;
  118. begin
  119. Result := Trunc(X);
  120. if (X - Result) < 0 then
  121. Dec(Result);
  122. end;
  123. {$ENDIF}
  124. {$ENDIF}
  125. { Fixed-point math }
  126. function FixedFloor(A: TFixed): Integer;
  127. {$IFDEF PUREPASCAL}
  128. begin
  129. Result := A div FIXEDONE;
  130. {$ELSE}
  131. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  132. asm
  133. {$IFDEF TARGET_x86}
  134. SAR EAX, 16
  135. {$ENDIF}
  136. {$IFDEF TARGET_x64}
  137. MOV EAX, ECX
  138. SAR EAX, 16
  139. {$ENDIF}
  140. {$ENDIF}
  141. end;
  142. function FixedCeil(A: TFixed): Integer;
  143. {$IFDEF PUREPASCAL}
  144. begin
  145. Result := (A + $FFFF) div FIXEDONE;
  146. {$ELSE}
  147. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  148. asm
  149. {$IFDEF TARGET_x86}
  150. ADD EAX, $0000FFFF
  151. SAR EAX, 16
  152. {$ENDIF}
  153. {$IFDEF TARGET_x64}
  154. MOV EAX, ECX
  155. ADD EAX, $0000FFFF
  156. SAR EAX, 16
  157. {$ENDIF}
  158. {$ENDIF}
  159. end;
  160. function FixedRound(A: TFixed): Integer;
  161. {$IFDEF PUREPASCAL}
  162. begin
  163. Result := (A + $7FFF) div FIXEDONE;
  164. {$ELSE}
  165. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  166. asm
  167. {$IFDEF TARGET_x86}
  168. ADD EAX, $00007FFF
  169. SAR EAX, 16
  170. {$ENDIF}
  171. {$IFDEF TARGET_x64}
  172. MOV EAX, ECX
  173. ADD EAX, $00007FFF
  174. SAR EAX, 16
  175. {$ENDIF}
  176. {$ENDIF}
  177. end;
  178. function FixedMul(A, B: TFixed): TFixed;
  179. {$IFDEF PUREPASCAL}
  180. begin
  181. Result := Round(A * FixedToFloat * B);
  182. {$ELSE}
  183. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  184. asm
  185. {$IFDEF TARGET_x86}
  186. IMUL EDX
  187. SHRD EAX, EDX, 16
  188. {$ENDIF}
  189. {$IFDEF TARGET_x64}
  190. MOV EAX, ECX
  191. IMUL EDX
  192. SHRD EAX, EDX, 16
  193. {$ENDIF}
  194. {$ENDIF}
  195. end;
  196. function FixedDiv(A, B: TFixed): TFixed;
  197. {$IFDEF PUREPASCAL}
  198. begin
  199. Result := Round(A / B * FixedOne);
  200. {$ELSE}
  201. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  202. asm
  203. {$IFDEF TARGET_x86}
  204. MOV ECX, B
  205. CDQ
  206. SHLD EDX, EAX, 16
  207. SHL EAX, 16
  208. IDIV ECX
  209. {$ENDIF}
  210. {$IFDEF TARGET_x64}
  211. MOV EAX, ECX
  212. MOV ECX, EDX
  213. CDQ
  214. SHLD EDX, EAX, 16
  215. SHL EAX, 16
  216. IDIV ECX
  217. {$ENDIF}
  218. {$ENDIF}
  219. end;
  220. function OneOver(Value: TFixed): TFixed;
  221. {$IFDEF PUREPASCAL}
  222. const
  223. Dividend: Single = 4294967296; // FixedOne * FixedOne
  224. begin
  225. Result := Round(Dividend / Value);
  226. {$ELSE}
  227. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  228. asm
  229. {$IFDEF TARGET_x86}
  230. MOV ECX, Value
  231. XOR EAX, EAX
  232. MOV EDX, 1
  233. IDIV ECX
  234. {$ENDIF}
  235. {$IFDEF TARGET_x64}
  236. XOR EAX, EAX
  237. MOV EDX, 1
  238. IDIV ECX
  239. {$ENDIF}
  240. {$ENDIF}
  241. end;
  242. function FixedSqr(Value: TFixed): TFixed;
  243. {$IFDEF PUREPASCAL}
  244. begin
  245. Result := Round(Value * FixedToFloat * Value);
  246. {$ELSE}
  247. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  248. asm
  249. {$IFDEF TARGET_x86}
  250. IMUL EAX
  251. SHRD EAX, EDX, 16
  252. {$ENDIF}
  253. {$IFDEF TARGET_x64}
  254. MOV EAX, Value
  255. IMUL EAX
  256. SHRD EAX, EDX, 16
  257. {$ENDIF}
  258. {$ENDIF}
  259. end;
  260. function FixedSqrtLP(Value: TFixed): TFixed;
  261. {$IFDEF PUREPASCAL}
  262. begin
  263. Result := Round(Sqrt(Value * FixedOneS));
  264. {$ELSE}
  265. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  266. asm
  267. {$IFDEF TARGET_x86}
  268. PUSH EBX
  269. MOV ECX, EAX
  270. XOR EAX, EAX
  271. MOV EBX, $40000000
  272. @SqrtLP1:
  273. MOV EDX, ECX
  274. SUB EDX, EBX
  275. JL @SqrtLP2
  276. SUB EDX, EAX
  277. JL @SqrtLP2
  278. MOV ECX,EDX
  279. SHR EAX, 1
  280. OR EAX, EBX
  281. SHR EBX, 2
  282. JNZ @SqrtLP1
  283. SHL EAX, 8
  284. JMP @SqrtLP3
  285. @SqrtLP2:
  286. SHR EAX, 1
  287. SHR EBX, 2
  288. JNZ @SqrtLP1
  289. SHL EAX, 8
  290. @SqrtLP3:
  291. POP EBX
  292. {$ENDIF}
  293. {$IFDEF TARGET_x64}
  294. PUSH RBX
  295. XOR EAX, EAX
  296. MOV EBX, $40000000
  297. @SqrtLP1:
  298. MOV EDX, ECX
  299. SUB EDX, EBX
  300. JL @SqrtLP2
  301. SUB EDX, EAX
  302. JL @SqrtLP2
  303. MOV ECX,EDX
  304. SHR EAX, 1
  305. OR EAX, EBX
  306. SHR EBX, 2
  307. JNZ @SqrtLP1
  308. SHL EAX, 8
  309. JMP @SqrtLP3
  310. @SqrtLP2:
  311. SHR EAX, 1
  312. SHR EBX, 2
  313. JNZ @SqrtLP1
  314. SHL EAX, 8
  315. @SqrtLP3:
  316. POP RBX
  317. {$ENDIF}
  318. {$ENDIF}
  319. end;
  320. function FixedSqrtHP(Value: TFixed): TFixed;
  321. {$IFDEF PUREPASCAL}
  322. begin
  323. Result := Round(Sqrt(Value * FixedOneS));
  324. {$ELSE}
  325. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  326. asm
  327. {$IFDEF TARGET_x86}
  328. PUSH EBX
  329. MOV ECX, EAX
  330. XOR EAX, EAX
  331. MOV EBX, $40000000
  332. @SqrtHP1:
  333. MOV EDX, ECX
  334. SUB EDX, EBX
  335. jb @SqrtHP2
  336. SUB EDX, EAX
  337. jb @SqrtHP2
  338. MOV ECX,EDX
  339. SHR EAX, 1
  340. OR EAX, EBX
  341. SHR EBX, 2
  342. JNZ @SqrtHP1
  343. JZ @SqrtHP5
  344. @SqrtHP2:
  345. SHR EAX, 1
  346. SHR EBX, 2
  347. JNZ @SqrtHP1
  348. @SqrtHP5:
  349. MOV EBX, $00004000
  350. SHL EAX, 16
  351. SHL ECX, 16
  352. @SqrtHP3:
  353. MOV EDX, ECX
  354. SUB EDX, EBX
  355. jb @SqrtHP4
  356. SUB EDX, EAX
  357. jb @SqrtHP4
  358. MOV ECX, EDX
  359. SHR EAX, 1
  360. OR EAX, EBX
  361. SHR EBX, 2
  362. JNZ @SqrtHP3
  363. JMP @SqrtHP6
  364. @SqrtHP4:
  365. SHR EAX, 1
  366. SHR EBX, 2
  367. JNZ @SqrtHP3
  368. @SqrtHP6:
  369. POP EBX
  370. {$ENDIF}
  371. {$IFDEF TARGET_x64}
  372. PUSH RBX
  373. XOR EAX, EAX
  374. MOV EBX, $40000000
  375. @SqrtHP1:
  376. MOV EDX, ECX
  377. SUB EDX, EBX
  378. jb @SqrtHP2
  379. SUB EDX, EAX
  380. jb @SqrtHP2
  381. MOV ECX,EDX
  382. SHR EAX, 1
  383. OR EAX, EBX
  384. SHR EBX, 2
  385. JNZ @SqrtHP1
  386. JZ @SqrtHP5
  387. @SqrtHP2:
  388. SHR EAX, 1
  389. SHR EBX, 2
  390. JNZ @SqrtHP1
  391. @SqrtHP5:
  392. MOV EBX, $00004000
  393. SHL EAX, 16
  394. SHL ECX, 16
  395. @SqrtHP3:
  396. MOV EDX, ECX
  397. SUB EDX, EBX
  398. jb @SqrtHP4
  399. SUB EDX, EAX
  400. jb @SqrtHP4
  401. MOV ECX, EDX
  402. SHR EAX, 1
  403. OR EAX, EBX
  404. SHR EBX, 2
  405. JNZ @SqrtHP3
  406. JMP @SqrtHP6
  407. @SqrtHP4:
  408. SHR EAX, 1
  409. SHR EBX, 2
  410. JNZ @SqrtHP3
  411. @SqrtHP6:
  412. POP RBX
  413. {$ENDIF}
  414. {$ENDIF}
  415. end;
  416. function FixedCombine(W, X, Y: TFixed): TFixed;
  417. // EAX <- W, EDX <- X, ECX <- Y
  418. // combine fixed value X and fixed value Y with the weight of X given in W
  419. // Result Z = W * X + (1 - W) * Y = Y + (X - Y) * W
  420. // Fixed Point Version: Result Z = Y + (X - Y) * W / 65536
  421. {$IFDEF PUREPASCAL}
  422. begin
  423. Result := Round(Y + (X - Y) * FixedToFloat * W);
  424. {$ELSE}
  425. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  426. asm
  427. {$IFDEF TARGET_x86}
  428. SUB EDX, ECX
  429. IMUL EDX
  430. SHRD EAX, EDX, 16
  431. ADD EAX, ECX
  432. {$ENDIF}
  433. {$IFDEF TARGET_x64}
  434. MOV EAX, ECX
  435. SUB EDX, R8D
  436. IMUL EDX
  437. SHRD EAX, EDX, 16
  438. ADD EAX, R8D
  439. {$ENDIF}
  440. {$ENDIF}
  441. end;
  442. { Trigonometry }
  443. procedure SinCos(const Theta: TFloat; out Sin, Cos: TFloat);
  444. {$IFDEF NATIVE_SINCOS}
  445. var
  446. S, C: Extended;
  447. begin
  448. Math.SinCos(Theta, S, C);
  449. Sin := S;
  450. Cos := C;
  451. {$ELSE}
  452. {$IFDEF TARGET_x64}
  453. var
  454. Temp: TFloat;
  455. {$ENDIF}
  456. asm
  457. {$IFDEF TARGET_x86}
  458. FLD Theta
  459. FSINCOS
  460. FSTP DWORD PTR [EDX] // cosine
  461. FSTP DWORD PTR [EAX] // sine
  462. {$ENDIF}
  463. {$IFDEF TARGET_x64}
  464. MOVD Temp, Theta
  465. FLD Temp
  466. FSINCOS
  467. FSTP [Sin] // cosine
  468. FSTP [Cos] // sine
  469. {$ENDIF}
  470. {$ENDIF}
  471. end;
  472. procedure SinCos(const Theta, Radius: TFloat; out Sin, Cos: TFloat);
  473. {$IFDEF NATIVE_SINCOS}
  474. var
  475. S, C: Extended;
  476. begin
  477. Math.SinCos(Theta, S, C);
  478. Sin := S * Radius;
  479. Cos := C * Radius;
  480. {$ELSE}
  481. {$IFDEF TARGET_x64}
  482. var
  483. Temp: TFloat;
  484. {$ENDIF}
  485. asm
  486. {$IFDEF TARGET_x86}
  487. FLD Theta
  488. FSINCOS
  489. FMUL Radius
  490. FSTP DWORD PTR [EDX] // cosine
  491. FMUL Radius
  492. FSTP DWORD PTR [EAX] // sine
  493. {$ENDIF}
  494. {$IFDEF TARGET_x64}
  495. MOVD Temp, Theta
  496. FLD Temp
  497. MOVD Temp, Radius
  498. FSINCOS
  499. FMUL Temp
  500. FSTP [Cos]
  501. FMUL Temp
  502. FSTP [Sin]
  503. {$ENDIF}
  504. {$ENDIF}
  505. end;
  506. procedure SinCos(const Theta, ScaleX, ScaleY: TFloat; out Sin, Cos: Single); overload;
  507. {$IFDEF NATIVE_SINCOS}
  508. var
  509. S, C: Extended;
  510. begin
  511. Math.SinCos(Theta, S, C);
  512. Sin := S * ScaleX;
  513. Cos := C * ScaleY;
  514. {$ELSE}
  515. {$IFDEF TARGET_x64}
  516. var
  517. Temp: TFloat;
  518. {$ENDIF}
  519. asm
  520. {$IFDEF TARGET_x86}
  521. FLD Theta
  522. FSINCOS
  523. FMUL ScaleX
  524. FSTP DWORD PTR [EDX] // cosine
  525. FMUL ScaleY
  526. FSTP DWORD PTR [EAX] // sine
  527. {$ENDIF}
  528. {$IFDEF TARGET_x64}
  529. MOVD Temp, Theta
  530. FLD Temp
  531. FSINCOS
  532. MOVD Temp, ScaleX
  533. FMUL Temp
  534. FSTP [Cos]
  535. MOVD Temp, ScaleY
  536. FMUL Temp
  537. FSTP [Sin]
  538. {$ENDIF}
  539. {$ENDIF}
  540. end;
  541. function Hypot(const X, Y: TFloat): TFloat;
  542. {$IFDEF PUREPASCAL}
  543. begin
  544. Result := Sqrt(Sqr(X) + Sqr(Y));
  545. {$ELSE}
  546. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  547. asm
  548. {$IFDEF TARGET_x86}
  549. FLD X
  550. FMUL ST,ST
  551. FLD Y
  552. FMUL ST,ST
  553. FADDP ST(1),ST
  554. FSQRT
  555. FWAIT
  556. {$ENDIF}
  557. {$IFDEF TARGET_x64}
  558. MULSS XMM0, XMM0
  559. MULSS XMM1, XMM1
  560. ADDSS XMM0, XMM1
  561. SQRTSS XMM0, XMM0
  562. {$ENDIF}
  563. {$ENDIF}
  564. end;
  565. function Hypot(const X, Y: Integer): Integer;
  566. //{$IFDEF PUREPASCAL}
  567. begin
  568. Result := Round(Math.Hypot(X, Y));
  569. (*
  570. {$ELSE}
  571. asm
  572. {$IFDEF TARGET_x64}
  573. IMUL RAX, RCX, RDX
  574. {$ELSE}
  575. FLD X
  576. FMUL ST,ST
  577. FLD Y
  578. FMUL ST,ST
  579. FADDP ST(1),ST
  580. FSQRT
  581. FISTP [ESP - 4]
  582. MOV EAX, [ESP - 4]
  583. FWAIT
  584. {$ENDIF}
  585. {$ENDIF}
  586. *)
  587. end;
  588. function FastSqrt(const Value: TFloat): TFloat;
  589. // see http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Approximations_that_depend_on_IEEE_representation
  590. {$IFDEF PUREPASCAL}
  591. var
  592. I: Integer absolute Value;
  593. J: Integer absolute Result;
  594. begin
  595. J := (I - $3F800000) div 2 + $3F800000;
  596. {$ELSE}
  597. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  598. asm
  599. {$IFDEF TARGET_x86}
  600. MOV EAX, DWORD PTR Value
  601. SUB EAX, $3F800000
  602. SAR EAX, 1
  603. ADD EAX, $3F800000
  604. MOV DWORD PTR [ESP - 4], EAX
  605. FLD DWORD PTR [ESP - 4]
  606. {$ENDIF}
  607. {$IFDEF TARGET_x64}
  608. SQRTSS XMM0, XMM0
  609. {$ENDIF}
  610. {$ENDIF}
  611. end;
  612. function FastSqrtBab1(const Value: TFloat): TFloat;
  613. // see http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Approximations_that_depend_on_IEEE_representation
  614. // additionally one babylonian step added
  615. {$IFNDEF PUREPASCAL}
  616. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  617. {$ENDIF}
  618. const
  619. CHalf : TFloat = 0.5;
  620. {$IFDEF PUREPASCAL}
  621. var
  622. I: Integer absolute Value;
  623. J: Integer absolute Result;
  624. begin
  625. J := (I - $3F800000) div 2 + $3F800000;
  626. Result := CHalf * (Result + Value / Result);
  627. {$ELSE}
  628. asm
  629. {$IFDEF TARGET_x86}
  630. MOV EAX, Value
  631. SUB EAX, $3F800000
  632. SAR EAX, 1
  633. ADD EAX, $3F800000
  634. MOV DWORD PTR [ESP - 4], EAX
  635. FLD Value
  636. FDIV DWORD PTR [ESP - 4]
  637. FADD DWORD PTR [ESP - 4]
  638. FMUL CHalf
  639. {$ENDIF}
  640. {$IFDEF TARGET_x64}
  641. SQRTSS XMM0, XMM0
  642. {$ENDIF}
  643. {$ENDIF}
  644. end;
  645. function FastSqrtBab2(const Value: TFloat): TFloat;
  646. // see http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Approximations_that_depend_on_IEEE_representation
  647. // additionally two babylonian steps added
  648. {$IFDEF PUREPASCAL}
  649. const
  650. CQuarter : TFloat = 0.25;
  651. var
  652. J: Integer absolute Result;
  653. begin
  654. Result := Value;
  655. J := ((J - (1 shl 23)) shr 1) + (1 shl 29);
  656. Result := Result + Value / Result;
  657. Result := CQuarter * Result + Value / Result;
  658. {$ELSE}
  659. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  660. const
  661. CHalf : TFloat = 0.5;
  662. asm
  663. {$IFDEF TARGET_x86}
  664. MOV EAX, Value
  665. SUB EAX, $3F800000
  666. SAR EAX, 1
  667. ADD EAX, $3F800000
  668. MOV DWORD PTR [ESP - 4], EAX
  669. FLD Value
  670. FDIV DWORD PTR [ESP - 4]
  671. FADD DWORD PTR [ESP - 4]
  672. FMUL CHalf
  673. {$ENDIF}
  674. {$IFDEF TARGET_x64}
  675. MOVD EAX, Value
  676. SUB EAX, $3F800000
  677. SAR EAX, 1
  678. ADD EAX, $3F800000
  679. MOVD XMM1, EAX
  680. DIVSS XMM0, XMM1
  681. ADDSS XMM0, XMM1
  682. MOVD XMM1, [RIP + CHalf]
  683. MULSS XMM0, XMM1
  684. {$ENDIF}
  685. {$ENDIF}
  686. end;
  687. function FastInvSqrt(const Value: Single): Single;
  688. var
  689. IntCst : Cardinal absolute result;
  690. begin
  691. Result := Value;
  692. IntCst := ($BE6EB50C - IntCst) shr 1;
  693. Result := 0.5 * Result * (3 - Value * Sqr(Result));
  694. end;
  695. { Misc. }
  696. function MulDiv(Multiplicand, Multiplier, Divisor: Integer): Integer;
  697. {$IFDEF PUREPASCAL}
  698. begin
  699. Result := Int64(Multiplicand) * Int64(Multiplier) div Divisor;
  700. {$ELSE}
  701. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  702. asm
  703. {$IFDEF TARGET_x86}
  704. PUSH EBX // Imperative save
  705. PUSH ESI // of EBX and ESI
  706. MOV EBX, EAX // Result will be negative or positive so set rounding direction
  707. XOR EBX, EDX // Negative: substract 1 in case of rounding
  708. XOR EBX, ECX // Positive: add 1
  709. OR EAX, EAX // Make all operands positive, ready for unsigned operations
  710. JNS @m1Ok // minimizing branching
  711. NEG EAX
  712. @m1Ok:
  713. OR EDX, EDX
  714. JNS @m2Ok
  715. NEG EDX
  716. @m2Ok:
  717. OR ECX, ECX
  718. JNS @DivOk
  719. NEG ECX
  720. @DivOK:
  721. MUL EDX // Unsigned multiply (Multiplicand*Multiplier)
  722. MOV ESI, EDX // Check for overflow, by comparing
  723. SHL ESI, 1 // 2 times the high-order 32 bits of the product (EDX)
  724. CMP ESI, ECX // with the Divisor.
  725. JAE @Overfl // If equal or greater than overflow with division anticipated
  726. DIV ECX // Unsigned divide of product by Divisor
  727. SUB ECX, EDX // Check if the result must be adjusted by adding or substracting
  728. CMP ECX, EDX // 1 (*.5 -> nearest integer), by comparing the difference of
  729. JA @NoAdd // Divisor and remainder with the remainder. If it is greater then
  730. INC EAX // no rounding needed; add 1 to result otherwise
  731. @NoAdd:
  732. OR EBX, EDX // From unsigned operations back the to original sign of the result
  733. JNS @Exit // must be positive
  734. NEG EAX // must be negative
  735. JMP @Exit
  736. @Overfl:
  737. OR EAX, -1 // 3 bytes alternative for MOV EAX,-1. Windows.MulDiv "overflow"
  738. // and "zero-divide" return value
  739. @Exit:
  740. POP ESI // Restore
  741. POP EBX // esi and EBX
  742. {$ENDIF}
  743. {$IFDEF TARGET_x64}
  744. MOV EAX, ECX // Result will be negative or positive so set rounding direction
  745. XOR ECX, EDX // Negative: substract 1 in case of rounding
  746. XOR ECX, R8D // Positive: add 1
  747. OR EAX, EAX // Make all operands positive, ready for unsigned operations
  748. JNS @m1Ok // minimizing branching
  749. NEG EAX
  750. @m1Ok:
  751. OR EDX, EDX
  752. JNS @m2Ok
  753. NEG EDX
  754. @m2Ok:
  755. OR R8D, R8D
  756. JNS @DivOk
  757. NEG R8D
  758. @DivOK:
  759. MUL EDX // Unsigned multiply (Multiplicand*Multiplier)
  760. MOV R9D, EDX // Check for overflow, by comparing
  761. SHL R9D, 1 // 2 times the high-order 32 bits of the product (EDX)
  762. CMP R9D, R8D // with the Divisor.
  763. JAE @Overfl // If equal or greater than overflow with division anticipated
  764. DIV R8D // Unsigned divide of product by Divisor
  765. SUB R8D, EDX // Check if the result must be adjusted by adding or substracting
  766. CMP R8D, EDX // 1 (*.5 -> nearest integer), by comparing the difference of
  767. JA @NoAdd // Divisor and remainder with the remainder. If it is greater then
  768. INC EAX // no rounding needed; add 1 to result otherwise
  769. @NoAdd:
  770. OR ECX, EDX // From unsigned operations back the to original sign of the result
  771. JNS @Exit // must be positive
  772. NEG EAX // must be negative
  773. JMP @Exit
  774. @Overfl:
  775. OR EAX, -1 // 3 bytes alternative for MOV EAX,-1. Windows.MulDiv "overflow"
  776. // and "zero-divide" return value
  777. @Exit:
  778. {$ENDIF}
  779. {$ENDIF}
  780. end;
  781. function IsPowerOf2(Value: Integer): Boolean;
  782. //returns true when X = 1,2,4,8,16 etc.
  783. begin
  784. Result := Value and (Value - 1) = 0;
  785. end;
  786. function PrevPowerOf2(Value: Integer): Integer;
  787. //returns X rounded down to the power of two
  788. {$IFDEF PUREPASCAL}
  789. begin
  790. Result := 1;
  791. while Value shr 1 > 0 do
  792. Result := Result shl 1;
  793. {$ELSE}
  794. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  795. asm
  796. {$IFDEF TARGET_x86}
  797. BSR ECX, EAX
  798. SHR EAX, CL
  799. SHL EAX, CL
  800. {$ENDIF}
  801. {$IFDEF TARGET_x64}
  802. MOV EAX, Value
  803. BSR ECX, EAX
  804. SHR EAX, CL
  805. SHL EAX, CL
  806. {$ENDIF}
  807. {$ENDIF}
  808. end;
  809. function NextPowerOf2(Value: Integer): Integer;
  810. //returns X rounded up to the power of two, i.e. 5 -> 8, 7 -> 8, 15 -> 16
  811. {$IFDEF PUREPASCAL}
  812. begin
  813. Result := 2;
  814. while Value shr 1 > 0 do
  815. Result := Result shl 1;
  816. {$ELSE}
  817. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  818. asm
  819. {$IFDEF TARGET_x86}
  820. DEC EAX
  821. JLE @1
  822. BSR ECX, EAX
  823. MOV EAX, 2
  824. SHL EAX, CL
  825. RET
  826. @1:
  827. MOV EAX, 1
  828. {$ENDIF}
  829. {$IFDEF TARGET_x64}
  830. MOV EAX, Value
  831. DEC EAX
  832. JLE @1
  833. BSR ECX, EAX
  834. MOV EAX, 2
  835. SHL EAX, CL
  836. RET
  837. @1:
  838. MOV EAX, 1
  839. {$ENDIF}
  840. {$ENDIF}
  841. end;
  842. function Average(A, B: Integer): Integer;
  843. //fast average without overflow, useful e.g. for fixed point math
  844. //(A + B)/2 = (A and B) + (A xor B)/2
  845. {$IFDEF PUREPASCAL}
  846. begin
  847. Result := (A and B) + (A xor B) div 2;
  848. {$ELSE}
  849. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  850. asm
  851. {$IFDEF TARGET_x86}
  852. MOV ECX, EDX
  853. XOR EDX, EAX
  854. SAR EDX, 1
  855. AND EAX, ECX
  856. ADD EAX, EDX
  857. {$ENDIF}
  858. {$IFDEF TARGET_x64}
  859. MOV EAX, A
  860. MOV ECX, EDX
  861. XOR EDX, EAX
  862. SAR EDX, 1
  863. AND EAX, ECX
  864. ADD EAX, EDX
  865. {$ENDIF}
  866. {$ENDIF}
  867. end;
  868. function Sign(Value: Integer): Integer;
  869. {$IFDEF PUREPASCAL}
  870. begin
  871. //Assumes 32 bit integer
  872. Result := (- Value) shr 31 - (Value shr 31);
  873. {$ELSE}
  874. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  875. asm
  876. {$IFDEF TARGET_x64}
  877. MOV EAX, Value
  878. {$ENDIF}
  879. CDQ
  880. NEG EAX
  881. ADC EDX, EDX
  882. MOV EAX, EDX
  883. {$ENDIF}
  884. end;
  885. function FloatMod(x, y: Double): Double;
  886. begin
  887. if (y = 0) then
  888. Result := X
  889. else
  890. Result := x - y * Floor(x / y);
  891. end;
  892. function DivMod(Dividend, Divisor: Integer; var Remainder: Integer): Integer;
  893. {$IFDEF PUREPASCAL}
  894. begin
  895. Result := Dividend div Divisor;
  896. Remainder := Dividend mod Divisor;
  897. {$ELSE}
  898. {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  899. asm
  900. {$IFDEF TARGET_x86}
  901. PUSH EDX
  902. CDQ
  903. IDIV DWORD PTR [ESP]
  904. ADD ESP, $04
  905. MOV DWORD PTR [ECX], edx
  906. {$ENDIF}
  907. {$IFDEF TARGET_x64}
  908. MOV RAX, RCX
  909. MOV R9, RDX
  910. CDQ
  911. IDIV R9
  912. MOV DWORD PTR [R8], EDX
  913. {$ENDIF}
  914. {$ENDIF}
  915. end;
  916. procedure CumSum_Pas(Values: PSingleArray; Count: Integer);
  917. var
  918. I: Integer;
  919. V: TFloat;
  920. begin
  921. V := Values[0];
  922. for I := 1 to Count - 1 do
  923. begin
  924. if PInteger(@Values[I])^ <> 0 then
  925. V := V + Values[I];
  926. Values[I] := V;
  927. end;
  928. end;
  929. {$IFNDEF PUREPASCAL}
  930. // Aligned SSE2 version -- Credits: Sanyin <[email protected]>
  931. procedure CumSum_SSE2(Values: PSingleArray; Count: Integer); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
  932. asm
  933. {$IFDEF TARGET_x86}
  934. MOV ECX,EDX
  935. CMP ECX,2 // if count < 2, exit
  936. JL @END
  937. CMP ECX,32 // if count < 32, avoid SSE2 overhead
  938. JL @SMALL
  939. {--- align memory ---}
  940. PUSH EBX
  941. PXOR XMM4,XMM4
  942. MOV EBX,EAX
  943. AND EBX,15 // get aligned count
  944. JZ @ENDALIGNING // already aligned
  945. ADD EBX,-16
  946. NEG EBX // get bytes to advance
  947. JZ @ENDALIGNING // already aligned
  948. MOV ECX,EBX
  949. SAR ECX,2 // div with 4 to get cnt
  950. SUB EDX,ECX
  951. ADD EAX,4
  952. DEC ECX
  953. JZ @SETUPLAST // one element
  954. @ALIGNINGLOOP:
  955. FLD DWORD PTR [EAX-4]
  956. FADD DWORD PTR [EAX]
  957. FSTP DWORD PTR [EAX]
  958. ADD EAX,4
  959. DEC ECX
  960. JNZ @ALIGNINGLOOP
  961. @SETUPLAST:
  962. MOVUPS XMM4,[EAX-4]
  963. PSLLDQ XMM4,12
  964. PSRLDQ XMM4,12
  965. @ENDALIGNING:
  966. POP EBX
  967. PUSH EBX
  968. MOV ECX,EDX
  969. SAR ECX,2
  970. @LOOP:
  971. MOVAPS XMM0,[EAX]
  972. PXOR XMM5,XMM5
  973. PCMPEQD XMM5,XMM0
  974. PMOVMSKB EBX,XMM5
  975. CMP EBX,$0000FFFF
  976. JNE @NORMAL
  977. PSHUFD XMM0,XMM4,0
  978. JMP @SKIP
  979. @NORMAL:
  980. ADDPS XMM0,XMM4
  981. PSHUFD XMM1,XMM0,$e4
  982. PSLLDQ XMM1,4
  983. PSHUFD XMM2,XMM1,$90
  984. PSHUFD XMM3,XMM1,$40
  985. ADDPS XMM2,XMM3
  986. ADDPS XMM1,XMM2
  987. ADDPS XMM0,XMM1
  988. PSHUFLW XMM4,XMM0,$E4
  989. PSRLDQ XMM4,12
  990. @SKIP:
  991. PREFETCHNTA [eax+16*16*2]
  992. MOVAPS [EAX],XMM0
  993. ADD EAX,16
  994. SUB ECX,1
  995. JNZ @LOOP
  996. POP EBX
  997. MOV ECX,EDX
  998. SAR ECX,2
  999. SHL ECX,2
  1000. SUB EDX,ECX
  1001. MOV ECX,EDX
  1002. JZ @END
  1003. @LOOP2:
  1004. FLD DWORD PTR [EAX-4]
  1005. FADD DWORD PTR [EAX]
  1006. FSTP DWORD PTR [EAX]
  1007. ADD EAX,4
  1008. DEC ECX
  1009. JNZ @LOOP2
  1010. JMP @END
  1011. @SMALL:
  1012. MOV ECX,EDX
  1013. ADD EAX,4
  1014. DEC ECX
  1015. @LOOP3:
  1016. FLD DWORD PTR [EAX-4]
  1017. FADD DWORD PTR [EAX]
  1018. FSTP DWORD PTR [EAX]
  1019. ADD EAX,4
  1020. DEC ECX
  1021. JNZ @LOOP3
  1022. {$ENDIF}
  1023. {$IFDEF TARGET_x64}
  1024. CMP EDX,2 // if count < 2, exit
  1025. JL @END
  1026. MOV EAX,ECX
  1027. MOV ECX,EDX
  1028. CMP ECX,32 // if count < 32, avoid SSE2 overhead
  1029. JL @SMALL
  1030. {--- align memory ---}
  1031. PXOR XMM4,XMM4
  1032. MOV R8D,EAX
  1033. AND R8D,15 // get aligned count
  1034. JZ @ENDALIGNING // already aligned
  1035. ADD R8D,-16
  1036. NEG R8D // get bytes to advance
  1037. JZ @ENDALIGNING // already aligned
  1038. MOV ECX,R8D
  1039. SAR ECX,2 // div with 4 to get cnt
  1040. SUB EDX,ECX
  1041. ADD EAX,4
  1042. DEC ECX
  1043. JZ @SETUPLAST // one element
  1044. @ALIGNINGLOOP:
  1045. FLD DWORD PTR [RAX - 4]
  1046. FADD DWORD PTR [RAX]
  1047. FSTP DWORD PTR [RAX]
  1048. ADD EAX,4
  1049. DEC ECX
  1050. JNZ @ALIGNINGLOOP
  1051. @SETUPLAST:
  1052. MOVUPS XMM4,[RAX - 4]
  1053. PSLLDQ XMM4,12
  1054. PSRLDQ XMM4,12
  1055. @ENDALIGNING:
  1056. MOV ECX,EDX
  1057. SAR ECX,2
  1058. @LOOP:
  1059. MOVAPS XMM0,[RAX]
  1060. PXOR XMM5,XMM5
  1061. PCMPEQD XMM5,XMM0
  1062. PMOVMSKB R8D,XMM5
  1063. CMP R8D,$0000FFFF
  1064. JNE @NORMAL
  1065. PSHUFD XMM0,XMM4,0
  1066. JMP @SKIP
  1067. @NORMAL:
  1068. ADDPS XMM0,XMM4
  1069. PSHUFD XMM1,XMM0,$e4
  1070. PSLLDQ XMM1,4
  1071. PSHUFD XMM2,XMM1,$90
  1072. PSHUFD XMM3,XMM1,$40
  1073. ADDPS XMM2,XMM3
  1074. ADDPS XMM1,XMM2
  1075. ADDPS XMM0,XMM1
  1076. PSHUFLW XMM4,XMM0,$E4
  1077. PSRLDQ XMM4,12
  1078. @SKIP:
  1079. PREFETCHNTA [RAX + 32 * 2]
  1080. MOVAPS [RAX],XMM0
  1081. ADD EAX,16
  1082. SUB ECX,1
  1083. JNZ @LOOP
  1084. MOV ECX,EDX
  1085. SAR ECX,2
  1086. SHL ECX,2
  1087. SUB EDX,ECX
  1088. MOV ECX,EDX
  1089. JZ @END
  1090. @LOOP2:
  1091. FLD DWORD PTR [RAX - 4]
  1092. FADD DWORD PTR [RAX]
  1093. FSTP DWORD PTR [RAX]
  1094. ADD EAX,4
  1095. DEC ECX
  1096. JNZ @LOOP2
  1097. JMP @END
  1098. @SMALL:
  1099. MOV ECX,EDX
  1100. ADD EAX,4
  1101. DEC ECX
  1102. @LOOP3:
  1103. FLD DWORD PTR [RAX - 4]
  1104. FADD DWORD PTR [RAX]
  1105. FSTP DWORD PTR [RAX]
  1106. ADD EAX,4
  1107. DEC ECX
  1108. JNZ @LOOP3
  1109. {$ENDIF}
  1110. @END:
  1111. end;
  1112. {$ENDIF}
  1113. initialization
  1114. {$IFNDEF PUREPASCAL}
  1115. if HasInstructionSet(ciSSE2) then
  1116. CumSum := CumSum_SSE2
  1117. else
  1118. {$ENDIF}
  1119. CumSum := CumSum_Pas;
  1120. end.