softfpu.pp 158 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. *}
  28. unit softfpu;
  29. { Overflow checking must be disabled,
  30. since some operations expect overflow!
  31. }
  32. {$Q-}
  33. {$ifdef fpc}
  34. {$define hascompilerproc}
  35. {$endif}
  36. {$ifdef fpc}
  37. {$goto on}
  38. {$endif}
  39. interface
  40. {
  41. -------------------------------------------------------------------------------
  42. Software IEC/IEEE floating-point types.
  43. -------------------------------------------------------------------------------
  44. }
  45. TYPE
  46. float32 = longword;
  47. flag = byte;
  48. uint8 = byte;
  49. int8 = shortint;
  50. uint16 = word;
  51. int16 = integer;
  52. uint32 = longword;
  53. int32 = longint;
  54. bits8 = byte;
  55. sbits8 = shortint;
  56. bits16 = word;
  57. sbits16 = integer;
  58. sbits32 = longint;
  59. bits32 = longword;
  60. {$ifndef fpc}
  61. qword = int64;
  62. {$endif}
  63. uint64 = qword;
  64. bits64 = qword;
  65. sbits64 = int64;
  66. {$ifdef ENDIAN_LITTLE}
  67. float64 = packed record
  68. low: bits32;
  69. high: bits32;
  70. end;
  71. int64rec = packed record
  72. low: bits32;
  73. high: bits32;
  74. end;
  75. {$else}
  76. float64 = packed record
  77. high,low : bits32;
  78. end;
  79. int64rec = packed record
  80. high,low : bits32;
  81. end;
  82. {$endif}
  83. {*
  84. -------------------------------------------------------------------------------
  85. Returns 1 if the double-precision floating-point value `a' is less than
  86. the corresponding value `b', and 0 otherwise. The comparison is performed
  87. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  88. -------------------------------------------------------------------------------
  89. *}
  90. Function float64_lt(a: float64;b: float64): flag; {$ifdef hascompilerproc} compilerproc; {$endif}
  91. {*
  92. -------------------------------------------------------------------------------
  93. Returns 1 if the double-precision floating-point value `a' is less than
  94. or equal to the corresponding value `b', and 0 otherwise. The comparison
  95. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  96. Arithmetic.
  97. -------------------------------------------------------------------------------
  98. *}
  99. Function float64_le(a: float64;b: float64): flag; {$ifdef hascompilerproc} compilerproc; {$endif}
  100. {*
  101. -------------------------------------------------------------------------------
  102. Returns 1 if the double-precision floating-point value `a' is equal to
  103. the corresponding value `b', and 0 otherwise. The comparison is performed
  104. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  105. -------------------------------------------------------------------------------
  106. *}
  107. Function float64_eq(a: float64;b: float64): flag; {$ifdef hascompilerproc} compilerproc; {$endif}
  108. {*
  109. -------------------------------------------------------------------------------
  110. Returns the square root of the double-precision floating-point value `a'.
  111. The operation is performed according to the IEC/IEEE Standard for Binary
  112. Floating-Point Arithmetic.
  113. -------------------------------------------------------------------------------
  114. *}
  115. Procedure float64_sqrt( a: float64; var out: float64 ); {$ifdef hascompilerproc} compilerproc; {$endif}
  116. {*
  117. -------------------------------------------------------------------------------
  118. Returns the remainder of the double-precision floating-point value `a'
  119. with respect to the corresponding value `b'. The operation is performed
  120. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  121. -------------------------------------------------------------------------------
  122. *}
  123. Procedure float64_rem(a: float64; b : float64; var out: float64); {$ifdef hascompilerproc} compilerproc; {$endif}
  124. {*
  125. -------------------------------------------------------------------------------
  126. Returns the result of dividing the double-precision floating-point value `a'
  127. by the corresponding value `b'. The operation is performed according to the
  128. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  129. -------------------------------------------------------------------------------
  130. *}
  131. Procedure float64_div(a: float64; b : float64 ; var out: float64 ); {$ifdef hascompilerproc} compilerproc; {$endif}
  132. {*
  133. -------------------------------------------------------------------------------
  134. Returns the result of multiplying the double-precision floating-point values
  135. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  136. for Binary Floating-Point Arithmetic.
  137. -------------------------------------------------------------------------------
  138. *}
  139. Procedure float64_mul( a: float64; b:float64; Var out: float64); {$ifdef hascompilerproc} compilerproc; {$endif}
  140. {*
  141. -------------------------------------------------------------------------------
  142. Returns the result of subtracting the double-precision floating-point values
  143. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  144. for Binary Floating-Point Arithmetic.
  145. -------------------------------------------------------------------------------
  146. *}
  147. Procedure float64_sub(a: float64; b : float64; var out: float64); {$ifdef hascompilerproc} compilerproc; {$endif}
  148. {*
  149. -------------------------------------------------------------------------------
  150. Returns the result of adding the double-precision floating-point values `a'
  151. and `b'. The operation is performed according to the IEC/IEEE Standard for
  152. Binary Floating-Point Arithmetic.
  153. -------------------------------------------------------------------------------
  154. *}
  155. Procedure float64_add( a: float64; b : float64; Var out : float64); {$ifdef hascompilerproc} compilerproc; {$endif}
  156. {*
  157. -------------------------------------------------------------------------------
  158. Rounds the double-precision floating-point value `a' to an integer,
  159. and returns the result as a double-precision floating-point value. The
  160. operation is performed according to the IEC/IEEE Standard for Binary
  161. Floating-Point Arithmetic.
  162. -------------------------------------------------------------------------------
  163. *}
  164. Procedure float64_round_to_int(a: float64; var out: float64 ); {$ifdef hascompilerproc} compilerproc; {$endif}
  165. {*
  166. -------------------------------------------------------------------------------
  167. Returns the result of converting the double-precision floating-point value
  168. `a' to the single-precision floating-point format. The conversion is
  169. performed according to the IEC/IEEE Standard for Binary Floating-Point
  170. Arithmetic.
  171. -------------------------------------------------------------------------------
  172. *}
  173. Function float64_to_float32(a: float64 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
  174. {*
  175. -------------------------------------------------------------------------------
  176. Returns the result of converting the double-precision floating-point value
  177. `a' to the 32-bit two's complement integer format. The conversion is
  178. performed according to the IEC/IEEE Standard for Binary Floating-Point
  179. Arithmetic, except that the conversion is always rounded toward zero.
  180. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  181. the conversion overflows, the largest integer with the same sign as `a' is
  182. returned.
  183. -------------------------------------------------------------------------------
  184. *}
  185. Function float64_to_int32_round_to_zero(a: float64 ): int32; {$ifdef hascompilerproc} compilerproc; {$endif}
  186. {*
  187. -------------------------------------------------------------------------------
  188. Returns the result of converting the double-precision floating-point value
  189. `a' to the 32-bit two's complement integer format. The conversion is
  190. performed according to the IEC/IEEE Standard for Binary Floating-Point
  191. Arithmetic---which means in particular that the conversion is rounded
  192. according to the current rounding mode. If `a' is a NaN, the largest
  193. positive integer is returned. Otherwise, if the conversion overflows, the
  194. largest integer with the same sign as `a' is returned.
  195. -------------------------------------------------------------------------------
  196. *}
  197. Function float64_to_int32(a: float64): int32; {$ifdef hascompilerproc} compilerproc; {$endif}
  198. {*
  199. -------------------------------------------------------------------------------
  200. Returns 1 if the single-precision floating-point value `a' is less than
  201. the corresponding value `b', and 0 otherwise. The comparison is performed
  202. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  203. -------------------------------------------------------------------------------
  204. *}
  205. Function float32_lt( a:float32 ; b : float32): flag; {$ifdef hascompilerproc} compilerproc; {$endif}
  206. {*
  207. -------------------------------------------------------------------------------
  208. Returns 1 if the single-precision floating-point value `a' is less than
  209. or equal to the corresponding value `b', and 0 otherwise. The comparison
  210. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  211. Arithmetic.
  212. -------------------------------------------------------------------------------
  213. *}
  214. Function float32_le( a: float32; b : float32 ):flag; {$ifdef hascompilerproc} compilerproc; {$endif}
  215. {*
  216. -------------------------------------------------------------------------------
  217. Returns 1 if the single-precision floating-point value `a' is equal to
  218. the corresponding value `b', and 0 otherwise. The comparison is performed
  219. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  220. -------------------------------------------------------------------------------
  221. *}
  222. Function float32_eq( a:float32; b:float32): flag; {$ifdef hascompilerproc} compilerproc; {$endif}
  223. {*
  224. -------------------------------------------------------------------------------
  225. Returns the square root of the single-precision floating-point value `a'.
  226. The operation is performed according to the IEC/IEEE Standard for Binary
  227. Floating-Point Arithmetic.
  228. -------------------------------------------------------------------------------
  229. *}
  230. Function float32_sqrt(a: float32 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
  231. {*
  232. -------------------------------------------------------------------------------
  233. Returns the remainder of the single-precision floating-point value `a'
  234. with respect to the corresponding value `b'. The operation is performed
  235. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  236. -------------------------------------------------------------------------------
  237. *}
  238. Function float32_rem(a: float32; b: float32 ):float32; {$ifdef hascompilerproc} compilerproc; {$endif}
  239. {*
  240. -------------------------------------------------------------------------------
  241. Returns the result of dividing the single-precision floating-point value `a'
  242. by the corresponding value `b'. The operation is performed according to the
  243. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  244. -------------------------------------------------------------------------------
  245. *}
  246. Function float32_div(a: float32;b: float32 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
  247. {*
  248. -------------------------------------------------------------------------------
  249. Returns the result of multiplying the single-precision floating-point values
  250. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  251. for Binary Floating-Point Arithmetic.
  252. -------------------------------------------------------------------------------
  253. *}
  254. Function float32_mul(a: float32; b: float32 ) : float32; {$ifdef hascompilerproc} compilerproc; {$endif}
  255. {*
  256. -------------------------------------------------------------------------------
  257. Returns the result of subtracting the single-precision floating-point values
  258. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  259. for Binary Floating-Point Arithmetic.
  260. -------------------------------------------------------------------------------
  261. *}
  262. Function float32_sub( a: float32 ; b:float32 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
  263. {*
  264. -------------------------------------------------------------------------------
  265. Returns the result of adding the single-precision floating-point values `a'
  266. and `b'. The operation is performed according to the IEC/IEEE Standard for
  267. Binary Floating-Point Arithmetic.
  268. -------------------------------------------------------------------------------
  269. *}
  270. Function float32_add( a: float32; b:float32 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
  271. {*
  272. -------------------------------------------------------------------------------
  273. Rounds the single-precision floating-point value `a' to an integer,
  274. and returns the result as a single-precision floating-point value. The
  275. operation is performed according to the IEC/IEEE Standard for Binary
  276. Floating-Point Arithmetic.
  277. -------------------------------------------------------------------------------
  278. *}
  279. Function float32_round_to_int( a: float32): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
  280. {*
  281. -------------------------------------------------------------------------------
  282. Returns the result of converting the single-precision floating-point value
  283. `a' to the double-precision floating-point format. The conversion is
  284. performed according to the IEC/IEEE Standard for Binary Floating-Point
  285. Arithmetic.
  286. -------------------------------------------------------------------------------
  287. *}
  288. Procedure float32_to_float64( a : float32; var out: Float64); {$ifdef hascompilerproc} compilerproc; {$endif}
  289. {*
  290. -------------------------------------------------------------------------------
  291. Returns the result of converting the single-precision floating-point value
  292. `a' to the 32-bit two's complement integer format. The conversion is
  293. performed according to the IEC/IEEE Standard for Binary Floating-Point
  294. Arithmetic, except that the conversion is always rounded toward zero.
  295. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  296. the conversion overflows, the largest integer with the same sign as `a' is
  297. returned.
  298. -------------------------------------------------------------------------------
  299. *}
  300. Function float32_to_int32_round_to_zero( a: Float32 ): int32; {$ifdef hascompilerproc} compilerproc; {$endif}
  301. {*
  302. -------------------------------------------------------------------------------
  303. Returns the result of converting the single-precision floating-point value
  304. `a' to the 32-bit two's complement integer format. The conversion is
  305. performed according to the IEC/IEEE Standard for Binary Floating-Point
  306. Arithmetic---which means in particular that the conversion is rounded
  307. according to the current rounding mode. If `a' is a NaN, the largest
  308. positive integer is returned. Otherwise, if the conversion overflows, the
  309. largest integer with the same sign as `a' is returned.
  310. -------------------------------------------------------------------------------
  311. *}
  312. Function float32_to_int32( a : float32) : int32; {$ifdef hascompilerproc} compilerproc; {$endif}
  313. {*
  314. -------------------------------------------------------------------------------
  315. Returns the result of converting the 32-bit two's complement integer `a' to
  316. the double-precision floating-point format. The conversion is performed
  317. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  318. -------------------------------------------------------------------------------
  319. *}
  320. Procedure int32_to_float64( a: int32; var c: float64 ); {$ifdef hascompilerproc} compilerproc; {$endif}
  321. {*
  322. -------------------------------------------------------------------------------
  323. Returns the result of converting the 32-bit two's complement integer `a' to
  324. the single-precision floating-point format. The conversion is performed
  325. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  326. -------------------------------------------------------------------------------
  327. *}
  328. Function int32_to_float32( a: int32): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
  329. {*----------------------------------------------------------------------------
  330. | Returns the result of converting the 64-bit two's complement integer `a'
  331. | to the double-precision floating-point format. The conversion is performed
  332. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  333. *----------------------------------------------------------------------------*}
  334. function int64_to_float64( a: int64 ): float64; {$ifdef hascompilerproc} compilerproc; {$endif}
  335. {*----------------------------------------------------------------------------
  336. | Returns the result of converting the 64-bit two's complement integer `a'
  337. | to the single-precision floating-point format. The conversion is performed
  338. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  339. *----------------------------------------------------------------------------*}
  340. function int64_to_float32( a: int64 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
  341. CONST
  342. {-------------------------------------------------------------------------------
  343. Software IEC/IEEE floating-point underflow tininess-detection mode.
  344. -------------------------------------------------------------------------------
  345. *}
  346. float_tininess_after_rounding = 0;
  347. float_tininess_before_rounding = 1;
  348. {*
  349. -------------------------------------------------------------------------------
  350. Software IEC/IEEE floating-point rounding mode.
  351. -------------------------------------------------------------------------------
  352. *}
  353. {
  354. Round to nearest.
  355. This is the default mode. It should be used unless there is a specific
  356. need for one of the others. In this mode results are rounded to the
  357. nearest representable value. If the result is midway between two
  358. representable values, the even representable is chosen. Even here
  359. means the lowest-order bit is zero. This rounding mode prevents
  360. statistical bias and guarantees numeric stability: round-off errors
  361. in a lengthy calculation will remain smaller than half of FLT_EPSILON.
  362. Round toward plus Infinity.
  363. All results are rounded to the smallest representable value which is
  364. greater than the result.
  365. Round toward minus Infinity.
  366. All results are rounded to the largest representable value which is
  367. less than the result.
  368. Round toward zero.
  369. All results are rounded to the largest representable value whose
  370. magnitude is less than that of the result. In other words, if the
  371. result is negative it is rounded up; if it is positive, it is
  372. rounded down.
  373. }
  374. float_round_nearest_even = 0;
  375. float_round_down = 1;
  376. float_round_up = 2;
  377. float_round_to_zero = 3;
  378. {*
  379. -------------------------------------------------------------------------------
  380. Software IEC/IEEE floating-point exception flags.
  381. -------------------------------------------------------------------------------
  382. *}
  383. float_flag_invalid = 1;
  384. float_flag_divbyzero = 4;
  385. float_flag_overflow = 8;
  386. float_flag_underflow = 16;
  387. float_flag_inexact = 32;
  388. {*
  389. -------------------------------------------------------------------------------
  390. Floating-point rounding mode and exception flags.
  391. -------------------------------------------------------------------------------
  392. *}
  393. const
  394. float_rounding_mode : Byte = float_round_nearest_even;
  395. float_exception_flags : Byte = 0;
  396. {*
  397. -------------------------------------------------------------------------------
  398. Underflow tininess-detection mode, statically initialized to default value.
  399. (The declaration in `softfloat.h' must match the `int8' type here.)
  400. -------------------------------------------------------------------------------
  401. *}
  402. const float_detect_tininess: int8 = float_tininess_after_rounding;
  403. implementation
  404. {*
  405. -------------------------------------------------------------------------------
  406. Raises the exceptions specified by `flags'. Floating-point traps can be
  407. defined here if desired. It is currently not possible for such a trap
  408. to substitute a result value. If traps are not implemented, this routine
  409. should be simply `float_exception_flags |= flags;'.
  410. -------------------------------------------------------------------------------
  411. *}
  412. procedure float_raise( i: shortint );
  413. Begin
  414. float_exception_flags := float_exception_flags or i;
  415. if (float_exception_flags and float_flag_invalid) <> 0 then
  416. RunError(207)
  417. else
  418. if (float_exception_flags and float_flag_divbyzero) <> 0 then
  419. RunError(200)
  420. else
  421. if (float_exception_flags and float_flag_overflow) <> 0 then
  422. RunError(205)
  423. else
  424. if (float_exception_flags and float_flag_underflow) <> 0 then
  425. RunError(206);
  426. end;
  427. (*****************************************************************************)
  428. (*----------------------------------------------------------------------------*)
  429. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  430. (* division and square root approximations. (Can be specialized to target if *)
  431. (* desired.) *)
  432. (* ---------------------------------------------------------------------------*)
  433. (*****************************************************************************)
  434. {*
  435. -------------------------------------------------------------------------------
  436. Shifts `a' right by the number of bits given in `count'. If any nonzero
  437. bits are shifted off, they are ``jammed'' into the least significant bit of
  438. the result by setting the least significant bit to 1. The value of `count'
  439. can be arbitrarily large; in particular, if `count' is greater than 32, the
  440. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  441. The result is stored in the location pointed to by `zPtr'.
  442. -------------------------------------------------------------------------------
  443. *}
  444. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  445. var
  446. z: Bits32;
  447. Begin
  448. if ( count = 0 ) then
  449. z := a
  450. else
  451. if ( count < 32 ) then
  452. Begin
  453. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  454. End
  455. else
  456. Begin
  457. z := bits32( a <> 0 );
  458. End;
  459. zPtr := z;
  460. End;
  461. {*
  462. -------------------------------------------------------------------------------
  463. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  464. number of bits given in `count'. Any bits shifted off are lost. The value
  465. of `count' can be arbitrarily large; in particular, if `count' is greater
  466. than 64, the result will be 0. The result is broken into two 32-bit pieces
  467. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  468. -------------------------------------------------------------------------------
  469. *}
  470. Procedure
  471. shift64Right(
  472. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  473. Var
  474. z0, z1: bits32;
  475. negCount : int8;
  476. Begin
  477. negCount := ( - count ) AND 31;
  478. if ( count = 0 ) then
  479. Begin
  480. z1 := a1;
  481. z0 := a0;
  482. End
  483. else if ( count < 32 ) then
  484. Begin
  485. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  486. z0 := a0 shr count;
  487. End
  488. else
  489. Begin
  490. if (count < 64) then
  491. z1 := ( a0 shr ( count AND 31 ) )
  492. else
  493. z1 := 0;
  494. z0 := 0;
  495. End;
  496. z1Ptr := z1;
  497. z0Ptr := z0;
  498. End;
  499. {*
  500. -------------------------------------------------------------------------------
  501. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  502. number of bits given in `count'. If any nonzero bits are shifted off, they
  503. are ``jammed'' into the least significant bit of the result by setting the
  504. least significant bit to 1. The value of `count' can be arbitrarily large;
  505. in particular, if `count' is greater than 64, the result will be either 0
  506. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  507. nonzero. The result is broken into two 32-bit pieces which are stored at
  508. the locations pointed to by `z0Ptr' and `z1Ptr'.
  509. -------------------------------------------------------------------------------
  510. *}
  511. Procedure
  512. shift64RightJamming(
  513. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  514. VAR
  515. z0, z1 : bits32;
  516. negCount : int8;
  517. Begin
  518. negCount := ( - count ) AND 31;
  519. if ( count = 0 ) then
  520. Begin
  521. z1 := a1;
  522. z0 := a0;
  523. End
  524. else
  525. if ( count < 32 ) then
  526. Begin
  527. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  528. z0 := a0 shr count;
  529. End
  530. else
  531. Begin
  532. if ( count = 32 ) then
  533. Begin
  534. z1 := a0 OR bits32( a1 <> 0 );
  535. End
  536. else
  537. if ( count < 64 ) Then
  538. Begin
  539. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  540. End
  541. else
  542. Begin
  543. z1 := bits32( ( a0 OR a1 ) <> 0 );
  544. End;
  545. z0 := 0;
  546. End;
  547. z1Ptr := z1;
  548. z0Ptr := z0;
  549. End;
  550. {*
  551. -------------------------------------------------------------------------------
  552. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  553. by 32 _plus_ the number of bits given in `count'. The shifted result is
  554. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  555. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  556. off form a third 32-bit result as follows: The _last_ bit shifted off is
  557. the most-significant bit of the extra result, and the other 31 bits of the
  558. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  559. were all zero. This extra result is stored in the location pointed to by
  560. `z2Ptr'. The value of `count' can be arbitrarily large.
  561. (This routine makes more sense if `a0', `a1', and `a2' are considered
  562. to form a fixed-point value with binary point between `a1' and `a2'. This
  563. fixed-point value is shifted right by the number of bits given in `count',
  564. and the integer part of the result is returned at the locations pointed to
  565. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  566. corrupted as described above, and is returned at the location pointed to by
  567. `z2Ptr'.)
  568. -------------------------------------------------------------------------------
  569. }
  570. Procedure
  571. shift64ExtraRightJamming(
  572. a0: bits32;
  573. a1: bits32;
  574. a2: bits32;
  575. count: int16;
  576. VAR z0Ptr: bits32;
  577. VAR z1Ptr: bits32;
  578. VAR z2Ptr: bits32
  579. );
  580. Var
  581. z0, z1, z2: bits32;
  582. negCount : int8;
  583. Begin
  584. negCount := ( - count ) AND 31;
  585. if ( count = 0 ) then
  586. Begin
  587. z2 := a2;
  588. z1 := a1;
  589. z0 := a0;
  590. End
  591. else
  592. Begin
  593. if ( count < 32 ) Then
  594. Begin
  595. z2 := a1 shl negCount;
  596. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  597. z0 := a0 shr count;
  598. End
  599. else
  600. Begin
  601. if ( count = 32 ) then
  602. Begin
  603. z2 := a1;
  604. z1 := a0;
  605. End
  606. else
  607. Begin
  608. a2 := a2 or a1;
  609. if ( count < 64 ) then
  610. Begin
  611. z2 := a0 shl negCount;
  612. z1 := a0 shr ( count AND 31 );
  613. End
  614. else
  615. Begin
  616. if count = 64 then
  617. z2 := a0
  618. else
  619. z2 := bits32(a0 <> 0);
  620. z1 := 0;
  621. End;
  622. End;
  623. z0 := 0;
  624. End;
  625. z2 := z2 or bits32( a2 <> 0 );
  626. End;
  627. z2Ptr := z2;
  628. z1Ptr := z1;
  629. z0Ptr := z0;
  630. End;
  631. {*
  632. -------------------------------------------------------------------------------
  633. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  634. number of bits given in `count'. Any bits shifted off are lost. The value
  635. of `count' must be less than 32. The result is broken into two 32-bit
  636. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  637. -------------------------------------------------------------------------------
  638. *}
  639. Procedure
  640. shortShift64Left(
  641. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  642. Begin
  643. z1Ptr := a1 shl count;
  644. if count = 0 then
  645. z0Ptr := a0
  646. else
  647. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  648. End;
  649. {*
  650. -------------------------------------------------------------------------------
  651. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  652. by the number of bits given in `count'. Any bits shifted off are lost.
  653. The value of `count' must be less than 32. The result is broken into three
  654. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  655. `z1Ptr', and `z2Ptr'.
  656. -------------------------------------------------------------------------------
  657. *}
  658. Procedure
  659. shortShift96Left(
  660. a0: bits32;
  661. a1: bits32;
  662. a2: bits32;
  663. count: int16;
  664. VAR z0Ptr: bits32;
  665. VAR z1Ptr: bits32;
  666. VAR z2Ptr: bits32
  667. );
  668. Var
  669. z0, z1, z2: bits32;
  670. negCount: int8;
  671. Begin
  672. z2 := a2 shl count;
  673. z1 := a1 shl count;
  674. z0 := a0 shl count;
  675. if ( 0 < count ) then
  676. Begin
  677. negCount := ( ( - count ) AND 31 );
  678. z1 := z1 or (a2 shr negCount);
  679. z0 := z0 or (a1 shr negCount);
  680. End;
  681. z2Ptr := z2;
  682. z1Ptr := z1;
  683. z0Ptr := z0;
  684. End;
  685. {*
  686. -------------------------------------------------------------------------------
  687. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  688. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  689. any carry out is lost. The result is broken into two 32-bit pieces which
  690. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  691. -------------------------------------------------------------------------------
  692. *}
  693. Procedure
  694. add64(
  695. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  696. Var
  697. z1: bits32;
  698. Begin
  699. z1 := a1 + b1;
  700. z1Ptr := z1;
  701. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  702. End;
  703. {*
  704. -------------------------------------------------------------------------------
  705. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  706. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  707. modulo 2^96, so any carry out is lost. The result is broken into three
  708. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  709. `z1Ptr', and `z2Ptr'.
  710. -------------------------------------------------------------------------------
  711. *}
  712. Procedure
  713. add96(
  714. a0: bits32;
  715. a1: bits32;
  716. a2: bits32;
  717. b0: bits32;
  718. b1: bits32;
  719. b2: bits32;
  720. VAR z0Ptr: bits32;
  721. VAR z1Ptr: bits32;
  722. VAR z2Ptr: bits32
  723. );
  724. var
  725. z0, z1, z2: bits32;
  726. carry0, carry1: int8;
  727. Begin
  728. z2 := a2 + b2;
  729. carry1 := int8( z2 < a2 );
  730. z1 := a1 + b1;
  731. carry0 := int8( z1 < a1 );
  732. z0 := a0 + b0;
  733. z1 := z1 + carry1;
  734. z0 := z0 + bits32( z1 < carry1 );
  735. z0 := z0 + carry0;
  736. z2Ptr := z2;
  737. z1Ptr := z1;
  738. z0Ptr := z0;
  739. End;
  740. {*
  741. -------------------------------------------------------------------------------
  742. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  743. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  744. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  745. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  746. `z1Ptr'.
  747. -------------------------------------------------------------------------------
  748. *}
  749. Procedure
  750. sub64(
  751. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  752. Begin
  753. z1Ptr := a1 - b1;
  754. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  755. End;
  756. {*
  757. -------------------------------------------------------------------------------
  758. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  759. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  760. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  761. into three 32-bit pieces which are stored at the locations pointed to by
  762. `z0Ptr', `z1Ptr', and `z2Ptr'.
  763. -------------------------------------------------------------------------------
  764. *}
  765. Procedure
  766. sub96(
  767. a0:bits32;
  768. a1:bits32;
  769. a2:bits32;
  770. b0:bits32;
  771. b1:bits32;
  772. b2:bits32;
  773. VAR z0Ptr:bits32;
  774. VAR z1Ptr:bits32;
  775. VAR z2Ptr:bits32
  776. );
  777. Var
  778. z0, z1, z2: bits32;
  779. borrow0, borrow1: int8;
  780. Begin
  781. z2 := a2 - b2;
  782. borrow1 := int8( a2 < b2 );
  783. z1 := a1 - b1;
  784. borrow0 := int8( a1 < b1 );
  785. z0 := a0 - b0;
  786. z0 := z0 - bits32( z1 < borrow1 );
  787. z1 := z1 - borrow1;
  788. z0 := z0 -borrow0;
  789. z2Ptr := z2;
  790. z1Ptr := z1;
  791. z0Ptr := z0;
  792. End;
  793. {*
  794. -------------------------------------------------------------------------------
  795. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  796. into two 32-bit pieces which are stored at the locations pointed to by
  797. `z0Ptr' and `z1Ptr'.
  798. -------------------------------------------------------------------------------
  799. *}
  800. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  801. :bits32 );
  802. Var
  803. aHigh, aLow, bHigh, bLow: bits16;
  804. z0, zMiddleA, zMiddleB, z1: bits32;
  805. Begin
  806. aLow := a and $ffff;
  807. aHigh := a shr 16;
  808. bLow := b and $ffff;
  809. bHigh := b shr 16;
  810. z1 := ( bits32( aLow) ) * bLow;
  811. zMiddleA := ( bits32 (aLow) ) * bHigh;
  812. zMiddleB := ( bits32 (aHigh) ) * bLow;
  813. z0 := ( bits32 (aHigh) ) * bHigh;
  814. zMiddleA := zMiddleA + zMiddleB;
  815. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  816. zMiddleA := zmiddleA shl 16;
  817. z1 := z1 + zMiddleA;
  818. z0 := z0 + bits32( z1 < zMiddleA );
  819. z1Ptr := z1;
  820. z0Ptr := z0;
  821. End;
  822. {*
  823. -------------------------------------------------------------------------------
  824. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  825. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  826. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  827. `z2Ptr'.
  828. -------------------------------------------------------------------------------
  829. *}
  830. Procedure
  831. mul64By32To96(
  832. a0:bits32;
  833. a1:bits32;
  834. b:bits32;
  835. VAR z0Ptr:bits32;
  836. VAR z1Ptr:bits32;
  837. VAR z2Ptr:bits32
  838. );
  839. Var
  840. z0, z1, z2, more1: bits32;
  841. Begin
  842. mul32To64( a1, b, z1, z2 );
  843. mul32To64( a0, b, z0, more1 );
  844. add64( z0, more1, 0, z1, z0, z1 );
  845. z2Ptr := z2;
  846. z1Ptr := z1;
  847. z0Ptr := z0;
  848. End;
  849. {*
  850. -------------------------------------------------------------------------------
  851. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  852. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  853. product. The product is broken into four 32-bit pieces which are stored at
  854. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  855. -------------------------------------------------------------------------------
  856. *}
  857. Procedure
  858. mul64To128(
  859. a0:bits32;
  860. a1:bits32;
  861. b0:bits32;
  862. b1:bits32;
  863. VAR z0Ptr:bits32;
  864. VAR z1Ptr:bits32;
  865. VAR z2Ptr:bits32;
  866. VAR z3Ptr:bits32
  867. );
  868. Var
  869. z0, z1, z2, z3: bits32;
  870. more1, more2: bits32;
  871. Begin
  872. mul32To64( a1, b1, z2, z3 );
  873. mul32To64( a1, b0, z1, more2 );
  874. add64( z1, more2, 0, z2, z1, z2 );
  875. mul32To64( a0, b0, z0, more1 );
  876. add64( z0, more1, 0, z1, z0, z1 );
  877. mul32To64( a0, b1, more1, more2 );
  878. add64( more1, more2, 0, z2, more1, z2 );
  879. add64( z0, z1, 0, more1, z0, z1 );
  880. z3Ptr := z3;
  881. z2Ptr := z2;
  882. z1Ptr := z1;
  883. z0Ptr := z0;
  884. End;
  885. {*
  886. -------------------------------------------------------------------------------
  887. Returns an approximation to the 32-bit integer quotient obtained by dividing
  888. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  889. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  890. toward zero, the approximation returned lies between q and q + 2 inclusive.
  891. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  892. unsigned integer is returned.
  893. -------------------------------------------------------------------------------
  894. *}
  895. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  896. Var
  897. b0, b1: bits32;
  898. rem0, rem1, term0, term1: bits32;
  899. z: bits32;
  900. Begin
  901. if ( b <= a0 ) then
  902. Begin
  903. estimateDiv64To32 := $FFFFFFFF;
  904. exit;
  905. End;
  906. b0 := b shr 16;
  907. if ( b0 shl 16 <= a0 ) then
  908. z:= $FFFF0000
  909. else
  910. z:= ( a0 div b0 ) shl 16;
  911. mul32To64( b, z, term0, term1 );
  912. sub64( a0, a1, term0, term1, rem0, rem1 );
  913. while ( ( sbits32 (rem0) ) < 0 ) do
  914. Begin
  915. z := z - $10000;
  916. b1 := b shl 16;
  917. add64( rem0, rem1, b0, b1, rem0, rem1 );
  918. End;
  919. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  920. if ( b0 shl 16 <= rem0 ) then
  921. z := z or $FFFF
  922. else
  923. z := z or (rem0 div b0);
  924. estimateDiv64To32 := z;
  925. End;
  926. {*
  927. -------------------------------------------------------------------------------
  928. Returns an approximation to the square root of the 32-bit significand given
  929. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  930. `aExp' (the least significant bit) is 1, the integer returned approximates
  931. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  932. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  933. case, the approximation returned lies strictly within +/-2 of the exact
  934. value.
  935. -------------------------------------------------------------------------------
  936. *}
  937. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  938. const sqrtOddAdjustments: array[0..15] of bits16 = (
  939. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  940. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  941. );
  942. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  943. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  944. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  945. );
  946. Var
  947. index: int8;
  948. z: bits32;
  949. Begin
  950. index := ( a shr 27 ) AND 15;
  951. if ( aExp AND 1 ) <> 0 then
  952. Begin
  953. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  954. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  955. a := a shr 1;
  956. End
  957. else
  958. Begin
  959. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  960. z := a div z + z;
  961. if ( $20000 <= z ) then
  962. z := $FFFF8000
  963. else
  964. z := ( z shl 15 );
  965. if ( z <= a ) then
  966. Begin
  967. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  968. exit;
  969. End;
  970. End;
  971. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  972. End;
  973. {*
  974. -------------------------------------------------------------------------------
  975. Returns the number of leading 0 bits before the most-significant 1 bit of
  976. `a'. If `a' is zero, 32 is returned.
  977. -------------------------------------------------------------------------------
  978. *}
  979. Function countLeadingZeros32( a:bits32 ): int8;
  980. const countLeadingZerosHigh:array[0..255] of int8 = (
  981. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  982. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  983. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  984. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  985. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  986. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  987. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  988. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  989. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  990. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  991. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  992. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  993. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  994. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  995. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  996. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  997. );
  998. Var
  999. shiftCount: int8;
  1000. Begin
  1001. shiftCount := 0;
  1002. if ( a < $10000 ) then
  1003. Begin
  1004. shiftCount := shiftcount + 16;
  1005. a := a shl 16;
  1006. End;
  1007. if ( a < $1000000 ) then
  1008. Begin
  1009. shiftCount := shiftcount + 8;
  1010. a := a shl 8;
  1011. end;
  1012. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1013. countLeadingZeros32:= shiftCount;
  1014. End;
  1015. {*----------------------------------------------------------------------------
  1016. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1017. | `a'. If `a' is zero, 64 is returned.
  1018. *----------------------------------------------------------------------------*}
  1019. function countLeadingZeros64( a : bits64): int8;
  1020. var
  1021. shiftcount : int8;
  1022. Begin
  1023. shiftCount := 0;
  1024. if ( a < (bits64(1) shl 32 )) then
  1025. shiftCount := shiftcount + 32
  1026. else
  1027. a := a shr 32;
  1028. shiftCount := shiftCount + countLeadingZeros32( a );
  1029. countLeadingZeros64:= shiftCount;
  1030. End;
  1031. {*
  1032. -------------------------------------------------------------------------------
  1033. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1034. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1035. returns 0.
  1036. -------------------------------------------------------------------------------
  1037. *}
  1038. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1039. Begin
  1040. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1041. End;
  1042. {*
  1043. -------------------------------------------------------------------------------
  1044. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1045. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1046. Otherwise, returns 0.
  1047. -------------------------------------------------------------------------------
  1048. *}
  1049. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1050. Begin
  1051. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1052. End;
  1053. {*
  1054. -------------------------------------------------------------------------------
  1055. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1056. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1057. returns 0.
  1058. -------------------------------------------------------------------------------
  1059. *}
  1060. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1061. Begin
  1062. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1063. End;
  1064. {*
  1065. -------------------------------------------------------------------------------
  1066. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1067. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1068. returns 0.
  1069. -------------------------------------------------------------------------------
  1070. *}
  1071. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1072. Begin
  1073. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1074. End;
  1075. (*****************************************************************************)
  1076. (* End Low-Level arithmetic *)
  1077. (*****************************************************************************)
  1078. {*
  1079. -------------------------------------------------------------------------------
  1080. Functions and definitions to determine: (1) whether tininess for underflow
  1081. is detected before or after rounding by default, (2) what (if anything)
  1082. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1083. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1084. are propagated from function inputs to output. These details are ENDIAN
  1085. specific
  1086. -------------------------------------------------------------------------------
  1087. *}
  1088. {$IFDEF ENDIAN_LITTLE}
  1089. {*
  1090. -------------------------------------------------------------------------------
  1091. Internal canonical NaN format.
  1092. -------------------------------------------------------------------------------
  1093. *}
  1094. TYPE
  1095. commonNaNT = packed record
  1096. sign: flag;
  1097. high, low : bits32;
  1098. end;
  1099. {*
  1100. -------------------------------------------------------------------------------
  1101. The pattern for a default generated single-precision NaN.
  1102. -------------------------------------------------------------------------------
  1103. *}
  1104. const float32_default_nan = $FFC00000;
  1105. {*
  1106. -------------------------------------------------------------------------------
  1107. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1108. otherwise returns 0.
  1109. -------------------------------------------------------------------------------
  1110. *}
  1111. Function float32_is_nan( a : float32 ): flag;
  1112. Begin
  1113. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1114. End;
  1115. {*
  1116. -------------------------------------------------------------------------------
  1117. Returns 1 if the single-precision floating-point value `a' is a signaling
  1118. NaN; otherwise returns 0.
  1119. -------------------------------------------------------------------------------
  1120. *}
  1121. Function float32_is_signaling_nan( a : float32 ): flag;
  1122. Begin
  1123. float32_is_signaling_nan := flag
  1124. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1125. End;
  1126. {*
  1127. -------------------------------------------------------------------------------
  1128. Returns the result of converting the single-precision floating-point NaN
  1129. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1130. exception is raised.
  1131. -------------------------------------------------------------------------------
  1132. *}
  1133. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1134. var
  1135. z : commonNaNT ;
  1136. Begin
  1137. if ( float32_is_signaling_nan( a ) <> 0) then
  1138. float_raise( float_flag_invalid );
  1139. z.sign := a shr 31;
  1140. z.low := 0;
  1141. z.high := a shl 9;
  1142. c := z;
  1143. End;
  1144. {*
  1145. -------------------------------------------------------------------------------
  1146. Returns the result of converting the canonical NaN `a' to the single-
  1147. precision floating-point format.
  1148. -------------------------------------------------------------------------------
  1149. *}
  1150. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1151. Begin
  1152. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1153. End;
  1154. {*
  1155. -------------------------------------------------------------------------------
  1156. Takes two single-precision floating-point values `a' and `b', one of which
  1157. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1158. signaling NaN, the invalid exception is raised.
  1159. -------------------------------------------------------------------------------
  1160. *}
  1161. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1162. Var
  1163. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1164. label returnLargerSignificand;
  1165. Begin
  1166. aIsNaN := float32_is_nan( a );
  1167. aIsSignalingNaN := float32_is_signaling_nan( a );
  1168. bIsNaN := float32_is_nan( b );
  1169. bIsSignalingNaN := float32_is_signaling_nan( b );
  1170. a := a or $00400000;
  1171. b := b or $00400000;
  1172. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1173. float_raise( float_flag_invalid );
  1174. if ( aIsSignalingNaN )<> 0 then
  1175. Begin
  1176. if ( bIsSignalingNaN ) <> 0 then
  1177. goto returnLargerSignificand;
  1178. if bIsNan <> 0 then
  1179. propagateFloat32NaN := b
  1180. else
  1181. propagateFloat32NaN := a;
  1182. exit;
  1183. End
  1184. else if ( aIsNaN <> 0) then
  1185. Begin
  1186. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1187. Begin
  1188. propagateFloat32NaN := a;
  1189. exit;
  1190. End;
  1191. returnLargerSignificand:
  1192. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1193. Begin
  1194. propagateFloat32NaN := b;
  1195. exit;
  1196. End;
  1197. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1198. Begin
  1199. propagateFloat32NaN := a;
  1200. End;
  1201. if a < b then
  1202. propagateFloat32NaN := a
  1203. else
  1204. propagateFloat32NaN := b;
  1205. exit;
  1206. End
  1207. else
  1208. Begin
  1209. propagateFloat32NaN := b;
  1210. exit;
  1211. End;
  1212. End;
  1213. {*
  1214. -------------------------------------------------------------------------------
  1215. The pattern for a default generated double-precision NaN. The `high' and
  1216. `low' values hold the most- and least-significant bits, respectively.
  1217. -------------------------------------------------------------------------------
  1218. *}
  1219. const
  1220. float64_default_nan_high = $FFF80000;
  1221. float64_default_nan_low = $00000000;
  1222. {*
  1223. -------------------------------------------------------------------------------
  1224. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1225. otherwise returns 0.
  1226. -------------------------------------------------------------------------------
  1227. *}
  1228. Function float64_is_nan( a : float64 ) : flag;
  1229. Begin
  1230. float64_is_nan :=
  1231. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1232. and ( a.low or ( a.high and $000FFFFF ) );
  1233. End;
  1234. {*
  1235. -------------------------------------------------------------------------------
  1236. Returns 1 if the double-precision floating-point value `a' is a signaling
  1237. NaN; otherwise returns 0.
  1238. -------------------------------------------------------------------------------
  1239. *}
  1240. Function float64_is_signaling_nan( a : float64 ): flag;
  1241. Begin
  1242. float64_is_signaling_nan :=
  1243. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1244. and ( a.low or ( a.high and $0007FFFF ) );
  1245. End;
  1246. {*
  1247. -------------------------------------------------------------------------------
  1248. Returns the result of converting the double-precision floating-point NaN
  1249. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1250. exception is raised.
  1251. -------------------------------------------------------------------------------
  1252. *}
  1253. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1254. Var
  1255. z : commonNaNT;
  1256. Begin
  1257. if ( float64_is_signaling_nan( a )<>0 ) then
  1258. float_raise( float_flag_invalid );
  1259. z.sign := a.high shr 31;
  1260. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1261. c := z;
  1262. End;
  1263. {*
  1264. -------------------------------------------------------------------------------
  1265. Returns the result of converting the canonical NaN `a' to the double-
  1266. precision floating-point format.
  1267. -------------------------------------------------------------------------------
  1268. *}
  1269. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1270. Var
  1271. z: float64;
  1272. Begin
  1273. shift64Right( a.high, a.low, 12, z.high, z.low );
  1274. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1275. c := z;
  1276. End;
  1277. {*
  1278. -------------------------------------------------------------------------------
  1279. Takes two double-precision floating-point values `a' and `b', one of which
  1280. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1281. signaling NaN, the invalid exception is raised.
  1282. -------------------------------------------------------------------------------
  1283. *}
  1284. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1285. Var
  1286. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1287. label returnLargerSignificand;
  1288. Begin
  1289. aIsNaN := float64_is_nan( a );
  1290. aIsSignalingNaN := float64_is_signaling_nan( a );
  1291. bIsNaN := float64_is_nan( b );
  1292. bIsSignalingNaN := float64_is_signaling_nan( b );
  1293. a.high := a.high or $00080000;
  1294. b.high := b.high or $00080000;
  1295. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1296. float_raise( float_flag_invalid );
  1297. if ( aIsSignalingNaN )<>0 then
  1298. Begin
  1299. if ( bIsSignalingNaN )<>0 then
  1300. goto returnLargerSignificand;
  1301. if bIsNan <> 0 then
  1302. c := b
  1303. else
  1304. c := a;
  1305. exit;
  1306. End
  1307. else if ( aIsNaN )<> 0 then
  1308. Begin
  1309. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1310. Begin
  1311. c := a;
  1312. exit;
  1313. End;
  1314. returnLargerSignificand:
  1315. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1316. Begin
  1317. c := b;
  1318. exit;
  1319. End;
  1320. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1321. Begin
  1322. c := a;
  1323. exit;
  1324. End;
  1325. if a.high < b.high then
  1326. c := a
  1327. else
  1328. c := b;
  1329. exit;
  1330. End
  1331. else
  1332. Begin
  1333. c := b;
  1334. exit;
  1335. End;
  1336. End;
  1337. {$ELSE}
  1338. { Big endian code }
  1339. (*----------------------------------------------------------------------------
  1340. | Internal canonical NaN format.
  1341. *----------------------------------------------------------------------------*)
  1342. type
  1343. commonNANT = packed record
  1344. sign : flag;
  1345. high, low : bits32;
  1346. end;
  1347. (*----------------------------------------------------------------------------
  1348. | The pattern for a default generated single-precision NaN.
  1349. *----------------------------------------------------------------------------*)
  1350. const float32_default_nan = $7FFFFFFF;
  1351. (*----------------------------------------------------------------------------
  1352. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1353. | otherwise returns 0.
  1354. *----------------------------------------------------------------------------*)
  1355. function float32_is_nan(a: float32): flag;
  1356. begin
  1357. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1358. end;
  1359. (*----------------------------------------------------------------------------
  1360. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1361. | NaN; otherwise returns 0.
  1362. *----------------------------------------------------------------------------*)
  1363. function float32_is_signaling_nan(a: float32):flag;
  1364. begin
  1365. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1366. end;
  1367. (*----------------------------------------------------------------------------
  1368. | Returns the result of converting the single-precision floating-point NaN
  1369. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1370. | exception is raised.
  1371. *----------------------------------------------------------------------------*)
  1372. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1373. var
  1374. z: commonNANT;
  1375. begin
  1376. if float32_is_signaling_nan(a)<>0 then
  1377. float_raise(float_flag_invalid);
  1378. z.sign := a shr 31;
  1379. z.low := 0;
  1380. z.high := a shl 9;
  1381. c:=z;
  1382. end;
  1383. (*----------------------------------------------------------------------------
  1384. | Returns the result of converting the canonical NaN `a' to the single-
  1385. | precision floating-point format.
  1386. *----------------------------------------------------------------------------*)
  1387. function CommonNanToFloat32(a : CommonNaNT): float32;
  1388. begin
  1389. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  1390. end;
  1391. (*----------------------------------------------------------------------------
  1392. | Takes two single-precision floating-point values `a' and `b', one of which
  1393. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1394. | signaling NaN, the invalid exception is raised.
  1395. *----------------------------------------------------------------------------*)
  1396. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  1397. var
  1398. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1399. begin
  1400. aIsNaN := float32_is_nan( a );
  1401. aIsSignalingNaN := float32_is_signaling_nan( a );
  1402. bIsNaN := float32_is_nan( b );
  1403. bIsSignalingNaN := float32_is_signaling_nan( b );
  1404. a := a or $00400000;
  1405. b := b or $00400000;
  1406. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1407. float_raise( float_flag_invalid );
  1408. if bIsSignalingNaN<>0 then
  1409. propagateFloat32Nan := b
  1410. else if aIsSignalingNan<>0 then
  1411. propagateFloat32Nan := a
  1412. else if bIsNan<>0 then
  1413. propagateFloat32Nan := b
  1414. else
  1415. propagateFloat32Nan := a;
  1416. end;
  1417. (*----------------------------------------------------------------------------
  1418. | The pattern for a default generated double-precision NaN. The `high' and
  1419. | `low' values hold the most- and least-significant bits, respectively.
  1420. *----------------------------------------------------------------------------*)
  1421. const
  1422. float64_default_nan_high = $7FFFFFFF;
  1423. float64_default_nan_low = $FFFFFFFF;
  1424. (*----------------------------------------------------------------------------
  1425. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  1426. | otherwise returns 0.
  1427. *----------------------------------------------------------------------------*)
  1428. function float64_is_nan(a: float64): flag;
  1429. begin
  1430. float64_is_nan := flag (
  1431. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1432. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  1433. end;
  1434. (*----------------------------------------------------------------------------
  1435. | Returns 1 if the double-precision floating-point value `a' is a signaling
  1436. | NaN; otherwise returns 0.
  1437. *----------------------------------------------------------------------------*)
  1438. function float64_is_signaling_nan( a:float64): flag;
  1439. begin
  1440. float64_is_signaling_nan := flag
  1441. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1442. and ( (a.low<>0) or ( boolean(( a.high and $0007FFFF )<>0)) );
  1443. end;
  1444. (*----------------------------------------------------------------------------
  1445. | Returns the result of converting the double-precision floating-point NaN
  1446. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1447. | exception is raised.
  1448. *----------------------------------------------------------------------------*)
  1449. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1450. var
  1451. z : commonNaNT;
  1452. begin
  1453. if ( float64_is_signaling_nan( a )<>0 ) then
  1454. float_raise( float_flag_invalid );
  1455. z.sign := a.high shr 31;
  1456. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1457. c:=z;
  1458. end;
  1459. (*----------------------------------------------------------------------------
  1460. | Returns the result of converting the canonical NaN `a' to the double-
  1461. | precision floating-point format.
  1462. *----------------------------------------------------------------------------*)
  1463. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1464. var
  1465. z: float64;
  1466. begin
  1467. shift64Right( a.high, a.low, 12, z.high, z.low );
  1468. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1469. c:=z;
  1470. end;
  1471. (*----------------------------------------------------------------------------
  1472. | Takes two double-precision floating-point values `a' and `b', one of which
  1473. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1474. | signaling NaN, the invalid exception is raised.
  1475. *----------------------------------------------------------------------------*)
  1476. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1477. var
  1478. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  1479. begin
  1480. aIsNaN := float64_is_nan( a );
  1481. aIsSignalingNaN := float64_is_signaling_nan( a );
  1482. bIsNaN := float64_is_nan( b );
  1483. bIsSignalingNaN := float64_is_signaling_nan( b );
  1484. a.high := a.high or $00080000;
  1485. b.high := b.high or $00080000;
  1486. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  1487. float_raise( float_flag_invalid );
  1488. if bIsSignalingNaN<>0 then
  1489. c := b
  1490. else if aIsSignalingNan<>0 then
  1491. c := a
  1492. else if bIsNan<>0 then
  1493. c := b
  1494. else
  1495. c := a;
  1496. end;
  1497. {$ENDIF}
  1498. (****************************************************************************)
  1499. (* END ENDIAN SPECIFIC CODE *)
  1500. (****************************************************************************)
  1501. {*
  1502. -------------------------------------------------------------------------------
  1503. Returns the fraction bits of the single-precision floating-point value `a'.
  1504. -------------------------------------------------------------------------------
  1505. *}
  1506. Function ExtractFloat32Frac(a : Float32) : Bits32;
  1507. Begin
  1508. ExtractFloat32Frac := A AND $007FFFFF;
  1509. End;
  1510. {*
  1511. -------------------------------------------------------------------------------
  1512. Returns the exponent bits of the single-precision floating-point value `a'.
  1513. -------------------------------------------------------------------------------
  1514. *}
  1515. Function extractFloat32Exp( a: float32 ): Int16;
  1516. Begin
  1517. extractFloat32Exp := (a shr 23) AND $FF;
  1518. End;
  1519. {*
  1520. -------------------------------------------------------------------------------
  1521. Returns the sign bit of the single-precision floating-point value `a'.
  1522. -------------------------------------------------------------------------------
  1523. *}
  1524. Function extractFloat32Sign( a: float32 ): Flag;
  1525. Begin
  1526. extractFloat32Sign := a shr 31;
  1527. End;
  1528. {*
  1529. -------------------------------------------------------------------------------
  1530. Normalizes the subnormal single-precision floating-point value represented
  1531. by the denormalized significand `aSig'. The normalized exponent and
  1532. significand are stored at the locations pointed to by `zExpPtr' and
  1533. `zSigPtr', respectively.
  1534. -------------------------------------------------------------------------------
  1535. *}
  1536. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  1537. Var
  1538. ShiftCount : BYTE;
  1539. Begin
  1540. shiftCount := countLeadingZeros32( aSig ) - 8;
  1541. zSigPtr := aSig shl shiftCount;
  1542. zExpPtr := 1 - shiftCount;
  1543. End;
  1544. {*
  1545. -------------------------------------------------------------------------------
  1546. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  1547. single-precision floating-point value, returning the result. After being
  1548. shifted into the proper positions, the three fields are simply added
  1549. together to form the result. This means that any integer portion of `zSig'
  1550. will be added into the exponent. Since a properly normalized significand
  1551. will have an integer portion equal to 1, the `zExp' input should be 1 less
  1552. than the desired result exponent whenever `zSig' is a complete, normalized
  1553. significand.
  1554. -------------------------------------------------------------------------------
  1555. *}
  1556. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  1557. Begin
  1558. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  1559. + zSig;
  1560. End;
  1561. {*
  1562. -------------------------------------------------------------------------------
  1563. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1564. and significand `zSig', and returns the proper single-precision floating-
  1565. point value corresponding to the abstract input. Ordinarily, the abstract
  1566. value is simply rounded and packed into the single-precision format, with
  1567. the inexact exception raised if the abstract input cannot be represented
  1568. exactly. However, if the abstract value is too large, the overflow and
  1569. inexact exceptions are raised and an infinity or maximal finite value is
  1570. returned. If the abstract value is too small, the input value is rounded to
  1571. a subnormal number, and the underflow and inexact exceptions are raised if
  1572. the abstract input cannot be represented exactly as a subnormal single-
  1573. precision floating-point number.
  1574. The input significand `zSig' has its binary point between bits 30
  1575. and 29, which is 7 bits to the left of the usual location. This shifted
  1576. significand must be normalized or smaller. If `zSig' is not normalized,
  1577. `zExp' must be 0; in that case, the result returned is a subnormal number,
  1578. and it must not require rounding. In the usual case that `zSig' is
  1579. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  1580. The handling of underflow and overflow follows the IEC/IEEE Standard for
  1581. Binary Floating-Point Arithmetic.
  1582. -------------------------------------------------------------------------------
  1583. *}
  1584. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  1585. Var
  1586. roundingMode : BYTE;
  1587. roundNearestEven : Flag;
  1588. roundIncrement, roundBits : BYTE;
  1589. IsTiny : Flag;
  1590. Begin
  1591. roundingMode := float_rounding_mode;
  1592. if (roundingMode = float_round_nearest_even) then
  1593. Begin
  1594. roundNearestEven := Flag(TRUE);
  1595. end
  1596. else
  1597. roundNearestEven := Flag(FALSE);
  1598. roundIncrement := $40;
  1599. if ( Boolean(roundNearestEven) = FALSE) then
  1600. Begin
  1601. if ( roundingMode = float_round_to_zero ) Then
  1602. Begin
  1603. roundIncrement := 0;
  1604. End
  1605. else
  1606. Begin
  1607. roundIncrement := $7F;
  1608. if ( zSign <> 0 ) then
  1609. Begin
  1610. if roundingMode = float_round_up then roundIncrement := 0;
  1611. End
  1612. else
  1613. Begin
  1614. if roundingMode = float_round_down then roundIncrement := 0;
  1615. End;
  1616. End
  1617. End;
  1618. roundBits := zSig AND $7F;
  1619. if ($FD <= bits16 (zExp) ) then
  1620. Begin
  1621. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  1622. Begin
  1623. float_raise( float_flag_overflow OR float_flag_inexact );
  1624. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  1625. exit;
  1626. End;
  1627. if ( zExp < 0 ) then
  1628. Begin
  1629. isTiny :=
  1630. flag(( float_detect_tininess = float_tininess_before_rounding )
  1631. OR ( zExp < -1 )
  1632. OR ( (zSig + roundIncrement) < $80000000 ));
  1633. shift32RightJamming( zSig, - zExp, zSig );
  1634. zExp := 0;
  1635. roundBits := zSig AND $7F;
  1636. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  1637. float_raise( float_flag_underflow );
  1638. End;
  1639. End;
  1640. if ( roundBits )<> 0 then
  1641. float_exception_flags := float_flag_inexact OR float_exception_flags;
  1642. zSig := ( zSig + roundIncrement ) shr 7;
  1643. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  1644. if ( zSig = 0 ) then zExp := 0;
  1645. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  1646. exit;
  1647. End;
  1648. {*
  1649. -------------------------------------------------------------------------------
  1650. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1651. and significand `zSig', and returns the proper single-precision floating-
  1652. point value corresponding to the abstract input. This routine is just like
  1653. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  1654. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  1655. floating-point exponent.
  1656. -------------------------------------------------------------------------------
  1657. *}
  1658. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  1659. Var
  1660. ShiftCount : int8;
  1661. Begin
  1662. shiftCount := countLeadingZeros32( zSig ) - 1;
  1663. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  1664. End;
  1665. {*
  1666. -------------------------------------------------------------------------------
  1667. Returns the least-significant 32 fraction bits of the double-precision
  1668. floating-point value `a'.
  1669. -------------------------------------------------------------------------------
  1670. *}
  1671. Function extractFloat64Frac( a: float64 ): bits32;
  1672. Begin
  1673. extractFloat64Frac := a.low;
  1674. End;
  1675. {*
  1676. -------------------------------------------------------------------------------
  1677. Returns the most-significant 20 fraction bits of the double-precision
  1678. floating-point value `a'.
  1679. -------------------------------------------------------------------------------
  1680. *}
  1681. Function extractFloat64Frac0(a: float64): bits32;
  1682. Begin
  1683. extractFloat64Frac0 := a.high and $000FFFFF;
  1684. End;
  1685. {*
  1686. -------------------------------------------------------------------------------
  1687. Returns the least-significant 32 fraction bits of the double-precision
  1688. floating-point value `a'.
  1689. -------------------------------------------------------------------------------
  1690. *}
  1691. Function extractFloat64Frac1(a: float64): bits32;
  1692. Begin
  1693. extractFloat64Frac1 := a.low;
  1694. End;
  1695. {*
  1696. -------------------------------------------------------------------------------
  1697. Returns the exponent bits of the double-precision floating-point value `a'.
  1698. -------------------------------------------------------------------------------
  1699. *}
  1700. Function extractFloat64Exp(a: float64): int16;
  1701. Begin
  1702. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  1703. End;
  1704. {*
  1705. -------------------------------------------------------------------------------
  1706. Returns the sign bit of the double-precision floating-point value `a'.
  1707. -------------------------------------------------------------------------------
  1708. *}
  1709. Function extractFloat64Sign(a: float64) : flag;
  1710. Begin
  1711. extractFloat64Sign := a.high shr 31;
  1712. End;
  1713. {*
  1714. -------------------------------------------------------------------------------
  1715. Normalizes the subnormal double-precision floating-point value represented
  1716. by the denormalized significand formed by the concatenation of `aSig0' and
  1717. `aSig1'. The normalized exponent is stored at the location pointed to by
  1718. `zExpPtr'. The most significant 21 bits of the normalized significand are
  1719. stored at the location pointed to by `zSig0Ptr', and the least significant
  1720. 32 bits of the normalized significand are stored at the location pointed to
  1721. by `zSig1Ptr'.
  1722. -------------------------------------------------------------------------------
  1723. *}
  1724. Procedure normalizeFloat64Subnormal(
  1725. aSig0: bits32;
  1726. aSig1: bits32;
  1727. VAR zExpPtr : Int16;
  1728. VAR zSig0Ptr : Bits32;
  1729. VAR zSig1Ptr : Bits32
  1730. );
  1731. Var
  1732. ShiftCount : Int8;
  1733. Begin
  1734. if ( aSig0 = 0 ) then
  1735. Begin
  1736. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  1737. if ( shiftCount < 0 ) then
  1738. Begin
  1739. zSig0Ptr := aSig1 shr ( - shiftCount );
  1740. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  1741. End
  1742. else
  1743. Begin
  1744. zSig0Ptr := aSig1 shl shiftCount;
  1745. zSig1Ptr := 0;
  1746. End;
  1747. zExpPtr := - shiftCount - 31;
  1748. End
  1749. else
  1750. Begin
  1751. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  1752. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  1753. zExpPtr := 1 - shiftCount;
  1754. End;
  1755. End;
  1756. {*
  1757. -------------------------------------------------------------------------------
  1758. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  1759. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  1760. point value, returning the result. After being shifted into the proper
  1761. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  1762. together to form the most significant 32 bits of the result. This means
  1763. that any integer portion of `zSig0' will be added into the exponent. Since
  1764. a properly normalized significand will have an integer portion equal to 1,
  1765. the `zExp' input should be 1 less than the desired result exponent whenever
  1766. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  1767. -------------------------------------------------------------------------------
  1768. *}
  1769. Procedure
  1770. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  1771. var
  1772. z: Float64;
  1773. Begin
  1774. z.low := zSig1;
  1775. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  1776. c := z;
  1777. End;
  1778. {*
  1779. -------------------------------------------------------------------------------
  1780. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1781. and extended significand formed by the concatenation of `zSig0', `zSig1',
  1782. and `zSig2', and returns the proper double-precision floating-point value
  1783. corresponding to the abstract input. Ordinarily, the abstract value is
  1784. simply rounded and packed into the double-precision format, with the inexact
  1785. exception raised if the abstract input cannot be represented exactly.
  1786. However, if the abstract value is too large, the overflow and inexact
  1787. exceptions are raised and an infinity or maximal finite value is returned.
  1788. If the abstract value is too small, the input value is rounded to a
  1789. subnormal number, and the underflow and inexact exceptions are raised if the
  1790. abstract input cannot be represented exactly as a subnormal double-precision
  1791. floating-point number.
  1792. The input significand must be normalized or smaller. If the input
  1793. significand is not normalized, `zExp' must be 0; in that case, the result
  1794. returned is a subnormal number, and it must not require rounding. In the
  1795. usual case that the input significand is normalized, `zExp' must be 1 less
  1796. than the ``true'' floating-point exponent. The handling of underflow and
  1797. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1798. -------------------------------------------------------------------------------
  1799. *}
  1800. Procedure
  1801. roundAndPackFloat64(
  1802. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  1803. Var
  1804. roundingMode : Int8;
  1805. roundNearestEven, increment, isTiny : Flag;
  1806. Begin
  1807. roundingMode := float_rounding_mode;
  1808. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  1809. increment := flag( sbits32 (zSig2) < 0 );
  1810. if ( roundNearestEven = flag(FALSE) ) then
  1811. Begin
  1812. if ( roundingMode = float_round_to_zero ) then
  1813. increment := 0
  1814. else
  1815. Begin
  1816. if ( zSign )<> 0 then
  1817. Begin
  1818. increment := flag( roundingMode = float_round_down ) and zSig2;
  1819. End
  1820. else
  1821. Begin
  1822. increment := flag( roundingMode = float_round_up ) and zSig2;
  1823. End
  1824. End
  1825. End;
  1826. if ( $7FD <= bits16 (zExp) ) then
  1827. Begin
  1828. if (( $7FD < zExp )
  1829. or (( zExp = $7FD )
  1830. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  1831. and (increment<>0)
  1832. )
  1833. ) then
  1834. Begin
  1835. float_raise( float_flag_overflow OR float_flag_inexact );
  1836. if (( roundingMode = float_round_to_zero )
  1837. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  1838. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  1839. ) then
  1840. Begin
  1841. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  1842. exit;
  1843. End;
  1844. packFloat64( zSign, $7FF, 0, 0, c );
  1845. exit;
  1846. End;
  1847. if ( zExp < 0 ) then
  1848. Begin
  1849. isTiny :=
  1850. flag( float_detect_tininess = float_tininess_before_rounding )
  1851. or flag( zExp < -1 )
  1852. or flag(increment = 0)
  1853. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  1854. shift64ExtraRightJamming(
  1855. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  1856. zExp := 0;
  1857. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  1858. if ( roundNearestEven )<>0 then
  1859. Begin
  1860. increment := flag( sbits32 (zSig2) < 0 );
  1861. End
  1862. else
  1863. Begin
  1864. if ( zSign )<>0 then
  1865. Begin
  1866. increment := flag( roundingMode = float_round_down ) and zSig2;
  1867. End
  1868. else
  1869. Begin
  1870. increment := flag( roundingMode = float_round_up ) and zSig2;
  1871. End
  1872. End;
  1873. End;
  1874. End;
  1875. if ( zSig2 )<>0 then
  1876. float_exception_flags := float_exception_flags OR float_flag_inexact;
  1877. if ( increment )<>0 then
  1878. Begin
  1879. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  1880. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  1881. End
  1882. else
  1883. Begin
  1884. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  1885. End;
  1886. packFloat64( zSign, zExp, zSig0, zSig1, c );
  1887. End;
  1888. {*
  1889. -------------------------------------------------------------------------------
  1890. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1891. and significand formed by the concatenation of `zSig0' and `zSig1', and
  1892. returns the proper double-precision floating-point value corresponding
  1893. to the abstract input. This routine is just like `roundAndPackFloat64'
  1894. except that the input significand has fewer bits and does not have to be
  1895. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  1896. point exponent.
  1897. -------------------------------------------------------------------------------
  1898. *}
  1899. Procedure
  1900. normalizeRoundAndPackFloat64(
  1901. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  1902. Var
  1903. shiftCount : int8;
  1904. zSig2 : bits32;
  1905. Begin
  1906. if ( zSig0 = 0 ) then
  1907. Begin
  1908. zSig0 := zSig1;
  1909. zSig1 := 0;
  1910. zExp := zExp -32;
  1911. End;
  1912. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  1913. if ( 0 <= shiftCount ) then
  1914. Begin
  1915. zSig2 := 0;
  1916. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  1917. End
  1918. else
  1919. Begin
  1920. shift64ExtraRightJamming
  1921. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  1922. End;
  1923. zExp := zExp - shiftCount;
  1924. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  1925. End;
  1926. {*
  1927. -------------------------------------------------------------------------------
  1928. Returns the result of converting the 32-bit two's complement integer `a' to
  1929. the single-precision floating-point format. The conversion is performed
  1930. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1931. -------------------------------------------------------------------------------
  1932. *}
  1933. Function int32_to_float32( a: int32): float32; {$ifdef fpc}[public,Alias:'INT32_TO_FLOAT32'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  1934. Var
  1935. zSign : Flag;
  1936. Begin
  1937. if ( a = 0 ) then
  1938. Begin
  1939. int32_to_float32 := 0;
  1940. exit;
  1941. End;
  1942. if ( a = sbits32 ($80000000) ) then
  1943. Begin
  1944. int32_to_float32 := packFloat32( 1, $9E, 0 );
  1945. exit;
  1946. end;
  1947. zSign := flag( a < 0 );
  1948. If zSign<>0 then
  1949. a := -a;
  1950. int32_to_float32:=
  1951. normalizeRoundAndPackFloat32( zSign, $9C, a );
  1952. End;
  1953. {*
  1954. -------------------------------------------------------------------------------
  1955. Returns the result of converting the 32-bit two's complement integer `a' to
  1956. the double-precision floating-point format. The conversion is performed
  1957. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1958. -------------------------------------------------------------------------------
  1959. *}
  1960. Procedure int32_to_float64( a: int32; var c: float64 );{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  1961. var
  1962. zSign : flag;
  1963. absA : bits32;
  1964. shiftCount : int8;
  1965. zSig0, zSig1 : bits32;
  1966. Begin
  1967. if ( a = 0 ) then
  1968. Begin
  1969. packFloat64( 0, 0, 0, 0, c );
  1970. exit;
  1971. end;
  1972. zSign := flag( a < 0 );
  1973. if ZSign<>0 then
  1974. AbsA := -a
  1975. else
  1976. AbsA := a;
  1977. shiftCount := countLeadingZeros32( absA ) - 11;
  1978. if ( 0 <= shiftCount ) then
  1979. Begin
  1980. zSig0 := absA shl shiftCount;
  1981. zSig1 := 0;
  1982. End
  1983. else
  1984. Begin
  1985. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  1986. End;
  1987. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1,c );
  1988. End;
  1989. {*
  1990. -------------------------------------------------------------------------------
  1991. Returns the result of converting the single-precision floating-point value
  1992. `a' to the 32-bit two's complement integer format. The conversion is
  1993. performed according to the IEC/IEEE Standard for Binary Floating-Point
  1994. Arithmetic---which means in particular that the conversion is rounded
  1995. according to the current rounding mode. If `a' is a NaN, the largest
  1996. positive integer is returned. Otherwise, if the conversion overflows, the
  1997. largest integer with the same sign as `a' is returned.
  1998. -------------------------------------------------------------------------------
  1999. *}
  2000. Function float32_to_int32( a : float32) : int32;{$ifdef fpc} [public,Alias:'FLOAT32_TO_INT32'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2001. Var
  2002. aSign: flag;
  2003. aExp, shiftCount: int16;
  2004. aSig, aSigExtra: bits32;
  2005. z: int32;
  2006. roundingMode: int8;
  2007. Begin
  2008. aSig := extractFloat32Frac( a );
  2009. aExp := extractFloat32Exp( a );
  2010. aSign := extractFloat32Sign( a );
  2011. shiftCount := aExp - $96;
  2012. if ( 0 <= shiftCount ) then
  2013. Begin
  2014. if ( $9E <= aExp ) then
  2015. Begin
  2016. if ( a <> $CF000000 ) then
  2017. Begin
  2018. float_raise( float_flag_invalid );
  2019. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2020. Begin
  2021. float32_to_int32 := $7FFFFFFF;
  2022. exit;
  2023. End;
  2024. End;
  2025. float32_to_int32 := sbits32 ($80000000);
  2026. exit;
  2027. End;
  2028. z := ( aSig or $00800000 ) shl shiftCount;
  2029. if ( aSign<>0 ) then z := - z;
  2030. End
  2031. else
  2032. Begin
  2033. if ( aExp < $7E ) then
  2034. Begin
  2035. aSigExtra := aExp OR aSig;
  2036. z := 0;
  2037. End
  2038. else
  2039. Begin
  2040. aSig := aSig OR $00800000;
  2041. aSigExtra := aSig shl ( shiftCount and 31 );
  2042. z := aSig shr ( - shiftCount );
  2043. End;
  2044. if ( aSigExtra<>0 ) then
  2045. float_exception_flags := float_exception_flags
  2046. or float_flag_inexact;
  2047. roundingMode := float_rounding_mode;
  2048. if ( roundingMode = float_round_nearest_even ) then
  2049. Begin
  2050. if ( sbits32 (aSigExtra) < 0 ) then
  2051. Begin
  2052. Inc(z);
  2053. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2054. z := z and not 1;
  2055. End;
  2056. if ( aSign<>0 ) then
  2057. z := - z;
  2058. End
  2059. else
  2060. Begin
  2061. aSigExtra := flag( aSigExtra <> 0 );
  2062. if ( aSign<>0 ) then
  2063. Begin
  2064. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2065. z := - z;
  2066. End
  2067. else
  2068. Begin
  2069. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2070. End
  2071. End;
  2072. End;
  2073. float32_to_int32 := z;
  2074. End;
  2075. {*
  2076. -------------------------------------------------------------------------------
  2077. Returns the result of converting the single-precision floating-point value
  2078. `a' to the 32-bit two's complement integer format. The conversion is
  2079. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2080. Arithmetic, except that the conversion is always rounded toward zero.
  2081. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2082. the conversion overflows, the largest integer with the same sign as `a' is
  2083. returned.
  2084. -------------------------------------------------------------------------------
  2085. *}
  2086. Function float32_to_int32_round_to_zero( a: Float32 ): int32;
  2087. {$ifdef fpc}[public,Alias:'FLOAT32_TO_INT32_ROUND_TO_ZERO'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2088. Var
  2089. aSign : flag;
  2090. aExp, shiftCount : int16;
  2091. aSig : bits32;
  2092. z : int32;
  2093. Begin
  2094. aSig := extractFloat32Frac( a );
  2095. aExp := extractFloat32Exp( a );
  2096. aSign := extractFloat32Sign( a );
  2097. shiftCount := aExp - $9E;
  2098. if ( 0 <= shiftCount ) then
  2099. Begin
  2100. if ( a <> $CF000000 ) then
  2101. Begin
  2102. float_raise( float_flag_invalid );
  2103. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2104. Begin
  2105. float32_to_int32_round_to_zero := $7FFFFFFF;
  2106. exit;
  2107. end;
  2108. End;
  2109. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2110. exit;
  2111. End
  2112. else
  2113. if ( aExp <= $7E ) then
  2114. Begin
  2115. if ( aExp or aSig )<>0 then
  2116. float_exception_flags :=
  2117. float_exception_flags or float_flag_inexact;
  2118. float32_to_int32_round_to_zero := 0;
  2119. exit;
  2120. End;
  2121. aSig := ( aSig or $00800000 ) shl 8;
  2122. z := aSig shr ( - shiftCount );
  2123. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2124. Begin
  2125. float_exception_flags :=
  2126. float_exception_flags or float_flag_inexact;
  2127. End;
  2128. if ( aSign<>0 ) then z := - z;
  2129. float32_to_int32_round_to_zero := z;
  2130. End;
  2131. {*
  2132. -------------------------------------------------------------------------------
  2133. Returns the result of converting the single-precision floating-point value
  2134. `a' to the double-precision floating-point format. The conversion is
  2135. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2136. Arithmetic.
  2137. -------------------------------------------------------------------------------
  2138. *}
  2139. Procedure float32_to_float64( a : float32; var out: Float64);
  2140. {$ifdef fpc}[public,Alias:'FLOAT32_TO_FLOAT64'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2141. Var
  2142. aSign : flag;
  2143. aExp : int16;
  2144. aSig, zSig0, zSig1: bits32;
  2145. tmp : CommonNanT;
  2146. Begin
  2147. aSig := extractFloat32Frac( a );
  2148. aExp := extractFloat32Exp( a );
  2149. aSign := extractFloat32Sign( a );
  2150. if ( aExp = $FF ) then
  2151. Begin
  2152. if ( aSig<>0 ) then
  2153. Begin
  2154. float32ToCommonNaN(a, tmp);
  2155. commonNaNToFloat64(tmp , out);
  2156. exit;
  2157. End;
  2158. packFloat64( aSign, $7FF, 0, 0, out );
  2159. exit;
  2160. End;
  2161. if ( aExp = 0 ) then
  2162. Begin
  2163. if ( aSig = 0 ) then
  2164. Begin
  2165. packFloat64( aSign, 0, 0, 0, out );
  2166. exit;
  2167. end;
  2168. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2169. Dec(aExp);
  2170. End;
  2171. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2172. packFloat64( aSign, aExp + $380, zSig0, zSig1, out );
  2173. End;
  2174. {*
  2175. -------------------------------------------------------------------------------
  2176. Rounds the single-precision floating-point value `a' to an integer,
  2177. and returns the result as a single-precision floating-point value. The
  2178. operation is performed according to the IEC/IEEE Standard for Binary
  2179. Floating-Point Arithmetic.
  2180. -------------------------------------------------------------------------------
  2181. *}
  2182. Function float32_round_to_int( a: float32): float32;
  2183. {$ifdef fpc}[public,Alias:'FLOAT32_ROUND_TO_INT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2184. Var
  2185. aSign: flag;
  2186. aExp: int16;
  2187. lastBitMask, roundBitsMask: bits32;
  2188. roundingMode: int8;
  2189. z: float32;
  2190. Begin
  2191. aExp := extractFloat32Exp( a );
  2192. if ( $96 <= aExp ) then
  2193. Begin
  2194. if ( ( aExp = $FF ) and (extractFloat32Frac( a )<>0) ) then
  2195. Begin
  2196. float32_round_to_int:= propagateFloat32NaN( a, a );
  2197. exit;
  2198. End;
  2199. float32_round_to_int:=a;
  2200. exit;
  2201. End;
  2202. if ( aExp <= $7E ) then
  2203. Begin
  2204. if ( bits32 ( a shl 1 ) = 0 ) then
  2205. Begin
  2206. float32_round_to_int:=a;
  2207. exit;
  2208. end;
  2209. float_exception_flags
  2210. := float_exception_flags OR float_flag_inexact;
  2211. aSign := extractFloat32Sign( a );
  2212. case ( float_rounding_mode ) of
  2213. float_round_nearest_even:
  2214. Begin
  2215. if ( ( aExp = $7E ) and (extractFloat32Frac( a )<>0) ) then
  2216. Begin
  2217. float32_round_to_int := packFloat32( aSign, $7F, 0 );
  2218. exit;
  2219. End;
  2220. End;
  2221. float_round_down:
  2222. Begin
  2223. if aSign <> 0 then
  2224. float32_round_to_int := $BF800000
  2225. else
  2226. float32_round_to_int := 0;
  2227. exit;
  2228. End;
  2229. float_round_up:
  2230. Begin
  2231. if aSign <> 0 then
  2232. float32_round_to_int := $80000000
  2233. else
  2234. float32_round_to_int := $3F800000;
  2235. exit;
  2236. End;
  2237. end;
  2238. float32_round_to_int := packFloat32( aSign, 0, 0 );
  2239. End;
  2240. lastBitMask := 1;
  2241. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2242. lastBitMask := lastBitMask shl ($96 - aExp);
  2243. roundBitsMask := lastBitMask - 1;
  2244. z := a;
  2245. roundingMode := float_rounding_mode;
  2246. if ( roundingMode = float_round_nearest_even ) then
  2247. Begin
  2248. z := z + (lastBitMask shr 1);
  2249. if ( ( z and roundBitsMask ) = 0 ) then
  2250. z := z and not lastBitMask;
  2251. End
  2252. else if ( roundingMode <> float_round_to_zero ) then
  2253. Begin
  2254. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2255. Begin
  2256. z := z + roundBitsMask;
  2257. End;
  2258. End;
  2259. z := z and not roundBitsMask;
  2260. if ( z <> a ) then
  2261. float_exception_flags := float_exception_flags or float_flag_inexact;
  2262. float32_round_to_int := z;
  2263. End;
  2264. {*
  2265. -------------------------------------------------------------------------------
  2266. Returns the result of adding the absolute values of the single-precision
  2267. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2268. before being returned. `zSign' is ignored if the result is a NaN.
  2269. The addition is performed according to the IEC/IEEE Standard for Binary
  2270. Floating-Point Arithmetic.
  2271. -------------------------------------------------------------------------------
  2272. *}
  2273. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2274. Var
  2275. aExp, bExp, zExp: int16;
  2276. aSig, bSig, zSig: bits32;
  2277. expDiff: int16;
  2278. label roundAndPack;
  2279. Begin
  2280. aSig:=extractFloat32Frac( a );
  2281. aExp:=extractFloat32Exp( a );
  2282. bSig:=extractFloat32Frac( b );
  2283. bExp := extractFloat32Exp( b );
  2284. expDiff := aExp - bExp;
  2285. aSig := aSig shl 6;
  2286. bSig := bSig shl 6;
  2287. if ( 0 < expDiff ) then
  2288. Begin
  2289. if ( aExp = $FF ) then
  2290. Begin
  2291. if ( aSig <> 0) then
  2292. Begin
  2293. addFloat32Sigs := propagateFloat32NaN( a, b );
  2294. exit;
  2295. End;
  2296. addFloat32Sigs := a;
  2297. exit;
  2298. End;
  2299. if ( bExp = 0 ) then
  2300. Begin
  2301. Dec(expDiff);
  2302. End
  2303. else
  2304. Begin
  2305. bSig := bSig or $20000000;
  2306. End;
  2307. shift32RightJamming( bSig, expDiff, bSig );
  2308. zExp := aExp;
  2309. End
  2310. else
  2311. If ( expDiff < 0 ) then
  2312. Begin
  2313. if ( bExp = $FF ) then
  2314. Begin
  2315. if ( bSig<>0 ) then
  2316. Begin
  2317. addFloat32Sigs := propagateFloat32NaN( a, b );
  2318. exit;
  2319. end;
  2320. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  2321. exit;
  2322. End;
  2323. if ( aExp = 0 ) then
  2324. Begin
  2325. Inc(expDiff);
  2326. End
  2327. else
  2328. Begin
  2329. aSig := aSig OR $20000000;
  2330. End;
  2331. shift32RightJamming( aSig, - expDiff, aSig );
  2332. zExp := bExp;
  2333. End
  2334. else
  2335. Begin
  2336. if ( aExp = $FF ) then
  2337. Begin
  2338. if ( aSig OR bSig )<> 0 then
  2339. Begin
  2340. addFloat32Sigs := propagateFloat32NaN( a, b );
  2341. exit;
  2342. end;
  2343. addFloat32Sigs := a;
  2344. exit;
  2345. End;
  2346. if ( aExp = 0 ) then
  2347. Begin
  2348. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  2349. exit;
  2350. end;
  2351. zSig := $40000000 + aSig + bSig;
  2352. zExp := aExp;
  2353. goto roundAndPack;
  2354. End;
  2355. aSig := aSig OR $20000000;
  2356. zSig := ( aSig + bSig ) shl 1;
  2357. Dec(zExp);
  2358. if ( sbits32 (zSig) < 0 ) then
  2359. Begin
  2360. zSig := aSig + bSig;
  2361. Inc(zExp);
  2362. End;
  2363. roundAndPack:
  2364. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  2365. End;
  2366. {*
  2367. -------------------------------------------------------------------------------
  2368. Returns the result of subtracting the absolute values of the single-
  2369. precision floating-point values `a' and `b'. If `zSign' is 1, the
  2370. difference is negated before being returned. `zSign' is ignored if the
  2371. result is a NaN. The subtraction is performed according to the IEC/IEEE
  2372. Standard for Binary Floating-Point Arithmetic.
  2373. -------------------------------------------------------------------------------
  2374. *}
  2375. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  2376. Var
  2377. aExp, bExp, zExp: int16;
  2378. aSig, bSig, zSig: bits32;
  2379. expDiff : int16;
  2380. label aExpBigger;
  2381. label bExpBigger;
  2382. label aBigger;
  2383. label bBigger;
  2384. label normalizeRoundAndPack;
  2385. Begin
  2386. aSig := extractFloat32Frac( a );
  2387. aExp := extractFloat32Exp( a );
  2388. bSig := extractFloat32Frac( b );
  2389. bExp := extractFloat32Exp( b );
  2390. expDiff := aExp - bExp;
  2391. aSig := aSig shl 7;
  2392. bSig := bSig shl 7;
  2393. if ( 0 < expDiff ) then goto aExpBigger;
  2394. if ( expDiff < 0 ) then goto bExpBigger;
  2395. if ( aExp = $FF ) then
  2396. Begin
  2397. if ( aSig OR bSig )<> 0 then
  2398. Begin
  2399. subFloat32Sigs := propagateFloat32NaN( a, b );
  2400. exit;
  2401. End;
  2402. float_raise( float_flag_invalid );
  2403. subFloat32Sigs := float32_default_nan;
  2404. exit;
  2405. End;
  2406. if ( aExp = 0 ) then
  2407. Begin
  2408. aExp := 1;
  2409. bExp := 1;
  2410. End;
  2411. if ( bSig < aSig ) Then goto aBigger;
  2412. if ( aSig < bSig ) Then goto bBigger;
  2413. subFloat32Sigs := packFloat32( flag(float_rounding_mode = float_round_down), 0, 0 );
  2414. exit;
  2415. bExpBigger:
  2416. if ( bExp = $FF ) then
  2417. Begin
  2418. if ( bSig<>0 ) then
  2419. Begin
  2420. subFloat32Sigs := propagateFloat32NaN( a, b );
  2421. exit;
  2422. End;
  2423. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  2424. exit;
  2425. End;
  2426. if ( aExp = 0 ) then
  2427. Begin
  2428. Inc(expDiff);
  2429. End
  2430. else
  2431. Begin
  2432. aSig := aSig OR $40000000;
  2433. End;
  2434. shift32RightJamming( aSig, - expDiff, aSig );
  2435. bSig := bSig OR $40000000;
  2436. bBigger:
  2437. zSig := bSig - aSig;
  2438. zExp := bExp;
  2439. zSign := zSign xor 1;
  2440. goto normalizeRoundAndPack;
  2441. aExpBigger:
  2442. if ( aExp = $FF ) then
  2443. Begin
  2444. if ( aSig <> 0) then
  2445. Begin
  2446. subFloat32Sigs := propagateFloat32NaN( a, b );
  2447. exit;
  2448. End;
  2449. subFloat32Sigs := a;
  2450. exit;
  2451. End;
  2452. if ( bExp = 0 ) then
  2453. Begin
  2454. Dec(expDiff);
  2455. End
  2456. else
  2457. Begin
  2458. bSig := bSig OR $40000000;
  2459. End;
  2460. shift32RightJamming( bSig, expDiff, bSig );
  2461. aSig := aSig OR $40000000;
  2462. aBigger:
  2463. zSig := aSig - bSig;
  2464. zExp := aExp;
  2465. normalizeRoundAndPack:
  2466. Dec(zExp);
  2467. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  2468. End;
  2469. {*
  2470. -------------------------------------------------------------------------------
  2471. Returns the result of adding the single-precision floating-point values `a'
  2472. and `b'. The operation is performed according to the IEC/IEEE Standard for
  2473. Binary Floating-Point Arithmetic.
  2474. -------------------------------------------------------------------------------
  2475. *}
  2476. Function float32_add( a: float32; b:float32 ): float32;{$ifdef fpc} [public,Alias:'FLOAT32_ADD'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2477. Var
  2478. aSign, bSign: Flag;
  2479. Begin
  2480. aSign := extractFloat32Sign( a );
  2481. bSign := extractFloat32Sign( b );
  2482. if ( aSign = bSign ) then
  2483. Begin
  2484. float32_add := addFloat32Sigs( a, b, aSign );
  2485. End
  2486. else
  2487. Begin
  2488. float32_add := subFloat32Sigs( a, b, aSign );
  2489. End;
  2490. End;
  2491. {*
  2492. -------------------------------------------------------------------------------
  2493. Returns the result of subtracting the single-precision floating-point values
  2494. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  2495. for Binary Floating-Point Arithmetic.
  2496. -------------------------------------------------------------------------------
  2497. *}
  2498. Function float32_sub( a: float32 ; b:float32 ): float32;{$ifdef fpc} [public,Alias:'FLOAT32_SUB'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2499. Var
  2500. aSign, bSign: flag;
  2501. Begin
  2502. aSign := extractFloat32Sign( a );
  2503. bSign := extractFloat32Sign( b );
  2504. if ( aSign = bSign ) then
  2505. Begin
  2506. float32_sub := subFloat32Sigs( a, b, aSign );
  2507. End
  2508. else
  2509. Begin
  2510. float32_sub := addFloat32Sigs( a, b, aSign );
  2511. End;
  2512. End;
  2513. {*
  2514. -------------------------------------------------------------------------------
  2515. Returns the result of multiplying the single-precision floating-point values
  2516. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  2517. for Binary Floating-Point Arithmetic.
  2518. -------------------------------------------------------------------------------
  2519. *}
  2520. Function float32_mul(a: float32; b: float32 ) : float32;{$ifdef fpc} [public,Alias:'FLOAT32_MUL'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2521. Var
  2522. aSign, bSign, zSign: flag;
  2523. aExp, bExp, zExp : int16;
  2524. aSig, bSig, zSig0, zSig1: bits32;
  2525. Begin
  2526. aSig := extractFloat32Frac( a );
  2527. aExp := extractFloat32Exp( a );
  2528. aSign := extractFloat32Sign( a );
  2529. bSig := extractFloat32Frac( b );
  2530. bExp := extractFloat32Exp( b );
  2531. bSign := extractFloat32Sign( b );
  2532. zSign := aSign xor bSign;
  2533. if ( aExp = $FF ) then
  2534. Begin
  2535. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  2536. Begin
  2537. float32_mul := propagateFloat32NaN( a, b );
  2538. End;
  2539. if ( ( bExp OR bSig ) = 0 ) then
  2540. Begin
  2541. float_raise( float_flag_invalid );
  2542. float32_mul := float32_default_nan;
  2543. exit;
  2544. End;
  2545. float32_mul := packFloat32( zSign, $FF, 0 );
  2546. exit;
  2547. End;
  2548. if ( bExp = $FF ) then
  2549. Begin
  2550. if ( bSig <> 0 ) then
  2551. Begin
  2552. float32_mul := propagateFloat32NaN( a, b );
  2553. exit;
  2554. End;
  2555. if ( ( aExp OR aSig ) = 0 ) then
  2556. Begin
  2557. float_raise( float_flag_invalid );
  2558. float32_mul := float32_default_nan;
  2559. exit;
  2560. End;
  2561. float32_mul := packFloat32( zSign, $FF, 0 );
  2562. exit;
  2563. End;
  2564. if ( aExp = 0 ) then
  2565. Begin
  2566. if ( aSig = 0 ) then
  2567. Begin
  2568. float32_mul := packFloat32( zSign, 0, 0 );
  2569. exit;
  2570. End;
  2571. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2572. End;
  2573. if ( bExp = 0 ) then
  2574. Begin
  2575. if ( bSig = 0 ) then
  2576. Begin
  2577. float32_mul := packFloat32( zSign, 0, 0 );
  2578. exit;
  2579. End;
  2580. normalizeFloat32Subnormal( bSig, bExp, bSig );
  2581. End;
  2582. zExp := aExp + bExp - $7F;
  2583. aSig := ( aSig OR $00800000 ) shl 7;
  2584. bSig := ( bSig OR $00800000 ) shl 8;
  2585. mul32To64( aSig, bSig, zSig0, zSig1 );
  2586. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  2587. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  2588. Begin
  2589. zSig0 := zSig0 shl 1;
  2590. Dec(zExp);
  2591. End;
  2592. float32_mul := roundAndPackFloat32( zSign, zExp, zSig0 );
  2593. End;
  2594. {*
  2595. -------------------------------------------------------------------------------
  2596. Returns the result of dividing the single-precision floating-point value `a'
  2597. by the corresponding value `b'. The operation is performed according to the
  2598. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2599. -------------------------------------------------------------------------------
  2600. *}
  2601. Function float32_div(a: float32;b: float32 ): float32;{$ifdef fpc} [public,Alias:'FLOAT32_DIV'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2602. Var
  2603. aSign, bSign, zSign: flag;
  2604. aExp, bExp, zExp: int16;
  2605. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  2606. Begin
  2607. aSig := extractFloat32Frac( a );
  2608. aExp := extractFloat32Exp( a );
  2609. aSign := extractFloat32Sign( a );
  2610. bSig := extractFloat32Frac( b );
  2611. bExp := extractFloat32Exp( b );
  2612. bSign := extractFloat32Sign( b );
  2613. zSign := aSign xor bSign;
  2614. if ( aExp = $FF ) then
  2615. Begin
  2616. if ( aSig <> 0 ) then
  2617. Begin
  2618. float32_div := propagateFloat32NaN( a, b );
  2619. exit;
  2620. End;
  2621. if ( bExp = $FF ) then
  2622. Begin
  2623. if ( bSig <> 0) then
  2624. Begin
  2625. float32_div := propagateFloat32NaN( a, b );
  2626. End;
  2627. float_raise( float_flag_invalid );
  2628. float32_div := float32_default_nan;
  2629. exit;
  2630. End;
  2631. float32_div := packFloat32( zSign, $FF, 0 );
  2632. exit;
  2633. End;
  2634. if ( bExp = $FF ) then
  2635. Begin
  2636. if ( bSig <> 0) then
  2637. Begin
  2638. float32_div := propagateFloat32NaN( a, b );
  2639. exit;
  2640. End;
  2641. float32_div := packFloat32( zSign, 0, 0 );
  2642. exit;
  2643. End;
  2644. if ( bExp = 0 ) Then
  2645. Begin
  2646. if ( bSig = 0 ) Then
  2647. Begin
  2648. if ( ( aExp OR aSig ) = 0 ) then
  2649. Begin
  2650. float_raise( float_flag_invalid );
  2651. float32_div := float32_default_nan;
  2652. exit;
  2653. End;
  2654. float_raise( float_flag_divbyzero );
  2655. float32_div := packFloat32( zSign, $FF, 0 );
  2656. exit;
  2657. End;
  2658. normalizeFloat32Subnormal( bSig, bExp, bSig );
  2659. End;
  2660. if ( aExp = 0 ) Then
  2661. Begin
  2662. if ( aSig = 0 ) Then
  2663. Begin
  2664. float32_div := packFloat32( zSign, 0, 0 );
  2665. exit;
  2666. End;
  2667. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2668. End;
  2669. zExp := aExp - bExp + $7D;
  2670. aSig := ( aSig OR $00800000 ) shl 7;
  2671. bSig := ( bSig OR $00800000 ) shl 8;
  2672. if ( bSig <= ( aSig + aSig ) ) then
  2673. Begin
  2674. aSig := aSig shr 1;
  2675. Inc(zExp);
  2676. End;
  2677. zSig := estimateDiv64To32( aSig, 0, bSig );
  2678. if ( ( zSig and $3F ) <= 2 ) then
  2679. Begin
  2680. mul32To64( bSig, zSig, term0, term1 );
  2681. sub64( aSig, 0, term0, term1, rem0, rem1 );
  2682. while ( sbits32 (rem0) < 0 ) do
  2683. Begin
  2684. Dec(zSig);
  2685. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  2686. End;
  2687. zSig := zSig or bits32( rem1 <> 0 );
  2688. End;
  2689. float32_div := roundAndPackFloat32( zSign, zExp, zSig );
  2690. End;
  2691. {*
  2692. -------------------------------------------------------------------------------
  2693. Returns the remainder of the single-precision floating-point value `a'
  2694. with respect to the corresponding value `b'. The operation is performed
  2695. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2696. -------------------------------------------------------------------------------
  2697. *}
  2698. Function float32_rem(a: float32; b: float32 ):float32;{$ifdef fpc} [public,Alias:'FLOAT32_REM'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2699. Var
  2700. aSign, bSign, zSign: flag;
  2701. aExp, bExp, expDiff: int16;
  2702. aSig, bSig, q, allZero, alternateASig: bits32;
  2703. sigMean: sbits32;
  2704. Begin
  2705. aSig := extractFloat32Frac( a );
  2706. aExp := extractFloat32Exp( a );
  2707. aSign := extractFloat32Sign( a );
  2708. bSig := extractFloat32Frac( b );
  2709. bExp := extractFloat32Exp( b );
  2710. bSign := extractFloat32Sign( b );
  2711. if ( aExp = $FF ) then
  2712. Begin
  2713. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  2714. Begin
  2715. float32_rem := propagateFloat32NaN( a, b );
  2716. exit;
  2717. End;
  2718. float_raise( float_flag_invalid );
  2719. float32_rem := float32_default_nan;
  2720. exit;
  2721. End;
  2722. if ( bExp = $FF ) then
  2723. Begin
  2724. if ( bSig <> 0 ) then
  2725. Begin
  2726. float32_rem := propagateFloat32NaN( a, b );
  2727. exit;
  2728. End;
  2729. float32_rem := a;
  2730. exit;
  2731. End;
  2732. if ( bExp = 0 ) then
  2733. Begin
  2734. if ( bSig = 0 ) then
  2735. Begin
  2736. float_raise( float_flag_invalid );
  2737. float32_rem := float32_default_nan;
  2738. exit;
  2739. End;
  2740. normalizeFloat32Subnormal( bSig, bExp, bSig );
  2741. End;
  2742. if ( aExp = 0 ) then
  2743. Begin
  2744. if ( aSig = 0 ) then
  2745. Begin
  2746. float32_rem := a;
  2747. exit;
  2748. End;
  2749. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2750. End;
  2751. expDiff := aExp - bExp;
  2752. aSig := ( aSig OR $00800000 ) shl 8;
  2753. bSig := ( bSig OR $00800000 ) shl 8;
  2754. if ( expDiff < 0 ) then
  2755. Begin
  2756. if ( expDiff < -1 ) then
  2757. Begin
  2758. float32_rem := a;
  2759. exit;
  2760. End;
  2761. aSig := aSig shr 1;
  2762. End;
  2763. q := bits32( bSig <= aSig );
  2764. if ( q <> 0) then
  2765. aSig := aSig - bSig;
  2766. expDiff := expDiff - 32;
  2767. while ( 0 < expDiff ) do
  2768. Begin
  2769. q := estimateDiv64To32( aSig, 0, bSig );
  2770. if (2 < q) then
  2771. q := q - 2
  2772. else
  2773. q := 0;
  2774. aSig := - ( ( bSig shr 2 ) * q );
  2775. expDiff := expDiff - 30;
  2776. End;
  2777. expDiff := expDiff + 32;
  2778. if ( 0 < expDiff ) then
  2779. Begin
  2780. q := estimateDiv64To32( aSig, 0, bSig );
  2781. if (2 < q) then
  2782. q := q - 2
  2783. else
  2784. q := 0;
  2785. q := q shr (32 - expDiff);
  2786. bSig := bSig shr 2;
  2787. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  2788. End
  2789. else
  2790. Begin
  2791. aSig := aSig shr 2;
  2792. bSig := bSig shr 2;
  2793. End;
  2794. Repeat
  2795. alternateASig := aSig;
  2796. Inc(q);
  2797. aSig := aSig - bSig;
  2798. Until not ( 0 <= sbits32 (aSig) );
  2799. sigMean := aSig + alternateASig;
  2800. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  2801. Begin
  2802. aSig := alternateASig;
  2803. End;
  2804. zSign := flag( sbits32 (aSig) < 0 );
  2805. if ( zSign<>0 ) then
  2806. aSig := - aSig;
  2807. float32_rem := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  2808. End;
  2809. {*
  2810. -------------------------------------------------------------------------------
  2811. Returns the square root of the single-precision floating-point value `a'.
  2812. The operation is performed according to the IEC/IEEE Standard for Binary
  2813. Floating-Point Arithmetic.
  2814. -------------------------------------------------------------------------------
  2815. *}
  2816. Function float32_sqrt(a: float32 ): float32;{$ifdef fpc} [public,Alias:'FLOAT32_SQRT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2817. Var
  2818. aSign : flag;
  2819. aExp, zExp : int16;
  2820. aSig, zSig, rem0, rem1, term0, term1: bits32;
  2821. label roundAndPack;
  2822. Begin
  2823. aSig := extractFloat32Frac( a );
  2824. aExp := extractFloat32Exp( a );
  2825. aSign := extractFloat32Sign( a );
  2826. if ( aExp = $FF ) then
  2827. Begin
  2828. if ( aSig <> 0) then
  2829. Begin
  2830. float32_sqrt := propagateFloat32NaN( a, 0 );
  2831. exit;
  2832. End;
  2833. if ( aSign = 0) then
  2834. Begin
  2835. float32_sqrt := a;
  2836. exit;
  2837. End;
  2838. float_raise( float_flag_invalid );
  2839. float32_sqrt := float32_default_nan;
  2840. exit;
  2841. End;
  2842. if ( aSign <> 0) then
  2843. Begin
  2844. if ( ( aExp OR aSig ) = 0 ) then
  2845. Begin
  2846. float32_sqrt := a;
  2847. exit;
  2848. End;
  2849. float_raise( float_flag_invalid );
  2850. float32_sqrt := float32_default_nan;
  2851. exit;
  2852. End;
  2853. if ( aExp = 0 ) then
  2854. Begin
  2855. if ( aSig = 0 ) then
  2856. Begin
  2857. float32_sqrt := 0;
  2858. exit;
  2859. End;
  2860. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2861. End;
  2862. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  2863. aSig := ( aSig OR $00800000 ) shl 8;
  2864. zSig := estimateSqrt32( aExp, aSig ) + 2;
  2865. if ( ( zSig and $7F ) <= 5 ) then
  2866. Begin
  2867. if ( zSig < 2 ) then
  2868. Begin
  2869. zSig := $7FFFFFFF;
  2870. goto roundAndPack;
  2871. End
  2872. else
  2873. Begin
  2874. aSig := aSig shr (aExp and 1);
  2875. mul32To64( zSig, zSig, term0, term1 );
  2876. sub64( aSig, 0, term0, term1, rem0, rem1 );
  2877. while ( sbits32 (rem0) < 0 ) do
  2878. Begin
  2879. Dec(zSig);
  2880. shortShift64Left( 0, zSig, 1, term0, term1 );
  2881. term1 := term1 or 1;
  2882. add64( rem0, rem1, term0, term1, rem0, rem1 );
  2883. End;
  2884. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  2885. End;
  2886. End;
  2887. shift32RightJamming( zSig, 1, zSig );
  2888. roundAndPack:
  2889. float32_sqrt := roundAndPackFloat32( 0, zExp, zSig );
  2890. End;
  2891. {*
  2892. -------------------------------------------------------------------------------
  2893. Returns 1 if the single-precision floating-point value `a' is equal to
  2894. the corresponding value `b', and 0 otherwise. The comparison is performed
  2895. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2896. -------------------------------------------------------------------------------
  2897. *}
  2898. Function float32_eq( a:float32; b:float32): flag;{$ifdef fpc} [public,Alias:'FLOAT32_EQ'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2899. Begin
  2900. if ((( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0))
  2901. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  2902. ) then
  2903. Begin
  2904. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  2905. Begin
  2906. float_raise( float_flag_invalid );
  2907. End;
  2908. float32_eq := 0;
  2909. exit;
  2910. End;
  2911. float32_eq := flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  2912. End;
  2913. {*
  2914. -------------------------------------------------------------------------------
  2915. Returns 1 if the single-precision floating-point value `a' is less than
  2916. or equal to the corresponding value `b', and 0 otherwise. The comparison
  2917. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2918. Arithmetic.
  2919. -------------------------------------------------------------------------------
  2920. *}
  2921. Function float32_le( a: float32; b : float32 ):flag;{$ifdef fpc} [public,Alias:'FLOAT32_LE'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2922. var
  2923. aSign, bSign: flag;
  2924. Begin
  2925. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  2926. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  2927. ) then
  2928. Begin
  2929. float_raise( float_flag_invalid );
  2930. float32_le := 0;
  2931. exit;
  2932. End;
  2933. aSign := extractFloat32Sign( a );
  2934. bSign := extractFloat32Sign( b );
  2935. if ( aSign <> bSign ) then
  2936. Begin
  2937. float32_le := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  2938. exit;
  2939. End;
  2940. float32_le := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  2941. End;
  2942. {*
  2943. -------------------------------------------------------------------------------
  2944. Returns 1 if the single-precision floating-point value `a' is less than
  2945. the corresponding value `b', and 0 otherwise. The comparison is performed
  2946. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2947. -------------------------------------------------------------------------------
  2948. *}
  2949. Function float32_lt( a:float32 ; b : float32): flag;{$ifdef fpc} [public,Alias:'FLOAT32_LT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  2950. var
  2951. aSign, bSign: flag;
  2952. Begin
  2953. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <>0))
  2954. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <>0) )
  2955. ) then
  2956. Begin
  2957. float_raise( float_flag_invalid );
  2958. float32_lt :=0;
  2959. exit;
  2960. End;
  2961. aSign := extractFloat32Sign( a );
  2962. bSign := extractFloat32Sign( b );
  2963. if ( aSign <> bSign ) then
  2964. Begin
  2965. float32_lt := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  2966. exit;
  2967. End;
  2968. float32_lt := flag(flag( a <> b ) AND flag( aSign xor flag( a < b ) ));
  2969. End;
  2970. {*
  2971. -------------------------------------------------------------------------------
  2972. Returns 1 if the single-precision floating-point value `a' is equal to
  2973. the corresponding value `b', and 0 otherwise. The invalid exception is
  2974. raised if either operand is a NaN. Otherwise, the comparison is performed
  2975. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2976. -------------------------------------------------------------------------------
  2977. *}
  2978. Function float32_eq_signaling( a: float32; b: float32) : flag;
  2979. Begin
  2980. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  2981. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  2982. ) then
  2983. Begin
  2984. float_raise( float_flag_invalid );
  2985. float32_eq_signaling := 0;
  2986. exit;
  2987. End;
  2988. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  2989. End;
  2990. {*
  2991. -------------------------------------------------------------------------------
  2992. Returns 1 if the single-precision floating-point value `a' is less than or
  2993. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  2994. cause an exception. Otherwise, the comparison is performed according to the
  2995. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2996. -------------------------------------------------------------------------------
  2997. *}
  2998. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  2999. Var
  3000. aSign, bSign: flag;
  3001. aExp, bExp: int16;
  3002. Begin
  3003. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3004. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3005. ) then
  3006. Begin
  3007. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3008. Begin
  3009. float_raise( float_flag_invalid );
  3010. End;
  3011. float32_le_quiet := 0;
  3012. exit;
  3013. End;
  3014. aSign := extractFloat32Sign( a );
  3015. bSign := extractFloat32Sign( b );
  3016. if ( aSign <> bSign ) then
  3017. Begin
  3018. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3019. exit;
  3020. End;
  3021. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3022. End;
  3023. {*
  3024. -------------------------------------------------------------------------------
  3025. Returns 1 if the single-precision floating-point value `a' is less than
  3026. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3027. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3028. Standard for Binary Floating-Point Arithmetic.
  3029. -------------------------------------------------------------------------------
  3030. *}
  3031. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3032. Var
  3033. aSign, bSign: flag;
  3034. Begin
  3035. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3036. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3037. ) then
  3038. Begin
  3039. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3040. Begin
  3041. float_raise( float_flag_invalid );
  3042. End;
  3043. float32_lt_quiet := 0;
  3044. exit;
  3045. End;
  3046. aSign := extractFloat32Sign( a );
  3047. bSign := extractFloat32Sign( b );
  3048. if ( aSign <> bSign ) then
  3049. Begin
  3050. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3051. exit;
  3052. End;
  3053. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3054. End;
  3055. {*
  3056. -------------------------------------------------------------------------------
  3057. Returns the result of converting the double-precision floating-point value
  3058. `a' to the 32-bit two's complement integer format. The conversion is
  3059. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3060. Arithmetic---which means in particular that the conversion is rounded
  3061. according to the current rounding mode. If `a' is a NaN, the largest
  3062. positive integer is returned. Otherwise, if the conversion overflows, the
  3063. largest integer with the same sign as `a' is returned.
  3064. -------------------------------------------------------------------------------
  3065. *}
  3066. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  3067. var
  3068. aSign: flag;
  3069. aExp, shiftCount: int16;
  3070. aSig0, aSig1, absZ, aSigExtra: bits32;
  3071. z: int32;
  3072. roundingMode: int8;
  3073. label invalid;
  3074. Begin
  3075. aSig1 := extractFloat64Frac1( a );
  3076. aSig0 := extractFloat64Frac0( a );
  3077. aExp := extractFloat64Exp( a );
  3078. aSign := extractFloat64Sign( a );
  3079. shiftCount := aExp - $413;
  3080. if ( 0 <= shiftCount ) then
  3081. Begin
  3082. if ( $41E < aExp ) then
  3083. Begin
  3084. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3085. aSign := 0;
  3086. goto invalid;
  3087. End;
  3088. shortShift64Left(
  3089. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3090. if ( $80000000 < absZ ) then
  3091. goto invalid;
  3092. End
  3093. else
  3094. Begin
  3095. aSig1 := flag( aSig1 <> 0 );
  3096. if ( aExp < $3FE ) then
  3097. Begin
  3098. aSigExtra := aExp OR aSig0 OR aSig1;
  3099. absZ := 0;
  3100. End
  3101. else
  3102. Begin
  3103. aSig0 := aSig0 OR $00100000;
  3104. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3105. absZ := aSig0 shr ( - shiftCount );
  3106. End;
  3107. End;
  3108. roundingMode := float_rounding_mode;
  3109. if ( roundingMode = float_round_nearest_even ) then
  3110. Begin
  3111. if ( sbits32(aSigExtra) < 0 ) then
  3112. Begin
  3113. Inc(absZ);
  3114. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3115. absZ := absZ and not 1;
  3116. End;
  3117. if aSign <> 0 then
  3118. z := - absZ
  3119. else
  3120. z := absZ;
  3121. End
  3122. else
  3123. Begin
  3124. aSigExtra := bits32( aSigExtra <> 0 );
  3125. if ( aSign <> 0) then
  3126. Begin
  3127. z := - ( absZ
  3128. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3129. End
  3130. else
  3131. Begin
  3132. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3133. End
  3134. End;
  3135. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3136. Begin
  3137. invalid:
  3138. float_raise( float_flag_invalid );
  3139. if (aSign <> 0 ) then
  3140. float64_to_int32 := sbits32 ($80000000)
  3141. else
  3142. float64_to_int32 := $7FFFFFFF;
  3143. exit;
  3144. End;
  3145. if ( aSigExtra <> 0) then
  3146. float_exception_flags := float_exception_flags or float_flag_inexact;
  3147. float64_to_int32 := z;
  3148. End;
  3149. {*
  3150. -------------------------------------------------------------------------------
  3151. Returns the result of converting the double-precision floating-point value
  3152. `a' to the 32-bit two's complement integer format. The conversion is
  3153. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3154. Arithmetic, except that the conversion is always rounded toward zero.
  3155. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3156. the conversion overflows, the largest integer with the same sign as `a' is
  3157. returned.
  3158. -------------------------------------------------------------------------------
  3159. *}
  3160. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3161. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  3162. Var
  3163. aSign: flag;
  3164. aExp, shiftCount: int16;
  3165. aSig0, aSig1, absZ, aSigExtra: bits32;
  3166. z: int32;
  3167. label invalid;
  3168. Begin
  3169. aSig1 := extractFloat64Frac1( a );
  3170. aSig0 := extractFloat64Frac0( a );
  3171. aExp := extractFloat64Exp( a );
  3172. aSign := extractFloat64Sign( a );
  3173. shiftCount := aExp - $413;
  3174. if ( 0 <= shiftCount ) then
  3175. Begin
  3176. if ( $41E < aExp ) then
  3177. Begin
  3178. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3179. aSign := 0;
  3180. goto invalid;
  3181. End;
  3182. shortShift64Left(
  3183. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3184. End
  3185. else
  3186. Begin
  3187. if ( aExp < $3FF ) then
  3188. Begin
  3189. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3190. Begin
  3191. float_exception_flags :=
  3192. float_exception_flags or float_flag_inexact;
  3193. End;
  3194. float64_to_int32_round_to_zero := 0;
  3195. exit;
  3196. End;
  3197. aSig0 := aSig0 or $00100000;
  3198. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3199. absZ := aSig0 shr ( - shiftCount );
  3200. End;
  3201. if aSign <> 0 then
  3202. z := - absZ
  3203. else
  3204. z := absZ;
  3205. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3206. Begin
  3207. invalid:
  3208. float_raise( float_flag_invalid );
  3209. if (aSign <> 0) then
  3210. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3211. else
  3212. float64_to_int32_round_to_zero := $7FFFFFFF;
  3213. exit;
  3214. End;
  3215. if ( aSigExtra <> 0) then
  3216. float_exception_flags := float_exception_flags or float_flag_inexact;
  3217. float64_to_int32_round_to_zero := z;
  3218. End;
  3219. {*
  3220. -------------------------------------------------------------------------------
  3221. Returns the result of converting the double-precision floating-point value
  3222. `a' to the single-precision floating-point format. The conversion is
  3223. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3224. Arithmetic.
  3225. -------------------------------------------------------------------------------
  3226. *}
  3227. Function float64_to_float32(a: float64 ): float32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_FLOAT32'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  3228. Var
  3229. aSign: flag;
  3230. aExp: int16;
  3231. aSig0, aSig1, zSig: bits32;
  3232. allZero: bits32;
  3233. tmp : CommonNanT;
  3234. Begin
  3235. aSig1 := extractFloat64Frac1( a );
  3236. aSig0 := extractFloat64Frac0( a );
  3237. aExp := extractFloat64Exp( a );
  3238. aSign := extractFloat64Sign( a );
  3239. if ( aExp = $7FF ) then
  3240. Begin
  3241. if ( aSig0 OR aSig1 ) <> 0 then
  3242. Begin
  3243. float64ToCommonNaN( a, tmp );
  3244. float64_to_float32 := commonNaNToFloat32( tmp );
  3245. exit;
  3246. End;
  3247. float64_to_float32 := packFloat32( aSign, $FF, 0 );
  3248. exit;
  3249. End;
  3250. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3251. if ( aExp <> 0) then
  3252. zSig := zSig OR $40000000;
  3253. float64_to_float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3254. End;
  3255. {*
  3256. -------------------------------------------------------------------------------
  3257. Rounds the double-precision floating-point value `a' to an integer,
  3258. and returns the result as a double-precision floating-point value. The
  3259. operation is performed according to the IEC/IEEE Standard for Binary
  3260. Floating-Point Arithmetic.
  3261. -------------------------------------------------------------------------------
  3262. *}
  3263. Procedure float64_round_to_int(a: float64; var out: float64 );{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  3264. Var
  3265. aSign: flag;
  3266. aExp: int16;
  3267. lastBitMask, roundBitsMask: bits32;
  3268. roundingMode: int8;
  3269. z: float64;
  3270. Begin
  3271. aExp := extractFloat64Exp( a );
  3272. if ( $413 <= aExp ) then
  3273. Begin
  3274. if ( $433 <= aExp ) then
  3275. Begin
  3276. if ( ( aExp = $7FF )
  3277. AND
  3278. (
  3279. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3280. ) <>0)
  3281. ) then
  3282. Begin
  3283. propagateFloat64NaN( a, a, out );
  3284. exit;
  3285. End;
  3286. out := a;
  3287. exit;
  3288. End;
  3289. lastBitMask := 1;
  3290. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3291. roundBitsMask := lastBitMask - 1;
  3292. z := a;
  3293. roundingMode := float_rounding_mode;
  3294. if ( roundingMode = float_round_nearest_even ) then
  3295. Begin
  3296. if ( lastBitMask <> 0) then
  3297. Begin
  3298. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  3299. if ( ( z.low and roundBitsMask ) = 0 ) then
  3300. z.low := z.low and not lastBitMask;
  3301. End
  3302. else
  3303. Begin
  3304. if ( sbits32 (z.low) < 0 ) then
  3305. Begin
  3306. Inc(z.high);
  3307. if ( bits32 ( z.low shl 1 ) = 0 ) then
  3308. z.high := z.high and not 1;
  3309. End;
  3310. End;
  3311. End
  3312. else if ( roundingMode <> float_round_to_zero ) then
  3313. Begin
  3314. if ( extractFloat64Sign( z )
  3315. xor flag( roundingMode = float_round_up ) )<> 0 then
  3316. Begin
  3317. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  3318. End;
  3319. End;
  3320. z.low := z.low and not roundBitsMask;
  3321. End
  3322. else
  3323. Begin
  3324. if ( aExp <= $3FE ) then
  3325. Begin
  3326. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  3327. Begin
  3328. out := a;
  3329. exit;
  3330. End;
  3331. float_exception_flags := float_exception_flags or
  3332. float_flag_inexact;
  3333. aSign := extractFloat64Sign( a );
  3334. case ( float_rounding_mode ) of
  3335. float_round_nearest_even:
  3336. Begin
  3337. if ( ( aExp = $3FE )
  3338. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  3339. ) then
  3340. Begin
  3341. packFloat64( aSign, $3FF, 0, 0, out );
  3342. exit;
  3343. End;
  3344. End;
  3345. float_round_down:
  3346. Begin
  3347. if aSign<>0 then
  3348. packFloat64( 1, $3FF, 0, 0, out )
  3349. else
  3350. packFloat64( 0, 0, 0, 0, out );
  3351. exit;
  3352. End;
  3353. float_round_up:
  3354. Begin
  3355. if aSign <> 0 then
  3356. packFloat64( 1, 0, 0, 0, out )
  3357. else
  3358. packFloat64( 0, $3FF, 0, 0, out );
  3359. exit;
  3360. End;
  3361. end;
  3362. packFloat64( aSign, 0, 0, 0, out );
  3363. exit;
  3364. End;
  3365. lastBitMask := 1;
  3366. lastBitMask := lastBitMask shl ($413 - aExp);
  3367. roundBitsMask := lastBitMask - 1;
  3368. z.low := 0;
  3369. z.high := a.high;
  3370. roundingMode := float_rounding_mode;
  3371. if ( roundingMode = float_round_nearest_even ) then
  3372. Begin
  3373. z.high := z.high + lastBitMask shr 1;
  3374. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  3375. Begin
  3376. z.high := z.high and not lastBitMask;
  3377. End;
  3378. End
  3379. else if ( roundingMode <> float_round_to_zero ) then
  3380. Begin
  3381. if ( extractFloat64Sign( z )
  3382. xor flag( roundingMode = float_round_up ) )<> 0 then
  3383. Begin
  3384. z.high := z.high or bits32( a.low <> 0 );
  3385. z.high := z.high + roundBitsMask;
  3386. End;
  3387. End;
  3388. z.high := z.high and not roundBitsMask;
  3389. End;
  3390. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  3391. Begin
  3392. float_exception_flags :=
  3393. float_exception_flags or float_flag_inexact;
  3394. End;
  3395. out := z;
  3396. End;
  3397. {*
  3398. -------------------------------------------------------------------------------
  3399. Returns the result of adding the absolute values of the double-precision
  3400. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3401. before being returned. `zSign' is ignored if the result is a NaN.
  3402. The addition is performed according to the IEC/IEEE Standard for Binary
  3403. Floating-Point Arithmetic.
  3404. -------------------------------------------------------------------------------
  3405. *}
  3406. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  3407. Var
  3408. aExp, bExp, zExp: int16;
  3409. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  3410. expDiff: int16;
  3411. label shiftRight1;
  3412. label roundAndPack;
  3413. Begin
  3414. aSig1 := extractFloat64Frac1( a );
  3415. aSig0 := extractFloat64Frac0( a );
  3416. aExp := extractFloat64Exp( a );
  3417. bSig1 := extractFloat64Frac1( b );
  3418. bSig0 := extractFloat64Frac0( b );
  3419. bExp := extractFloat64Exp( b );
  3420. expDiff := aExp - bExp;
  3421. if ( 0 < expDiff ) then
  3422. Begin
  3423. if ( aExp = $7FF ) then
  3424. Begin
  3425. if ( aSig0 OR aSig1 ) <> 0 then
  3426. Begin
  3427. propagateFloat64NaN( a, b, out );
  3428. exit;
  3429. end;
  3430. out := a;
  3431. exit;
  3432. End;
  3433. if ( bExp = 0 ) then
  3434. Begin
  3435. Dec(expDiff);
  3436. End
  3437. else
  3438. Begin
  3439. bSig0 := bSig0 or $00100000;
  3440. End;
  3441. shift64ExtraRightJamming(
  3442. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  3443. zExp := aExp;
  3444. End
  3445. else if ( expDiff < 0 ) then
  3446. Begin
  3447. if ( bExp = $7FF ) then
  3448. Begin
  3449. if ( bSig0 OR bSig1 ) <> 0 then
  3450. Begin
  3451. propagateFloat64NaN( a, b, out );
  3452. exit;
  3453. End;
  3454. packFloat64( zSign, $7FF, 0, 0, out );
  3455. End;
  3456. if ( aExp = 0 ) then
  3457. Begin
  3458. Inc(expDiff);
  3459. End
  3460. else
  3461. Begin
  3462. aSig0 := aSig0 or $00100000;
  3463. End;
  3464. shift64ExtraRightJamming(
  3465. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  3466. zExp := bExp;
  3467. End
  3468. else
  3469. Begin
  3470. if ( aExp = $7FF ) then
  3471. Begin
  3472. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  3473. Begin
  3474. propagateFloat64NaN( a, b, out );
  3475. exit;
  3476. End;
  3477. out := a;
  3478. exit;
  3479. End;
  3480. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  3481. if ( aExp = 0 ) then
  3482. Begin
  3483. packFloat64( zSign, 0, zSig0, zSig1, out );
  3484. exit;
  3485. End;
  3486. zSig2 := 0;
  3487. zSig0 := zSig0 or $00200000;
  3488. zExp := aExp;
  3489. goto shiftRight1;
  3490. End;
  3491. aSig0 := aSig0 or $00100000;
  3492. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  3493. Dec(zExp);
  3494. if ( zSig0 < $00200000 ) then
  3495. goto roundAndPack;
  3496. Inc(zExp);
  3497. shiftRight1:
  3498. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  3499. roundAndPack:
  3500. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  3501. End;
  3502. {*
  3503. -------------------------------------------------------------------------------
  3504. Returns the result of subtracting the absolute values of the double-
  3505. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3506. difference is negated before being returned. `zSign' is ignored if the
  3507. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3508. Standard for Binary Floating-Point Arithmetic.
  3509. -------------------------------------------------------------------------------
  3510. *}
  3511. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  3512. Var
  3513. aExp, bExp, zExp: int16;
  3514. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  3515. expDiff: int16;
  3516. z: float64;
  3517. label aExpBigger;
  3518. label bExpBigger;
  3519. label aBigger;
  3520. label bBigger;
  3521. label normalizeRoundAndPack;
  3522. Begin
  3523. aSig1 := extractFloat64Frac1( a );
  3524. aSig0 := extractFloat64Frac0( a );
  3525. aExp := extractFloat64Exp( a );
  3526. bSig1 := extractFloat64Frac1( b );
  3527. bSig0 := extractFloat64Frac0( b );
  3528. bExp := extractFloat64Exp( b );
  3529. expDiff := aExp - bExp;
  3530. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  3531. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  3532. if ( 0 < expDiff ) then goto aExpBigger;
  3533. if ( expDiff < 0 ) then goto bExpBigger;
  3534. if ( aExp = $7FF ) then
  3535. Begin
  3536. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  3537. Begin
  3538. propagateFloat64NaN( a, b, out );
  3539. exit;
  3540. End;
  3541. float_raise( float_flag_invalid );
  3542. z.low := float64_default_nan_low;
  3543. z.high := float64_default_nan_high;
  3544. out := z;
  3545. exit;
  3546. End;
  3547. if ( aExp = 0 ) then
  3548. Begin
  3549. aExp := 1;
  3550. bExp := 1;
  3551. End;
  3552. if ( bSig0 < aSig0 ) then goto aBigger;
  3553. if ( aSig0 < bSig0 ) then goto bBigger;
  3554. if ( bSig1 < aSig1 ) then goto aBigger;
  3555. if ( aSig1 < bSig1 ) then goto bBigger;
  3556. packFloat64( flag(float_rounding_mode = float_round_down), 0, 0, 0 , out);
  3557. exit;
  3558. bExpBigger:
  3559. if ( bExp = $7FF ) then
  3560. Begin
  3561. if ( bSig0 OR bSig1 ) <> 0 then
  3562. Begin
  3563. propagateFloat64NaN( a, b, out );
  3564. exit;
  3565. End;
  3566. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  3567. exit;
  3568. End;
  3569. if ( aExp = 0 ) then
  3570. Begin
  3571. Inc(expDiff);
  3572. End
  3573. else
  3574. Begin
  3575. aSig0 := aSig0 or $40000000;
  3576. End;
  3577. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  3578. bSig0 := bSig0 or $40000000;
  3579. bBigger:
  3580. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  3581. zExp := bExp;
  3582. zSign := zSign xor 1;
  3583. goto normalizeRoundAndPack;
  3584. aExpBigger:
  3585. if ( aExp = $7FF ) then
  3586. Begin
  3587. if ( aSig0 OR aSig1 ) <> 0 then
  3588. Begin
  3589. propagateFloat64NaN( a, b, out );
  3590. exit;
  3591. End;
  3592. out := a;
  3593. exit;
  3594. End;
  3595. if ( bExp = 0 ) then
  3596. Begin
  3597. Dec(expDiff);
  3598. End
  3599. else
  3600. Begin
  3601. bSig0 := bSig0 or $40000000;
  3602. End;
  3603. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  3604. aSig0 := aSig0 or $40000000;
  3605. aBigger:
  3606. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  3607. zExp := aExp;
  3608. normalizeRoundAndPack:
  3609. Dec(zExp);
  3610. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  3611. End;
  3612. {*
  3613. -------------------------------------------------------------------------------
  3614. Returns the result of adding the double-precision floating-point values `a'
  3615. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3616. Binary Floating-Point Arithmetic.
  3617. -------------------------------------------------------------------------------
  3618. *}
  3619. Procedure float64_add( a: float64; b : float64; Var out : float64);
  3620. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  3621. Var
  3622. aSign, bSign: flag;
  3623. Begin
  3624. aSign := extractFloat64Sign( a );
  3625. bSign := extractFloat64Sign( b );
  3626. if ( aSign = bSign ) then
  3627. Begin
  3628. addFloat64Sigs( a, b, aSign, out );
  3629. End
  3630. else
  3631. Begin
  3632. subFloat64Sigs( a, b, aSign, out );
  3633. End;
  3634. End;
  3635. {*
  3636. -------------------------------------------------------------------------------
  3637. Returns the result of subtracting the double-precision floating-point values
  3638. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3639. for Binary Floating-Point Arithmetic.
  3640. -------------------------------------------------------------------------------
  3641. *}
  3642. Procedure float64_sub(a: float64; b : float64; var out: float64);
  3643. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  3644. Var
  3645. aSign, bSign: flag;
  3646. Begin
  3647. aSign := extractFloat64Sign( a );
  3648. bSign := extractFloat64Sign( b );
  3649. if ( aSign = bSign ) then
  3650. Begin
  3651. subFloat64Sigs( a, b, aSign, out );
  3652. End
  3653. else
  3654. Begin
  3655. addFloat64Sigs( a, b, aSign, out );
  3656. End;
  3657. End;
  3658. {*
  3659. -------------------------------------------------------------------------------
  3660. Returns the result of multiplying the double-precision floating-point values
  3661. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3662. for Binary Floating-Point Arithmetic.
  3663. -------------------------------------------------------------------------------
  3664. *}
  3665. Procedure float64_mul( a: float64; b:float64; Var out: float64);
  3666. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  3667. Var
  3668. aSign, bSign, zSign: flag;
  3669. aExp, bExp, zExp: int16;
  3670. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  3671. z: float64;
  3672. label invalid;
  3673. Begin
  3674. aSig1 := extractFloat64Frac1( a );
  3675. aSig0 := extractFloat64Frac0( a );
  3676. aExp := extractFloat64Exp( a );
  3677. aSign := extractFloat64Sign( a );
  3678. bSig1 := extractFloat64Frac1( b );
  3679. bSig0 := extractFloat64Frac0( b );
  3680. bExp := extractFloat64Exp( b );
  3681. bSign := extractFloat64Sign( b );
  3682. zSign := aSign xor bSign;
  3683. if ( aExp = $7FF ) then
  3684. Begin
  3685. if ( (( aSig0 OR aSig1 ) <>0)
  3686. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  3687. Begin
  3688. propagateFloat64NaN( a, b, out );
  3689. exit;
  3690. End;
  3691. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  3692. packFloat64( zSign, $7FF, 0, 0, out );
  3693. exit;
  3694. End;
  3695. if ( bExp = $7FF ) then
  3696. Begin
  3697. if ( bSig0 OR bSig1 )<> 0 then
  3698. Begin
  3699. propagateFloat64NaN( a, b, out );
  3700. exit;
  3701. End;
  3702. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  3703. Begin
  3704. invalid:
  3705. float_raise( float_flag_invalid );
  3706. z.low := float64_default_nan_low;
  3707. z.high := float64_default_nan_high;
  3708. out := z;
  3709. exit;
  3710. End;
  3711. packFloat64( zSign, $7FF, 0, 0, out );
  3712. exit;
  3713. End;
  3714. if ( aExp = 0 ) then
  3715. Begin
  3716. if ( ( aSig0 OR aSig1 ) = 0 ) then
  3717. Begin
  3718. packFloat64( zSign, 0, 0, 0, out );
  3719. exit;
  3720. End;
  3721. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  3722. End;
  3723. if ( bExp = 0 ) then
  3724. Begin
  3725. if ( ( bSig0 OR bSig1 ) = 0 ) then
  3726. Begin
  3727. packFloat64( zSign, 0, 0, 0, out );
  3728. exit;
  3729. End;
  3730. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  3731. End;
  3732. zExp := aExp + bExp - $400;
  3733. aSig0 := aSig0 or $00100000;
  3734. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  3735. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  3736. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  3737. zSig2 := zSig2 or flag( zSig3 <> 0 );
  3738. if ( $00200000 <= zSig0 ) then
  3739. Begin
  3740. shift64ExtraRightJamming(
  3741. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  3742. Inc(zExp);
  3743. End;
  3744. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  3745. End;
  3746. {*
  3747. -------------------------------------------------------------------------------
  3748. Returns the result of dividing the double-precision floating-point value `a'
  3749. by the corresponding value `b'. The operation is performed according to the
  3750. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3751. -------------------------------------------------------------------------------
  3752. *}
  3753. Procedure float64_div(a: float64; b : float64 ; var out: float64 );
  3754. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  3755. Var
  3756. aSign, bSign, zSign: flag;
  3757. aExp, bExp, zExp: int16;
  3758. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  3759. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  3760. z: float64;
  3761. label invalid;
  3762. Begin
  3763. aSig1 := extractFloat64Frac1( a );
  3764. aSig0 := extractFloat64Frac0( a );
  3765. aExp := extractFloat64Exp( a );
  3766. aSign := extractFloat64Sign( a );
  3767. bSig1 := extractFloat64Frac1( b );
  3768. bSig0 := extractFloat64Frac0( b );
  3769. bExp := extractFloat64Exp( b );
  3770. bSign := extractFloat64Sign( b );
  3771. zSign := aSign xor bSign;
  3772. if ( aExp = $7FF ) then
  3773. Begin
  3774. if ( aSig0 OR aSig1 )<> 0 then
  3775. Begin
  3776. propagateFloat64NaN( a, b, out );
  3777. exit;
  3778. end;
  3779. if ( bExp = $7FF ) then
  3780. Begin
  3781. if ( bSig0 OR bSig1 )<>0 then
  3782. Begin
  3783. propagateFloat64NaN( a, b, out );
  3784. exit;
  3785. End;
  3786. goto invalid;
  3787. End;
  3788. packFloat64( zSign, $7FF, 0, 0, out );
  3789. exit;
  3790. End;
  3791. if ( bExp = $7FF ) then
  3792. Begin
  3793. if ( bSig0 OR bSig1 )<> 0 then
  3794. Begin
  3795. propagateFloat64NaN( a, b, out );
  3796. exit;
  3797. End;
  3798. packFloat64( zSign, 0, 0, 0, out );
  3799. exit;
  3800. End;
  3801. if ( bExp = 0 ) then
  3802. Begin
  3803. if ( ( bSig0 OR bSig1 ) = 0 ) then
  3804. Begin
  3805. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  3806. Begin
  3807. invalid:
  3808. float_raise( float_flag_invalid );
  3809. z.low := float64_default_nan_low;
  3810. z.high := float64_default_nan_high;
  3811. out := z;
  3812. exit;
  3813. End;
  3814. float_raise( float_flag_divbyzero );
  3815. packFloat64( zSign, $7FF, 0, 0, out );
  3816. exit;
  3817. End;
  3818. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  3819. End;
  3820. if ( aExp = 0 ) then
  3821. Begin
  3822. if ( ( aSig0 OR aSig1 ) = 0 ) then
  3823. Begin
  3824. packFloat64( zSign, 0, 0, 0, out );
  3825. exit;
  3826. End;
  3827. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  3828. End;
  3829. zExp := aExp - bExp + $3FD;
  3830. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  3831. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  3832. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  3833. Begin
  3834. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  3835. Inc(zExp);
  3836. End;
  3837. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  3838. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  3839. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  3840. while ( sbits32 (rem0) < 0 ) do
  3841. Begin
  3842. Dec(zSig0);
  3843. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  3844. End;
  3845. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  3846. if ( ( zSig1 and $3FF ) <= 4 ) then
  3847. Begin
  3848. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  3849. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  3850. while ( sbits32 (rem1) < 0 ) do
  3851. Begin
  3852. Dec(zSig1);
  3853. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  3854. End;
  3855. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  3856. End;
  3857. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  3858. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  3859. End;
  3860. {*
  3861. -------------------------------------------------------------------------------
  3862. Returns the remainder of the double-precision floating-point value `a'
  3863. with respect to the corresponding value `b'. The operation is performed
  3864. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3865. -------------------------------------------------------------------------------
  3866. *}
  3867. Procedure float64_rem(a: float64; b : float64; var out: float64);
  3868. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  3869. Var
  3870. aSign, bSign, zSign: flag;
  3871. aExp, bExp, expDiff: int16;
  3872. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  3873. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  3874. sigMean0: sbits32;
  3875. z: float64;
  3876. label invalid;
  3877. Begin
  3878. aSig1 := extractFloat64Frac1( a );
  3879. aSig0 := extractFloat64Frac0( a );
  3880. aExp := extractFloat64Exp( a );
  3881. aSign := extractFloat64Sign( a );
  3882. bSig1 := extractFloat64Frac1( b );
  3883. bSig0 := extractFloat64Frac0( b );
  3884. bExp := extractFloat64Exp( b );
  3885. bSign := extractFloat64Sign( b );
  3886. if ( aExp = $7FF ) then
  3887. Begin
  3888. if ((( aSig0 OR aSig1 )<>0)
  3889. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  3890. Begin
  3891. propagateFloat64NaN( a, b, out );
  3892. exit;
  3893. End;
  3894. goto invalid;
  3895. End;
  3896. if ( bExp = $7FF ) then
  3897. Begin
  3898. if ( bSig0 OR bSig1 ) <> 0 then
  3899. Begin
  3900. propagateFloat64NaN( a, b, out );
  3901. exit;
  3902. End;
  3903. out := a;
  3904. exit;
  3905. End;
  3906. if ( bExp = 0 ) then
  3907. Begin
  3908. if ( ( bSig0 OR bSig1 ) = 0 ) then
  3909. Begin
  3910. invalid:
  3911. float_raise( float_flag_invalid );
  3912. z.low := float64_default_nan_low;
  3913. z.high := float64_default_nan_high;
  3914. out := z;
  3915. exit;
  3916. End;
  3917. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  3918. End;
  3919. if ( aExp = 0 ) then
  3920. Begin
  3921. if ( ( aSig0 OR aSig1 ) = 0 ) then
  3922. Begin
  3923. out := a;
  3924. exit;
  3925. End;
  3926. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  3927. End;
  3928. expDiff := aExp - bExp;
  3929. if ( expDiff < -1 ) then
  3930. Begin
  3931. out := a;
  3932. exit;
  3933. End;
  3934. shortShift64Left(
  3935. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  3936. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  3937. q := le64( bSig0, bSig1, aSig0, aSig1 );
  3938. if ( q )<>0 then
  3939. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  3940. expDiff := expDiff - 32;
  3941. while ( 0 < expDiff ) do
  3942. Begin
  3943. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  3944. if 4 < q then
  3945. q:= q - 4
  3946. else
  3947. q := 0;
  3948. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  3949. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  3950. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  3951. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  3952. expDiff := expDiff - 29;
  3953. End;
  3954. if ( -32 < expDiff ) then
  3955. Begin
  3956. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  3957. if 4 < q then
  3958. q := q - 4
  3959. else
  3960. q := 0;
  3961. q := q shr (- expDiff);
  3962. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  3963. expDiff := expDiff + 24;
  3964. if ( expDiff < 0 ) then
  3965. Begin
  3966. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  3967. End
  3968. else
  3969. Begin
  3970. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  3971. End;
  3972. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  3973. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  3974. End
  3975. else
  3976. Begin
  3977. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  3978. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  3979. End;
  3980. Repeat
  3981. alternateASig0 := aSig0;
  3982. alternateASig1 := aSig1;
  3983. Inc(q);
  3984. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  3985. Until not ( 0 <= sbits32 (aSig0) );
  3986. add64(
  3987. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  3988. if ( ( sigMean0 < 0 )
  3989. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  3990. Begin
  3991. aSig0 := alternateASig0;
  3992. aSig1 := alternateASig1;
  3993. End;
  3994. zSign := flag( sbits32 (aSig0) < 0 );
  3995. if ( zSign <> 0 ) then
  3996. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  3997. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, out );
  3998. End;
  3999. {*
  4000. -------------------------------------------------------------------------------
  4001. Returns the square root of the double-precision floating-point value `a'.
  4002. The operation is performed according to the IEC/IEEE Standard for Binary
  4003. Floating-Point Arithmetic.
  4004. -------------------------------------------------------------------------------
  4005. *}
  4006. Procedure float64_sqrt( a: float64; var out: float64 );
  4007. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  4008. Var
  4009. aSign: flag;
  4010. aExp, zExp: int16;
  4011. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4012. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4013. z: float64;
  4014. label invalid;
  4015. Begin
  4016. aSig1 := extractFloat64Frac1( a );
  4017. aSig0 := extractFloat64Frac0( a );
  4018. aExp := extractFloat64Exp( a );
  4019. aSign := extractFloat64Sign( a );
  4020. if ( aExp = $7FF ) then
  4021. Begin
  4022. if ( aSig0 OR aSig1 ) <> 0 then
  4023. Begin
  4024. propagateFloat64NaN( a, a, out );
  4025. exit;
  4026. End;
  4027. if ( aSign = 0) then
  4028. Begin
  4029. out := a;
  4030. exit;
  4031. End;
  4032. goto invalid;
  4033. End;
  4034. if ( aSign <> 0 ) then
  4035. Begin
  4036. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4037. Begin
  4038. out := a;
  4039. exit;
  4040. End;
  4041. invalid:
  4042. float_raise( float_flag_invalid );
  4043. z.low := float64_default_nan_low;
  4044. z.high := float64_default_nan_high;
  4045. out := z;
  4046. exit;
  4047. End;
  4048. if ( aExp = 0 ) then
  4049. Begin
  4050. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4051. Begin
  4052. packFloat64( 0, 0, 0, 0, out );
  4053. exit;
  4054. End;
  4055. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4056. End;
  4057. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4058. aSig0 := aSig0 or $00100000;
  4059. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4060. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4061. if ( zSig0 = 0 ) then
  4062. zSig0 := $7FFFFFFF;
  4063. doubleZSig0 := zSig0 + zSig0;
  4064. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4065. mul32To64( zSig0, zSig0, term0, term1 );
  4066. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4067. while ( sbits32 (rem0) < 0 ) do
  4068. Begin
  4069. Dec(zSig0);
  4070. doubleZSig0 := doubleZSig0 - 2;
  4071. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4072. End;
  4073. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4074. if ( ( zSig1 and $1FF ) <= 5 ) then
  4075. Begin
  4076. if ( zSig1 = 0 ) then
  4077. zSig1 := 1;
  4078. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4079. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4080. mul32To64( zSig1, zSig1, term2, term3 );
  4081. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4082. while ( sbits32 (rem1) < 0 ) do
  4083. Begin
  4084. Dec(zSig1);
  4085. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4086. term3 := term3 or 1;
  4087. term2 := term2 or doubleZSig0;
  4088. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4089. End;
  4090. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4091. End;
  4092. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4093. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4094. End;
  4095. {*
  4096. -------------------------------------------------------------------------------
  4097. Returns 1 if the double-precision floating-point value `a' is equal to
  4098. the corresponding value `b', and 0 otherwise. The comparison is performed
  4099. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4100. -------------------------------------------------------------------------------
  4101. *}
  4102. Function float64_eq(a: float64; b: float64): flag;
  4103. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  4104. Begin
  4105. if
  4106. (
  4107. ( extractFloat64Exp( a ) = $7FF )
  4108. AND
  4109. (
  4110. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4111. )
  4112. )
  4113. OR (
  4114. ( extractFloat64Exp( b ) = $7FF )
  4115. AND (
  4116. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4117. )
  4118. )
  4119. ) then
  4120. Begin
  4121. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4122. float_raise( float_flag_invalid );
  4123. float64_eq := 0;
  4124. exit;
  4125. End;
  4126. float64_eq := flag(
  4127. ( a.low = b.low )
  4128. AND ( ( a.high = b.high )
  4129. OR ( ( a.low = 0 )
  4130. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4131. ));
  4132. End;
  4133. {*
  4134. -------------------------------------------------------------------------------
  4135. Returns 1 if the double-precision floating-point value `a' is less than
  4136. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4137. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4138. Arithmetic.
  4139. -------------------------------------------------------------------------------
  4140. *}
  4141. Function float64_le(a: float64;b: float64): flag;
  4142. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  4143. Var
  4144. aSign, bSign: flag;
  4145. Begin
  4146. if
  4147. (
  4148. ( extractFloat64Exp( a ) = $7FF )
  4149. AND
  4150. (
  4151. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4152. )
  4153. )
  4154. OR (
  4155. ( extractFloat64Exp( b ) = $7FF )
  4156. AND (
  4157. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4158. )
  4159. )
  4160. ) then
  4161. Begin
  4162. float_raise( float_flag_invalid );
  4163. float64_le := 0;
  4164. exit;
  4165. End;
  4166. aSign := extractFloat64Sign( a );
  4167. bSign := extractFloat64Sign( b );
  4168. if ( aSign <> bSign ) then
  4169. Begin
  4170. float64_le := flag(
  4171. (aSign <> 0)
  4172. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4173. = 0 ));
  4174. exit;
  4175. End;
  4176. if aSign <> 0 then
  4177. float64_le := le64( b.high, b.low, a.high, a.low )
  4178. else
  4179. float64_le := le64( a.high, a.low, b.high, b.low );
  4180. End;
  4181. {*
  4182. -------------------------------------------------------------------------------
  4183. Returns 1 if the double-precision floating-point value `a' is less than
  4184. the corresponding value `b', and 0 otherwise. The comparison is performed
  4185. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4186. -------------------------------------------------------------------------------
  4187. *}
  4188. Function float64_lt(a: float64;b: float64): flag;
  4189. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  4190. Var
  4191. aSign, bSign: flag;
  4192. Begin
  4193. if
  4194. (
  4195. ( extractFloat64Exp( a ) = $7FF )
  4196. AND
  4197. (
  4198. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4199. )
  4200. )
  4201. OR (
  4202. ( extractFloat64Exp( b ) = $7FF )
  4203. AND (
  4204. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4205. )
  4206. )
  4207. ) then
  4208. Begin
  4209. float_raise( float_flag_invalid );
  4210. float64_lt := 0;
  4211. exit;
  4212. End;
  4213. aSign := extractFloat64Sign( a );
  4214. bSign := extractFloat64Sign( b );
  4215. if ( aSign <> bSign ) then
  4216. Begin
  4217. float64_lt := flag(
  4218. (aSign <> 0)
  4219. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4220. <> 0 ));
  4221. exit;
  4222. End;
  4223. if aSign <> 0 then
  4224. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4225. else
  4226. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4227. End;
  4228. {*
  4229. -------------------------------------------------------------------------------
  4230. Returns 1 if the double-precision floating-point value `a' is equal to
  4231. the corresponding value `b', and 0 otherwise. The invalid exception is
  4232. raised if either operand is a NaN. Otherwise, the comparison is performed
  4233. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4234. -------------------------------------------------------------------------------
  4235. *}
  4236. Function float64_eq_signaling( a: float64; b: float64): flag;
  4237. Begin
  4238. if
  4239. (
  4240. ( extractFloat64Exp( a ) = $7FF )
  4241. AND
  4242. (
  4243. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4244. )
  4245. )
  4246. OR (
  4247. ( extractFloat64Exp( b ) = $7FF )
  4248. AND (
  4249. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4250. )
  4251. )
  4252. ) then
  4253. Begin
  4254. float_raise( float_flag_invalid );
  4255. float64_eq_signaling := 0;
  4256. exit;
  4257. End;
  4258. float64_eq_signaling := flag(
  4259. ( a.low = b.low )
  4260. AND ( ( a.high = b.high )
  4261. OR ( ( a.low = 0 )
  4262. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4263. ));
  4264. End;
  4265. {*
  4266. -------------------------------------------------------------------------------
  4267. Returns 1 if the double-precision floating-point value `a' is less than or
  4268. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4269. cause an exception. Otherwise, the comparison is performed according to the
  4270. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4271. -------------------------------------------------------------------------------
  4272. *}
  4273. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4274. Var
  4275. aSign, bSign : flag;
  4276. Begin
  4277. if
  4278. (
  4279. ( extractFloat64Exp( a ) = $7FF )
  4280. AND
  4281. (
  4282. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4283. )
  4284. )
  4285. OR (
  4286. ( extractFloat64Exp( b ) = $7FF )
  4287. AND (
  4288. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4289. )
  4290. )
  4291. ) then
  4292. Begin
  4293. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4294. float_raise( float_flag_invalid );
  4295. float64_le_quiet := 0;
  4296. exit;
  4297. End;
  4298. aSign := extractFloat64Sign( a );
  4299. bSign := extractFloat64Sign( b );
  4300. if ( aSign <> bSign ) then
  4301. Begin
  4302. float64_le_quiet := flag
  4303. ((aSign <> 0)
  4304. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4305. = 0 ));
  4306. exit;
  4307. End;
  4308. if aSign <> 0 then
  4309. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  4310. else
  4311. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  4312. End;
  4313. {*
  4314. -------------------------------------------------------------------------------
  4315. Returns 1 if the double-precision floating-point value `a' is less than
  4316. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4317. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4318. Standard for Binary Floating-Point Arithmetic.
  4319. -------------------------------------------------------------------------------
  4320. *}
  4321. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  4322. Var
  4323. aSign, bSign: flag;
  4324. Begin
  4325. if
  4326. (
  4327. ( extractFloat64Exp( a ) = $7FF )
  4328. AND
  4329. (
  4330. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4331. )
  4332. )
  4333. OR (
  4334. ( extractFloat64Exp( b ) = $7FF )
  4335. AND (
  4336. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4337. )
  4338. )
  4339. ) then
  4340. Begin
  4341. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4342. float_raise( float_flag_invalid );
  4343. float64_lt_quiet := 0;
  4344. exit;
  4345. End;
  4346. aSign := extractFloat64Sign( a );
  4347. bSign := extractFloat64Sign( b );
  4348. if ( aSign <> bSign ) then
  4349. Begin
  4350. float64_lt_quiet := flag(
  4351. (aSign<>0)
  4352. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4353. <> 0 ));
  4354. exit;
  4355. End;
  4356. If aSign <> 0 then
  4357. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  4358. else
  4359. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  4360. End;
  4361. {*----------------------------------------------------------------------------
  4362. | Returns the result of converting the 64-bit two's complement integer `a'
  4363. | to the single-precision floating-point format. The conversion is performed
  4364. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4365. *----------------------------------------------------------------------------*}
  4366. function int64_to_float32( a: int64 ): float32;
  4367. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT32'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  4368. var
  4369. zSign : flag;
  4370. absA : uint64;
  4371. shiftCount: int8;
  4372. zSig : bits32;
  4373. intval : int64rec;
  4374. Begin
  4375. if ( a = 0 ) then
  4376. begin
  4377. int64_to_float32 := 0;
  4378. exit;
  4379. end;
  4380. if a < 0 then
  4381. zSign := flag(TRUE)
  4382. else
  4383. zSign := flag(FALSE);
  4384. if zSign<>0 then
  4385. absA := -a
  4386. else
  4387. absA := a;
  4388. shiftCount := countLeadingZeros64( absA ) - 40;
  4389. if ( 0 <= shiftCount ) then
  4390. begin
  4391. int64_to_float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  4392. end
  4393. else
  4394. begin
  4395. shiftCount := shiftCount + 7;
  4396. if ( shiftCount < 0 ) then
  4397. begin
  4398. intval.low := int64rec(AbsA).low;
  4399. intval.high := int64rec(AbsA).high;
  4400. shift64RightJamming( intval.low, intval.high, - shiftCount,
  4401. intval.low, intval.high);
  4402. int64rec(absA).low := intval.low;
  4403. int64rec(absA).high := intval.high;
  4404. end
  4405. else
  4406. absA := absA shl shiftCount;
  4407. int64_to_float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  4408. end;
  4409. End;
  4410. {*----------------------------------------------------------------------------
  4411. | Returns the result of converting the 64-bit two's complement integer `a'
  4412. | to the double-precision floating-point format. The conversion is performed
  4413. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4414. *----------------------------------------------------------------------------*}
  4415. function int64_to_float64( a: int64 ): float64;
  4416. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
  4417. var
  4418. zSign : flag;
  4419. float_result : float64;
  4420. intval : int64rec;
  4421. AbsA : bits64;
  4422. shiftcount : int8;
  4423. zSig0, zSig1 : bits32;
  4424. Begin
  4425. if ( a = 0 ) then
  4426. Begin
  4427. packFloat64( 0, 0, 0, 0, float_result );
  4428. exit;
  4429. end;
  4430. zSign := flag( a < 0 );
  4431. if ZSign<>0 then
  4432. AbsA := -a
  4433. else
  4434. AbsA := a;
  4435. shiftCount := countLeadingZeros64( absA ) - 11;
  4436. if ( 0 <= shiftCount ) then
  4437. Begin
  4438. absA := absA shl shiftcount;
  4439. zSig0:=int64rec(absA).high;
  4440. zSig1:=int64rec(absA).low;
  4441. End
  4442. else
  4443. Begin
  4444. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  4445. End;
  4446. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  4447. int64_to_float64:= float_result;
  4448. End;
  4449. end.