softfpu.pp 279 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. interface
  69. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  70. {$if not(defined(fpc_softfpu_implementation))}
  71. {
  72. -------------------------------------------------------------------------------
  73. Software IEC/IEEE floating-point types.
  74. -------------------------------------------------------------------------------
  75. }
  76. TYPE
  77. float32 = longword;
  78. { we use here a record in the function header because
  79. the record allows bitwise conversion to single }
  80. float32rec = record
  81. float32 : float32;
  82. end;
  83. flag = byte;
  84. uint8 = byte;
  85. int8 = shortint;
  86. uint16 = word;
  87. int16 = smallint;
  88. uint32 = longword;
  89. int32 = longint;
  90. bits8 = byte;
  91. sbits8 = shortint;
  92. bits16 = word;
  93. sbits16 = smallint;
  94. sbits32 = longint;
  95. bits32 = longword;
  96. {$ifndef fpc}
  97. qword = int64;
  98. {$endif}
  99. uint64 = qword;
  100. bits64 = qword;
  101. sbits64 = int64;
  102. {$ifdef ENDIAN_LITTLE}
  103. float64 = packed record
  104. low: bits32;
  105. high: bits32;
  106. end;
  107. int64rec = packed record
  108. low: bits32;
  109. high: bits32;
  110. end;
  111. floatx80 = packed record
  112. low : qword;
  113. high : word;
  114. end;
  115. float128 = packed record
  116. low : qword;
  117. high : qword;
  118. end;
  119. {$else}
  120. float64 = packed record
  121. high,low : bits32;
  122. end;
  123. int64rec = packed record
  124. high,low : bits32;
  125. end;
  126. floatx80 = packed record
  127. high : word;
  128. low : qword;
  129. end;
  130. float128 = packed record
  131. high : qword;
  132. low : qword;
  133. end;
  134. {$endif}
  135. {*
  136. -------------------------------------------------------------------------------
  137. Returns 1 if the double-precision floating-point value `a' is less than
  138. the corresponding value `b', and 0 otherwise. The comparison is performed
  139. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  140. -------------------------------------------------------------------------------
  141. *}
  142. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  143. {*
  144. -------------------------------------------------------------------------------
  145. Returns 1 if the double-precision floating-point value `a' is less than
  146. or equal to the corresponding value `b', and 0 otherwise. The comparison
  147. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  148. Arithmetic.
  149. -------------------------------------------------------------------------------
  150. *}
  151. Function float64_le(a: float64;b: float64): flag; compilerproc;
  152. {*
  153. -------------------------------------------------------------------------------
  154. Returns 1 if the double-precision floating-point value `a' is equal to
  155. the corresponding value `b', and 0 otherwise. The comparison is performed
  156. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  157. -------------------------------------------------------------------------------
  158. *}
  159. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  160. {*
  161. -------------------------------------------------------------------------------
  162. Returns the square root of the double-precision floating-point value `a'.
  163. The operation is performed according to the IEC/IEEE Standard for Binary
  164. Floating-Point Arithmetic.
  165. -------------------------------------------------------------------------------
  166. *}
  167. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  168. {*
  169. -------------------------------------------------------------------------------
  170. Returns the remainder of the double-precision floating-point value `a'
  171. with respect to the corresponding value `b'. The operation is performed
  172. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  173. -------------------------------------------------------------------------------
  174. *}
  175. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  176. {*
  177. -------------------------------------------------------------------------------
  178. Returns the result of dividing the double-precision floating-point value `a'
  179. by the corresponding value `b'. The operation is performed according to the
  180. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  181. -------------------------------------------------------------------------------
  182. *}
  183. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  184. {*
  185. -------------------------------------------------------------------------------
  186. Returns the result of multiplying the double-precision floating-point values
  187. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  188. for Binary Floating-Point Arithmetic.
  189. -------------------------------------------------------------------------------
  190. *}
  191. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  192. {*
  193. -------------------------------------------------------------------------------
  194. Returns the result of subtracting the double-precision floating-point values
  195. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  196. for Binary Floating-Point Arithmetic.
  197. -------------------------------------------------------------------------------
  198. *}
  199. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  200. {*
  201. -------------------------------------------------------------------------------
  202. Returns the result of adding the double-precision floating-point values `a'
  203. and `b'. The operation is performed according to the IEC/IEEE Standard for
  204. Binary Floating-Point Arithmetic.
  205. -------------------------------------------------------------------------------
  206. *}
  207. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  208. {*
  209. -------------------------------------------------------------------------------
  210. Rounds the double-precision floating-point value `a' to an integer,
  211. and returns the result as a double-precision floating-point value. The
  212. operation is performed according to the IEC/IEEE Standard for Binary
  213. Floating-Point Arithmetic.
  214. -------------------------------------------------------------------------------
  215. *}
  216. Function float64_round_to_int(a: float64) : float64; compilerproc;
  217. {*
  218. -------------------------------------------------------------------------------
  219. Returns the result of converting the double-precision floating-point value
  220. `a' to the single-precision floating-point format. The conversion is
  221. performed according to the IEC/IEEE Standard for Binary Floating-Point
  222. Arithmetic.
  223. -------------------------------------------------------------------------------
  224. *}
  225. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  226. {*
  227. -------------------------------------------------------------------------------
  228. Returns the result of converting the double-precision floating-point value
  229. `a' to the 32-bit two's complement integer format. The conversion is
  230. performed according to the IEC/IEEE Standard for Binary Floating-Point
  231. Arithmetic, except that the conversion is always rounded toward zero.
  232. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  233. the conversion overflows, the largest integer with the same sign as `a' is
  234. returned.
  235. -------------------------------------------------------------------------------
  236. *}
  237. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  238. {*
  239. -------------------------------------------------------------------------------
  240. Returns the result of converting the double-precision floating-point value
  241. `a' to the 32-bit two's complement integer format. The conversion is
  242. performed according to the IEC/IEEE Standard for Binary Floating-Point
  243. Arithmetic---which means in particular that the conversion is rounded
  244. according to the current rounding mode. If `a' is a NaN, the largest
  245. positive integer is returned. Otherwise, if the conversion overflows, the
  246. largest integer with the same sign as `a' is returned.
  247. -------------------------------------------------------------------------------
  248. *}
  249. Function float64_to_int32(a: float64): int32; compilerproc;
  250. {*
  251. -------------------------------------------------------------------------------
  252. Returns 1 if the single-precision floating-point value `a' is less than
  253. the corresponding value `b', and 0 otherwise. The comparison is performed
  254. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  255. -------------------------------------------------------------------------------
  256. *}
  257. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  258. {*
  259. -------------------------------------------------------------------------------
  260. Returns 1 if the single-precision floating-point value `a' is less than
  261. or equal to the corresponding value `b', and 0 otherwise. The comparison
  262. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  263. Arithmetic.
  264. -------------------------------------------------------------------------------
  265. *}
  266. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  267. {*
  268. -------------------------------------------------------------------------------
  269. Returns 1 if the single-precision floating-point value `a' is equal to
  270. the corresponding value `b', and 0 otherwise. The comparison is performed
  271. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  272. -------------------------------------------------------------------------------
  273. *}
  274. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  275. {*
  276. -------------------------------------------------------------------------------
  277. Returns the square root of the single-precision floating-point value `a'.
  278. The operation is performed according to the IEC/IEEE Standard for Binary
  279. Floating-Point Arithmetic.
  280. -------------------------------------------------------------------------------
  281. *}
  282. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  283. {*
  284. -------------------------------------------------------------------------------
  285. Returns the remainder of the single-precision floating-point value `a'
  286. with respect to the corresponding value `b'. The operation is performed
  287. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  288. -------------------------------------------------------------------------------
  289. *}
  290. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  291. {*
  292. -------------------------------------------------------------------------------
  293. Returns the result of dividing the single-precision floating-point value `a'
  294. by the corresponding value `b'. The operation is performed according to the
  295. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  296. -------------------------------------------------------------------------------
  297. *}
  298. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  299. {*
  300. -------------------------------------------------------------------------------
  301. Returns the result of multiplying the single-precision floating-point values
  302. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  303. for Binary Floating-Point Arithmetic.
  304. -------------------------------------------------------------------------------
  305. *}
  306. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  307. {*
  308. -------------------------------------------------------------------------------
  309. Returns the result of subtracting the single-precision floating-point values
  310. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  311. for Binary Floating-Point Arithmetic.
  312. -------------------------------------------------------------------------------
  313. *}
  314. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  315. {*
  316. -------------------------------------------------------------------------------
  317. Returns the result of adding the single-precision floating-point values `a'
  318. and `b'. The operation is performed according to the IEC/IEEE Standard for
  319. Binary Floating-Point Arithmetic.
  320. -------------------------------------------------------------------------------
  321. *}
  322. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  323. {*
  324. -------------------------------------------------------------------------------
  325. Rounds the single-precision floating-point value `a' to an integer,
  326. and returns the result as a single-precision floating-point value. The
  327. operation is performed according to the IEC/IEEE Standard for Binary
  328. Floating-Point Arithmetic.
  329. -------------------------------------------------------------------------------
  330. *}
  331. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  332. {*
  333. -------------------------------------------------------------------------------
  334. Returns the result of converting the single-precision floating-point value
  335. `a' to the double-precision floating-point format. The conversion is
  336. performed according to the IEC/IEEE Standard for Binary Floating-Point
  337. Arithmetic.
  338. -------------------------------------------------------------------------------
  339. *}
  340. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  341. {*
  342. -------------------------------------------------------------------------------
  343. Returns the result of converting the single-precision floating-point value
  344. `a' to the 32-bit two's complement integer format. The conversion is
  345. performed according to the IEC/IEEE Standard for Binary Floating-Point
  346. Arithmetic, except that the conversion is always rounded toward zero.
  347. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  348. the conversion overflows, the largest integer with the same sign as `a' is
  349. returned.
  350. -------------------------------------------------------------------------------
  351. *}
  352. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  353. {*
  354. -------------------------------------------------------------------------------
  355. Returns the result of converting the single-precision floating-point value
  356. `a' to the 32-bit two's complement integer format. The conversion is
  357. performed according to the IEC/IEEE Standard for Binary Floating-Point
  358. Arithmetic---which means in particular that the conversion is rounded
  359. according to the current rounding mode. If `a' is a NaN, the largest
  360. positive integer is returned. Otherwise, if the conversion overflows, the
  361. largest integer with the same sign as `a' is returned.
  362. -------------------------------------------------------------------------------
  363. *}
  364. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  365. {*
  366. -------------------------------------------------------------------------------
  367. Returns the result of converting the 32-bit two's complement integer `a' to
  368. the double-precision floating-point format. The conversion is performed
  369. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  370. -------------------------------------------------------------------------------
  371. *}
  372. Function int32_to_float64( a: int32) : float64; compilerproc;
  373. {*
  374. -------------------------------------------------------------------------------
  375. Returns the result of converting the 32-bit two's complement integer `a' to
  376. the single-precision floating-point format. The conversion is performed
  377. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  378. -------------------------------------------------------------------------------
  379. *}
  380. Function int32_to_float32( a: int32): float32rec; compilerproc;
  381. {*----------------------------------------------------------------------------
  382. | Returns the result of converting the 64-bit two's complement integer `a'
  383. | to the double-precision floating-point format. The conversion is performed
  384. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  385. *----------------------------------------------------------------------------*}
  386. Function int64_to_float64( a: int64 ): float64; compilerproc;
  387. {*----------------------------------------------------------------------------
  388. | Returns the result of converting the 64-bit two's complement integer `a'
  389. | to the single-precision floating-point format. The conversion is performed
  390. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  391. *----------------------------------------------------------------------------*}
  392. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  393. CONST
  394. {-------------------------------------------------------------------------------
  395. Software IEC/IEEE floating-point underflow tininess-detection mode.
  396. -------------------------------------------------------------------------------
  397. *}
  398. float_tininess_after_rounding = 0;
  399. float_tininess_before_rounding = 1;
  400. {*
  401. -------------------------------------------------------------------------------
  402. Software IEC/IEEE floating-point rounding mode.
  403. -------------------------------------------------------------------------------
  404. *}
  405. {
  406. Round to nearest.
  407. This is the default mode. It should be used unless there is a specific
  408. need for one of the others. In this mode results are rounded to the
  409. nearest representable value. If the result is midway between two
  410. representable values, the even representable is chosen. Even here
  411. means the lowest-order bit is zero. This rounding mode prevents
  412. statistical bias and guarantees numeric stability: round-off errors
  413. in a lengthy calculation will remain smaller than half of FLT_EPSILON.
  414. Round toward plus Infinity.
  415. All results are rounded to the smallest representable value which is
  416. greater than the result.
  417. Round toward minus Infinity.
  418. All results are rounded to the largest representable value which is
  419. less than the result.
  420. Round toward zero.
  421. All results are rounded to the largest representable value whose
  422. magnitude is less than that of the result. In other words, if the
  423. result is negative it is rounded up; if it is positive, it is
  424. rounded down.
  425. }
  426. float_round_nearest_even = 0;
  427. float_round_down = 1;
  428. float_round_up = 2;
  429. float_round_to_zero = 3;
  430. {*
  431. -------------------------------------------------------------------------------
  432. Floating-point rounding mode and exception flags.
  433. -------------------------------------------------------------------------------
  434. *}
  435. const
  436. float_rounding_mode : Byte = float_round_nearest_even;
  437. {*
  438. -------------------------------------------------------------------------------
  439. Underflow tininess-detection mode, statically initialized to default value.
  440. (The declaration in `softfloat.h' must match the `int8' type here.)
  441. -------------------------------------------------------------------------------
  442. *}
  443. const float_detect_tininess: int8 = float_tininess_after_rounding;
  444. {$endif not(defined(fpc_softfpu_implementation))}
  445. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  446. implementation
  447. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  448. {$if not(defined(fpc_softfpu_interface))}
  449. (*****************************************************************************)
  450. (*----------------------------------------------------------------------------*)
  451. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  452. (* division and square root approximations. (Can be specialized to target if *)
  453. (* desired.) *)
  454. (* ---------------------------------------------------------------------------*)
  455. (*****************************************************************************)
  456. {*----------------------------------------------------------------------------
  457. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  458. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  459. | input. If `zSign' is 1, the input is negated before being converted to an
  460. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  461. | is simply rounded to an integer, with the inexact exception raised if the
  462. | input cannot be represented exactly as an integer. However, if the fixed-
  463. | point input is too large, the invalid exception is raised and the largest
  464. | positive or negative integer is returned.
  465. *----------------------------------------------------------------------------*}
  466. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  467. var
  468. roundingMode: int8;
  469. roundNearestEven: flag;
  470. roundIncrement, roundBits: int8;
  471. z: int32;
  472. begin
  473. roundingMode := float_rounding_mode;
  474. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  475. roundIncrement := $40;
  476. if ( roundNearestEven=0 ) then
  477. begin
  478. if ( roundingMode = float_round_to_zero ) then
  479. begin
  480. roundIncrement := 0;
  481. end
  482. else begin
  483. roundIncrement := $7F;
  484. if ( zSign<>0 ) then
  485. begin
  486. if ( roundingMode = float_round_up ) then
  487. roundIncrement := 0;
  488. end
  489. else begin
  490. if ( roundingMode = float_round_down ) then
  491. roundIncrement := 0;
  492. end;
  493. end;
  494. end;
  495. roundBits := absZ and $7F;
  496. absZ := ( absZ + roundIncrement ) shr 7;
  497. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  498. z := absZ;
  499. if ( zSign<>0 ) then
  500. z := - z;
  501. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  502. begin
  503. float_raise( float_flag_invalid );
  504. if zSign<>0 then
  505. result:=sbits32($80000000)
  506. else
  507. result:=$7FFFFFFF;
  508. exit;
  509. end;
  510. if ( roundBits<>0 ) then
  511. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  512. result:=z;
  513. end;
  514. {*----------------------------------------------------------------------------
  515. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  516. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  517. | and returns the properly rounded 64-bit integer corresponding to the input.
  518. | If `zSign' is 1, the input is negated before being converted to an integer.
  519. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  520. | the inexact exception raised if the input cannot be represented exactly as
  521. | an integer. However, if the fixed-point input is too large, the invalid
  522. | exception is raised and the largest positive or negative integer is
  523. | returned.
  524. *----------------------------------------------------------------------------*}
  525. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  526. var
  527. roundingMode: int8;
  528. roundNearestEven, increment: flag;
  529. z: int64;
  530. label
  531. overflow;
  532. begin
  533. roundingMode := float_rounding_mode;
  534. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  535. increment := ord( sbits64(absZ1) < 0 );
  536. if ( roundNearestEven=0 ) then
  537. begin
  538. if ( roundingMode = float_round_to_zero ) then
  539. begin
  540. increment := 0;
  541. end
  542. else begin
  543. if ( zSign<>0 ) then
  544. begin
  545. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  546. end
  547. else begin
  548. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  549. end;
  550. end;
  551. end;
  552. if ( increment<>0 ) then
  553. begin
  554. inc(absZ0);
  555. if ( absZ0 = 0 ) then
  556. goto overflow;
  557. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  558. end;
  559. z := absZ0;
  560. if ( zSign<>0 ) then
  561. z := - z;
  562. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  563. begin
  564. overflow:
  565. float_raise( float_flag_invalid );
  566. if zSign<>0 then
  567. result:=int64($8000000000000000)
  568. else
  569. result:=int64($7FFFFFFFFFFFFFFF);
  570. end;
  571. if ( absZ1<>0 ) then
  572. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  573. result:=z;
  574. end;
  575. {*
  576. -------------------------------------------------------------------------------
  577. Shifts `a' right by the number of bits given in `count'. If any nonzero
  578. bits are shifted off, they are ``jammed'' into the least significant bit of
  579. the result by setting the least significant bit to 1. The value of `count'
  580. can be arbitrarily large; in particular, if `count' is greater than 32, the
  581. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  582. The result is stored in the location pointed to by `zPtr'.
  583. -------------------------------------------------------------------------------
  584. *}
  585. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  586. var
  587. z: Bits32;
  588. Begin
  589. if ( count = 0 ) then
  590. z := a
  591. else
  592. if ( count < 32 ) then
  593. Begin
  594. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  595. End
  596. else
  597. Begin
  598. z := bits32( a <> 0 );
  599. End;
  600. zPtr := z;
  601. End;
  602. {*----------------------------------------------------------------------------
  603. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  604. | number of bits given in `count'. Any bits shifted off are lost. The value
  605. | of `count' can be arbitrarily large; in particular, if `count' is greater
  606. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  607. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  608. *----------------------------------------------------------------------------*}
  609. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  610. var
  611. z0, z1: bits64;
  612. negCount: int8;
  613. begin
  614. negCount := ( - count ) and 63;
  615. if ( count = 0 ) then
  616. begin
  617. z1 := a1;
  618. z0 := a0;
  619. end
  620. else if ( count < 64 ) then
  621. begin
  622. z1 := ( a0 shl negCount ) or ( a1 shr count );
  623. z0 := a0 shr count;
  624. end
  625. else
  626. begin
  627. if ( count shl 64 )<>0 then
  628. z1 := a0 shr ( count and 63 )
  629. else
  630. z1 := 0;
  631. z0 := 0;
  632. end;
  633. z1Ptr := z1;
  634. z0Ptr := z0;
  635. end;
  636. {*
  637. -------------------------------------------------------------------------------
  638. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  639. number of bits given in `count'. Any bits shifted off are lost. The value
  640. of `count' can be arbitrarily large; in particular, if `count' is greater
  641. than 64, the result will be 0. The result is broken into two 32-bit pieces
  642. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  643. -------------------------------------------------------------------------------
  644. *}
  645. Procedure
  646. shift64Right(
  647. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  648. Var
  649. z0, z1: bits32;
  650. negCount : int8;
  651. Begin
  652. negCount := ( - count ) AND 31;
  653. if ( count = 0 ) then
  654. Begin
  655. z1 := a1;
  656. z0 := a0;
  657. End
  658. else if ( count < 32 ) then
  659. Begin
  660. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  661. z0 := a0 shr count;
  662. End
  663. else
  664. Begin
  665. if (count < 64) then
  666. z1 := ( a0 shr ( count AND 31 ) )
  667. else
  668. z1 := 0;
  669. z0 := 0;
  670. End;
  671. z1Ptr := z1;
  672. z0Ptr := z0;
  673. End;
  674. {*
  675. -------------------------------------------------------------------------------
  676. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  677. number of bits given in `count'. If any nonzero bits are shifted off, they
  678. are ``jammed'' into the least significant bit of the result by setting the
  679. least significant bit to 1. The value of `count' can be arbitrarily large;
  680. in particular, if `count' is greater than 64, the result will be either 0
  681. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  682. nonzero. The result is broken into two 32-bit pieces which are stored at
  683. the locations pointed to by `z0Ptr' and `z1Ptr'.
  684. -------------------------------------------------------------------------------
  685. *}
  686. Procedure
  687. shift64RightJamming(
  688. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  689. VAR
  690. z0, z1 : bits32;
  691. negCount : int8;
  692. Begin
  693. negCount := ( - count ) AND 31;
  694. if ( count = 0 ) then
  695. Begin
  696. z1 := a1;
  697. z0 := a0;
  698. End
  699. else
  700. if ( count < 32 ) then
  701. Begin
  702. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  703. z0 := a0 shr count;
  704. End
  705. else
  706. Begin
  707. if ( count = 32 ) then
  708. Begin
  709. z1 := a0 OR bits32( a1 <> 0 );
  710. End
  711. else
  712. if ( count < 64 ) Then
  713. Begin
  714. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  715. End
  716. else
  717. Begin
  718. z1 := bits32( ( a0 OR a1 ) <> 0 );
  719. End;
  720. z0 := 0;
  721. End;
  722. z1Ptr := z1;
  723. z0Ptr := z0;
  724. End;
  725. {*----------------------------------------------------------------------------
  726. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  727. | bits are shifted off, they are ``jammed'' into the least significant bit of
  728. | the result by setting the least significant bit to 1. The value of `count'
  729. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  730. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  731. | The result is stored in the location pointed to by `zPtr'.
  732. *----------------------------------------------------------------------------*}
  733. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  734. var
  735. z: bits64;
  736. begin
  737. if ( count = 0 ) then
  738. begin
  739. z := a;
  740. end
  741. else if ( count < 64 ) then
  742. begin
  743. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  744. end
  745. else
  746. begin
  747. z := ord( a <> 0 );
  748. end;
  749. zPtr := z;
  750. end;
  751. {*
  752. -------------------------------------------------------------------------------
  753. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  754. by 32 _plus_ the number of bits given in `count'. The shifted result is
  755. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  756. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  757. off form a third 32-bit result as follows: The _last_ bit shifted off is
  758. the most-significant bit of the extra result, and the other 31 bits of the
  759. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  760. were all zero. This extra result is stored in the location pointed to by
  761. `z2Ptr'. The value of `count' can be arbitrarily large.
  762. (This routine makes more sense if `a0', `a1', and `a2' are considered
  763. to form a fixed-point value with binary point between `a1' and `a2'. This
  764. fixed-point value is shifted right by the number of bits given in `count',
  765. and the integer part of the result is returned at the locations pointed to
  766. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  767. corrupted as described above, and is returned at the location pointed to by
  768. `z2Ptr'.)
  769. -------------------------------------------------------------------------------
  770. }
  771. Procedure
  772. shift64ExtraRightJamming(
  773. a0: bits32;
  774. a1: bits32;
  775. a2: bits32;
  776. count: int16;
  777. VAR z0Ptr: bits32;
  778. VAR z1Ptr: bits32;
  779. VAR z2Ptr: bits32
  780. );
  781. Var
  782. z0, z1, z2: bits32;
  783. negCount : int8;
  784. Begin
  785. negCount := ( - count ) AND 31;
  786. if ( count = 0 ) then
  787. Begin
  788. z2 := a2;
  789. z1 := a1;
  790. z0 := a0;
  791. End
  792. else
  793. Begin
  794. if ( count < 32 ) Then
  795. Begin
  796. z2 := a1 shl negCount;
  797. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  798. z0 := a0 shr count;
  799. End
  800. else
  801. Begin
  802. if ( count = 32 ) then
  803. Begin
  804. z2 := a1;
  805. z1 := a0;
  806. End
  807. else
  808. Begin
  809. a2 := a2 or a1;
  810. if ( count < 64 ) then
  811. Begin
  812. z2 := a0 shl negCount;
  813. z1 := a0 shr ( count AND 31 );
  814. End
  815. else
  816. Begin
  817. if count = 64 then
  818. z2 := a0
  819. else
  820. z2 := bits32(a0 <> 0);
  821. z1 := 0;
  822. End;
  823. End;
  824. z0 := 0;
  825. End;
  826. z2 := z2 or bits32( a2 <> 0 );
  827. End;
  828. z2Ptr := z2;
  829. z1Ptr := z1;
  830. z0Ptr := z0;
  831. End;
  832. {*
  833. -------------------------------------------------------------------------------
  834. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  835. number of bits given in `count'. Any bits shifted off are lost. The value
  836. of `count' must be less than 32. The result is broken into two 32-bit
  837. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  838. -------------------------------------------------------------------------------
  839. *}
  840. Procedure
  841. shortShift64Left(
  842. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  843. Begin
  844. z1Ptr := a1 shl count;
  845. if count = 0 then
  846. z0Ptr := a0
  847. else
  848. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  849. End;
  850. {*
  851. -------------------------------------------------------------------------------
  852. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  853. by the number of bits given in `count'. Any bits shifted off are lost.
  854. The value of `count' must be less than 32. The result is broken into three
  855. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  856. `z1Ptr', and `z2Ptr'.
  857. -------------------------------------------------------------------------------
  858. *}
  859. Procedure
  860. shortShift96Left(
  861. a0: bits32;
  862. a1: bits32;
  863. a2: bits32;
  864. count: int16;
  865. VAR z0Ptr: bits32;
  866. VAR z1Ptr: bits32;
  867. VAR z2Ptr: bits32
  868. );
  869. Var
  870. z0, z1, z2: bits32;
  871. negCount: int8;
  872. Begin
  873. z2 := a2 shl count;
  874. z1 := a1 shl count;
  875. z0 := a0 shl count;
  876. if ( 0 < count ) then
  877. Begin
  878. negCount := ( ( - count ) AND 31 );
  879. z1 := z1 or (a2 shr negCount);
  880. z0 := z0 or (a1 shr negCount);
  881. End;
  882. z2Ptr := z2;
  883. z1Ptr := z1;
  884. z0Ptr := z0;
  885. End;
  886. {*----------------------------------------------------------------------------
  887. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  888. | number of bits given in `count'. Any bits shifted off are lost. The value
  889. | of `count' must be less than 64. The result is broken into two 64-bit
  890. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  891. *----------------------------------------------------------------------------*}
  892. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);inline;
  893. begin
  894. z1Ptr := a1 shl count;
  895. if count=0 then
  896. z0Ptr:=a0
  897. else
  898. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  899. end;
  900. {*
  901. -------------------------------------------------------------------------------
  902. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  903. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  904. any carry out is lost. The result is broken into two 32-bit pieces which
  905. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  906. -------------------------------------------------------------------------------
  907. *}
  908. Procedure
  909. add64(
  910. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  911. Var
  912. z1: bits32;
  913. Begin
  914. z1 := a1 + b1;
  915. z1Ptr := z1;
  916. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  917. End;
  918. {*
  919. -------------------------------------------------------------------------------
  920. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  921. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  922. modulo 2^96, so any carry out is lost. The result is broken into three
  923. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  924. `z1Ptr', and `z2Ptr'.
  925. -------------------------------------------------------------------------------
  926. *}
  927. Procedure
  928. add96(
  929. a0: bits32;
  930. a1: bits32;
  931. a2: bits32;
  932. b0: bits32;
  933. b1: bits32;
  934. b2: bits32;
  935. VAR z0Ptr: bits32;
  936. VAR z1Ptr: bits32;
  937. VAR z2Ptr: bits32
  938. );
  939. var
  940. z0, z1, z2: bits32;
  941. carry0, carry1: int8;
  942. Begin
  943. z2 := a2 + b2;
  944. carry1 := int8( z2 < a2 );
  945. z1 := a1 + b1;
  946. carry0 := int8( z1 < a1 );
  947. z0 := a0 + b0;
  948. z1 := z1 + carry1;
  949. z0 := z0 + bits32( z1 < carry1 );
  950. z0 := z0 + carry0;
  951. z2Ptr := z2;
  952. z1Ptr := z1;
  953. z0Ptr := z0;
  954. End;
  955. {*
  956. -------------------------------------------------------------------------------
  957. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  958. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  959. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  960. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  961. `z1Ptr'.
  962. -------------------------------------------------------------------------------
  963. *}
  964. Procedure
  965. sub64(
  966. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  967. Begin
  968. z1Ptr := a1 - b1;
  969. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  970. End;
  971. {*
  972. -------------------------------------------------------------------------------
  973. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  974. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  975. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  976. into three 32-bit pieces which are stored at the locations pointed to by
  977. `z0Ptr', `z1Ptr', and `z2Ptr'.
  978. -------------------------------------------------------------------------------
  979. *}
  980. Procedure
  981. sub96(
  982. a0:bits32;
  983. a1:bits32;
  984. a2:bits32;
  985. b0:bits32;
  986. b1:bits32;
  987. b2:bits32;
  988. VAR z0Ptr:bits32;
  989. VAR z1Ptr:bits32;
  990. VAR z2Ptr:bits32
  991. );
  992. Var
  993. z0, z1, z2: bits32;
  994. borrow0, borrow1: int8;
  995. Begin
  996. z2 := a2 - b2;
  997. borrow1 := int8( a2 < b2 );
  998. z1 := a1 - b1;
  999. borrow0 := int8( a1 < b1 );
  1000. z0 := a0 - b0;
  1001. z0 := z0 - bits32( z1 < borrow1 );
  1002. z1 := z1 - borrow1;
  1003. z0 := z0 -borrow0;
  1004. z2Ptr := z2;
  1005. z1Ptr := z1;
  1006. z0Ptr := z0;
  1007. End;
  1008. {*
  1009. -------------------------------------------------------------------------------
  1010. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1011. into two 32-bit pieces which are stored at the locations pointed to by
  1012. `z0Ptr' and `z1Ptr'.
  1013. -------------------------------------------------------------------------------
  1014. *}
  1015. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1016. :bits32 );
  1017. Var
  1018. aHigh, aLow, bHigh, bLow: bits16;
  1019. z0, zMiddleA, zMiddleB, z1: bits32;
  1020. Begin
  1021. aLow := a and $ffff;
  1022. aHigh := a shr 16;
  1023. bLow := b and $ffff;
  1024. bHigh := b shr 16;
  1025. z1 := ( bits32( aLow) ) * bLow;
  1026. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1027. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1028. z0 := ( bits32 (aHigh) ) * bHigh;
  1029. zMiddleA := zMiddleA + zMiddleB;
  1030. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1031. zMiddleA := zmiddleA shl 16;
  1032. z1 := z1 + zMiddleA;
  1033. z0 := z0 + bits32( z1 < zMiddleA );
  1034. z1Ptr := z1;
  1035. z0Ptr := z0;
  1036. End;
  1037. {*
  1038. -------------------------------------------------------------------------------
  1039. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1040. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1041. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1042. `z2Ptr'.
  1043. -------------------------------------------------------------------------------
  1044. *}
  1045. Procedure
  1046. mul64By32To96(
  1047. a0:bits32;
  1048. a1:bits32;
  1049. b:bits32;
  1050. VAR z0Ptr:bits32;
  1051. VAR z1Ptr:bits32;
  1052. VAR z2Ptr:bits32
  1053. );
  1054. Var
  1055. z0, z1, z2, more1: bits32;
  1056. Begin
  1057. mul32To64( a1, b, z1, z2 );
  1058. mul32To64( a0, b, z0, more1 );
  1059. add64( z0, more1, 0, z1, z0, z1 );
  1060. z2Ptr := z2;
  1061. z1Ptr := z1;
  1062. z0Ptr := z0;
  1063. End;
  1064. {*
  1065. -------------------------------------------------------------------------------
  1066. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1067. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1068. product. The product is broken into four 32-bit pieces which are stored at
  1069. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1070. -------------------------------------------------------------------------------
  1071. *}
  1072. Procedure
  1073. mul64To128(
  1074. a0:bits32;
  1075. a1:bits32;
  1076. b0:bits32;
  1077. b1:bits32;
  1078. VAR z0Ptr:bits32;
  1079. VAR z1Ptr:bits32;
  1080. VAR z2Ptr:bits32;
  1081. VAR z3Ptr:bits32
  1082. );
  1083. Var
  1084. z0, z1, z2, z3: bits32;
  1085. more1, more2: bits32;
  1086. Begin
  1087. mul32To64( a1, b1, z2, z3 );
  1088. mul32To64( a1, b0, z1, more2 );
  1089. add64( z1, more2, 0, z2, z1, z2 );
  1090. mul32To64( a0, b0, z0, more1 );
  1091. add64( z0, more1, 0, z1, z0, z1 );
  1092. mul32To64( a0, b1, more1, more2 );
  1093. add64( more1, more2, 0, z2, more1, z2 );
  1094. add64( z0, z1, 0, more1, z0, z1 );
  1095. z3Ptr := z3;
  1096. z2Ptr := z2;
  1097. z1Ptr := z1;
  1098. z0Ptr := z0;
  1099. End;
  1100. {*
  1101. -------------------------------------------------------------------------------
  1102. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1103. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1104. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1105. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1106. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1107. unsigned integer is returned.
  1108. -------------------------------------------------------------------------------
  1109. *}
  1110. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1111. Var
  1112. b0, b1: bits32;
  1113. rem0, rem1, term0, term1: bits32;
  1114. z: bits32;
  1115. Begin
  1116. if ( b <= a0 ) then
  1117. Begin
  1118. estimateDiv64To32 := $FFFFFFFF;
  1119. exit;
  1120. End;
  1121. b0 := b shr 16;
  1122. if ( b0 shl 16 <= a0 ) then
  1123. z:= $FFFF0000
  1124. else
  1125. z:= ( a0 div b0 ) shl 16;
  1126. mul32To64( b, z, term0, term1 );
  1127. sub64( a0, a1, term0, term1, rem0, rem1 );
  1128. while ( ( sbits32 (rem0) ) < 0 ) do
  1129. Begin
  1130. z := z - $10000;
  1131. b1 := b shl 16;
  1132. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1133. End;
  1134. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1135. if ( b0 shl 16 <= rem0 ) then
  1136. z := z or $FFFF
  1137. else
  1138. z := z or (rem0 div b0);
  1139. estimateDiv64To32 := z;
  1140. End;
  1141. {*
  1142. -------------------------------------------------------------------------------
  1143. Returns an approximation to the square root of the 32-bit significand given
  1144. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1145. `aExp' (the least significant bit) is 1, the integer returned approximates
  1146. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1147. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1148. case, the approximation returned lies strictly within +/-2 of the exact
  1149. value.
  1150. -------------------------------------------------------------------------------
  1151. *}
  1152. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1153. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1154. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1155. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1156. );
  1157. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1158. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1159. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1160. );
  1161. Var
  1162. index: int8;
  1163. z: bits32;
  1164. Begin
  1165. index := ( a shr 27 ) AND 15;
  1166. if ( aExp AND 1 ) <> 0 then
  1167. Begin
  1168. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1169. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1170. a := a shr 1;
  1171. End
  1172. else
  1173. Begin
  1174. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1175. z := a div z + z;
  1176. if ( $20000 <= z ) then
  1177. z := $FFFF8000
  1178. else
  1179. z := ( z shl 15 );
  1180. if ( z <= a ) then
  1181. Begin
  1182. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1183. exit;
  1184. End;
  1185. End;
  1186. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1187. End;
  1188. {*
  1189. -------------------------------------------------------------------------------
  1190. Returns the number of leading 0 bits before the most-significant 1 bit of
  1191. `a'. If `a' is zero, 32 is returned.
  1192. -------------------------------------------------------------------------------
  1193. *}
  1194. Function countLeadingZeros32( a:bits32 ): int8;
  1195. const countLeadingZerosHigh:array[0..255] of int8 = (
  1196. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1197. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1198. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1199. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1200. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1201. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1202. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1203. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1204. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1205. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1206. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1207. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1208. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1209. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1210. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1211. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1212. );
  1213. Var
  1214. shiftCount: int8;
  1215. Begin
  1216. shiftCount := 0;
  1217. if ( a < $10000 ) then
  1218. Begin
  1219. shiftCount := shiftcount + 16;
  1220. a := a shl 16;
  1221. End;
  1222. if ( a < $1000000 ) then
  1223. Begin
  1224. shiftCount := shiftcount + 8;
  1225. a := a shl 8;
  1226. end;
  1227. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1228. countLeadingZeros32:= shiftCount;
  1229. End;
  1230. {*----------------------------------------------------------------------------
  1231. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1232. | `a'. If `a' is zero, 64 is returned.
  1233. *----------------------------------------------------------------------------*}
  1234. function countLeadingZeros64( a : bits64): int8;
  1235. var
  1236. shiftcount : int8;
  1237. Begin
  1238. shiftCount := 0;
  1239. if ( a < (bits64(1) shl 32 )) then
  1240. shiftCount := shiftcount + 32
  1241. else
  1242. a := a shr 32;
  1243. shiftCount := shiftCount + countLeadingZeros32( a );
  1244. countLeadingZeros64:= shiftCount;
  1245. End;
  1246. {*
  1247. -------------------------------------------------------------------------------
  1248. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1249. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1250. returns 0.
  1251. -------------------------------------------------------------------------------
  1252. *}
  1253. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1254. Begin
  1255. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1256. End;
  1257. {*
  1258. -------------------------------------------------------------------------------
  1259. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1260. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1261. Otherwise, returns 0.
  1262. -------------------------------------------------------------------------------
  1263. *}
  1264. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1265. Begin
  1266. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1267. End;
  1268. {*
  1269. -------------------------------------------------------------------------------
  1270. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1271. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1272. returns 0.
  1273. -------------------------------------------------------------------------------
  1274. *}
  1275. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1276. Begin
  1277. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1278. End;
  1279. {*
  1280. -------------------------------------------------------------------------------
  1281. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1282. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1283. returns 0.
  1284. -------------------------------------------------------------------------------
  1285. *}
  1286. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1287. Begin
  1288. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1289. End;
  1290. (*****************************************************************************)
  1291. (* End Low-Level arithmetic *)
  1292. (*****************************************************************************)
  1293. {*
  1294. -------------------------------------------------------------------------------
  1295. Functions and definitions to determine: (1) whether tininess for underflow
  1296. is detected before or after rounding by default, (2) what (if anything)
  1297. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1298. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1299. are propagated from function inputs to output. These details are ENDIAN
  1300. specific
  1301. -------------------------------------------------------------------------------
  1302. *}
  1303. {$IFDEF ENDIAN_LITTLE}
  1304. {*
  1305. -------------------------------------------------------------------------------
  1306. Internal canonical NaN format.
  1307. -------------------------------------------------------------------------------
  1308. *}
  1309. TYPE
  1310. commonNaNT = packed record
  1311. sign: flag;
  1312. high, low : bits32;
  1313. end;
  1314. {*
  1315. -------------------------------------------------------------------------------
  1316. The pattern for a default generated single-precision NaN.
  1317. -------------------------------------------------------------------------------
  1318. *}
  1319. const float32_default_nan = $FFC00000;
  1320. {*
  1321. -------------------------------------------------------------------------------
  1322. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1323. otherwise returns 0.
  1324. -------------------------------------------------------------------------------
  1325. *}
  1326. Function float32_is_nan( a : float32 ): flag;
  1327. Begin
  1328. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1329. End;
  1330. {*
  1331. -------------------------------------------------------------------------------
  1332. Returns 1 if the single-precision floating-point value `a' is a signaling
  1333. NaN; otherwise returns 0.
  1334. -------------------------------------------------------------------------------
  1335. *}
  1336. Function float32_is_signaling_nan( a : float32 ): flag;
  1337. Begin
  1338. float32_is_signaling_nan := flag
  1339. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1340. End;
  1341. {*
  1342. -------------------------------------------------------------------------------
  1343. Returns the result of converting the single-precision floating-point NaN
  1344. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1345. exception is raised.
  1346. -------------------------------------------------------------------------------
  1347. *}
  1348. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1349. var
  1350. z : commonNaNT ;
  1351. Begin
  1352. if ( float32_is_signaling_nan( a ) <> 0) then
  1353. float_raise( float_flag_invalid );
  1354. z.sign := a shr 31;
  1355. z.low := 0;
  1356. z.high := a shl 9;
  1357. c := z;
  1358. End;
  1359. {*
  1360. -------------------------------------------------------------------------------
  1361. Returns the result of converting the canonical NaN `a' to the single-
  1362. precision floating-point format.
  1363. -------------------------------------------------------------------------------
  1364. *}
  1365. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1366. Begin
  1367. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1368. End;
  1369. {*
  1370. -------------------------------------------------------------------------------
  1371. Takes two single-precision floating-point values `a' and `b', one of which
  1372. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1373. signaling NaN, the invalid exception is raised.
  1374. -------------------------------------------------------------------------------
  1375. *}
  1376. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1377. Var
  1378. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1379. label returnLargerSignificand;
  1380. Begin
  1381. aIsNaN := float32_is_nan( a );
  1382. aIsSignalingNaN := float32_is_signaling_nan( a );
  1383. bIsNaN := float32_is_nan( b );
  1384. bIsSignalingNaN := float32_is_signaling_nan( b );
  1385. a := a or $00400000;
  1386. b := b or $00400000;
  1387. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1388. float_raise( float_flag_invalid );
  1389. if ( aIsSignalingNaN )<> 0 then
  1390. Begin
  1391. if ( bIsSignalingNaN ) <> 0 then
  1392. goto returnLargerSignificand;
  1393. if bIsNan <> 0 then
  1394. propagateFloat32NaN := b
  1395. else
  1396. propagateFloat32NaN := a;
  1397. exit;
  1398. End
  1399. else if ( aIsNaN <> 0) then
  1400. Begin
  1401. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1402. Begin
  1403. propagateFloat32NaN := a;
  1404. exit;
  1405. End;
  1406. returnLargerSignificand:
  1407. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1408. Begin
  1409. propagateFloat32NaN := b;
  1410. exit;
  1411. End;
  1412. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1413. Begin
  1414. propagateFloat32NaN := a;
  1415. End;
  1416. if a < b then
  1417. propagateFloat32NaN := a
  1418. else
  1419. propagateFloat32NaN := b;
  1420. exit;
  1421. End
  1422. else
  1423. Begin
  1424. propagateFloat32NaN := b;
  1425. exit;
  1426. End;
  1427. End;
  1428. {*
  1429. -------------------------------------------------------------------------------
  1430. The pattern for a default generated double-precision NaN. The `high' and
  1431. `low' values hold the most- and least-significant bits, respectively.
  1432. -------------------------------------------------------------------------------
  1433. *}
  1434. const
  1435. float64_default_nan_high = $FFF80000;
  1436. float64_default_nan_low = $00000000;
  1437. {*
  1438. -------------------------------------------------------------------------------
  1439. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1440. otherwise returns 0.
  1441. -------------------------------------------------------------------------------
  1442. *}
  1443. Function float64_is_nan( a : float64 ) : flag;
  1444. Begin
  1445. float64_is_nan :=
  1446. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1447. and ( a.low or ( a.high and $000FFFFF ) );
  1448. End;
  1449. {*
  1450. -------------------------------------------------------------------------------
  1451. Returns 1 if the double-precision floating-point value `a' is a signaling
  1452. NaN; otherwise returns 0.
  1453. -------------------------------------------------------------------------------
  1454. *}
  1455. Function float64_is_signaling_nan( a : float64 ): flag;
  1456. Begin
  1457. float64_is_signaling_nan :=
  1458. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1459. and ( a.low or ( a.high and $0007FFFF ) );
  1460. End;
  1461. {*
  1462. -------------------------------------------------------------------------------
  1463. Returns the result of converting the double-precision floating-point NaN
  1464. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1465. exception is raised.
  1466. -------------------------------------------------------------------------------
  1467. *}
  1468. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1469. Var
  1470. z : commonNaNT;
  1471. Begin
  1472. if ( float64_is_signaling_nan( a )<>0 ) then
  1473. float_raise( float_flag_invalid );
  1474. z.sign := a.high shr 31;
  1475. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1476. c := z;
  1477. End;
  1478. {*
  1479. -------------------------------------------------------------------------------
  1480. Returns the result of converting the canonical NaN `a' to the double-
  1481. precision floating-point format.
  1482. -------------------------------------------------------------------------------
  1483. *}
  1484. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1485. Var
  1486. z: float64;
  1487. Begin
  1488. shift64Right( a.high, a.low, 12, z.high, z.low );
  1489. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1490. c := z;
  1491. End;
  1492. {*
  1493. -------------------------------------------------------------------------------
  1494. Takes two double-precision floating-point values `a' and `b', one of which
  1495. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1496. signaling NaN, the invalid exception is raised.
  1497. -------------------------------------------------------------------------------
  1498. *}
  1499. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1500. Var
  1501. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1502. label returnLargerSignificand;
  1503. Begin
  1504. aIsNaN := float64_is_nan( a );
  1505. aIsSignalingNaN := float64_is_signaling_nan( a );
  1506. bIsNaN := float64_is_nan( b );
  1507. bIsSignalingNaN := float64_is_signaling_nan( b );
  1508. a.high := a.high or $00080000;
  1509. b.high := b.high or $00080000;
  1510. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1511. float_raise( float_flag_invalid );
  1512. if ( aIsSignalingNaN )<>0 then
  1513. Begin
  1514. if ( bIsSignalingNaN )<>0 then
  1515. goto returnLargerSignificand;
  1516. if bIsNan <> 0 then
  1517. c := b
  1518. else
  1519. c := a;
  1520. exit;
  1521. End
  1522. else if ( aIsNaN )<> 0 then
  1523. Begin
  1524. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1525. Begin
  1526. c := a;
  1527. exit;
  1528. End;
  1529. returnLargerSignificand:
  1530. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1531. Begin
  1532. c := b;
  1533. exit;
  1534. End;
  1535. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1536. Begin
  1537. c := a;
  1538. exit;
  1539. End;
  1540. if a.high < b.high then
  1541. c := a
  1542. else
  1543. c := b;
  1544. exit;
  1545. End
  1546. else
  1547. Begin
  1548. c := b;
  1549. exit;
  1550. End;
  1551. End;
  1552. {*----------------------------------------------------------------------------
  1553. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1554. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1555. | returns 0.
  1556. *----------------------------------------------------------------------------*}
  1557. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1558. begin
  1559. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1560. end;
  1561. {*----------------------------------------------------------------------------
  1562. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1563. | otherwise returns 0.
  1564. *----------------------------------------------------------------------------*}
  1565. function float128_is_nan( a : float128): flag;
  1566. begin
  1567. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1568. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1569. end;
  1570. {*----------------------------------------------------------------------------
  1571. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1572. | signaling NaN; otherwise returns 0.
  1573. *----------------------------------------------------------------------------*}
  1574. function float128_is_signaling_nan( a : float128): flag;
  1575. begin
  1576. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1577. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1578. end;
  1579. {*----------------------------------------------------------------------------
  1580. | Returns the result of converting the quadruple-precision floating-point NaN
  1581. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1582. | exception is raised.
  1583. *----------------------------------------------------------------------------*}
  1584. function float128ToCommonNaN( a : float128): commonNaNT;
  1585. var
  1586. z: commonNaNT;
  1587. qhigh,qlow : qword;
  1588. begin
  1589. if ( float128_is_signaling_nan( a )<>0) then
  1590. float_raise( float_flag_invalid );
  1591. z.sign := a.high shr 63;
  1592. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1593. z.high:=qhigh shr 32;
  1594. z.low:=qhigh and $ffffffff;
  1595. result:=z;
  1596. end;
  1597. {*----------------------------------------------------------------------------
  1598. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1599. | precision floating-point format.
  1600. *----------------------------------------------------------------------------*}
  1601. function commonNaNToFloat128( a : commonNaNT): float128;
  1602. var
  1603. z: float128;
  1604. begin
  1605. shift128Right( a.high, a.low, 16, z.high, z.low );
  1606. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1607. result:=z;
  1608. end;
  1609. {*----------------------------------------------------------------------------
  1610. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1611. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1612. | `b' is a signaling NaN, the invalid exception is raised.
  1613. *----------------------------------------------------------------------------*}
  1614. function propagateFloat128NaN( a: float128; b : float128): float128;
  1615. var
  1616. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1617. label
  1618. returnLargerSignificand;
  1619. begin
  1620. aIsNaN := float128_is_nan( a );
  1621. aIsSignalingNaN := float128_is_signaling_nan( a );
  1622. bIsNaN := float128_is_nan( b );
  1623. bIsSignalingNaN := float128_is_signaling_nan( b );
  1624. a.high := a.high or int64( $0000800000000000 );
  1625. b.high := b.high or int64( $0000800000000000 );
  1626. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1627. float_raise( float_flag_invalid );
  1628. if ( aIsSignalingNaN )<>0 then
  1629. begin
  1630. if ( bIsSignalingNaN )<>0 then
  1631. goto returnLargerSignificand;
  1632. if bIsNaN<>0 then
  1633. result := b
  1634. else
  1635. result := a;
  1636. exit;
  1637. end
  1638. else if ( aIsNaN )<>0 then
  1639. begin
  1640. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1641. begin
  1642. result := a;
  1643. exit;
  1644. end;
  1645. returnLargerSignificand:
  1646. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1647. begin
  1648. result := b;
  1649. exit;
  1650. end;
  1651. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1652. begin
  1653. result := a;
  1654. exit
  1655. end;
  1656. if ( a.high < b.high ) then
  1657. result := a
  1658. else
  1659. result := b;
  1660. exit;
  1661. end
  1662. else
  1663. result:=b;
  1664. end;
  1665. {$ELSE}
  1666. { Big endian code }
  1667. (*----------------------------------------------------------------------------
  1668. | Internal canonical NaN format.
  1669. *----------------------------------------------------------------------------*)
  1670. type
  1671. commonNANT = packed record
  1672. sign : flag;
  1673. high, low : bits32;
  1674. end;
  1675. (*----------------------------------------------------------------------------
  1676. | The pattern for a default generated single-precision NaN.
  1677. *----------------------------------------------------------------------------*)
  1678. const float32_default_nan = $7FFFFFFF;
  1679. (*----------------------------------------------------------------------------
  1680. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1681. | otherwise returns 0.
  1682. *----------------------------------------------------------------------------*)
  1683. function float32_is_nan(a: float32): flag;
  1684. begin
  1685. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1686. end;
  1687. (*----------------------------------------------------------------------------
  1688. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1689. | NaN; otherwise returns 0.
  1690. *----------------------------------------------------------------------------*)
  1691. function float32_is_signaling_nan(a: float32):flag;
  1692. begin
  1693. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1694. end;
  1695. (*----------------------------------------------------------------------------
  1696. | Returns the result of converting the single-precision floating-point NaN
  1697. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1698. | exception is raised.
  1699. *----------------------------------------------------------------------------*)
  1700. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1701. var
  1702. z: commonNANT;
  1703. begin
  1704. if float32_is_signaling_nan(a)<>0 then
  1705. float_raise(float_flag_invalid);
  1706. z.sign := a shr 31;
  1707. z.low := 0;
  1708. z.high := a shl 9;
  1709. c:=z;
  1710. end;
  1711. (*----------------------------------------------------------------------------
  1712. | Returns the result of converting the canonical NaN `a' to the single-
  1713. | precision floating-point format.
  1714. *----------------------------------------------------------------------------*)
  1715. function CommonNanToFloat32(a : CommonNaNT): float32;
  1716. begin
  1717. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  1718. end;
  1719. (*----------------------------------------------------------------------------
  1720. | Takes two single-precision floating-point values `a' and `b', one of which
  1721. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1722. | signaling NaN, the invalid exception is raised.
  1723. *----------------------------------------------------------------------------*)
  1724. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  1725. var
  1726. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1727. begin
  1728. aIsNaN := float32_is_nan( a );
  1729. aIsSignalingNaN := float32_is_signaling_nan( a );
  1730. bIsNaN := float32_is_nan( b );
  1731. bIsSignalingNaN := float32_is_signaling_nan( b );
  1732. a := a or $00400000;
  1733. b := b or $00400000;
  1734. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1735. float_raise( float_flag_invalid );
  1736. if bIsSignalingNaN<>0 then
  1737. propagateFloat32Nan := b
  1738. else if aIsSignalingNan<>0 then
  1739. propagateFloat32Nan := a
  1740. else if bIsNan<>0 then
  1741. propagateFloat32Nan := b
  1742. else
  1743. propagateFloat32Nan := a;
  1744. end;
  1745. (*----------------------------------------------------------------------------
  1746. | The pattern for a default generated double-precision NaN. The `high' and
  1747. | `low' values hold the most- and least-significant bits, respectively.
  1748. *----------------------------------------------------------------------------*)
  1749. const
  1750. float64_default_nan_high = $7FFFFFFF;
  1751. float64_default_nan_low = $FFFFFFFF;
  1752. (*----------------------------------------------------------------------------
  1753. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  1754. | otherwise returns 0.
  1755. *----------------------------------------------------------------------------*)
  1756. function float64_is_nan(a: float64): flag;
  1757. begin
  1758. float64_is_nan := flag (
  1759. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1760. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  1761. end;
  1762. (*----------------------------------------------------------------------------
  1763. | Returns 1 if the double-precision floating-point value `a' is a signaling
  1764. | NaN; otherwise returns 0.
  1765. *----------------------------------------------------------------------------*)
  1766. function float64_is_signaling_nan( a:float64): flag;
  1767. begin
  1768. float64_is_signaling_nan := flag(
  1769. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1770. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  1771. end;
  1772. (*----------------------------------------------------------------------------
  1773. | Returns the result of converting the double-precision floating-point NaN
  1774. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1775. | exception is raised.
  1776. *----------------------------------------------------------------------------*)
  1777. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1778. var
  1779. z : commonNaNT;
  1780. begin
  1781. if ( float64_is_signaling_nan( a )<>0 ) then
  1782. float_raise( float_flag_invalid );
  1783. z.sign := a.high shr 31;
  1784. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1785. c:=z;
  1786. end;
  1787. (*----------------------------------------------------------------------------
  1788. | Returns the result of converting the canonical NaN `a' to the double-
  1789. | precision floating-point format.
  1790. *----------------------------------------------------------------------------*)
  1791. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1792. var
  1793. z: float64;
  1794. begin
  1795. shift64Right( a.high, a.low, 12, z.high, z.low );
  1796. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1797. c:=z;
  1798. end;
  1799. (*----------------------------------------------------------------------------
  1800. | Takes two double-precision floating-point values `a' and `b', one of which
  1801. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1802. | signaling NaN, the invalid exception is raised.
  1803. *----------------------------------------------------------------------------*)
  1804. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1805. var
  1806. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  1807. begin
  1808. aIsNaN := float64_is_nan( a );
  1809. aIsSignalingNaN := float64_is_signaling_nan( a );
  1810. bIsNaN := float64_is_nan( b );
  1811. bIsSignalingNaN := float64_is_signaling_nan( b );
  1812. a.high := a.high or $00080000;
  1813. b.high := b.high or $00080000;
  1814. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  1815. float_raise( float_flag_invalid );
  1816. if bIsSignalingNaN<>0 then
  1817. c := b
  1818. else if aIsSignalingNan<>0 then
  1819. c := a
  1820. else if bIsNan<>0 then
  1821. c := b
  1822. else
  1823. c := a;
  1824. end;
  1825. {$ENDIF}
  1826. (****************************************************************************)
  1827. (* END ENDIAN SPECIFIC CODE *)
  1828. (****************************************************************************)
  1829. {*
  1830. -------------------------------------------------------------------------------
  1831. Returns the fraction bits of the single-precision floating-point value `a'.
  1832. -------------------------------------------------------------------------------
  1833. *}
  1834. Function ExtractFloat32Frac(a : Float32) : Bits32;
  1835. Begin
  1836. ExtractFloat32Frac := A AND $007FFFFF;
  1837. End;
  1838. {*
  1839. -------------------------------------------------------------------------------
  1840. Returns the exponent bits of the single-precision floating-point value `a'.
  1841. -------------------------------------------------------------------------------
  1842. *}
  1843. Function extractFloat32Exp( a: float32 ): Int16;
  1844. Begin
  1845. extractFloat32Exp := (a shr 23) AND $FF;
  1846. End;
  1847. {*
  1848. -------------------------------------------------------------------------------
  1849. Returns the sign bit of the single-precision floating-point value `a'.
  1850. -------------------------------------------------------------------------------
  1851. *}
  1852. Function extractFloat32Sign( a: float32 ): Flag;
  1853. Begin
  1854. extractFloat32Sign := a shr 31;
  1855. End;
  1856. {*
  1857. -------------------------------------------------------------------------------
  1858. Normalizes the subnormal single-precision floating-point value represented
  1859. by the denormalized significand `aSig'. The normalized exponent and
  1860. significand are stored at the locations pointed to by `zExpPtr' and
  1861. `zSigPtr', respectively.
  1862. -------------------------------------------------------------------------------
  1863. *}
  1864. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  1865. Var
  1866. ShiftCount : BYTE;
  1867. Begin
  1868. shiftCount := countLeadingZeros32( aSig ) - 8;
  1869. zSigPtr := aSig shl shiftCount;
  1870. zExpPtr := 1 - shiftCount;
  1871. End;
  1872. {*
  1873. -------------------------------------------------------------------------------
  1874. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  1875. single-precision floating-point value, returning the result. After being
  1876. shifted into the proper positions, the three fields are simply added
  1877. together to form the result. This means that any integer portion of `zSig'
  1878. will be added into the exponent. Since a properly normalized significand
  1879. will have an integer portion equal to 1, the `zExp' input should be 1 less
  1880. than the desired result exponent whenever `zSig' is a complete, normalized
  1881. significand.
  1882. -------------------------------------------------------------------------------
  1883. *}
  1884. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  1885. Begin
  1886. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  1887. + zSig;
  1888. End;
  1889. {*
  1890. -------------------------------------------------------------------------------
  1891. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1892. and significand `zSig', and returns the proper single-precision floating-
  1893. point value corresponding to the abstract input. Ordinarily, the abstract
  1894. value is simply rounded and packed into the single-precision format, with
  1895. the inexact exception raised if the abstract input cannot be represented
  1896. exactly. However, if the abstract value is too large, the overflow and
  1897. inexact exceptions are raised and an infinity or maximal finite value is
  1898. returned. If the abstract value is too small, the input value is rounded to
  1899. a subnormal number, and the underflow and inexact exceptions are raised if
  1900. the abstract input cannot be represented exactly as a subnormal single-
  1901. precision floating-point number.
  1902. The input significand `zSig' has its binary point between bits 30
  1903. and 29, which is 7 bits to the left of the usual location. This shifted
  1904. significand must be normalized or smaller. If `zSig' is not normalized,
  1905. `zExp' must be 0; in that case, the result returned is a subnormal number,
  1906. and it must not require rounding. In the usual case that `zSig' is
  1907. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  1908. The handling of underflow and overflow follows the IEC/IEEE Standard for
  1909. Binary Floating-Point Arithmetic.
  1910. -------------------------------------------------------------------------------
  1911. *}
  1912. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  1913. Var
  1914. roundingMode : BYTE;
  1915. roundNearestEven : Flag;
  1916. roundIncrement, roundBits : BYTE;
  1917. IsTiny : Flag;
  1918. Begin
  1919. roundingMode := float_rounding_mode;
  1920. if (roundingMode = float_round_nearest_even) then
  1921. Begin
  1922. roundNearestEven := Flag(TRUE);
  1923. end
  1924. else
  1925. roundNearestEven := Flag(FALSE);
  1926. roundIncrement := $40;
  1927. if ( Boolean(roundNearestEven) = FALSE) then
  1928. Begin
  1929. if ( roundingMode = float_round_to_zero ) Then
  1930. Begin
  1931. roundIncrement := 0;
  1932. End
  1933. else
  1934. Begin
  1935. roundIncrement := $7F;
  1936. if ( zSign <> 0 ) then
  1937. Begin
  1938. if roundingMode = float_round_up then roundIncrement := 0;
  1939. End
  1940. else
  1941. Begin
  1942. if roundingMode = float_round_down then roundIncrement := 0;
  1943. End;
  1944. End
  1945. End;
  1946. roundBits := zSig AND $7F;
  1947. if ($FD <= bits16 (zExp) ) then
  1948. Begin
  1949. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  1950. Begin
  1951. float_raise( float_flag_overflow OR float_flag_inexact );
  1952. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  1953. exit;
  1954. End;
  1955. if ( zExp < 0 ) then
  1956. Begin
  1957. isTiny :=
  1958. flag(( float_detect_tininess = float_tininess_before_rounding )
  1959. OR ( zExp < -1 )
  1960. OR ( (zSig + roundIncrement) < $80000000 ));
  1961. shift32RightJamming( zSig, - zExp, zSig );
  1962. zExp := 0;
  1963. roundBits := zSig AND $7F;
  1964. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  1965. float_raise( float_flag_underflow );
  1966. End;
  1967. End;
  1968. if ( roundBits )<> 0 then
  1969. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  1970. zSig := ( zSig + roundIncrement ) shr 7;
  1971. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  1972. if ( zSig = 0 ) then zExp := 0;
  1973. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  1974. exit;
  1975. End;
  1976. {*
  1977. -------------------------------------------------------------------------------
  1978. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1979. and significand `zSig', and returns the proper single-precision floating-
  1980. point value corresponding to the abstract input. This routine is just like
  1981. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  1982. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  1983. floating-point exponent.
  1984. -------------------------------------------------------------------------------
  1985. *}
  1986. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  1987. Var
  1988. ShiftCount : int8;
  1989. Begin
  1990. shiftCount := countLeadingZeros32( zSig ) - 1;
  1991. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  1992. End;
  1993. {*
  1994. -------------------------------------------------------------------------------
  1995. Returns the least-significant 32 fraction bits of the double-precision
  1996. floating-point value `a'.
  1997. -------------------------------------------------------------------------------
  1998. *}
  1999. Function extractFloat64Frac( a: float64 ): bits32;
  2000. Begin
  2001. extractFloat64Frac := a.low;
  2002. End;
  2003. {*
  2004. -------------------------------------------------------------------------------
  2005. Returns the most-significant 20 fraction bits of the double-precision
  2006. floating-point value `a'.
  2007. -------------------------------------------------------------------------------
  2008. *}
  2009. Function extractFloat64Frac0(a: float64): bits32;
  2010. Begin
  2011. extractFloat64Frac0 := a.high and $000FFFFF;
  2012. End;
  2013. {*
  2014. -------------------------------------------------------------------------------
  2015. Returns the least-significant 32 fraction bits of the double-precision
  2016. floating-point value `a'.
  2017. -------------------------------------------------------------------------------
  2018. *}
  2019. Function extractFloat64Frac1(a: float64): bits32;
  2020. Begin
  2021. extractFloat64Frac1 := a.low;
  2022. End;
  2023. {*
  2024. -------------------------------------------------------------------------------
  2025. Returns the exponent bits of the double-precision floating-point value `a'.
  2026. -------------------------------------------------------------------------------
  2027. *}
  2028. Function extractFloat64Exp(a: float64): int16;
  2029. Begin
  2030. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2031. End;
  2032. {*
  2033. -------------------------------------------------------------------------------
  2034. Returns the sign bit of the double-precision floating-point value `a'.
  2035. -------------------------------------------------------------------------------
  2036. *}
  2037. Function extractFloat64Sign(a: float64) : flag;
  2038. Begin
  2039. extractFloat64Sign := a.high shr 31;
  2040. End;
  2041. {*
  2042. -------------------------------------------------------------------------------
  2043. Normalizes the subnormal double-precision floating-point value represented
  2044. by the denormalized significand formed by the concatenation of `aSig0' and
  2045. `aSig1'. The normalized exponent is stored at the location pointed to by
  2046. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2047. stored at the location pointed to by `zSig0Ptr', and the least significant
  2048. 32 bits of the normalized significand are stored at the location pointed to
  2049. by `zSig1Ptr'.
  2050. -------------------------------------------------------------------------------
  2051. *}
  2052. Procedure normalizeFloat64Subnormal(
  2053. aSig0: bits32;
  2054. aSig1: bits32;
  2055. VAR zExpPtr : Int16;
  2056. VAR zSig0Ptr : Bits32;
  2057. VAR zSig1Ptr : Bits32
  2058. );
  2059. Var
  2060. ShiftCount : Int8;
  2061. Begin
  2062. if ( aSig0 = 0 ) then
  2063. Begin
  2064. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2065. if ( shiftCount < 0 ) then
  2066. Begin
  2067. zSig0Ptr := aSig1 shr ( - shiftCount );
  2068. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2069. End
  2070. else
  2071. Begin
  2072. zSig0Ptr := aSig1 shl shiftCount;
  2073. zSig1Ptr := 0;
  2074. End;
  2075. zExpPtr := - shiftCount - 31;
  2076. End
  2077. else
  2078. Begin
  2079. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2080. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2081. zExpPtr := 1 - shiftCount;
  2082. End;
  2083. End;
  2084. {*
  2085. -------------------------------------------------------------------------------
  2086. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2087. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2088. point value, returning the result. After being shifted into the proper
  2089. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2090. together to form the most significant 32 bits of the result. This means
  2091. that any integer portion of `zSig0' will be added into the exponent. Since
  2092. a properly normalized significand will have an integer portion equal to 1,
  2093. the `zExp' input should be 1 less than the desired result exponent whenever
  2094. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2095. -------------------------------------------------------------------------------
  2096. *}
  2097. Procedure
  2098. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2099. var
  2100. z: Float64;
  2101. Begin
  2102. z.low := zSig1;
  2103. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2104. c := z;
  2105. End;
  2106. {*----------------------------------------------------------------------------
  2107. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2108. | double-precision floating-point value, returning the result. After being
  2109. | shifted into the proper positions, the three fields are simply added
  2110. | together to form the result. This means that any integer portion of `zSig'
  2111. | will be added into the exponent. Since a properly normalized significand
  2112. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2113. | than the desired result exponent whenever `zSig' is a complete, normalized
  2114. | significand.
  2115. *----------------------------------------------------------------------------*}
  2116. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2117. begin
  2118. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2119. end;
  2120. {*
  2121. -------------------------------------------------------------------------------
  2122. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2123. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2124. and `zSig2', and returns the proper double-precision floating-point value
  2125. corresponding to the abstract input. Ordinarily, the abstract value is
  2126. simply rounded and packed into the double-precision format, with the inexact
  2127. exception raised if the abstract input cannot be represented exactly.
  2128. However, if the abstract value is too large, the overflow and inexact
  2129. exceptions are raised and an infinity or maximal finite value is returned.
  2130. If the abstract value is too small, the input value is rounded to a
  2131. subnormal number, and the underflow and inexact exceptions are raised if the
  2132. abstract input cannot be represented exactly as a subnormal double-precision
  2133. floating-point number.
  2134. The input significand must be normalized or smaller. If the input
  2135. significand is not normalized, `zExp' must be 0; in that case, the result
  2136. returned is a subnormal number, and it must not require rounding. In the
  2137. usual case that the input significand is normalized, `zExp' must be 1 less
  2138. than the ``true'' floating-point exponent. The handling of underflow and
  2139. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2140. -------------------------------------------------------------------------------
  2141. *}
  2142. Procedure
  2143. roundAndPackFloat64(
  2144. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2145. Var
  2146. roundingMode : Int8;
  2147. roundNearestEven, increment, isTiny : Flag;
  2148. Begin
  2149. roundingMode := float_rounding_mode;
  2150. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2151. increment := flag( sbits32 (zSig2) < 0 );
  2152. if ( roundNearestEven = flag(FALSE) ) then
  2153. Begin
  2154. if ( roundingMode = float_round_to_zero ) then
  2155. increment := 0
  2156. else
  2157. Begin
  2158. if ( zSign )<> 0 then
  2159. Begin
  2160. increment := flag( roundingMode = float_round_down ) and zSig2;
  2161. End
  2162. else
  2163. Begin
  2164. increment := flag( roundingMode = float_round_up ) and zSig2;
  2165. End
  2166. End
  2167. End;
  2168. if ( $7FD <= bits16 (zExp) ) then
  2169. Begin
  2170. if (( $7FD < zExp )
  2171. or (( zExp = $7FD )
  2172. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2173. and (increment<>0)
  2174. )
  2175. ) then
  2176. Begin
  2177. float_raise( float_flag_overflow OR float_flag_inexact );
  2178. if (( roundingMode = float_round_to_zero )
  2179. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2180. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2181. ) then
  2182. Begin
  2183. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2184. exit;
  2185. End;
  2186. packFloat64( zSign, $7FF, 0, 0, c );
  2187. exit;
  2188. End;
  2189. if ( zExp < 0 ) then
  2190. Begin
  2191. isTiny :=
  2192. flag( float_detect_tininess = float_tininess_before_rounding )
  2193. or flag( zExp < -1 )
  2194. or flag(increment = 0)
  2195. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2196. shift64ExtraRightJamming(
  2197. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2198. zExp := 0;
  2199. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2200. if ( roundNearestEven )<>0 then
  2201. Begin
  2202. increment := flag( sbits32 (zSig2) < 0 );
  2203. End
  2204. else
  2205. Begin
  2206. if ( zSign )<>0 then
  2207. Begin
  2208. increment := flag( roundingMode = float_round_down ) and zSig2;
  2209. End
  2210. else
  2211. Begin
  2212. increment := flag( roundingMode = float_round_up ) and zSig2;
  2213. End
  2214. End;
  2215. End;
  2216. End;
  2217. if ( zSig2 )<>0 then
  2218. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2219. if ( increment )<>0 then
  2220. Begin
  2221. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2222. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2223. End
  2224. else
  2225. Begin
  2226. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2227. End;
  2228. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2229. End;
  2230. {*----------------------------------------------------------------------------
  2231. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2232. | and significand `zSig', and returns the proper double-precision floating-
  2233. | point value corresponding to the abstract input. Ordinarily, the abstract
  2234. | value is simply rounded and packed into the double-precision format, with
  2235. | the inexact exception raised if the abstract input cannot be represented
  2236. | exactly. However, if the abstract value is too large, the overflow and
  2237. | inexact exceptions are raised and an infinity or maximal finite value is
  2238. | returned. If the abstract value is too small, the input value is rounded
  2239. | to a subnormal number, and the underflow and inexact exceptions are raised
  2240. | if the abstract input cannot be represented exactly as a subnormal double-
  2241. | precision floating-point number.
  2242. | The input significand `zSig' has its binary point between bits 62
  2243. | and 61, which is 10 bits to the left of the usual location. This shifted
  2244. | significand must be normalized or smaller. If `zSig' is not normalized,
  2245. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2246. | and it must not require rounding. In the usual case that `zSig' is
  2247. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2248. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2249. | Binary Floating-Point Arithmetic.
  2250. *----------------------------------------------------------------------------*}
  2251. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2252. var
  2253. roundingMode: int8;
  2254. roundNearestEven: flag;
  2255. roundIncrement, roundBits: int16;
  2256. isTiny: flag;
  2257. begin
  2258. roundingMode := float_rounding_mode;
  2259. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2260. roundIncrement := $200;
  2261. if ( roundNearestEven=0 ) then
  2262. begin
  2263. if ( roundingMode = float_round_to_zero ) then
  2264. begin
  2265. roundIncrement := 0;
  2266. end
  2267. else begin
  2268. roundIncrement := $3FF;
  2269. if ( zSign<>0 ) then
  2270. begin
  2271. if ( roundingMode = float_round_up ) then
  2272. roundIncrement := 0;
  2273. end
  2274. else begin
  2275. if ( roundingMode = float_round_down ) then
  2276. roundIncrement := 0;
  2277. end
  2278. end
  2279. end;
  2280. roundBits := zSig and $3FF;
  2281. if ( $7FD <= bits16(zExp) ) then
  2282. begin
  2283. if ( ( $7FD < zExp )
  2284. or ( ( zExp = $7FD )
  2285. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2286. ) then
  2287. begin
  2288. float_raise( float_flag_overflow or float_flag_inexact );
  2289. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2290. exit;
  2291. end;
  2292. if ( zExp < 0 ) then
  2293. begin
  2294. isTiny := ord(
  2295. ( float_detect_tininess = float_tininess_before_rounding )
  2296. or ( zExp < -1 )
  2297. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2298. shift64RightJamming( zSig, - zExp, zSig );
  2299. zExp := 0;
  2300. roundBits := zSig and $3FF;
  2301. if ( isTiny and roundBits )<>0 then
  2302. float_raise( float_flag_underflow );
  2303. end
  2304. end;
  2305. if ( roundBits<>0 ) then
  2306. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2307. zSig := ( zSig + roundIncrement ) shr 10;
  2308. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2309. if ( zSig = 0 ) then
  2310. zExp := 0;
  2311. result:=packFloat64( zSign, zExp, zSig );
  2312. end;
  2313. {*
  2314. -------------------------------------------------------------------------------
  2315. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2316. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2317. returns the proper double-precision floating-point value corresponding
  2318. to the abstract input. This routine is just like `roundAndPackFloat64'
  2319. except that the input significand has fewer bits and does not have to be
  2320. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2321. point exponent.
  2322. -------------------------------------------------------------------------------
  2323. *}
  2324. Procedure
  2325. normalizeRoundAndPackFloat64(
  2326. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2327. Var
  2328. shiftCount : int8;
  2329. zSig2 : bits32;
  2330. Begin
  2331. if ( zSig0 = 0 ) then
  2332. Begin
  2333. zSig0 := zSig1;
  2334. zSig1 := 0;
  2335. zExp := zExp -32;
  2336. End;
  2337. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2338. if ( 0 <= shiftCount ) then
  2339. Begin
  2340. zSig2 := 0;
  2341. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2342. End
  2343. else
  2344. Begin
  2345. shift64ExtraRightJamming
  2346. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2347. End;
  2348. zExp := zExp - shiftCount;
  2349. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2350. End;
  2351. {*
  2352. -------------------------------------------------------------------------------
  2353. Returns the result of converting the 32-bit two's complement integer `a' to
  2354. the single-precision floating-point format. The conversion is performed
  2355. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2356. -------------------------------------------------------------------------------
  2357. *}
  2358. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2359. Var
  2360. zSign : Flag;
  2361. Begin
  2362. if ( a = 0 ) then
  2363. Begin
  2364. int32_to_float32.float32 := 0;
  2365. exit;
  2366. End;
  2367. if ( a = sbits32 ($80000000) ) then
  2368. Begin
  2369. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2370. exit;
  2371. end;
  2372. zSign := flag( a < 0 );
  2373. If zSign<>0 then
  2374. a := -a;
  2375. int32_to_float32.float32:=
  2376. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2377. End;
  2378. {*
  2379. -------------------------------------------------------------------------------
  2380. Returns the result of converting the 32-bit two's complement integer `a' to
  2381. the double-precision floating-point format. The conversion is performed
  2382. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2383. -------------------------------------------------------------------------------
  2384. *}
  2385. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2386. var
  2387. zSign : flag;
  2388. absA : bits32;
  2389. shiftCount : int8;
  2390. zSig0, zSig1 : bits32;
  2391. Begin
  2392. if ( a = 0 ) then
  2393. Begin
  2394. packFloat64( 0, 0, 0, 0, result );
  2395. exit;
  2396. end;
  2397. zSign := flag( a < 0 );
  2398. if ZSign<>0 then
  2399. AbsA := -a
  2400. else
  2401. AbsA := a;
  2402. shiftCount := countLeadingZeros32( absA ) - 11;
  2403. if ( 0 <= shiftCount ) then
  2404. Begin
  2405. zSig0 := absA shl shiftCount;
  2406. zSig1 := 0;
  2407. End
  2408. else
  2409. Begin
  2410. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2411. End;
  2412. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2413. End;
  2414. {*
  2415. -------------------------------------------------------------------------------
  2416. Returns the result of converting the single-precision floating-point value
  2417. `a' to the 32-bit two's complement integer format. The conversion is
  2418. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2419. Arithmetic---which means in particular that the conversion is rounded
  2420. according to the current rounding mode. If `a' is a NaN, the largest
  2421. positive integer is returned. Otherwise, if the conversion overflows, the
  2422. largest integer with the same sign as `a' is returned.
  2423. -------------------------------------------------------------------------------
  2424. *}
  2425. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2426. Var
  2427. aSign: flag;
  2428. aExp, shiftCount: int16;
  2429. aSig, aSigExtra: bits32;
  2430. z: int32;
  2431. roundingMode: int8;
  2432. Begin
  2433. aSig := extractFloat32Frac( a.float32 );
  2434. aExp := extractFloat32Exp( a.float32 );
  2435. aSign := extractFloat32Sign( a.float32 );
  2436. shiftCount := aExp - $96;
  2437. if ( 0 <= shiftCount ) then
  2438. Begin
  2439. if ( $9E <= aExp ) then
  2440. Begin
  2441. if ( a.float32 <> $CF000000 ) then
  2442. Begin
  2443. float_raise( float_flag_invalid );
  2444. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2445. Begin
  2446. float32_to_int32 := $7FFFFFFF;
  2447. exit;
  2448. End;
  2449. End;
  2450. float32_to_int32 := sbits32 ($80000000);
  2451. exit;
  2452. End;
  2453. z := ( aSig or $00800000 ) shl shiftCount;
  2454. if ( aSign<>0 ) then z := - z;
  2455. End
  2456. else
  2457. Begin
  2458. if ( aExp < $7E ) then
  2459. Begin
  2460. aSigExtra := aExp OR aSig;
  2461. z := 0;
  2462. End
  2463. else
  2464. Begin
  2465. aSig := aSig OR $00800000;
  2466. aSigExtra := aSig shl ( shiftCount and 31 );
  2467. z := aSig shr ( - shiftCount );
  2468. End;
  2469. if ( aSigExtra<>0 ) then
  2470. softfloat_exception_flags := softfloat_exception_flags
  2471. or float_flag_inexact;
  2472. roundingMode := float_rounding_mode;
  2473. if ( roundingMode = float_round_nearest_even ) then
  2474. Begin
  2475. if ( sbits32 (aSigExtra) < 0 ) then
  2476. Begin
  2477. Inc(z);
  2478. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2479. z := z and not 1;
  2480. End;
  2481. if ( aSign<>0 ) then
  2482. z := - z;
  2483. End
  2484. else
  2485. Begin
  2486. aSigExtra := flag( aSigExtra <> 0 );
  2487. if ( aSign<>0 ) then
  2488. Begin
  2489. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2490. z := - z;
  2491. End
  2492. else
  2493. Begin
  2494. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2495. End
  2496. End;
  2497. End;
  2498. float32_to_int32 := z;
  2499. End;
  2500. {*
  2501. -------------------------------------------------------------------------------
  2502. Returns the result of converting the single-precision floating-point value
  2503. `a' to the 32-bit two's complement integer format. The conversion is
  2504. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2505. Arithmetic, except that the conversion is always rounded toward zero.
  2506. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2507. the conversion overflows, the largest integer with the same sign as `a' is
  2508. returned.
  2509. -------------------------------------------------------------------------------
  2510. *}
  2511. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2512. Var
  2513. aSign : flag;
  2514. aExp, shiftCount : int16;
  2515. aSig : bits32;
  2516. z : int32;
  2517. Begin
  2518. aSig := extractFloat32Frac( a.float32 );
  2519. aExp := extractFloat32Exp( a.float32 );
  2520. aSign := extractFloat32Sign( a.float32 );
  2521. shiftCount := aExp - $9E;
  2522. if ( 0 <= shiftCount ) then
  2523. Begin
  2524. if ( a.float32 <> $CF000000 ) then
  2525. Begin
  2526. float_raise( float_flag_invalid );
  2527. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2528. Begin
  2529. float32_to_int32_round_to_zero := $7FFFFFFF;
  2530. exit;
  2531. end;
  2532. End;
  2533. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2534. exit;
  2535. End
  2536. else
  2537. if ( aExp <= $7E ) then
  2538. Begin
  2539. if ( aExp or aSig )<>0 then
  2540. softfloat_exception_flags :=
  2541. softfloat_exception_flags or float_flag_inexact;
  2542. float32_to_int32_round_to_zero := 0;
  2543. exit;
  2544. End;
  2545. aSig := ( aSig or $00800000 ) shl 8;
  2546. z := aSig shr ( - shiftCount );
  2547. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2548. Begin
  2549. softfloat_exception_flags :=
  2550. softfloat_exception_flags or float_flag_inexact;
  2551. End;
  2552. if ( aSign<>0 ) then z := - z;
  2553. float32_to_int32_round_to_zero := z;
  2554. End;
  2555. {*
  2556. -------------------------------------------------------------------------------
  2557. Returns the result of converting the single-precision floating-point value
  2558. `a' to the double-precision floating-point format. The conversion is
  2559. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2560. Arithmetic.
  2561. -------------------------------------------------------------------------------
  2562. *}
  2563. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2564. Var
  2565. aSign : flag;
  2566. aExp : int16;
  2567. aSig, zSig0, zSig1: bits32;
  2568. tmp : CommonNanT;
  2569. Begin
  2570. aSig := extractFloat32Frac( a.float32 );
  2571. aExp := extractFloat32Exp( a.float32 );
  2572. aSign := extractFloat32Sign( a.float32 );
  2573. if ( aExp = $FF ) then
  2574. Begin
  2575. if ( aSig<>0 ) then
  2576. Begin
  2577. float32ToCommonNaN(a.float32, tmp);
  2578. commonNaNToFloat64(tmp , result);
  2579. exit;
  2580. End;
  2581. packFloat64( aSign, $7FF, 0, 0, result);
  2582. exit;
  2583. End;
  2584. if ( aExp = 0 ) then
  2585. Begin
  2586. if ( aSig = 0 ) then
  2587. Begin
  2588. packFloat64( aSign, 0, 0, 0, result );
  2589. exit;
  2590. end;
  2591. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2592. Dec(aExp);
  2593. End;
  2594. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2595. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2596. End;
  2597. {*
  2598. -------------------------------------------------------------------------------
  2599. Rounds the single-precision floating-point value `a' to an integer,
  2600. and returns the result as a single-precision floating-point value. The
  2601. operation is performed according to the IEC/IEEE Standard for Binary
  2602. Floating-Point Arithmetic.
  2603. -------------------------------------------------------------------------------
  2604. *}
  2605. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2606. Var
  2607. aSign: flag;
  2608. aExp: int16;
  2609. lastBitMask, roundBitsMask: bits32;
  2610. roundingMode: int8;
  2611. z: float32;
  2612. Begin
  2613. aExp := extractFloat32Exp( a.float32 );
  2614. if ( $96 <= aExp ) then
  2615. Begin
  2616. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2617. Begin
  2618. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2619. exit;
  2620. End;
  2621. float32_round_to_int:=a;
  2622. exit;
  2623. End;
  2624. if ( aExp <= $7E ) then
  2625. Begin
  2626. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2627. Begin
  2628. float32_round_to_int:=a;
  2629. exit;
  2630. end;
  2631. softfloat_exception_flags
  2632. := softfloat_exception_flags OR float_flag_inexact;
  2633. aSign := extractFloat32Sign( a.float32 );
  2634. case ( float_rounding_mode ) of
  2635. float_round_nearest_even:
  2636. Begin
  2637. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2638. Begin
  2639. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2640. exit;
  2641. End;
  2642. End;
  2643. float_round_down:
  2644. Begin
  2645. if aSign <> 0 then
  2646. float32_round_to_int.float32 := $BF800000
  2647. else
  2648. float32_round_to_int.float32 := 0;
  2649. exit;
  2650. End;
  2651. float_round_up:
  2652. Begin
  2653. if aSign <> 0 then
  2654. float32_round_to_int.float32 := $80000000
  2655. else
  2656. float32_round_to_int.float32 := $3F800000;
  2657. exit;
  2658. End;
  2659. end;
  2660. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2661. End;
  2662. lastBitMask := 1;
  2663. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2664. lastBitMask := lastBitMask shl ($96 - aExp);
  2665. roundBitsMask := lastBitMask - 1;
  2666. z := a.float32;
  2667. roundingMode := float_rounding_mode;
  2668. if ( roundingMode = float_round_nearest_even ) then
  2669. Begin
  2670. z := z + (lastBitMask shr 1);
  2671. if ( ( z and roundBitsMask ) = 0 ) then
  2672. z := z and not lastBitMask;
  2673. End
  2674. else if ( roundingMode <> float_round_to_zero ) then
  2675. Begin
  2676. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2677. Begin
  2678. z := z + roundBitsMask;
  2679. End;
  2680. End;
  2681. z := z and not roundBitsMask;
  2682. if ( z <> a.float32 ) then
  2683. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2684. float32_round_to_int.float32 := z;
  2685. End;
  2686. {*
  2687. -------------------------------------------------------------------------------
  2688. Returns the result of adding the absolute values of the single-precision
  2689. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2690. before being returned. `zSign' is ignored if the result is a NaN.
  2691. The addition is performed according to the IEC/IEEE Standard for Binary
  2692. Floating-Point Arithmetic.
  2693. -------------------------------------------------------------------------------
  2694. *}
  2695. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2696. Var
  2697. aExp, bExp, zExp: int16;
  2698. aSig, bSig, zSig: bits32;
  2699. expDiff: int16;
  2700. label roundAndPack;
  2701. Begin
  2702. aSig:=extractFloat32Frac( a );
  2703. aExp:=extractFloat32Exp( a );
  2704. bSig:=extractFloat32Frac( b );
  2705. bExp := extractFloat32Exp( b );
  2706. expDiff := aExp - bExp;
  2707. aSig := aSig shl 6;
  2708. bSig := bSig shl 6;
  2709. if ( 0 < expDiff ) then
  2710. Begin
  2711. if ( aExp = $FF ) then
  2712. Begin
  2713. if ( aSig <> 0) then
  2714. Begin
  2715. addFloat32Sigs := propagateFloat32NaN( a, b );
  2716. exit;
  2717. End;
  2718. addFloat32Sigs := a;
  2719. exit;
  2720. End;
  2721. if ( bExp = 0 ) then
  2722. Begin
  2723. Dec(expDiff);
  2724. End
  2725. else
  2726. Begin
  2727. bSig := bSig or $20000000;
  2728. End;
  2729. shift32RightJamming( bSig, expDiff, bSig );
  2730. zExp := aExp;
  2731. End
  2732. else
  2733. If ( expDiff < 0 ) then
  2734. Begin
  2735. if ( bExp = $FF ) then
  2736. Begin
  2737. if ( bSig<>0 ) then
  2738. Begin
  2739. addFloat32Sigs := propagateFloat32NaN( a, b );
  2740. exit;
  2741. end;
  2742. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  2743. exit;
  2744. End;
  2745. if ( aExp = 0 ) then
  2746. Begin
  2747. Inc(expDiff);
  2748. End
  2749. else
  2750. Begin
  2751. aSig := aSig OR $20000000;
  2752. End;
  2753. shift32RightJamming( aSig, - expDiff, aSig );
  2754. zExp := bExp;
  2755. End
  2756. else
  2757. Begin
  2758. if ( aExp = $FF ) then
  2759. Begin
  2760. if ( aSig OR bSig )<> 0 then
  2761. Begin
  2762. addFloat32Sigs := propagateFloat32NaN( a, b );
  2763. exit;
  2764. end;
  2765. addFloat32Sigs := a;
  2766. exit;
  2767. End;
  2768. if ( aExp = 0 ) then
  2769. Begin
  2770. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  2771. exit;
  2772. end;
  2773. zSig := $40000000 + aSig + bSig;
  2774. zExp := aExp;
  2775. goto roundAndPack;
  2776. End;
  2777. aSig := aSig OR $20000000;
  2778. zSig := ( aSig + bSig ) shl 1;
  2779. Dec(zExp);
  2780. if ( sbits32 (zSig) < 0 ) then
  2781. Begin
  2782. zSig := aSig + bSig;
  2783. Inc(zExp);
  2784. End;
  2785. roundAndPack:
  2786. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  2787. End;
  2788. {*
  2789. -------------------------------------------------------------------------------
  2790. Returns the result of subtracting the absolute values of the single-
  2791. precision floating-point values `a' and `b'. If `zSign' is 1, the
  2792. difference is negated before being returned. `zSign' is ignored if the
  2793. result is a NaN. The subtraction is performed according to the IEC/IEEE
  2794. Standard for Binary Floating-Point Arithmetic.
  2795. -------------------------------------------------------------------------------
  2796. *}
  2797. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  2798. Var
  2799. aExp, bExp, zExp: int16;
  2800. aSig, bSig, zSig: bits32;
  2801. expDiff : int16;
  2802. label aExpBigger;
  2803. label bExpBigger;
  2804. label aBigger;
  2805. label bBigger;
  2806. label normalizeRoundAndPack;
  2807. Begin
  2808. aSig := extractFloat32Frac( a );
  2809. aExp := extractFloat32Exp( a );
  2810. bSig := extractFloat32Frac( b );
  2811. bExp := extractFloat32Exp( b );
  2812. expDiff := aExp - bExp;
  2813. aSig := aSig shl 7;
  2814. bSig := bSig shl 7;
  2815. if ( 0 < expDiff ) then goto aExpBigger;
  2816. if ( expDiff < 0 ) then goto bExpBigger;
  2817. if ( aExp = $FF ) then
  2818. Begin
  2819. if ( aSig OR bSig )<> 0 then
  2820. Begin
  2821. subFloat32Sigs := propagateFloat32NaN( a, b );
  2822. exit;
  2823. End;
  2824. float_raise( float_flag_invalid );
  2825. subFloat32Sigs := float32_default_nan;
  2826. exit;
  2827. End;
  2828. if ( aExp = 0 ) then
  2829. Begin
  2830. aExp := 1;
  2831. bExp := 1;
  2832. End;
  2833. if ( bSig < aSig ) Then goto aBigger;
  2834. if ( aSig < bSig ) Then goto bBigger;
  2835. subFloat32Sigs := packFloat32( flag(float_rounding_mode = float_round_down), 0, 0 );
  2836. exit;
  2837. bExpBigger:
  2838. if ( bExp = $FF ) then
  2839. Begin
  2840. if ( bSig<>0 ) then
  2841. Begin
  2842. subFloat32Sigs := propagateFloat32NaN( a, b );
  2843. exit;
  2844. End;
  2845. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  2846. exit;
  2847. End;
  2848. if ( aExp = 0 ) then
  2849. Begin
  2850. Inc(expDiff);
  2851. End
  2852. else
  2853. Begin
  2854. aSig := aSig OR $40000000;
  2855. End;
  2856. shift32RightJamming( aSig, - expDiff, aSig );
  2857. bSig := bSig OR $40000000;
  2858. bBigger:
  2859. zSig := bSig - aSig;
  2860. zExp := bExp;
  2861. zSign := zSign xor 1;
  2862. goto normalizeRoundAndPack;
  2863. aExpBigger:
  2864. if ( aExp = $FF ) then
  2865. Begin
  2866. if ( aSig <> 0) then
  2867. Begin
  2868. subFloat32Sigs := propagateFloat32NaN( a, b );
  2869. exit;
  2870. End;
  2871. subFloat32Sigs := a;
  2872. exit;
  2873. End;
  2874. if ( bExp = 0 ) then
  2875. Begin
  2876. Dec(expDiff);
  2877. End
  2878. else
  2879. Begin
  2880. bSig := bSig OR $40000000;
  2881. End;
  2882. shift32RightJamming( bSig, expDiff, bSig );
  2883. aSig := aSig OR $40000000;
  2884. aBigger:
  2885. zSig := aSig - bSig;
  2886. zExp := aExp;
  2887. normalizeRoundAndPack:
  2888. Dec(zExp);
  2889. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  2890. End;
  2891. {*
  2892. -------------------------------------------------------------------------------
  2893. Returns the result of adding the single-precision floating-point values `a'
  2894. and `b'. The operation is performed according to the IEC/IEEE Standard for
  2895. Binary Floating-Point Arithmetic.
  2896. -------------------------------------------------------------------------------
  2897. *}
  2898. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  2899. Var
  2900. aSign, bSign: Flag;
  2901. Begin
  2902. aSign := extractFloat32Sign( a.float32 );
  2903. bSign := extractFloat32Sign( b.float32 );
  2904. if ( aSign = bSign ) then
  2905. Begin
  2906. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  2907. End
  2908. else
  2909. Begin
  2910. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  2911. End;
  2912. End;
  2913. {*
  2914. -------------------------------------------------------------------------------
  2915. Returns the result of subtracting the single-precision floating-point values
  2916. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  2917. for Binary Floating-Point Arithmetic.
  2918. -------------------------------------------------------------------------------
  2919. *}
  2920. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  2921. Var
  2922. aSign, bSign: flag;
  2923. Begin
  2924. aSign := extractFloat32Sign( a.float32 );
  2925. bSign := extractFloat32Sign( b.float32 );
  2926. if ( aSign = bSign ) then
  2927. Begin
  2928. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  2929. End
  2930. else
  2931. Begin
  2932. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  2933. End;
  2934. End;
  2935. {*
  2936. -------------------------------------------------------------------------------
  2937. Returns the result of multiplying the single-precision floating-point values
  2938. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  2939. for Binary Floating-Point Arithmetic.
  2940. -------------------------------------------------------------------------------
  2941. *}
  2942. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  2943. Var
  2944. aSign, bSign, zSign: flag;
  2945. aExp, bExp, zExp : int16;
  2946. aSig, bSig, zSig0, zSig1: bits32;
  2947. Begin
  2948. aSig := extractFloat32Frac( a.float32 );
  2949. aExp := extractFloat32Exp( a.float32 );
  2950. aSign := extractFloat32Sign( a.float32 );
  2951. bSig := extractFloat32Frac( b.float32 );
  2952. bExp := extractFloat32Exp( b.float32 );
  2953. bSign := extractFloat32Sign( b.float32 );
  2954. zSign := aSign xor bSign;
  2955. if ( aExp = $FF ) then
  2956. Begin
  2957. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  2958. Begin
  2959. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  2960. End;
  2961. if ( ( bExp OR bSig ) = 0 ) then
  2962. Begin
  2963. float_raise( float_flag_invalid );
  2964. float32_mul.float32 := float32_default_nan;
  2965. exit;
  2966. End;
  2967. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  2968. exit;
  2969. End;
  2970. if ( bExp = $FF ) then
  2971. Begin
  2972. if ( bSig <> 0 ) then
  2973. Begin
  2974. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  2975. exit;
  2976. End;
  2977. if ( ( aExp OR aSig ) = 0 ) then
  2978. Begin
  2979. float_raise( float_flag_invalid );
  2980. float32_mul.float32 := float32_default_nan;
  2981. exit;
  2982. End;
  2983. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  2984. exit;
  2985. End;
  2986. if ( aExp = 0 ) then
  2987. Begin
  2988. if ( aSig = 0 ) then
  2989. Begin
  2990. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  2991. exit;
  2992. End;
  2993. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2994. End;
  2995. if ( bExp = 0 ) then
  2996. Begin
  2997. if ( bSig = 0 ) then
  2998. Begin
  2999. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3000. exit;
  3001. End;
  3002. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3003. End;
  3004. zExp := aExp + bExp - $7F;
  3005. aSig := ( aSig OR $00800000 ) shl 7;
  3006. bSig := ( bSig OR $00800000 ) shl 8;
  3007. mul32To64( aSig, bSig, zSig0, zSig1 );
  3008. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3009. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3010. Begin
  3011. zSig0 := zSig0 shl 1;
  3012. Dec(zExp);
  3013. End;
  3014. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3015. End;
  3016. {*
  3017. -------------------------------------------------------------------------------
  3018. Returns the result of dividing the single-precision floating-point value `a'
  3019. by the corresponding value `b'. The operation is performed according to the
  3020. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3021. -------------------------------------------------------------------------------
  3022. *}
  3023. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3024. Var
  3025. aSign, bSign, zSign: flag;
  3026. aExp, bExp, zExp: int16;
  3027. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3028. Begin
  3029. aSig := extractFloat32Frac( a.float32 );
  3030. aExp := extractFloat32Exp( a.float32 );
  3031. aSign := extractFloat32Sign( a.float32 );
  3032. bSig := extractFloat32Frac( b.float32 );
  3033. bExp := extractFloat32Exp( b.float32 );
  3034. bSign := extractFloat32Sign( b.float32 );
  3035. zSign := aSign xor bSign;
  3036. if ( aExp = $FF ) then
  3037. Begin
  3038. if ( aSig <> 0 ) then
  3039. Begin
  3040. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3041. exit;
  3042. End;
  3043. if ( bExp = $FF ) then
  3044. Begin
  3045. if ( bSig <> 0) then
  3046. Begin
  3047. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3048. End;
  3049. float_raise( float_flag_invalid );
  3050. float32_div.float32 := float32_default_nan;
  3051. exit;
  3052. End;
  3053. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3054. exit;
  3055. End;
  3056. if ( bExp = $FF ) then
  3057. Begin
  3058. if ( bSig <> 0) then
  3059. Begin
  3060. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3061. exit;
  3062. End;
  3063. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3064. exit;
  3065. End;
  3066. if ( bExp = 0 ) Then
  3067. Begin
  3068. if ( bSig = 0 ) Then
  3069. Begin
  3070. if ( ( aExp OR aSig ) = 0 ) then
  3071. Begin
  3072. float_raise( float_flag_invalid );
  3073. float32_div.float32 := float32_default_nan;
  3074. exit;
  3075. End;
  3076. float_raise( float_flag_divbyzero );
  3077. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3078. exit;
  3079. End;
  3080. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3081. End;
  3082. if ( aExp = 0 ) Then
  3083. Begin
  3084. if ( aSig = 0 ) Then
  3085. Begin
  3086. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3087. exit;
  3088. End;
  3089. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3090. End;
  3091. zExp := aExp - bExp + $7D;
  3092. aSig := ( aSig OR $00800000 ) shl 7;
  3093. bSig := ( bSig OR $00800000 ) shl 8;
  3094. if ( bSig <= ( aSig + aSig ) ) then
  3095. Begin
  3096. aSig := aSig shr 1;
  3097. Inc(zExp);
  3098. End;
  3099. zSig := estimateDiv64To32( aSig, 0, bSig );
  3100. if ( ( zSig and $3F ) <= 2 ) then
  3101. Begin
  3102. mul32To64( bSig, zSig, term0, term1 );
  3103. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3104. while ( sbits32 (rem0) < 0 ) do
  3105. Begin
  3106. Dec(zSig);
  3107. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3108. End;
  3109. zSig := zSig or bits32( rem1 <> 0 );
  3110. End;
  3111. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3112. End;
  3113. {*
  3114. -------------------------------------------------------------------------------
  3115. Returns the remainder of the single-precision floating-point value `a'
  3116. with respect to the corresponding value `b'. The operation is performed
  3117. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3118. -------------------------------------------------------------------------------
  3119. *}
  3120. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3121. Var
  3122. aSign, bSign, zSign: flag;
  3123. aExp, bExp, expDiff: int16;
  3124. aSig, bSig, q, allZero, alternateASig: bits32;
  3125. sigMean: sbits32;
  3126. Begin
  3127. aSig := extractFloat32Frac( a.float32 );
  3128. aExp := extractFloat32Exp( a.float32 );
  3129. aSign := extractFloat32Sign( a.float32 );
  3130. bSig := extractFloat32Frac( b.float32 );
  3131. bExp := extractFloat32Exp( b.float32 );
  3132. bSign := extractFloat32Sign( b.float32 );
  3133. if ( aExp = $FF ) then
  3134. Begin
  3135. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3136. Begin
  3137. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3138. exit;
  3139. End;
  3140. float_raise( float_flag_invalid );
  3141. float32_rem.float32 := float32_default_nan;
  3142. exit;
  3143. End;
  3144. if ( bExp = $FF ) then
  3145. Begin
  3146. if ( bSig <> 0 ) then
  3147. Begin
  3148. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3149. exit;
  3150. End;
  3151. float32_rem := a;
  3152. exit;
  3153. End;
  3154. if ( bExp = 0 ) then
  3155. Begin
  3156. if ( bSig = 0 ) then
  3157. Begin
  3158. float_raise( float_flag_invalid );
  3159. float32_rem.float32 := float32_default_nan;
  3160. exit;
  3161. End;
  3162. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3163. End;
  3164. if ( aExp = 0 ) then
  3165. Begin
  3166. if ( aSig = 0 ) then
  3167. Begin
  3168. float32_rem := a;
  3169. exit;
  3170. End;
  3171. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3172. End;
  3173. expDiff := aExp - bExp;
  3174. aSig := ( aSig OR $00800000 ) shl 8;
  3175. bSig := ( bSig OR $00800000 ) shl 8;
  3176. if ( expDiff < 0 ) then
  3177. Begin
  3178. if ( expDiff < -1 ) then
  3179. Begin
  3180. float32_rem := a;
  3181. exit;
  3182. End;
  3183. aSig := aSig shr 1;
  3184. End;
  3185. q := bits32( bSig <= aSig );
  3186. if ( q <> 0) then
  3187. aSig := aSig - bSig;
  3188. expDiff := expDiff - 32;
  3189. while ( 0 < expDiff ) do
  3190. Begin
  3191. q := estimateDiv64To32( aSig, 0, bSig );
  3192. if (2 < q) then
  3193. q := q - 2
  3194. else
  3195. q := 0;
  3196. aSig := - ( ( bSig shr 2 ) * q );
  3197. expDiff := expDiff - 30;
  3198. End;
  3199. expDiff := expDiff + 32;
  3200. if ( 0 < expDiff ) then
  3201. Begin
  3202. q := estimateDiv64To32( aSig, 0, bSig );
  3203. if (2 < q) then
  3204. q := q - 2
  3205. else
  3206. q := 0;
  3207. q := q shr (32 - expDiff);
  3208. bSig := bSig shr 2;
  3209. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3210. End
  3211. else
  3212. Begin
  3213. aSig := aSig shr 2;
  3214. bSig := bSig shr 2;
  3215. End;
  3216. Repeat
  3217. alternateASig := aSig;
  3218. Inc(q);
  3219. aSig := aSig - bSig;
  3220. Until not ( 0 <= sbits32 (aSig) );
  3221. sigMean := aSig + alternateASig;
  3222. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3223. Begin
  3224. aSig := alternateASig;
  3225. End;
  3226. zSign := flag( sbits32 (aSig) < 0 );
  3227. if ( zSign<>0 ) then
  3228. aSig := - aSig;
  3229. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3230. End;
  3231. {*
  3232. -------------------------------------------------------------------------------
  3233. Returns the square root of the single-precision floating-point value `a'.
  3234. The operation is performed according to the IEC/IEEE Standard for Binary
  3235. Floating-Point Arithmetic.
  3236. -------------------------------------------------------------------------------
  3237. *}
  3238. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3239. Var
  3240. aSign : flag;
  3241. aExp, zExp : int16;
  3242. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3243. label roundAndPack;
  3244. Begin
  3245. aSig := extractFloat32Frac( a.float32 );
  3246. aExp := extractFloat32Exp( a.float32 );
  3247. aSign := extractFloat32Sign( a.float32 );
  3248. if ( aExp = $FF ) then
  3249. Begin
  3250. if ( aSig <> 0) then
  3251. Begin
  3252. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3253. exit;
  3254. End;
  3255. if ( aSign = 0) then
  3256. Begin
  3257. float32_sqrt := a;
  3258. exit;
  3259. End;
  3260. float_raise( float_flag_invalid );
  3261. float32_sqrt.float32 := float32_default_nan;
  3262. exit;
  3263. End;
  3264. if ( aSign <> 0) then
  3265. Begin
  3266. if ( ( aExp OR aSig ) = 0 ) then
  3267. Begin
  3268. float32_sqrt := a;
  3269. exit;
  3270. End;
  3271. float_raise( float_flag_invalid );
  3272. float32_sqrt.float32 := float32_default_nan;
  3273. exit;
  3274. End;
  3275. if ( aExp = 0 ) then
  3276. Begin
  3277. if ( aSig = 0 ) then
  3278. Begin
  3279. float32_sqrt.float32 := 0;
  3280. exit;
  3281. End;
  3282. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3283. End;
  3284. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3285. aSig := ( aSig OR $00800000 ) shl 8;
  3286. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3287. if ( ( zSig and $7F ) <= 5 ) then
  3288. Begin
  3289. if ( zSig < 2 ) then
  3290. Begin
  3291. zSig := $7FFFFFFF;
  3292. goto roundAndPack;
  3293. End
  3294. else
  3295. Begin
  3296. aSig := aSig shr (aExp and 1);
  3297. mul32To64( zSig, zSig, term0, term1 );
  3298. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3299. while ( sbits32 (rem0) < 0 ) do
  3300. Begin
  3301. Dec(zSig);
  3302. shortShift64Left( 0, zSig, 1, term0, term1 );
  3303. term1 := term1 or 1;
  3304. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3305. End;
  3306. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3307. End;
  3308. End;
  3309. shift32RightJamming( zSig, 1, zSig );
  3310. roundAndPack:
  3311. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3312. End;
  3313. {*
  3314. -------------------------------------------------------------------------------
  3315. Returns 1 if the single-precision floating-point value `a' is equal to
  3316. the corresponding value `b', and 0 otherwise. The comparison is performed
  3317. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3318. -------------------------------------------------------------------------------
  3319. *}
  3320. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3321. Begin
  3322. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3323. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3324. ) then
  3325. Begin
  3326. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3327. Begin
  3328. float_raise( float_flag_invalid );
  3329. End;
  3330. float32_eq := 0;
  3331. exit;
  3332. End;
  3333. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3334. End;
  3335. {*
  3336. -------------------------------------------------------------------------------
  3337. Returns 1 if the single-precision floating-point value `a' is less than
  3338. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3339. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3340. Arithmetic.
  3341. -------------------------------------------------------------------------------
  3342. *}
  3343. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3344. var
  3345. aSign, bSign: flag;
  3346. Begin
  3347. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3348. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3349. ) then
  3350. Begin
  3351. float_raise( float_flag_invalid );
  3352. float32_le := 0;
  3353. exit;
  3354. End;
  3355. aSign := extractFloat32Sign( a.float32 );
  3356. bSign := extractFloat32Sign( b.float32 );
  3357. if ( aSign <> bSign ) then
  3358. Begin
  3359. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3360. exit;
  3361. End;
  3362. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3363. End;
  3364. {*
  3365. -------------------------------------------------------------------------------
  3366. Returns 1 if the single-precision floating-point value `a' is less than
  3367. the corresponding value `b', and 0 otherwise. The comparison is performed
  3368. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3369. -------------------------------------------------------------------------------
  3370. *}
  3371. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3372. var
  3373. aSign, bSign: flag;
  3374. Begin
  3375. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3376. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3377. ) then
  3378. Begin
  3379. float_raise( float_flag_invalid );
  3380. float32_lt :=0;
  3381. exit;
  3382. End;
  3383. aSign := extractFloat32Sign( a.float32 );
  3384. bSign := extractFloat32Sign( b.float32 );
  3385. if ( aSign <> bSign ) then
  3386. Begin
  3387. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3388. exit;
  3389. End;
  3390. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3391. End;
  3392. {*
  3393. -------------------------------------------------------------------------------
  3394. Returns 1 if the single-precision floating-point value `a' is equal to
  3395. the corresponding value `b', and 0 otherwise. The invalid exception is
  3396. raised if either operand is a NaN. Otherwise, the comparison is performed
  3397. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3398. -------------------------------------------------------------------------------
  3399. *}
  3400. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3401. Begin
  3402. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3403. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3404. ) then
  3405. Begin
  3406. float_raise( float_flag_invalid );
  3407. float32_eq_signaling := 0;
  3408. exit;
  3409. End;
  3410. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3411. End;
  3412. {*
  3413. -------------------------------------------------------------------------------
  3414. Returns 1 if the single-precision floating-point value `a' is less than or
  3415. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3416. cause an exception. Otherwise, the comparison is performed according to the
  3417. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3418. -------------------------------------------------------------------------------
  3419. *}
  3420. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3421. Var
  3422. aSign, bSign: flag;
  3423. aExp, bExp: int16;
  3424. Begin
  3425. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3426. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3427. ) then
  3428. Begin
  3429. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3430. Begin
  3431. float_raise( float_flag_invalid );
  3432. End;
  3433. float32_le_quiet := 0;
  3434. exit;
  3435. End;
  3436. aSign := extractFloat32Sign( a );
  3437. bSign := extractFloat32Sign( b );
  3438. if ( aSign <> bSign ) then
  3439. Begin
  3440. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3441. exit;
  3442. End;
  3443. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3444. End;
  3445. {*
  3446. -------------------------------------------------------------------------------
  3447. Returns 1 if the single-precision floating-point value `a' is less than
  3448. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3449. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3450. Standard for Binary Floating-Point Arithmetic.
  3451. -------------------------------------------------------------------------------
  3452. *}
  3453. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3454. Var
  3455. aSign, bSign: flag;
  3456. Begin
  3457. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3458. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3459. ) then
  3460. Begin
  3461. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3462. Begin
  3463. float_raise( float_flag_invalid );
  3464. End;
  3465. float32_lt_quiet := 0;
  3466. exit;
  3467. End;
  3468. aSign := extractFloat32Sign( a );
  3469. bSign := extractFloat32Sign( b );
  3470. if ( aSign <> bSign ) then
  3471. Begin
  3472. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3473. exit;
  3474. End;
  3475. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3476. End;
  3477. {*
  3478. -------------------------------------------------------------------------------
  3479. Returns the result of converting the double-precision floating-point value
  3480. `a' to the 32-bit two's complement integer format. The conversion is
  3481. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3482. Arithmetic---which means in particular that the conversion is rounded
  3483. according to the current rounding mode. If `a' is a NaN, the largest
  3484. positive integer is returned. Otherwise, if the conversion overflows, the
  3485. largest integer with the same sign as `a' is returned.
  3486. -------------------------------------------------------------------------------
  3487. *}
  3488. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3489. var
  3490. aSign: flag;
  3491. aExp, shiftCount: int16;
  3492. aSig0, aSig1, absZ, aSigExtra: bits32;
  3493. z: int32;
  3494. roundingMode: int8;
  3495. label invalid;
  3496. Begin
  3497. aSig1 := extractFloat64Frac1( a );
  3498. aSig0 := extractFloat64Frac0( a );
  3499. aExp := extractFloat64Exp( a );
  3500. aSign := extractFloat64Sign( a );
  3501. shiftCount := aExp - $413;
  3502. if ( 0 <= shiftCount ) then
  3503. Begin
  3504. if ( $41E < aExp ) then
  3505. Begin
  3506. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3507. aSign := 0;
  3508. goto invalid;
  3509. End;
  3510. shortShift64Left(
  3511. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3512. if ( $80000000 < absZ ) then
  3513. goto invalid;
  3514. End
  3515. else
  3516. Begin
  3517. aSig1 := flag( aSig1 <> 0 );
  3518. if ( aExp < $3FE ) then
  3519. Begin
  3520. aSigExtra := aExp OR aSig0 OR aSig1;
  3521. absZ := 0;
  3522. End
  3523. else
  3524. Begin
  3525. aSig0 := aSig0 OR $00100000;
  3526. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3527. absZ := aSig0 shr ( - shiftCount );
  3528. End;
  3529. End;
  3530. roundingMode := float_rounding_mode;
  3531. if ( roundingMode = float_round_nearest_even ) then
  3532. Begin
  3533. if ( sbits32(aSigExtra) < 0 ) then
  3534. Begin
  3535. Inc(absZ);
  3536. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3537. absZ := absZ and not 1;
  3538. End;
  3539. if aSign <> 0 then
  3540. z := - absZ
  3541. else
  3542. z := absZ;
  3543. End
  3544. else
  3545. Begin
  3546. aSigExtra := bits32( aSigExtra <> 0 );
  3547. if ( aSign <> 0) then
  3548. Begin
  3549. z := - ( absZ
  3550. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3551. End
  3552. else
  3553. Begin
  3554. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3555. End
  3556. End;
  3557. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3558. Begin
  3559. invalid:
  3560. float_raise( float_flag_invalid );
  3561. if (aSign <> 0 ) then
  3562. float64_to_int32 := sbits32 ($80000000)
  3563. else
  3564. float64_to_int32 := $7FFFFFFF;
  3565. exit;
  3566. End;
  3567. if ( aSigExtra <> 0) then
  3568. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3569. float64_to_int32 := z;
  3570. End;
  3571. {*
  3572. -------------------------------------------------------------------------------
  3573. Returns the result of converting the double-precision floating-point value
  3574. `a' to the 32-bit two's complement integer format. The conversion is
  3575. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3576. Arithmetic, except that the conversion is always rounded toward zero.
  3577. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3578. the conversion overflows, the largest integer with the same sign as `a' is
  3579. returned.
  3580. -------------------------------------------------------------------------------
  3581. *}
  3582. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3583. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3584. Var
  3585. aSign: flag;
  3586. aExp, shiftCount: int16;
  3587. aSig0, aSig1, absZ, aSigExtra: bits32;
  3588. z: int32;
  3589. label invalid;
  3590. Begin
  3591. aSig1 := extractFloat64Frac1( a );
  3592. aSig0 := extractFloat64Frac0( a );
  3593. aExp := extractFloat64Exp( a );
  3594. aSign := extractFloat64Sign( a );
  3595. shiftCount := aExp - $413;
  3596. if ( 0 <= shiftCount ) then
  3597. Begin
  3598. if ( $41E < aExp ) then
  3599. Begin
  3600. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3601. aSign := 0;
  3602. goto invalid;
  3603. End;
  3604. shortShift64Left(
  3605. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3606. End
  3607. else
  3608. Begin
  3609. if ( aExp < $3FF ) then
  3610. Begin
  3611. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3612. Begin
  3613. softfloat_exception_flags :=
  3614. softfloat_exception_flags or float_flag_inexact;
  3615. End;
  3616. float64_to_int32_round_to_zero := 0;
  3617. exit;
  3618. End;
  3619. aSig0 := aSig0 or $00100000;
  3620. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3621. absZ := aSig0 shr ( - shiftCount );
  3622. End;
  3623. if aSign <> 0 then
  3624. z := - absZ
  3625. else
  3626. z := absZ;
  3627. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3628. Begin
  3629. invalid:
  3630. float_raise( float_flag_invalid );
  3631. if (aSign <> 0) then
  3632. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3633. else
  3634. float64_to_int32_round_to_zero := $7FFFFFFF;
  3635. exit;
  3636. End;
  3637. if ( aSigExtra <> 0) then
  3638. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3639. float64_to_int32_round_to_zero := z;
  3640. End;
  3641. {*
  3642. -------------------------------------------------------------------------------
  3643. Returns the result of converting the double-precision floating-point value
  3644. `a' to the single-precision floating-point format. The conversion is
  3645. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3646. Arithmetic.
  3647. -------------------------------------------------------------------------------
  3648. *}
  3649. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3650. Var
  3651. aSign: flag;
  3652. aExp: int16;
  3653. aSig0, aSig1, zSig: bits32;
  3654. allZero: bits32;
  3655. tmp : CommonNanT;
  3656. Begin
  3657. aSig1 := extractFloat64Frac1( a );
  3658. aSig0 := extractFloat64Frac0( a );
  3659. aExp := extractFloat64Exp( a );
  3660. aSign := extractFloat64Sign( a );
  3661. if ( aExp = $7FF ) then
  3662. Begin
  3663. if ( aSig0 OR aSig1 ) <> 0 then
  3664. Begin
  3665. float64ToCommonNaN( a, tmp );
  3666. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3667. exit;
  3668. End;
  3669. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3670. exit;
  3671. End;
  3672. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3673. if ( aExp <> 0) then
  3674. zSig := zSig OR $40000000;
  3675. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3676. End;
  3677. {*
  3678. -------------------------------------------------------------------------------
  3679. Rounds the double-precision floating-point value `a' to an integer,
  3680. and returns the result as a double-precision floating-point value. The
  3681. operation is performed according to the IEC/IEEE Standard for Binary
  3682. Floating-Point Arithmetic.
  3683. -------------------------------------------------------------------------------
  3684. *}
  3685. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3686. Var
  3687. aSign: flag;
  3688. aExp: int16;
  3689. lastBitMask, roundBitsMask: bits32;
  3690. roundingMode: int8;
  3691. z: float64;
  3692. Begin
  3693. aExp := extractFloat64Exp( a );
  3694. if ( $413 <= aExp ) then
  3695. Begin
  3696. if ( $433 <= aExp ) then
  3697. Begin
  3698. if ( ( aExp = $7FF )
  3699. AND
  3700. (
  3701. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3702. ) <>0)
  3703. ) then
  3704. Begin
  3705. propagateFloat64NaN( a, a, result );
  3706. exit;
  3707. End;
  3708. result := a;
  3709. exit;
  3710. End;
  3711. lastBitMask := 1;
  3712. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3713. roundBitsMask := lastBitMask - 1;
  3714. z := a;
  3715. roundingMode := float_rounding_mode;
  3716. if ( roundingMode = float_round_nearest_even ) then
  3717. Begin
  3718. if ( lastBitMask <> 0) then
  3719. Begin
  3720. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  3721. if ( ( z.low and roundBitsMask ) = 0 ) then
  3722. z.low := z.low and not lastBitMask;
  3723. End
  3724. else
  3725. Begin
  3726. if ( sbits32 (z.low) < 0 ) then
  3727. Begin
  3728. Inc(z.high);
  3729. if ( bits32 ( z.low shl 1 ) = 0 ) then
  3730. z.high := z.high and not 1;
  3731. End;
  3732. End;
  3733. End
  3734. else if ( roundingMode <> float_round_to_zero ) then
  3735. Begin
  3736. if ( extractFloat64Sign( z )
  3737. xor flag( roundingMode = float_round_up ) )<> 0 then
  3738. Begin
  3739. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  3740. End;
  3741. End;
  3742. z.low := z.low and not roundBitsMask;
  3743. End
  3744. else
  3745. Begin
  3746. if ( aExp <= $3FE ) then
  3747. Begin
  3748. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  3749. Begin
  3750. result := a;
  3751. exit;
  3752. End;
  3753. softfloat_exception_flags := softfloat_exception_flags or
  3754. float_flag_inexact;
  3755. aSign := extractFloat64Sign( a );
  3756. case ( float_rounding_mode ) of
  3757. float_round_nearest_even:
  3758. Begin
  3759. if ( ( aExp = $3FE )
  3760. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  3761. ) then
  3762. Begin
  3763. packFloat64( aSign, $3FF, 0, 0, result );
  3764. exit;
  3765. End;
  3766. End;
  3767. float_round_down:
  3768. Begin
  3769. if aSign<>0 then
  3770. packFloat64( 1, $3FF, 0, 0, result )
  3771. else
  3772. packFloat64( 0, 0, 0, 0, result );
  3773. exit;
  3774. End;
  3775. float_round_up:
  3776. Begin
  3777. if aSign <> 0 then
  3778. packFloat64( 1, 0, 0, 0, result )
  3779. else
  3780. packFloat64( 0, $3FF, 0, 0, result );
  3781. exit;
  3782. End;
  3783. end;
  3784. packFloat64( aSign, 0, 0, 0, result );
  3785. exit;
  3786. End;
  3787. lastBitMask := 1;
  3788. lastBitMask := lastBitMask shl ($413 - aExp);
  3789. roundBitsMask := lastBitMask - 1;
  3790. z.low := 0;
  3791. z.high := a.high;
  3792. roundingMode := float_rounding_mode;
  3793. if ( roundingMode = float_round_nearest_even ) then
  3794. Begin
  3795. z.high := z.high + lastBitMask shr 1;
  3796. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  3797. Begin
  3798. z.high := z.high and not lastBitMask;
  3799. End;
  3800. End
  3801. else if ( roundingMode <> float_round_to_zero ) then
  3802. Begin
  3803. if ( extractFloat64Sign( z )
  3804. xor flag( roundingMode = float_round_up ) )<> 0 then
  3805. Begin
  3806. z.high := z.high or bits32( a.low <> 0 );
  3807. z.high := z.high + roundBitsMask;
  3808. End;
  3809. End;
  3810. z.high := z.high and not roundBitsMask;
  3811. End;
  3812. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  3813. Begin
  3814. softfloat_exception_flags :=
  3815. softfloat_exception_flags or float_flag_inexact;
  3816. End;
  3817. result := z;
  3818. End;
  3819. {*
  3820. -------------------------------------------------------------------------------
  3821. Returns the result of adding the absolute values of the double-precision
  3822. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3823. before being returned. `zSign' is ignored if the result is a NaN.
  3824. The addition is performed according to the IEC/IEEE Standard for Binary
  3825. Floating-Point Arithmetic.
  3826. -------------------------------------------------------------------------------
  3827. *}
  3828. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  3829. Var
  3830. aExp, bExp, zExp: int16;
  3831. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  3832. expDiff: int16;
  3833. label shiftRight1;
  3834. label roundAndPack;
  3835. Begin
  3836. aSig1 := extractFloat64Frac1( a );
  3837. aSig0 := extractFloat64Frac0( a );
  3838. aExp := extractFloat64Exp( a );
  3839. bSig1 := extractFloat64Frac1( b );
  3840. bSig0 := extractFloat64Frac0( b );
  3841. bExp := extractFloat64Exp( b );
  3842. expDiff := aExp - bExp;
  3843. if ( 0 < expDiff ) then
  3844. Begin
  3845. if ( aExp = $7FF ) then
  3846. Begin
  3847. if ( aSig0 OR aSig1 ) <> 0 then
  3848. Begin
  3849. propagateFloat64NaN( a, b, out );
  3850. exit;
  3851. end;
  3852. out := a;
  3853. exit;
  3854. End;
  3855. if ( bExp = 0 ) then
  3856. Begin
  3857. Dec(expDiff);
  3858. End
  3859. else
  3860. Begin
  3861. bSig0 := bSig0 or $00100000;
  3862. End;
  3863. shift64ExtraRightJamming(
  3864. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  3865. zExp := aExp;
  3866. End
  3867. else if ( expDiff < 0 ) then
  3868. Begin
  3869. if ( bExp = $7FF ) then
  3870. Begin
  3871. if ( bSig0 OR bSig1 ) <> 0 then
  3872. Begin
  3873. propagateFloat64NaN( a, b, out );
  3874. exit;
  3875. End;
  3876. packFloat64( zSign, $7FF, 0, 0, out );
  3877. End;
  3878. if ( aExp = 0 ) then
  3879. Begin
  3880. Inc(expDiff);
  3881. End
  3882. else
  3883. Begin
  3884. aSig0 := aSig0 or $00100000;
  3885. End;
  3886. shift64ExtraRightJamming(
  3887. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  3888. zExp := bExp;
  3889. End
  3890. else
  3891. Begin
  3892. if ( aExp = $7FF ) then
  3893. Begin
  3894. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  3895. Begin
  3896. propagateFloat64NaN( a, b, out );
  3897. exit;
  3898. End;
  3899. out := a;
  3900. exit;
  3901. End;
  3902. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  3903. if ( aExp = 0 ) then
  3904. Begin
  3905. packFloat64( zSign, 0, zSig0, zSig1, out );
  3906. exit;
  3907. End;
  3908. zSig2 := 0;
  3909. zSig0 := zSig0 or $00200000;
  3910. zExp := aExp;
  3911. goto shiftRight1;
  3912. End;
  3913. aSig0 := aSig0 or $00100000;
  3914. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  3915. Dec(zExp);
  3916. if ( zSig0 < $00200000 ) then
  3917. goto roundAndPack;
  3918. Inc(zExp);
  3919. shiftRight1:
  3920. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  3921. roundAndPack:
  3922. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  3923. End;
  3924. {*
  3925. -------------------------------------------------------------------------------
  3926. Returns the result of subtracting the absolute values of the double-
  3927. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3928. difference is negated before being returned. `zSign' is ignored if the
  3929. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3930. Standard for Binary Floating-Point Arithmetic.
  3931. -------------------------------------------------------------------------------
  3932. *}
  3933. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  3934. Var
  3935. aExp, bExp, zExp: int16;
  3936. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  3937. expDiff: int16;
  3938. z: float64;
  3939. label aExpBigger;
  3940. label bExpBigger;
  3941. label aBigger;
  3942. label bBigger;
  3943. label normalizeRoundAndPack;
  3944. Begin
  3945. aSig1 := extractFloat64Frac1( a );
  3946. aSig0 := extractFloat64Frac0( a );
  3947. aExp := extractFloat64Exp( a );
  3948. bSig1 := extractFloat64Frac1( b );
  3949. bSig0 := extractFloat64Frac0( b );
  3950. bExp := extractFloat64Exp( b );
  3951. expDiff := aExp - bExp;
  3952. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  3953. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  3954. if ( 0 < expDiff ) then goto aExpBigger;
  3955. if ( expDiff < 0 ) then goto bExpBigger;
  3956. if ( aExp = $7FF ) then
  3957. Begin
  3958. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  3959. Begin
  3960. propagateFloat64NaN( a, b, out );
  3961. exit;
  3962. End;
  3963. float_raise( float_flag_invalid );
  3964. z.low := float64_default_nan_low;
  3965. z.high := float64_default_nan_high;
  3966. out := z;
  3967. exit;
  3968. End;
  3969. if ( aExp = 0 ) then
  3970. Begin
  3971. aExp := 1;
  3972. bExp := 1;
  3973. End;
  3974. if ( bSig0 < aSig0 ) then goto aBigger;
  3975. if ( aSig0 < bSig0 ) then goto bBigger;
  3976. if ( bSig1 < aSig1 ) then goto aBigger;
  3977. if ( aSig1 < bSig1 ) then goto bBigger;
  3978. packFloat64( flag(float_rounding_mode = float_round_down), 0, 0, 0 , out);
  3979. exit;
  3980. bExpBigger:
  3981. if ( bExp = $7FF ) then
  3982. Begin
  3983. if ( bSig0 OR bSig1 ) <> 0 then
  3984. Begin
  3985. propagateFloat64NaN( a, b, out );
  3986. exit;
  3987. End;
  3988. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  3989. exit;
  3990. End;
  3991. if ( aExp = 0 ) then
  3992. Begin
  3993. Inc(expDiff);
  3994. End
  3995. else
  3996. Begin
  3997. aSig0 := aSig0 or $40000000;
  3998. End;
  3999. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4000. bSig0 := bSig0 or $40000000;
  4001. bBigger:
  4002. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4003. zExp := bExp;
  4004. zSign := zSign xor 1;
  4005. goto normalizeRoundAndPack;
  4006. aExpBigger:
  4007. if ( aExp = $7FF ) then
  4008. Begin
  4009. if ( aSig0 OR aSig1 ) <> 0 then
  4010. Begin
  4011. propagateFloat64NaN( a, b, out );
  4012. exit;
  4013. End;
  4014. out := a;
  4015. exit;
  4016. End;
  4017. if ( bExp = 0 ) then
  4018. Begin
  4019. Dec(expDiff);
  4020. End
  4021. else
  4022. Begin
  4023. bSig0 := bSig0 or $40000000;
  4024. End;
  4025. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4026. aSig0 := aSig0 or $40000000;
  4027. aBigger:
  4028. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4029. zExp := aExp;
  4030. normalizeRoundAndPack:
  4031. Dec(zExp);
  4032. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4033. End;
  4034. {*
  4035. -------------------------------------------------------------------------------
  4036. Returns the result of adding the double-precision floating-point values `a'
  4037. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4038. Binary Floating-Point Arithmetic.
  4039. -------------------------------------------------------------------------------
  4040. *}
  4041. Function float64_add( a: float64; b : float64) : Float64;
  4042. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4043. Var
  4044. aSign, bSign: flag;
  4045. Begin
  4046. aSign := extractFloat64Sign( a );
  4047. bSign := extractFloat64Sign( b );
  4048. if ( aSign = bSign ) then
  4049. Begin
  4050. addFloat64Sigs( a, b, aSign, result );
  4051. End
  4052. else
  4053. Begin
  4054. subFloat64Sigs( a, b, aSign, result );
  4055. End;
  4056. End;
  4057. {*
  4058. -------------------------------------------------------------------------------
  4059. Returns the result of subtracting the double-precision floating-point values
  4060. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4061. for Binary Floating-Point Arithmetic.
  4062. -------------------------------------------------------------------------------
  4063. *}
  4064. Function float64_sub(a: float64; b : float64) : Float64;
  4065. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4066. Var
  4067. aSign, bSign: flag;
  4068. Begin
  4069. aSign := extractFloat64Sign( a );
  4070. bSign := extractFloat64Sign( b );
  4071. if ( aSign = bSign ) then
  4072. Begin
  4073. subFloat64Sigs( a, b, aSign, result );
  4074. End
  4075. else
  4076. Begin
  4077. addFloat64Sigs( a, b, aSign, result );
  4078. End;
  4079. End;
  4080. {*
  4081. -------------------------------------------------------------------------------
  4082. Returns the result of multiplying the double-precision floating-point values
  4083. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4084. for Binary Floating-Point Arithmetic.
  4085. -------------------------------------------------------------------------------
  4086. *}
  4087. Function float64_mul( a: float64; b:float64) : Float64;
  4088. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4089. Var
  4090. aSign, bSign, zSign: flag;
  4091. aExp, bExp, zExp: int16;
  4092. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4093. z: float64;
  4094. label invalid;
  4095. Begin
  4096. aSig1 := extractFloat64Frac1( a );
  4097. aSig0 := extractFloat64Frac0( a );
  4098. aExp := extractFloat64Exp( a );
  4099. aSign := extractFloat64Sign( a );
  4100. bSig1 := extractFloat64Frac1( b );
  4101. bSig0 := extractFloat64Frac0( b );
  4102. bExp := extractFloat64Exp( b );
  4103. bSign := extractFloat64Sign( b );
  4104. zSign := aSign xor bSign;
  4105. if ( aExp = $7FF ) then
  4106. Begin
  4107. if ( (( aSig0 OR aSig1 ) <>0)
  4108. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4109. Begin
  4110. propagateFloat64NaN( a, b, result );
  4111. exit;
  4112. End;
  4113. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4114. packFloat64( zSign, $7FF, 0, 0, result );
  4115. exit;
  4116. End;
  4117. if ( bExp = $7FF ) then
  4118. Begin
  4119. if ( bSig0 OR bSig1 )<> 0 then
  4120. Begin
  4121. propagateFloat64NaN( a, b, result );
  4122. exit;
  4123. End;
  4124. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4125. Begin
  4126. invalid:
  4127. float_raise( float_flag_invalid );
  4128. z.low := float64_default_nan_low;
  4129. z.high := float64_default_nan_high;
  4130. result := z;
  4131. exit;
  4132. End;
  4133. packFloat64( zSign, $7FF, 0, 0, result );
  4134. exit;
  4135. End;
  4136. if ( aExp = 0 ) then
  4137. Begin
  4138. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4139. Begin
  4140. packFloat64( zSign, 0, 0, 0, result );
  4141. exit;
  4142. End;
  4143. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4144. End;
  4145. if ( bExp = 0 ) then
  4146. Begin
  4147. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4148. Begin
  4149. packFloat64( zSign, 0, 0, 0, result );
  4150. exit;
  4151. End;
  4152. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4153. End;
  4154. zExp := aExp + bExp - $400;
  4155. aSig0 := aSig0 or $00100000;
  4156. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4157. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4158. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4159. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4160. if ( $00200000 <= zSig0 ) then
  4161. Begin
  4162. shift64ExtraRightJamming(
  4163. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4164. Inc(zExp);
  4165. End;
  4166. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4167. End;
  4168. {*
  4169. -------------------------------------------------------------------------------
  4170. Returns the result of dividing the double-precision floating-point value `a'
  4171. by the corresponding value `b'. The operation is performed according to the
  4172. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4173. -------------------------------------------------------------------------------
  4174. *}
  4175. Function float64_div(a: float64; b : float64) : Float64;
  4176. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4177. Var
  4178. aSign, bSign, zSign: flag;
  4179. aExp, bExp, zExp: int16;
  4180. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4181. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4182. z: float64;
  4183. label invalid;
  4184. Begin
  4185. aSig1 := extractFloat64Frac1( a );
  4186. aSig0 := extractFloat64Frac0( a );
  4187. aExp := extractFloat64Exp( a );
  4188. aSign := extractFloat64Sign( a );
  4189. bSig1 := extractFloat64Frac1( b );
  4190. bSig0 := extractFloat64Frac0( b );
  4191. bExp := extractFloat64Exp( b );
  4192. bSign := extractFloat64Sign( b );
  4193. zSign := aSign xor bSign;
  4194. if ( aExp = $7FF ) then
  4195. Begin
  4196. if ( aSig0 OR aSig1 )<> 0 then
  4197. Begin
  4198. propagateFloat64NaN( a, b, result );
  4199. exit;
  4200. end;
  4201. if ( bExp = $7FF ) then
  4202. Begin
  4203. if ( bSig0 OR bSig1 )<>0 then
  4204. Begin
  4205. propagateFloat64NaN( a, b, result );
  4206. exit;
  4207. End;
  4208. goto invalid;
  4209. End;
  4210. packFloat64( zSign, $7FF, 0, 0, result );
  4211. exit;
  4212. End;
  4213. if ( bExp = $7FF ) then
  4214. Begin
  4215. if ( bSig0 OR bSig1 )<> 0 then
  4216. Begin
  4217. propagateFloat64NaN( a, b, result );
  4218. exit;
  4219. End;
  4220. packFloat64( zSign, 0, 0, 0, result );
  4221. exit;
  4222. End;
  4223. if ( bExp = 0 ) then
  4224. Begin
  4225. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4226. Begin
  4227. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4228. Begin
  4229. invalid:
  4230. float_raise( float_flag_invalid );
  4231. z.low := float64_default_nan_low;
  4232. z.high := float64_default_nan_high;
  4233. result := z;
  4234. exit;
  4235. End;
  4236. float_raise( float_flag_divbyzero );
  4237. packFloat64( zSign, $7FF, 0, 0, result );
  4238. exit;
  4239. End;
  4240. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4241. End;
  4242. if ( aExp = 0 ) then
  4243. Begin
  4244. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4245. Begin
  4246. packFloat64( zSign, 0, 0, 0, result );
  4247. exit;
  4248. End;
  4249. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4250. End;
  4251. zExp := aExp - bExp + $3FD;
  4252. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4253. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4254. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4255. Begin
  4256. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4257. Inc(zExp);
  4258. End;
  4259. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4260. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4261. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4262. while ( sbits32 (rem0) < 0 ) do
  4263. Begin
  4264. Dec(zSig0);
  4265. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4266. End;
  4267. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4268. if ( ( zSig1 and $3FF ) <= 4 ) then
  4269. Begin
  4270. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4271. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4272. while ( sbits32 (rem1) < 0 ) do
  4273. Begin
  4274. Dec(zSig1);
  4275. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4276. End;
  4277. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4278. End;
  4279. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4280. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4281. End;
  4282. {*
  4283. -------------------------------------------------------------------------------
  4284. Returns the remainder of the double-precision floating-point value `a'
  4285. with respect to the corresponding value `b'. The operation is performed
  4286. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4287. -------------------------------------------------------------------------------
  4288. *}
  4289. Function float64_rem(a: float64; b : float64) : float64;
  4290. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4291. Var
  4292. aSign, bSign, zSign: flag;
  4293. aExp, bExp, expDiff: int16;
  4294. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4295. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4296. sigMean0: sbits32;
  4297. z: float64;
  4298. label invalid;
  4299. Begin
  4300. aSig1 := extractFloat64Frac1( a );
  4301. aSig0 := extractFloat64Frac0( a );
  4302. aExp := extractFloat64Exp( a );
  4303. aSign := extractFloat64Sign( a );
  4304. bSig1 := extractFloat64Frac1( b );
  4305. bSig0 := extractFloat64Frac0( b );
  4306. bExp := extractFloat64Exp( b );
  4307. bSign := extractFloat64Sign( b );
  4308. if ( aExp = $7FF ) then
  4309. Begin
  4310. if ((( aSig0 OR aSig1 )<>0)
  4311. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4312. Begin
  4313. propagateFloat64NaN( a, b, result );
  4314. exit;
  4315. End;
  4316. goto invalid;
  4317. End;
  4318. if ( bExp = $7FF ) then
  4319. Begin
  4320. if ( bSig0 OR bSig1 ) <> 0 then
  4321. Begin
  4322. propagateFloat64NaN( a, b, result );
  4323. exit;
  4324. End;
  4325. result := a;
  4326. exit;
  4327. End;
  4328. if ( bExp = 0 ) then
  4329. Begin
  4330. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4331. Begin
  4332. invalid:
  4333. float_raise( float_flag_invalid );
  4334. z.low := float64_default_nan_low;
  4335. z.high := float64_default_nan_high;
  4336. result := z;
  4337. exit;
  4338. End;
  4339. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4340. End;
  4341. if ( aExp = 0 ) then
  4342. Begin
  4343. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4344. Begin
  4345. result := a;
  4346. exit;
  4347. End;
  4348. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4349. End;
  4350. expDiff := aExp - bExp;
  4351. if ( expDiff < -1 ) then
  4352. Begin
  4353. result := a;
  4354. exit;
  4355. End;
  4356. shortShift64Left(
  4357. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4358. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4359. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4360. if ( q )<>0 then
  4361. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4362. expDiff := expDiff - 32;
  4363. while ( 0 < expDiff ) do
  4364. Begin
  4365. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4366. if 4 < q then
  4367. q:= q - 4
  4368. else
  4369. q := 0;
  4370. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4371. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4372. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4373. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4374. expDiff := expDiff - 29;
  4375. End;
  4376. if ( -32 < expDiff ) then
  4377. Begin
  4378. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4379. if 4 < q then
  4380. q := q - 4
  4381. else
  4382. q := 0;
  4383. q := q shr (- expDiff);
  4384. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4385. expDiff := expDiff + 24;
  4386. if ( expDiff < 0 ) then
  4387. Begin
  4388. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4389. End
  4390. else
  4391. Begin
  4392. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4393. End;
  4394. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4395. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4396. End
  4397. else
  4398. Begin
  4399. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4400. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4401. End;
  4402. Repeat
  4403. alternateASig0 := aSig0;
  4404. alternateASig1 := aSig1;
  4405. Inc(q);
  4406. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4407. Until not ( 0 <= sbits32 (aSig0) );
  4408. add64(
  4409. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4410. if ( ( sigMean0 < 0 )
  4411. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4412. Begin
  4413. aSig0 := alternateASig0;
  4414. aSig1 := alternateASig1;
  4415. End;
  4416. zSign := flag( sbits32 (aSig0) < 0 );
  4417. if ( zSign <> 0 ) then
  4418. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4419. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4420. End;
  4421. {*
  4422. -------------------------------------------------------------------------------
  4423. Returns the square root of the double-precision floating-point value `a'.
  4424. The operation is performed according to the IEC/IEEE Standard for Binary
  4425. Floating-Point Arithmetic.
  4426. -------------------------------------------------------------------------------
  4427. *}
  4428. Procedure float64_sqrt( a: float64; var out: float64 );
  4429. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4430. Var
  4431. aSign: flag;
  4432. aExp, zExp: int16;
  4433. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4434. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4435. z: float64;
  4436. label invalid;
  4437. Begin
  4438. aSig1 := extractFloat64Frac1( a );
  4439. aSig0 := extractFloat64Frac0( a );
  4440. aExp := extractFloat64Exp( a );
  4441. aSign := extractFloat64Sign( a );
  4442. if ( aExp = $7FF ) then
  4443. Begin
  4444. if ( aSig0 OR aSig1 ) <> 0 then
  4445. Begin
  4446. propagateFloat64NaN( a, a, out );
  4447. exit;
  4448. End;
  4449. if ( aSign = 0) then
  4450. Begin
  4451. out := a;
  4452. exit;
  4453. End;
  4454. goto invalid;
  4455. End;
  4456. if ( aSign <> 0 ) then
  4457. Begin
  4458. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4459. Begin
  4460. out := a;
  4461. exit;
  4462. End;
  4463. invalid:
  4464. float_raise( float_flag_invalid );
  4465. z.low := float64_default_nan_low;
  4466. z.high := float64_default_nan_high;
  4467. out := z;
  4468. exit;
  4469. End;
  4470. if ( aExp = 0 ) then
  4471. Begin
  4472. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4473. Begin
  4474. packFloat64( 0, 0, 0, 0, out );
  4475. exit;
  4476. End;
  4477. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4478. End;
  4479. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4480. aSig0 := aSig0 or $00100000;
  4481. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4482. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4483. if ( zSig0 = 0 ) then
  4484. zSig0 := $7FFFFFFF;
  4485. doubleZSig0 := zSig0 + zSig0;
  4486. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4487. mul32To64( zSig0, zSig0, term0, term1 );
  4488. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4489. while ( sbits32 (rem0) < 0 ) do
  4490. Begin
  4491. Dec(zSig0);
  4492. doubleZSig0 := doubleZSig0 - 2;
  4493. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4494. End;
  4495. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4496. if ( ( zSig1 and $1FF ) <= 5 ) then
  4497. Begin
  4498. if ( zSig1 = 0 ) then
  4499. zSig1 := 1;
  4500. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4501. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4502. mul32To64( zSig1, zSig1, term2, term3 );
  4503. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4504. while ( sbits32 (rem1) < 0 ) do
  4505. Begin
  4506. Dec(zSig1);
  4507. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4508. term3 := term3 or 1;
  4509. term2 := term2 or doubleZSig0;
  4510. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4511. End;
  4512. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4513. End;
  4514. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4515. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4516. End;
  4517. {*
  4518. -------------------------------------------------------------------------------
  4519. Returns 1 if the double-precision floating-point value `a' is equal to
  4520. the corresponding value `b', and 0 otherwise. The comparison is performed
  4521. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4522. -------------------------------------------------------------------------------
  4523. *}
  4524. Function float64_eq(a: float64; b: float64): flag;
  4525. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4526. Begin
  4527. if
  4528. (
  4529. ( extractFloat64Exp( a ) = $7FF )
  4530. AND
  4531. (
  4532. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4533. )
  4534. )
  4535. OR (
  4536. ( extractFloat64Exp( b ) = $7FF )
  4537. AND (
  4538. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4539. )
  4540. )
  4541. ) then
  4542. Begin
  4543. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4544. float_raise( float_flag_invalid );
  4545. float64_eq := 0;
  4546. exit;
  4547. End;
  4548. float64_eq := flag(
  4549. ( a.low = b.low )
  4550. AND ( ( a.high = b.high )
  4551. OR ( ( a.low = 0 )
  4552. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4553. ));
  4554. End;
  4555. {*
  4556. -------------------------------------------------------------------------------
  4557. Returns 1 if the double-precision floating-point value `a' is less than
  4558. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4559. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4560. Arithmetic.
  4561. -------------------------------------------------------------------------------
  4562. *}
  4563. Function float64_le(a: float64;b: float64): flag;
  4564. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4565. Var
  4566. aSign, bSign: flag;
  4567. Begin
  4568. if
  4569. (
  4570. ( extractFloat64Exp( a ) = $7FF )
  4571. AND
  4572. (
  4573. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4574. )
  4575. )
  4576. OR (
  4577. ( extractFloat64Exp( b ) = $7FF )
  4578. AND (
  4579. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4580. )
  4581. )
  4582. ) then
  4583. Begin
  4584. float_raise( float_flag_invalid );
  4585. float64_le := 0;
  4586. exit;
  4587. End;
  4588. aSign := extractFloat64Sign( a );
  4589. bSign := extractFloat64Sign( b );
  4590. if ( aSign <> bSign ) then
  4591. Begin
  4592. float64_le := flag(
  4593. (aSign <> 0)
  4594. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4595. = 0 ));
  4596. exit;
  4597. End;
  4598. if aSign <> 0 then
  4599. float64_le := le64( b.high, b.low, a.high, a.low )
  4600. else
  4601. float64_le := le64( a.high, a.low, b.high, b.low );
  4602. End;
  4603. {*
  4604. -------------------------------------------------------------------------------
  4605. Returns 1 if the double-precision floating-point value `a' is less than
  4606. the corresponding value `b', and 0 otherwise. The comparison is performed
  4607. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4608. -------------------------------------------------------------------------------
  4609. *}
  4610. Function float64_lt(a: float64;b: float64): flag;
  4611. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4612. Var
  4613. aSign, bSign: flag;
  4614. Begin
  4615. if
  4616. (
  4617. ( extractFloat64Exp( a ) = $7FF )
  4618. AND
  4619. (
  4620. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4621. )
  4622. )
  4623. OR (
  4624. ( extractFloat64Exp( b ) = $7FF )
  4625. AND (
  4626. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4627. )
  4628. )
  4629. ) then
  4630. Begin
  4631. float_raise( float_flag_invalid );
  4632. float64_lt := 0;
  4633. exit;
  4634. End;
  4635. aSign := extractFloat64Sign( a );
  4636. bSign := extractFloat64Sign( b );
  4637. if ( aSign <> bSign ) then
  4638. Begin
  4639. float64_lt := flag(
  4640. (aSign <> 0)
  4641. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4642. <> 0 ));
  4643. exit;
  4644. End;
  4645. if aSign <> 0 then
  4646. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4647. else
  4648. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4649. End;
  4650. {*
  4651. -------------------------------------------------------------------------------
  4652. Returns 1 if the double-precision floating-point value `a' is equal to
  4653. the corresponding value `b', and 0 otherwise. The invalid exception is
  4654. raised if either operand is a NaN. Otherwise, the comparison is performed
  4655. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4656. -------------------------------------------------------------------------------
  4657. *}
  4658. Function float64_eq_signaling( a: float64; b: float64): flag;
  4659. Begin
  4660. if
  4661. (
  4662. ( extractFloat64Exp( a ) = $7FF )
  4663. AND
  4664. (
  4665. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4666. )
  4667. )
  4668. OR (
  4669. ( extractFloat64Exp( b ) = $7FF )
  4670. AND (
  4671. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4672. )
  4673. )
  4674. ) then
  4675. Begin
  4676. float_raise( float_flag_invalid );
  4677. float64_eq_signaling := 0;
  4678. exit;
  4679. End;
  4680. float64_eq_signaling := flag(
  4681. ( a.low = b.low )
  4682. AND ( ( a.high = b.high )
  4683. OR ( ( a.low = 0 )
  4684. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4685. ));
  4686. End;
  4687. {*
  4688. -------------------------------------------------------------------------------
  4689. Returns 1 if the double-precision floating-point value `a' is less than or
  4690. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4691. cause an exception. Otherwise, the comparison is performed according to the
  4692. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4693. -------------------------------------------------------------------------------
  4694. *}
  4695. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4696. Var
  4697. aSign, bSign : flag;
  4698. Begin
  4699. if
  4700. (
  4701. ( extractFloat64Exp( a ) = $7FF )
  4702. AND
  4703. (
  4704. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4705. )
  4706. )
  4707. OR (
  4708. ( extractFloat64Exp( b ) = $7FF )
  4709. AND (
  4710. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4711. )
  4712. )
  4713. ) then
  4714. Begin
  4715. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4716. float_raise( float_flag_invalid );
  4717. float64_le_quiet := 0;
  4718. exit;
  4719. End;
  4720. aSign := extractFloat64Sign( a );
  4721. bSign := extractFloat64Sign( b );
  4722. if ( aSign <> bSign ) then
  4723. Begin
  4724. float64_le_quiet := flag
  4725. ((aSign <> 0)
  4726. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4727. = 0 ));
  4728. exit;
  4729. End;
  4730. if aSign <> 0 then
  4731. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  4732. else
  4733. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  4734. End;
  4735. {*
  4736. -------------------------------------------------------------------------------
  4737. Returns 1 if the double-precision floating-point value `a' is less than
  4738. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4739. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4740. Standard for Binary Floating-Point Arithmetic.
  4741. -------------------------------------------------------------------------------
  4742. *}
  4743. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  4744. Var
  4745. aSign, bSign: flag;
  4746. Begin
  4747. if
  4748. (
  4749. ( extractFloat64Exp( a ) = $7FF )
  4750. AND
  4751. (
  4752. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4753. )
  4754. )
  4755. OR (
  4756. ( extractFloat64Exp( b ) = $7FF )
  4757. AND (
  4758. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4759. )
  4760. )
  4761. ) then
  4762. Begin
  4763. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4764. float_raise( float_flag_invalid );
  4765. float64_lt_quiet := 0;
  4766. exit;
  4767. End;
  4768. aSign := extractFloat64Sign( a );
  4769. bSign := extractFloat64Sign( b );
  4770. if ( aSign <> bSign ) then
  4771. Begin
  4772. float64_lt_quiet := flag(
  4773. (aSign<>0)
  4774. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4775. <> 0 ));
  4776. exit;
  4777. End;
  4778. If aSign <> 0 then
  4779. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  4780. else
  4781. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  4782. End;
  4783. {*----------------------------------------------------------------------------
  4784. | Returns the result of converting the 64-bit two's complement integer `a'
  4785. | to the single-precision floating-point format. The conversion is performed
  4786. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4787. *----------------------------------------------------------------------------*}
  4788. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  4789. var
  4790. zSign : flag;
  4791. absA : uint64;
  4792. shiftCount: int8;
  4793. zSig : bits32;
  4794. intval : int64rec;
  4795. Begin
  4796. if ( a = 0 ) then
  4797. begin
  4798. int64_to_float32.float32 := 0;
  4799. exit;
  4800. end;
  4801. if a < 0 then
  4802. zSign := flag(TRUE)
  4803. else
  4804. zSign := flag(FALSE);
  4805. if zSign<>0 then
  4806. absA := -a
  4807. else
  4808. absA := a;
  4809. shiftCount := countLeadingZeros64( absA ) - 40;
  4810. if ( 0 <= shiftCount ) then
  4811. begin
  4812. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  4813. end
  4814. else
  4815. begin
  4816. shiftCount := shiftCount + 7;
  4817. if ( shiftCount < 0 ) then
  4818. begin
  4819. intval.low := int64rec(AbsA).low;
  4820. intval.high := int64rec(AbsA).high;
  4821. shift64RightJamming( intval.low, intval.high, - shiftCount,
  4822. intval.low, intval.high);
  4823. int64rec(absA).low := intval.low;
  4824. int64rec(absA).high := intval.high;
  4825. end
  4826. else
  4827. absA := absA shl shiftCount;
  4828. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  4829. end;
  4830. End;
  4831. {*----------------------------------------------------------------------------
  4832. | Returns the result of converting the 64-bit two's complement integer `a'
  4833. | to the double-precision floating-point format. The conversion is performed
  4834. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4835. *----------------------------------------------------------------------------*}
  4836. function int64_to_float64( a: int64 ): float64;
  4837. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  4838. var
  4839. zSign : flag;
  4840. float_result : float64;
  4841. intval : int64rec;
  4842. AbsA : bits64;
  4843. shiftcount : int8;
  4844. zSig0, zSig1 : bits32;
  4845. Begin
  4846. if ( a = 0 ) then
  4847. Begin
  4848. packFloat64( 0, 0, 0, 0, float_result );
  4849. exit;
  4850. end;
  4851. zSign := flag( a < 0 );
  4852. if ZSign<>0 then
  4853. AbsA := -a
  4854. else
  4855. AbsA := a;
  4856. shiftCount := countLeadingZeros64( absA ) - 11;
  4857. if ( 0 <= shiftCount ) then
  4858. Begin
  4859. absA := absA shl shiftcount;
  4860. zSig0:=int64rec(absA).high;
  4861. zSig1:=int64rec(absA).low;
  4862. End
  4863. else
  4864. Begin
  4865. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  4866. End;
  4867. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  4868. int64_to_float64:= float_result;
  4869. End;
  4870. {*----------------------------------------------------------------------------
  4871. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  4872. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  4873. | Otherwise, returns 0.
  4874. *----------------------------------------------------------------------------*}
  4875. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  4876. begin
  4877. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  4878. end;
  4879. {*----------------------------------------------------------------------------
  4880. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  4881. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  4882. | any carry out is lost. The result is broken into two 64-bit pieces which
  4883. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  4884. *----------------------------------------------------------------------------*}
  4885. procedure add128(a0: bits64; a1: bits64; b0: bits64; b1: bits64; var z0Ptr: bits64; var z1Ptr : bits64);inline;
  4886. var
  4887. z1: bits64;
  4888. begin
  4889. z1 := a1 + b1;
  4890. z1Ptr := z1;
  4891. z0Ptr := a0 + b0 + ord( z1 < a1 );
  4892. end;
  4893. {*----------------------------------------------------------------------------
  4894. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  4895. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  4896. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  4897. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  4898. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  4899. | the most-significant bit of the extra result, and the other 63 bits of the
  4900. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  4901. | were all zero. This extra result is stored in the location pointed to by
  4902. | `z2Ptr'. The value of `count' can be arbitrarily large.
  4903. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  4904. | to form a fixed-point value with binary point between `a1' and `a2'. This
  4905. | fixed-point value is shifted right by the number of bits given in `count',
  4906. | and the integer part of the result is returned at the locations pointed to
  4907. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  4908. | corrupted as described above, and is returned at the location pointed to by
  4909. | `z2Ptr'.)
  4910. *----------------------------------------------------------------------------*}
  4911. procedure shift128ExtraRightJamming(
  4912. a0: bits64;
  4913. a1: bits64;
  4914. a2: bits64;
  4915. count: int16;
  4916. var z0Ptr: bits64;
  4917. var z1Ptr: bits64;
  4918. var z2Ptr: bits64);
  4919. var
  4920. z0, z1, z2: bits64;
  4921. negCount: int8;
  4922. begin
  4923. negCount := ( - count ) and 63;
  4924. if ( count = 0 ) then
  4925. begin
  4926. z2 := a2;
  4927. z1 := a1;
  4928. z0 := a0;
  4929. end
  4930. else begin
  4931. if ( count < 64 ) then
  4932. begin
  4933. z2 := a1 shr negCount;
  4934. z1 := ( a0 shl negCount ) or ( a1 shr count );
  4935. z0 := a0 shr count;
  4936. end
  4937. else begin
  4938. if ( count = 64 ) then
  4939. begin
  4940. z2 := a1;
  4941. z1 := a0;
  4942. end
  4943. else begin
  4944. a2 := a2 or a1;
  4945. if ( count < 128 ) then
  4946. begin
  4947. z2 := a0 shl negCount;
  4948. z1 := a0 shr ( count and 63 );
  4949. end
  4950. else begin
  4951. if ( count = 128 ) then
  4952. z2 := a0
  4953. else
  4954. z2 := ord( a0 <> 0 );
  4955. z1 := 0;
  4956. end;
  4957. end;
  4958. z0 := 0;
  4959. end;
  4960. z2 := z2 or ord( a2 <> 0 );
  4961. end;
  4962. z2Ptr := z2;
  4963. z1Ptr := z1;
  4964. z0Ptr := z0;
  4965. end;
  4966. {*----------------------------------------------------------------------------
  4967. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  4968. | _plus_ the number of bits given in `count'. The shifted result is at most
  4969. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  4970. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  4971. | shifted off is the most-significant bit of the extra result, and the other
  4972. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  4973. | bits shifted off were all zero. This extra result is stored in the location
  4974. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  4975. | (This routine makes more sense if `a0' and `a1' are considered to form
  4976. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  4977. | point value is shifted right by the number of bits given in `count', and
  4978. | the integer part of the result is returned at the location pointed to by
  4979. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  4980. | described above, and is returned at the location pointed to by `z1Ptr'.)
  4981. *----------------------------------------------------------------------------*}
  4982. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  4983. var
  4984. z0, z1: bits64;
  4985. negCount: int8;
  4986. begin
  4987. negCount := ( - count ) and 63;
  4988. if ( count = 0 ) then
  4989. begin
  4990. z1 := a1;
  4991. z0 := a0;
  4992. end
  4993. else if ( count < 64 ) then
  4994. begin
  4995. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  4996. z0 := a0 shr count;
  4997. end
  4998. else begin
  4999. if ( count = 64 ) then
  5000. begin
  5001. z1 := a0 or ord( a1 <> 0 );
  5002. end
  5003. else begin
  5004. z1 := ord( ( a0 or a1 ) <> 0 );
  5005. end;
  5006. z0 := 0;
  5007. end;
  5008. z1Ptr := z1;
  5009. z0Ptr := z0;
  5010. end;
  5011. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5012. {*----------------------------------------------------------------------------
  5013. | Returns the fraction bits of the extended double-precision floating-point
  5014. | value `a'.
  5015. *----------------------------------------------------------------------------*}
  5016. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5017. begin
  5018. result:=a.low;
  5019. end;
  5020. {*----------------------------------------------------------------------------
  5021. | Returns the exponent bits of the extended double-precision floating-point
  5022. | value `a'.
  5023. *----------------------------------------------------------------------------*}
  5024. function extractFloatx80Exp(a : floatx80): int32;inline;
  5025. begin
  5026. result:=a.high and $7FFF;
  5027. end;
  5028. {*----------------------------------------------------------------------------
  5029. | Returns the sign bit of the extended double-precision floating-point value
  5030. | `a'.
  5031. *----------------------------------------------------------------------------*}
  5032. function extractFloatx80Sign(a : floatx80): flag;inline;
  5033. begin
  5034. result:=a.high shr 15;
  5035. end;
  5036. {*----------------------------------------------------------------------------
  5037. | Normalizes the subnormal extended double-precision floating-point value
  5038. | represented by the denormalized significand `aSig'. The normalized exponent
  5039. | and significand are stored at the locations pointed to by `zExpPtr' and
  5040. | `zSigPtr', respectively.
  5041. *----------------------------------------------------------------------------*}
  5042. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5043. var
  5044. shiftCount: int8;
  5045. begin
  5046. shiftCount := countLeadingZeros64( aSig );
  5047. zSigPtr := aSig shl shiftCount;
  5048. zExpPtr := 1 - shiftCount;
  5049. end;
  5050. {*----------------------------------------------------------------------------
  5051. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5052. | extended double-precision floating-point value, returning the result.
  5053. *----------------------------------------------------------------------------*}
  5054. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5055. var
  5056. z: floatx80;
  5057. begin
  5058. z.low := zSig;
  5059. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5060. result:=z;
  5061. end;
  5062. {*----------------------------------------------------------------------------
  5063. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5064. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5065. | and returns the proper extended double-precision floating-point value
  5066. | corresponding to the abstract input. Ordinarily, the abstract value is
  5067. | rounded and packed into the extended double-precision format, with the
  5068. | inexact exception raised if the abstract input cannot be represented
  5069. | exactly. However, if the abstract value is too large, the overflow and
  5070. | inexact exceptions are raised and an infinity or maximal finite value is
  5071. | returned. If the abstract value is too small, the input value is rounded to
  5072. | a subnormal number, and the underflow and inexact exceptions are raised if
  5073. | the abstract input cannot be represented exactly as a subnormal extended
  5074. | double-precision floating-point number.
  5075. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5076. | number of bits as single or double precision, respectively. Otherwise, the
  5077. | result is rounded to the full precision of the extended double-precision
  5078. | format.
  5079. | The input significand must be normalized or smaller. If the input
  5080. | significand is not normalized, `zExp' must be 0; in that case, the result
  5081. | returned is a subnormal number, and it must not require rounding. The
  5082. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5083. | Floating-Point Arithmetic.
  5084. *----------------------------------------------------------------------------*}
  5085. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5086. var
  5087. roundingMode: int8;
  5088. roundNearestEven, increment, isTiny: flag;
  5089. roundIncrement, roundMask, roundBits: int64;
  5090. label
  5091. precision80;
  5092. begin
  5093. roundingMode := float_rounding_mode;
  5094. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5095. if ( roundingPrecision = 80 ) then
  5096. goto precision80;
  5097. if ( roundingPrecision = 64 ) then
  5098. begin
  5099. roundIncrement := int64( $0000000000000400 );
  5100. roundMask := int64( $00000000000007FF );
  5101. end
  5102. else if ( roundingPrecision = 32 ) then
  5103. begin
  5104. roundIncrement := int64( $0000008000000000 );
  5105. roundMask := int64( $000000FFFFFFFFFF );
  5106. end
  5107. else begin
  5108. goto precision80;
  5109. end;
  5110. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5111. if ( not (roundNearestEven<>0) ) then
  5112. begin
  5113. if ( roundingMode = float_round_to_zero ) then
  5114. begin
  5115. roundIncrement := 0;
  5116. end
  5117. else begin
  5118. roundIncrement := roundMask;
  5119. if ( zSign<>0 ) then
  5120. begin
  5121. if ( roundingMode = float_round_up ) then
  5122. roundIncrement := 0;
  5123. end
  5124. else begin
  5125. if ( roundingMode = float_round_down ) then
  5126. roundIncrement := 0;
  5127. end;
  5128. end;
  5129. end;
  5130. roundBits := zSig0 and roundMask;
  5131. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5132. if ( ( $7FFE < zExp )
  5133. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5134. ) begin
  5135. goto overflow;
  5136. end;
  5137. if ( zExp <= 0 ) begin
  5138. isTiny =
  5139. ( float_detect_tininess = float_tininess_before_rounding )
  5140. or ( zExp < 0 )
  5141. or ( zSig0 <= zSig0 + roundIncrement );
  5142. shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
  5143. zExp := 0;
  5144. roundBits := zSig0 and roundMask;
  5145. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5146. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5147. zSig0 += roundIncrement;
  5148. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5149. roundIncrement := roundMask + 1;
  5150. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5151. roundMask |= roundIncrement;
  5152. end;
  5153. zSig0 &= ~ roundMask;
  5154. result:=packFloatx80( zSign, zExp, zSig0 );
  5155. end;
  5156. end;
  5157. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5158. zSig0 += roundIncrement;
  5159. if ( zSig0 < roundIncrement ) begin
  5160. ++zExp;
  5161. zSig0 := LIT64( $8000000000000000 );
  5162. end;
  5163. roundIncrement := roundMask + 1;
  5164. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5165. roundMask |= roundIncrement;
  5166. end;
  5167. zSig0 &= ~ roundMask;
  5168. if ( zSig0 = 0 ) zExp := 0;
  5169. result:=packFloatx80( zSign, zExp, zSig0 );
  5170. precision80:
  5171. increment := ( (sbits64) zSig1 < 0 );
  5172. if ( ! roundNearestEven ) begin
  5173. if ( roundingMode = float_round_to_zero ) begin
  5174. increment := 0;
  5175. end;
  5176. else begin
  5177. if ( zSign ) begin
  5178. increment := ( roundingMode = float_round_down ) and zSig1;
  5179. end;
  5180. else begin
  5181. increment := ( roundingMode = float_round_up ) and zSig1;
  5182. end;
  5183. end;
  5184. end;
  5185. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5186. if ( ( $7FFE < zExp )
  5187. or ( ( zExp = $7FFE )
  5188. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5189. and increment
  5190. )
  5191. ) begin
  5192. roundMask := 0;
  5193. overflow:
  5194. float_raise( float_flag_overflow or float_flag_inexact );
  5195. if ( ( roundingMode = float_round_to_zero )
  5196. or ( zSign and ( roundingMode = float_round_up ) )
  5197. or ( ! zSign and ( roundingMode = float_round_down ) )
  5198. ) begin
  5199. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5200. end;
  5201. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5202. end;
  5203. if ( zExp <= 0 ) begin
  5204. isTiny =
  5205. ( float_detect_tininess = float_tininess_before_rounding )
  5206. or ( zExp < 0 )
  5207. or ! increment
  5208. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5209. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
  5210. zExp := 0;
  5211. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5212. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5213. if ( roundNearestEven ) begin
  5214. increment := ( (sbits64) zSig1 < 0 );
  5215. end;
  5216. else begin
  5217. if ( zSign ) begin
  5218. increment := ( roundingMode = float_round_down ) and zSig1;
  5219. end;
  5220. else begin
  5221. increment := ( roundingMode = float_round_up ) and zSig1;
  5222. end;
  5223. end;
  5224. if ( increment ) begin
  5225. ++zSig0;
  5226. zSig0 &=
  5227. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5228. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5229. end;
  5230. result:=packFloatx80( zSign, zExp, zSig0 );
  5231. end;
  5232. end;
  5233. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5234. if ( increment ) begin
  5235. ++zSig0;
  5236. if ( zSig0 = 0 ) begin
  5237. ++zExp;
  5238. zSig0 := LIT64( $8000000000000000 );
  5239. end;
  5240. else begin
  5241. zSig0 &= ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5242. end;
  5243. end;
  5244. else begin
  5245. if ( zSig0 = 0 ) zExp := 0;
  5246. end;
  5247. result:=packFloatx80( zSign, zExp, zSig0 );
  5248. end;
  5249. {*----------------------------------------------------------------------------
  5250. | Takes an abstract floating-point value having sign `zSign', exponent
  5251. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5252. | and returns the proper extended double-precision floating-point value
  5253. | corresponding to the abstract input. This routine is just like
  5254. | `roundAndPackFloatx80' except that the input significand does not have to be
  5255. | normalized.
  5256. *----------------------------------------------------------------------------*}
  5257. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5258. var
  5259. shiftCount: int8;
  5260. begin
  5261. if ( zSig0 = 0 ) begin
  5262. zSig0 := zSig1;
  5263. zSig1 := 0;
  5264. zExp -= 64;
  5265. end;
  5266. shiftCount := countLeadingZeros64( zSig0 );
  5267. shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  5268. zExp := eExp - shiftCount;
  5269. return
  5270. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5271. end;
  5272. {*----------------------------------------------------------------------------
  5273. | Returns the result of converting the extended double-precision floating-
  5274. | point value `a' to the 32-bit two's complement integer format. The
  5275. | conversion is performed according to the IEC/IEEE Standard for Binary
  5276. | Floating-Point Arithmetic---which means in particular that the conversion
  5277. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5278. | largest positive integer is returned. Otherwise, if the conversion
  5279. | overflows, the largest integer with the same sign as `a' is returned.
  5280. *----------------------------------------------------------------------------*}
  5281. function floatx80_to_int32(a: floatx80): int32;
  5282. var
  5283. aSign: flag;
  5284. aExp, shiftCount: int32;
  5285. aSig: bits64;
  5286. begin
  5287. aSig := extractFloatx80Frac( a );
  5288. aExp := extractFloatx80Exp( a );
  5289. aSign := extractFloatx80Sign( a );
  5290. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5291. shiftCount := $4037 - aExp;
  5292. if ( shiftCount <= 0 ) shiftCount := 1;
  5293. shift64RightJamming( aSig, shiftCount, &aSig );
  5294. result := roundAndPackInt32( aSign, aSig );
  5295. end;
  5296. {*----------------------------------------------------------------------------
  5297. | Returns the result of converting the extended double-precision floating-
  5298. | point value `a' to the 32-bit two's complement integer format. The
  5299. | conversion is performed according to the IEC/IEEE Standard for Binary
  5300. | Floating-Point Arithmetic, except that the conversion is always rounded
  5301. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5302. | Otherwise, if the conversion overflows, the largest integer with the same
  5303. | sign as `a' is returned.
  5304. *----------------------------------------------------------------------------*}
  5305. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5306. var
  5307. aSign: flag;
  5308. aExp, shiftCount: int32;
  5309. aSig, savedASig: bits64;
  5310. z: int32;
  5311. begin
  5312. aSig := extractFloatx80Frac( a );
  5313. aExp := extractFloatx80Exp( a );
  5314. aSign := extractFloatx80Sign( a );
  5315. if ( $401E < aExp ) begin
  5316. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5317. goto invalid;
  5318. end;
  5319. else if ( aExp < $3FFF ) begin
  5320. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5321. result := 0;
  5322. end;
  5323. shiftCount := $403E - aExp;
  5324. savedASig := aSig;
  5325. aSig >>= shiftCount;
  5326. z := aSig;
  5327. if ( aSign ) z := - z;
  5328. if ( ( z < 0 ) xor aSign ) begin
  5329. invalid:
  5330. float_raise( float_flag_invalid );
  5331. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5332. end;
  5333. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5334. softfloat_exception_flags or= float_flag_inexact;
  5335. end;
  5336. result := z;
  5337. end;
  5338. {*----------------------------------------------------------------------------
  5339. | Returns the result of converting the extended double-precision floating-
  5340. | point value `a' to the 64-bit two's complement integer format. The
  5341. | conversion is performed according to the IEC/IEEE Standard for Binary
  5342. | Floating-Point Arithmetic---which means in particular that the conversion
  5343. | is rounded according to the current rounding mode. If `a' is a NaN,
  5344. | the largest positive integer is returned. Otherwise, if the conversion
  5345. | overflows, the largest integer with the same sign as `a' is returned.
  5346. *----------------------------------------------------------------------------*}
  5347. function floatx80_to_int64(a: floatx80): int64;
  5348. var
  5349. aSign: flag;
  5350. aExp, shiftCount: int32;
  5351. aSig, aSigExtra: bits64;
  5352. begin
  5353. aSig := extractFloatx80Frac( a );
  5354. aExp := extractFloatx80Exp( a );
  5355. aSign := extractFloatx80Sign( a );
  5356. shiftCount := $403E - aExp;
  5357. if ( shiftCount <= 0 ) begin
  5358. if ( shiftCount ) begin
  5359. float_raise( float_flag_invalid );
  5360. if ( ! aSign
  5361. or ( ( aExp = $7FFF )
  5362. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5363. ) begin
  5364. result := LIT64( $7FFFFFFFFFFFFFFF );
  5365. end;
  5366. result := (sbits64) LIT64( $8000000000000000 );
  5367. end;
  5368. aSigExtra := 0;
  5369. end;
  5370. else begin
  5371. shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
  5372. end;
  5373. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5374. end;
  5375. {*----------------------------------------------------------------------------
  5376. | Returns the result of converting the extended double-precision floating-
  5377. | point value `a' to the 64-bit two's complement integer format. The
  5378. | conversion is performed according to the IEC/IEEE Standard for Binary
  5379. | Floating-Point Arithmetic, except that the conversion is always rounded
  5380. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5381. | Otherwise, if the conversion overflows, the largest integer with the same
  5382. | sign as `a' is returned.
  5383. *----------------------------------------------------------------------------*}
  5384. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5385. var
  5386. aSign: flag;
  5387. aExp, shiftCount: int32;
  5388. aSig: bits64;
  5389. z: int64;
  5390. begin
  5391. aSig := extractFloatx80Frac( a );
  5392. aExp := extractFloatx80Exp( a );
  5393. aSign := extractFloatx80Sign( a );
  5394. shiftCount := aExp - $403E;
  5395. if ( 0 <= shiftCount ) begin
  5396. aSig &= LIT64( $7FFFFFFFFFFFFFFF );
  5397. if ( ( a.high <> $C03E ) or aSig ) begin
  5398. float_raise( float_flag_invalid );
  5399. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5400. result := LIT64( $7FFFFFFFFFFFFFFF );
  5401. end;
  5402. end;
  5403. result := (sbits64) LIT64( $8000000000000000 );
  5404. end;
  5405. else if ( aExp < $3FFF ) begin
  5406. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5407. result := 0;
  5408. end;
  5409. z := aSig>>( - shiftCount );
  5410. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5411. softfloat_exception_flags or= float_flag_inexact;
  5412. end;
  5413. if ( aSign ) z := - z;
  5414. result := z;
  5415. end;
  5416. {*----------------------------------------------------------------------------
  5417. | Returns the result of converting the extended double-precision floating-
  5418. | point value `a' to the single-precision floating-point format. The
  5419. | conversion is performed according to the IEC/IEEE Standard for Binary
  5420. | Floating-Point Arithmetic.
  5421. *----------------------------------------------------------------------------*}
  5422. function floatx80_to_float32(a: floatx80): float32;
  5423. var
  5424. aSign: flag;
  5425. aExp: int32;
  5426. aSig: bits64;
  5427. begin
  5428. aSig := extractFloatx80Frac( a );
  5429. aExp := extractFloatx80Exp( a );
  5430. aSign := extractFloatx80Sign( a );
  5431. if ( aExp = $7FFF ) begin
  5432. if ( (bits64) ( aSig shl 1 ) ) begin
  5433. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5434. end;
  5435. result := packFloat32( aSign, $FF, 0 );
  5436. end;
  5437. shift64RightJamming( aSig, 33, &aSig );
  5438. if ( aExp or aSig ) aExp -= $3F81;
  5439. result := roundAndPackFloat32( aSign, aExp, aSig );
  5440. end;
  5441. {*----------------------------------------------------------------------------
  5442. | Returns the result of converting the extended double-precision floating-
  5443. | point value `a' to the double-precision floating-point format. The
  5444. | conversion is performed according to the IEC/IEEE Standard for Binary
  5445. | Floating-Point Arithmetic.
  5446. *----------------------------------------------------------------------------*}
  5447. function floatx80_to_float64(a: floatx80): float64;
  5448. var
  5449. aSign: flag;
  5450. aExp: int32;
  5451. aSig, zSig: bits64;
  5452. begin
  5453. aSig := extractFloatx80Frac( a );
  5454. aExp := extractFloatx80Exp( a );
  5455. aSign := extractFloatx80Sign( a );
  5456. if ( aExp = $7FFF ) begin
  5457. if ( (bits64) ( aSig shl 1 ) ) begin
  5458. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5459. end;
  5460. result := packFloat64( aSign, $7FF, 0 );
  5461. end;
  5462. shift64RightJamming( aSig, 1, &zSig );
  5463. if ( aExp or aSig ) aExp -= $3C01;
  5464. result := roundAndPackFloat64( aSign, aExp, zSig );
  5465. end;
  5466. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5467. {*----------------------------------------------------------------------------
  5468. | Returns the result of converting the extended double-precision floating-
  5469. | point value `a' to the quadruple-precision floating-point format. The
  5470. | conversion is performed according to the IEC/IEEE Standard for Binary
  5471. | Floating-Point Arithmetic.
  5472. *----------------------------------------------------------------------------*}
  5473. function floatx80_to_float128(a: floatx80): float128;
  5474. var
  5475. aSign: flag;
  5476. aExp: int16;
  5477. aSig, zSig0, zSig1: bits64;
  5478. begin
  5479. aSig := extractFloatx80Frac( a );
  5480. aExp := extractFloatx80Exp( a );
  5481. aSign := extractFloatx80Sign( a );
  5482. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5483. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5484. end;
  5485. shift128Right( aSig shl 1, 0, 16, &zSig0, &zSig1 );
  5486. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5487. end;
  5488. {$endif FPC_SOFTFLOAT_FLOAT128}
  5489. {*----------------------------------------------------------------------------
  5490. | Rounds the extended double-precision floating-point value `a' to an integer,
  5491. | and Returns the result as an extended quadruple-precision floating-point
  5492. | value. The operation is performed according to the IEC/IEEE Standard for
  5493. | Binary Floating-Point Arithmetic.
  5494. *----------------------------------------------------------------------------*}
  5495. function floatx80_round_to_int(a: floatx80): floatx80;
  5496. var
  5497. aSign: flag;
  5498. aExp: int32;
  5499. lastBitMask, roundBitsMask: bits64;
  5500. roundingMode: int8;
  5501. z: floatx80;
  5502. begin
  5503. aExp := extractFloatx80Exp( a );
  5504. if ( $403E <= aExp ) begin
  5505. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5506. result := propagateFloatx80NaN( a, a );
  5507. end;
  5508. result := a;
  5509. end;
  5510. if ( aExp < $3FFF ) begin
  5511. if ( ( aExp = 0 )
  5512. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5513. result := a;
  5514. end;
  5515. softfloat_exception_flags or= float_flag_inexact;
  5516. aSign := extractFloatx80Sign( a );
  5517. switch ( float_rounding_mode ) begin
  5518. case float_round_nearest_even:
  5519. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5520. ) begin
  5521. result :=
  5522. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5523. end;
  5524. break;
  5525. case float_round_down:
  5526. result :=
  5527. aSign ?
  5528. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5529. : packFloatx80( 0, 0, 0 );
  5530. case float_round_up:
  5531. result :=
  5532. aSign ? packFloatx80( 1, 0, 0 )
  5533. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5534. end;
  5535. result := packFloatx80( aSign, 0, 0 );
  5536. end;
  5537. lastBitMask := 1;
  5538. lastBitMask shl = $403E - aExp;
  5539. roundBitsMask := lastBitMask - 1;
  5540. z := a;
  5541. roundingMode := float_rounding_mode;
  5542. if ( roundingMode = float_round_nearest_even ) begin
  5543. z.low += lastBitMask>>1;
  5544. if ( ( z.low and roundBitsMask ) = 0 ) z.low &= ~ lastBitMask;
  5545. end;
  5546. else if ( roundingMode <> float_round_to_zero ) begin
  5547. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5548. z.low += roundBitsMask;
  5549. end;
  5550. end;
  5551. z.low &= ~ roundBitsMask;
  5552. if ( z.low = 0 ) begin
  5553. ++z.high;
  5554. z.low := LIT64( $8000000000000000 );
  5555. end;
  5556. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5557. result := z;
  5558. end;
  5559. {*----------------------------------------------------------------------------
  5560. | Returns the result of adding the absolute values of the extended double-
  5561. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5562. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5563. | The addition is performed according to the IEC/IEEE Standard for Binary
  5564. | Floating-Point Arithmetic.
  5565. *----------------------------------------------------------------------------*}
  5566. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5567. var
  5568. aExp, bExp, zExp: int32;
  5569. aSig, bSig, zSig0, zSig1: bits64;
  5570. expDiff: int32;
  5571. begin
  5572. aSig := extractFloatx80Frac( a );
  5573. aExp := extractFloatx80Exp( a );
  5574. bSig := extractFloatx80Frac( b );
  5575. bExp := extractFloatx80Exp( b );
  5576. expDiff := aExp - bExp;
  5577. if ( 0 < expDiff ) begin
  5578. if ( aExp = $7FFF ) begin
  5579. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5580. result := a;
  5581. end;
  5582. if ( bExp = 0 ) --expDiff;
  5583. shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
  5584. zExp := aExp;
  5585. end;
  5586. else if ( expDiff < 0 ) begin
  5587. if ( bExp = $7FFF ) begin
  5588. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5589. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5590. end;
  5591. if ( aExp = 0 ) ++expDiff;
  5592. shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
  5593. zExp := bExp;
  5594. end;
  5595. else begin
  5596. if ( aExp = $7FFF ) begin
  5597. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5598. result := propagateFloatx80NaN( a, b );
  5599. end;
  5600. result := a;
  5601. end;
  5602. zSig1 := 0;
  5603. zSig0 := aSig + bSig;
  5604. if ( aExp = 0 ) begin
  5605. normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
  5606. goto roundAndPack;
  5607. end;
  5608. zExp := aExp;
  5609. goto shiftRight1;
  5610. end;
  5611. zSig0 := aSig + bSig;
  5612. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5613. shiftRight1:
  5614. shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
  5615. zSig0 or= LIT64( $8000000000000000 );
  5616. ++zExp;
  5617. roundAndPack:
  5618. result :=
  5619. roundAndPackFloatx80(
  5620. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5621. end;
  5622. {*----------------------------------------------------------------------------
  5623. | Returns the result of subtracting the absolute values of the extended
  5624. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5625. | difference is negated before being returned. `zSign' is ignored if the
  5626. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5627. | Standard for Binary Floating-Point Arithmetic.
  5628. *----------------------------------------------------------------------------*}
  5629. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5630. var
  5631. aExp, bExp, zExp: int32;
  5632. aSig, bSig, zSig0, zSig1: bits64;
  5633. expDiff: int32;
  5634. z: floatx80;
  5635. begin
  5636. aSig := extractFloatx80Frac( a );
  5637. aExp := extractFloatx80Exp( a );
  5638. bSig := extractFloatx80Frac( b );
  5639. bExp := extractFloatx80Exp( b );
  5640. expDiff := aExp - bExp;
  5641. if ( 0 < expDiff ) goto aExpBigger;
  5642. if ( expDiff < 0 ) goto bExpBigger;
  5643. if ( aExp = $7FFF ) begin
  5644. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5645. result := propagateFloatx80NaN( a, b );
  5646. end;
  5647. float_raise( float_flag_invalid );
  5648. z.low := floatx80_default_nan_low;
  5649. z.high := floatx80_default_nan_high;
  5650. result := z;
  5651. end;
  5652. if ( aExp = 0 ) begin
  5653. aExp := 1;
  5654. bExp := 1;
  5655. end;
  5656. zSig1 := 0;
  5657. if ( bSig < aSig ) goto aBigger;
  5658. if ( aSig < bSig ) goto bBigger;
  5659. result := packFloatx80( float_rounding_mode = float_round_down, 0, 0 );
  5660. bExpBigger:
  5661. if ( bExp = $7FFF ) begin
  5662. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5663. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  5664. end;
  5665. if ( aExp = 0 ) ++expDiff;
  5666. shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
  5667. bBigger:
  5668. sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
  5669. zExp := bExp;
  5670. zSign xor = 1;
  5671. goto normalizeRoundAndPack;
  5672. aExpBigger:
  5673. if ( aExp = $7FFF ) begin
  5674. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5675. result := a;
  5676. end;
  5677. if ( bExp = 0 ) --expDiff;
  5678. shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
  5679. aBigger:
  5680. sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
  5681. zExp := aExp;
  5682. normalizeRoundAndPack:
  5683. result :=
  5684. normalizeRoundAndPackFloatx80(
  5685. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5686. end;
  5687. {*----------------------------------------------------------------------------
  5688. | Returns the result of adding the extended double-precision floating-point
  5689. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  5690. | Standard for Binary Floating-Point Arithmetic.
  5691. *----------------------------------------------------------------------------*}
  5692. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  5693. var
  5694. aSign, bSign: flag;
  5695. begin
  5696. aSign := extractFloatx80Sign( a );
  5697. bSign := extractFloatx80Sign( b );
  5698. if ( aSign = bSign ) begin
  5699. result := addFloatx80Sigs( a, b, aSign );
  5700. end;
  5701. else begin
  5702. result := subFloatx80Sigs( a, b, aSign );
  5703. end;
  5704. end;
  5705. {*----------------------------------------------------------------------------
  5706. | Returns the result of subtracting the extended double-precision floating-
  5707. | point values `a' and `b'. The operation is performed according to the
  5708. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5709. *----------------------------------------------------------------------------*}
  5710. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  5711. var
  5712. aSign, bSign: flag;
  5713. begin
  5714. aSign := extractFloatx80Sign( a );
  5715. bSign := extractFloatx80Sign( b );
  5716. if ( aSign = bSign ) begin
  5717. result := subFloatx80Sigs( a, b, aSign );
  5718. end;
  5719. else begin
  5720. result := addFloatx80Sigs( a, b, aSign );
  5721. end;
  5722. end;
  5723. {*----------------------------------------------------------------------------
  5724. | Returns the result of multiplying the extended double-precision floating-
  5725. | point values `a' and `b'. The operation is performed according to the
  5726. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5727. *----------------------------------------------------------------------------*}
  5728. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  5729. var
  5730. aSign, bSign, zSign: flag;
  5731. aExp, bExp, zExp: int32;
  5732. aSig, bSig, zSig0, zSig1: bits64;
  5733. z: floatx80;
  5734. begin
  5735. aSig := extractFloatx80Frac( a );
  5736. aExp := extractFloatx80Exp( a );
  5737. aSign := extractFloatx80Sign( a );
  5738. bSig := extractFloatx80Frac( b );
  5739. bExp := extractFloatx80Exp( b );
  5740. bSign := extractFloatx80Sign( b );
  5741. zSign := aSign xor bSign;
  5742. if ( aExp = $7FFF ) begin
  5743. if ( (bits64) ( aSig shl 1 )
  5744. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  5745. result := propagateFloatx80NaN( a, b );
  5746. end;
  5747. if ( ( bExp or bSig ) = 0 ) goto invalid;
  5748. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5749. end;
  5750. if ( bExp = $7FFF ) begin
  5751. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5752. if ( ( aExp or aSig ) = 0 ) begin
  5753. invalid:
  5754. float_raise( float_flag_invalid );
  5755. z.low := floatx80_default_nan_low;
  5756. z.high := floatx80_default_nan_high;
  5757. result := z;
  5758. end;
  5759. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5760. end;
  5761. if ( aExp = 0 ) begin
  5762. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5763. normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
  5764. end;
  5765. if ( bExp = 0 ) begin
  5766. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5767. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  5768. end;
  5769. zExp := aExp + bExp - $3FFE;
  5770. mul64To128( aSig, bSig, &zSig0, &zSig1 );
  5771. if ( 0 < (sbits64) zSig0 ) begin
  5772. shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
  5773. --zExp;
  5774. end;
  5775. result :=
  5776. roundAndPackFloatx80(
  5777. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5778. end;
  5779. {*----------------------------------------------------------------------------
  5780. | Returns the result of dividing the extended double-precision floating-point
  5781. | value `a' by the corresponding value `b'. The operation is performed
  5782. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5783. *----------------------------------------------------------------------------*}
  5784. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  5785. var
  5786. aSign, bSign, zSign: flag;
  5787. aExp, bExp, zExp: int32;
  5788. aSig, bSig, zSig0, zSig1: bits64;
  5789. rem0, rem1, rem2, term0, term1, term2: bits64;
  5790. z: floatx80;
  5791. begin
  5792. aSig := extractFloatx80Frac( a );
  5793. aExp := extractFloatx80Exp( a );
  5794. aSign := extractFloatx80Sign( a );
  5795. bSig := extractFloatx80Frac( b );
  5796. bExp := extractFloatx80Exp( b );
  5797. bSign := extractFloatx80Sign( b );
  5798. zSign := aSign xor bSign;
  5799. if ( aExp = $7FFF ) begin
  5800. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5801. if ( bExp = $7FFF ) begin
  5802. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5803. goto invalid;
  5804. end;
  5805. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5806. end;
  5807. if ( bExp = $7FFF ) begin
  5808. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5809. result := packFloatx80( zSign, 0, 0 );
  5810. end;
  5811. if ( bExp = 0 ) begin
  5812. if ( bSig = 0 ) begin
  5813. if ( ( aExp or aSig ) = 0 ) begin
  5814. invalid:
  5815. float_raise( float_flag_invalid );
  5816. z.low := floatx80_default_nan_low;
  5817. z.high := floatx80_default_nan_high;
  5818. result := z;
  5819. end;
  5820. float_raise( float_flag_divbyzero );
  5821. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5822. end;
  5823. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  5824. end;
  5825. if ( aExp = 0 ) begin
  5826. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5827. normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
  5828. end;
  5829. zExp := aExp - bExp + $3FFE;
  5830. rem1 := 0;
  5831. if ( bSig <= aSig ) begin
  5832. shift128Right( aSig, 0, 1, &aSig, &rem1 );
  5833. ++zExp;
  5834. end;
  5835. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  5836. mul64To128( bSig, zSig0, &term0, &term1 );
  5837. sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
  5838. while ( (sbits64) rem0 < 0 ) begin
  5839. --zSig0;
  5840. add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
  5841. end;
  5842. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  5843. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  5844. mul64To128( bSig, zSig1, &term1, &term2 );
  5845. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  5846. while ( (sbits64) rem1 < 0 ) begin
  5847. --zSig1;
  5848. add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
  5849. end;
  5850. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  5851. end;
  5852. result :=
  5853. roundAndPackFloatx80(
  5854. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5855. end;
  5856. {*----------------------------------------------------------------------------
  5857. | Returns the remainder of the extended double-precision floating-point value
  5858. | `a' with respect to the corresponding value `b'. The operation is performed
  5859. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5860. *----------------------------------------------------------------------------*}
  5861. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  5862. var
  5863. aSign, bSign, zSign: flag;
  5864. aExp, bExp, expDiff: int32;
  5865. aSig0, aSig1, bSig: bits64;
  5866. q, term0, term1, alternateASig0, alternateASig1: bits64;
  5867. z: floatx80;
  5868. begin
  5869. aSig0 := extractFloatx80Frac( a );
  5870. aExp := extractFloatx80Exp( a );
  5871. aSign := extractFloatx80Sign( a );
  5872. bSig := extractFloatx80Frac( b );
  5873. bExp := extractFloatx80Exp( b );
  5874. bSign := extractFloatx80Sign( b );
  5875. if ( aExp = $7FFF ) begin
  5876. if ( (bits64) ( aSig0 shl 1 )
  5877. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  5878. result := propagateFloatx80NaN( a, b );
  5879. end;
  5880. goto invalid;
  5881. end;
  5882. if ( bExp = $7FFF ) begin
  5883. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5884. result := a;
  5885. end;
  5886. if ( bExp = 0 ) begin
  5887. if ( bSig = 0 ) begin
  5888. invalid:
  5889. float_raise( float_flag_invalid );
  5890. z.low := floatx80_default_nan_low;
  5891. z.high := floatx80_default_nan_high;
  5892. result := z;
  5893. end;
  5894. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  5895. end;
  5896. if ( aExp = 0 ) begin
  5897. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  5898. normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
  5899. end;
  5900. bSig or= LIT64( $8000000000000000 );
  5901. zSign := aSign;
  5902. expDiff := aExp - bExp;
  5903. aSig1 := 0;
  5904. if ( expDiff < 0 ) begin
  5905. if ( expDiff < -1 ) result := a;
  5906. shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
  5907. expDiff := 0;
  5908. end;
  5909. q := ( bSig <= aSig0 );
  5910. if ( q ) aSig0 -= bSig;
  5911. expDiff -= 64;
  5912. while ( 0 < expDiff ) begin
  5913. q := estimateDiv128To64( aSig0, aSig1, bSig );
  5914. q := ( 2 < q ) ? q - 2 : 0;
  5915. mul64To128( bSig, q, &term0, &term1 );
  5916. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5917. shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
  5918. expDiff -= 62;
  5919. end;
  5920. expDiff += 64;
  5921. if ( 0 < expDiff ) begin
  5922. q := estimateDiv128To64( aSig0, aSig1, bSig );
  5923. q := ( 2 < q ) ? q - 2 : 0;
  5924. q >>= 64 - expDiff;
  5925. mul64To128( bSig, q shl ( 64 - expDiff ), &term0, &term1 );
  5926. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5927. shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
  5928. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  5929. ++q;
  5930. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5931. end;
  5932. end;
  5933. else begin
  5934. term1 := 0;
  5935. term0 := bSig;
  5936. end;
  5937. sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
  5938. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  5939. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  5940. and ( q and 1 ) )
  5941. ) begin
  5942. aSig0 := alternateASig0;
  5943. aSig1 := alternateASig1;
  5944. zSign := ! zSign;
  5945. end;
  5946. result :=
  5947. normalizeRoundAndPackFloatx80(
  5948. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  5949. end;
  5950. {*----------------------------------------------------------------------------
  5951. | Returns the square root of the extended double-precision floating-point
  5952. | value `a'. The operation is performed according to the IEC/IEEE Standard
  5953. | for Binary Floating-Point Arithmetic.
  5954. *----------------------------------------------------------------------------*}
  5955. function floatx80_sqrt(a: floatx80): floatx80;
  5956. var
  5957. aSign: flag;
  5958. aExp, zExp: int32;
  5959. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  5960. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  5961. z: floatx80;
  5962. label
  5963. invalid;
  5964. begin
  5965. aSig0 := extractFloatx80Frac( a );
  5966. aExp := extractFloatx80Exp( a );
  5967. aSign := extractFloatx80Sign( a );
  5968. if ( aExp = $7FFF ) begin
  5969. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  5970. if ( ! aSign ) result := a;
  5971. goto invalid;
  5972. end;
  5973. if ( aSign ) begin
  5974. if ( ( aExp or aSig0 ) = 0 ) result := a;
  5975. invalid:
  5976. float_raise( float_flag_invalid );
  5977. z.low := floatx80_default_nan_low;
  5978. z.high := floatx80_default_nan_high;
  5979. result := z;
  5980. end;
  5981. if ( aExp = 0 ) begin
  5982. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  5983. normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
  5984. end;
  5985. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  5986. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  5987. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), &aSig0, &aSig1 );
  5988. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  5989. doubleZSig0 := zSig0 shl 1;
  5990. mul64To128( zSig0, zSig0, &term0, &term1 );
  5991. sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
  5992. while ( (sbits64) rem0 < 0 ) begin
  5993. --zSig0;
  5994. doubleZSig0 -= 2;
  5995. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, &rem0, &rem1 );
  5996. end;
  5997. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  5998. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  5999. if ( zSig1 = 0 ) zSig1 := 1;
  6000. mul64To128( doubleZSig0, zSig1, &term1, &term2 );
  6001. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  6002. mul64To128( zSig1, zSig1, &term2, &term3 );
  6003. sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
  6004. while ( (sbits64) rem1 < 0 ) begin
  6005. --zSig1;
  6006. shortShift128Left( 0, zSig1, 1, &term2, &term3 );
  6007. term3 or= 1;
  6008. term2 or= doubleZSig0;
  6009. add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
  6010. end;
  6011. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  6012. end;
  6013. shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
  6014. zSig0 or= doubleZSig0;
  6015. result :=
  6016. roundAndPackFloatx80(
  6017. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  6018. end;
  6019. {*----------------------------------------------------------------------------
  6020. | Returns 1 if the extended double-precision floating-point value `a' is
  6021. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  6022. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6023. | Arithmetic.
  6024. *----------------------------------------------------------------------------*}
  6025. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  6026. begin
  6027. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6028. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6029. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6030. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6031. ) begin
  6032. if ( floatx80_is_signaling_nan( a )
  6033. or floatx80_is_signaling_nan( b ) ) begin
  6034. float_raise( float_flag_invalid );
  6035. end;
  6036. result := 0;
  6037. end;
  6038. result :=
  6039. ( a.low = b.low )
  6040. and ( ( a.high = b.high )
  6041. or ( ( a.low = 0 )
  6042. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6043. );
  6044. end;
  6045. {*----------------------------------------------------------------------------
  6046. | Returns 1 if the extended double-precision floating-point value `a' is
  6047. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6048. | comparison is performed according to the IEC/IEEE Standard for Binary
  6049. | Floating-Point Arithmetic.
  6050. *----------------------------------------------------------------------------*}
  6051. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6052. var
  6053. aSign, bSign: flag;
  6054. begin
  6055. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6056. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6057. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6058. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6059. ) begin
  6060. float_raise( float_flag_invalid );
  6061. result := 0;
  6062. end;
  6063. aSign := extractFloatx80Sign( a );
  6064. bSign := extractFloatx80Sign( b );
  6065. if ( aSign <> bSign ) begin
  6066. result :=
  6067. aSign
  6068. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6069. = 0 );
  6070. end;
  6071. result :=
  6072. aSign ? le128( b.high, b.low, a.high, a.low )
  6073. : le128( a.high, a.low, b.high, b.low );
  6074. end;
  6075. {*----------------------------------------------------------------------------
  6076. | Returns 1 if the extended double-precision floating-point value `a' is
  6077. | less than the corresponding value `b', and 0 otherwise. The comparison
  6078. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6079. | Arithmetic.
  6080. *----------------------------------------------------------------------------*}
  6081. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6082. var
  6083. aSign, bSign: flag;
  6084. begin
  6085. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6086. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6087. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6088. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6089. ) begin
  6090. float_raise( float_flag_invalid );
  6091. result := 0;
  6092. end;
  6093. aSign := extractFloatx80Sign( a );
  6094. bSign := extractFloatx80Sign( b );
  6095. if ( aSign <> bSign ) begin
  6096. result :=
  6097. aSign
  6098. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6099. <> 0 );
  6100. end;
  6101. result :=
  6102. aSign ? lt128( b.high, b.low, a.high, a.low )
  6103. : lt128( a.high, a.low, b.high, b.low );
  6104. end;
  6105. {*----------------------------------------------------------------------------
  6106. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6107. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6108. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6109. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6110. *----------------------------------------------------------------------------*}
  6111. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6112. begin
  6113. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6114. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6115. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6116. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6117. ) begin
  6118. float_raise( float_flag_invalid );
  6119. result := 0;
  6120. end;
  6121. result :=
  6122. ( a.low = b.low )
  6123. and ( ( a.high = b.high )
  6124. or ( ( a.low = 0 )
  6125. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6126. );
  6127. end;
  6128. {*----------------------------------------------------------------------------
  6129. | Returns 1 if the extended double-precision floating-point value `a' is less
  6130. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6131. | do not cause an exception. Otherwise, the comparison is performed according
  6132. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6133. *----------------------------------------------------------------------------*}
  6134. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6135. var
  6136. aSign, bSign: flag;
  6137. begin
  6138. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6139. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6140. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6141. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6142. ) begin
  6143. if ( floatx80_is_signaling_nan( a )
  6144. or floatx80_is_signaling_nan( b ) ) begin
  6145. float_raise( float_flag_invalid );
  6146. end;
  6147. result := 0;
  6148. end;
  6149. aSign := extractFloatx80Sign( a );
  6150. bSign := extractFloatx80Sign( b );
  6151. if ( aSign <> bSign ) begin
  6152. result :=
  6153. aSign
  6154. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6155. = 0 );
  6156. end;
  6157. result :=
  6158. aSign ? le128( b.high, b.low, a.high, a.low )
  6159. : le128( a.high, a.low, b.high, b.low );
  6160. end;
  6161. {*----------------------------------------------------------------------------
  6162. | Returns 1 if the extended double-precision floating-point value `a' is less
  6163. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6164. | an exception. Otherwise, the comparison is performed according to the
  6165. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6166. *----------------------------------------------------------------------------*}
  6167. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6168. var
  6169. aSign, bSign: flag;
  6170. begin
  6171. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6172. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6173. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6174. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6175. ) begin
  6176. if ( floatx80_is_signaling_nan( a )
  6177. or floatx80_is_signaling_nan( b ) ) begin
  6178. float_raise( float_flag_invalid );
  6179. end;
  6180. result := 0;
  6181. end;
  6182. aSign := extractFloatx80Sign( a );
  6183. bSign := extractFloatx80Sign( b );
  6184. if ( aSign <> bSign ) begin
  6185. result :=
  6186. aSign
  6187. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6188. <> 0 );
  6189. end;
  6190. result :=
  6191. aSign ? lt128( b.high, b.low, a.high, a.low )
  6192. : lt128( a.high, a.low, b.high, b.low );
  6193. end;
  6194. {$endif FPC_SOFTFLOAT_FLOATX80}
  6195. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6196. {*----------------------------------------------------------------------------
  6197. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6198. | floating-point value `a'.
  6199. *----------------------------------------------------------------------------*}
  6200. function extractFloat128Frac1(a : float128): bits64;
  6201. begin
  6202. result:=a.low;
  6203. end;
  6204. {*----------------------------------------------------------------------------
  6205. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6206. | floating-point value `a'.
  6207. *----------------------------------------------------------------------------*}
  6208. function extractFloat128Frac0(a : float128): bits64;
  6209. begin
  6210. result:=a.high and int64($0000FFFFFFFFFFFF);
  6211. end;
  6212. {*----------------------------------------------------------------------------
  6213. | Returns the exponent bits of the quadruple-precision floating-point value
  6214. | `a'.
  6215. *----------------------------------------------------------------------------*}
  6216. function extractFloat128Exp(a : float128): int32;
  6217. begin
  6218. result:=( a.high shr 48 ) and $7FFF;
  6219. end;
  6220. {*----------------------------------------------------------------------------
  6221. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6222. *----------------------------------------------------------------------------*}
  6223. function extractFloat128Sign(a : float128): flag;
  6224. begin
  6225. result:=a.high shr 63;
  6226. end;
  6227. {*----------------------------------------------------------------------------
  6228. | Normalizes the subnormal quadruple-precision floating-point value
  6229. | represented by the denormalized significand formed by the concatenation of
  6230. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6231. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6232. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6233. | least significant 64 bits of the normalized significand are stored at the
  6234. | location pointed to by `zSig1Ptr'.
  6235. *----------------------------------------------------------------------------*}
  6236. procedure normalizeFloat128Subnormal(
  6237. aSig0: bits64;
  6238. aSig1: bits64;
  6239. var zExpPtr: int32;
  6240. var zSig0Ptr: bits64;
  6241. var zSig1Ptr: bits64);
  6242. var
  6243. shiftCount: int8;
  6244. begin
  6245. if ( aSig0 = 0 ) then
  6246. begin
  6247. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6248. if ( shiftCount < 0 ) then
  6249. begin
  6250. zSig0Ptr := aSig1 shr ( - shiftCount );
  6251. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6252. end
  6253. else begin
  6254. zSig0Ptr := aSig1 shl shiftCount;
  6255. zSig1Ptr := 0;
  6256. end;
  6257. zExpPtr := - shiftCount - 63;
  6258. end
  6259. else begin
  6260. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6261. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6262. zExpPtr := 1 - shiftCount;
  6263. end;
  6264. end;
  6265. {*----------------------------------------------------------------------------
  6266. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6267. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6268. | floating-point value, returning the result. After being shifted into the
  6269. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6270. | added together to form the most significant 32 bits of the result. This
  6271. | means that any integer portion of `zSig0' will be added into the exponent.
  6272. | Since a properly normalized significand will have an integer portion equal
  6273. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6274. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6275. | significand.
  6276. *----------------------------------------------------------------------------*}
  6277. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6278. var
  6279. z: float128;
  6280. begin
  6281. z.low := zSig1;
  6282. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6283. result:=z;
  6284. end;
  6285. {*----------------------------------------------------------------------------
  6286. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6287. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6288. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6289. | corresponding to the abstract input. Ordinarily, the abstract value is
  6290. | simply rounded and packed into the quadruple-precision format, with the
  6291. | inexact exception raised if the abstract input cannot be represented
  6292. | exactly. However, if the abstract value is too large, the overflow and
  6293. | inexact exceptions are raised and an infinity or maximal finite value is
  6294. | returned. If the abstract value is too small, the input value is rounded to
  6295. | a subnormal number, and the underflow and inexact exceptions are raised if
  6296. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6297. | precision floating-point number.
  6298. | The input significand must be normalized or smaller. If the input
  6299. | significand is not normalized, `zExp' must be 0; in that case, the result
  6300. | returned is a subnormal number, and it must not require rounding. In the
  6301. | usual case that the input significand is normalized, `zExp' must be 1 less
  6302. | than the ``true'' floating-point exponent. The handling of underflow and
  6303. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6304. *----------------------------------------------------------------------------*}
  6305. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6306. var
  6307. roundingMode: int8;
  6308. roundNearestEven, increment, isTiny: flag;
  6309. begin
  6310. roundingMode := float_rounding_mode;
  6311. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6312. increment := ord( sbits64(zSig2) < 0 );
  6313. if ( roundNearestEven=0 ) then
  6314. begin
  6315. if ( roundingMode = float_round_to_zero ) then
  6316. begin
  6317. increment := 0;
  6318. end
  6319. else begin
  6320. if ( zSign<>0 ) then
  6321. begin
  6322. increment := ord( roundingMode = float_round_down ) and zSig2;
  6323. end
  6324. else begin
  6325. increment := ord( roundingMode = float_round_up ) and zSig2;
  6326. end;
  6327. end;
  6328. end;
  6329. if ( $7FFD <= bits32(zExp) ) then
  6330. begin
  6331. if ( ord( $7FFD < zExp )
  6332. or ( ord( zExp = $7FFD )
  6333. and eq128(
  6334. int64( $0001FFFFFFFFFFFF ),
  6335. int64( $FFFFFFFFFFFFFFFF ),
  6336. zSig0,
  6337. zSig1
  6338. )
  6339. and increment
  6340. )
  6341. )<>0 then
  6342. begin
  6343. float_raise( float_flag_overflow or float_flag_inexact );
  6344. if ( ord( roundingMode = float_round_to_zero )
  6345. or ( zSign and ord( roundingMode = float_round_up ) )
  6346. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6347. )<>0 then
  6348. begin
  6349. result :=
  6350. packFloat128(
  6351. zSign,
  6352. $7FFE,
  6353. int64( $0000FFFFFFFFFFFF ),
  6354. int64( $FFFFFFFFFFFFFFFF )
  6355. );
  6356. end;
  6357. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6358. end;
  6359. if ( zExp < 0 ) then
  6360. begin
  6361. isTiny :=
  6362. ord(( float_detect_tininess = float_tininess_before_rounding )
  6363. or ( zExp < -1 )
  6364. or not( increment<>0 )
  6365. or boolean(lt128(
  6366. zSig0,
  6367. zSig1,
  6368. int64( $0001FFFFFFFFFFFF ),
  6369. int64( $FFFFFFFFFFFFFFFF )
  6370. )));
  6371. shift128ExtraRightJamming(
  6372. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6373. zExp := 0;
  6374. if ( isTiny and zSig2 )<>0 then
  6375. float_raise( float_flag_underflow );
  6376. if ( roundNearestEven<>0 ) then
  6377. begin
  6378. increment := ord( sbits64(zSig2) < 0 );
  6379. end
  6380. else begin
  6381. if ( zSign<>0 ) then
  6382. begin
  6383. increment := ord( roundingMode = float_round_down ) and zSig2;
  6384. end
  6385. else begin
  6386. increment := ord( roundingMode = float_round_up ) and zSig2;
  6387. end;
  6388. end;
  6389. end;
  6390. end;
  6391. if ( zSig2<>0 ) then
  6392. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6393. if ( increment<>0 ) then
  6394. begin
  6395. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6396. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6397. end
  6398. else begin
  6399. if ( ( zSig0 or zSig1 ) = 0 ) then
  6400. zExp := 0;
  6401. end;
  6402. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6403. end;
  6404. {*----------------------------------------------------------------------------
  6405. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6406. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6407. | returns the proper quadruple-precision floating-point value corresponding
  6408. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6409. | except that the input significand has fewer bits and does not have to be
  6410. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6411. | point exponent.
  6412. *----------------------------------------------------------------------------*}
  6413. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6414. var
  6415. shiftCount: int8;
  6416. zSig2: bits64;
  6417. begin
  6418. if ( zSig0 = 0 ) then
  6419. begin
  6420. zSig0 := zSig1;
  6421. zSig1 := 0;
  6422. dec(zExp, 64);
  6423. end;
  6424. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6425. if ( 0 <= shiftCount ) then
  6426. begin
  6427. zSig2 := 0;
  6428. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6429. end
  6430. else begin
  6431. shift128ExtraRightJamming(
  6432. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6433. end;
  6434. dec(zExp, shiftCount);
  6435. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6436. end;
  6437. {*----------------------------------------------------------------------------
  6438. | Returns the result of converting the quadruple-precision floating-point
  6439. | value `a' to the 32-bit two's complement integer format. The conversion
  6440. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6441. | Arithmetic---which means in particular that the conversion is rounded
  6442. | according to the current rounding mode. If `a' is a NaN, the largest
  6443. | positive integer is returned. Otherwise, if the conversion overflows, the
  6444. | largest integer with the same sign as `a' is returned.
  6445. *----------------------------------------------------------------------------*}
  6446. function float128_to_int32(a: float128): int32;
  6447. var
  6448. aSign: flag;
  6449. aExp, shiftCount: int32;
  6450. aSig0, aSig1: bits64;
  6451. begin
  6452. aSig1 := extractFloat128Frac1( a );
  6453. aSig0 := extractFloat128Frac0( a );
  6454. aExp := extractFloat128Exp( a );
  6455. aSign := extractFloat128Sign( a );
  6456. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6457. aSign := 0;
  6458. if ( aExp<>0 ) then
  6459. aSig0 := aSig0 or int64( $0001000000000000 );
  6460. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6461. shiftCount := $4028 - aExp;
  6462. if ( 0 < shiftCount ) then
  6463. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6464. result := roundAndPackInt32( aSign, aSig0 );
  6465. end;
  6466. {*----------------------------------------------------------------------------
  6467. | Returns the result of converting the quadruple-precision floating-point
  6468. | value `a' to the 32-bit two's complement integer format. The conversion
  6469. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6470. | Arithmetic, except that the conversion is always rounded toward zero. If
  6471. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6472. | conversion overflows, the largest integer with the same sign as `a' is
  6473. | returned.
  6474. *----------------------------------------------------------------------------*}
  6475. function float128_to_int32_round_to_zero(a: float128): int32;
  6476. var
  6477. aSign: flag;
  6478. aExp, shiftCount: int32;
  6479. aSig0, aSig1, savedASig: bits64;
  6480. z: int32;
  6481. label
  6482. invalid;
  6483. begin
  6484. aSig1 := extractFloat128Frac1( a );
  6485. aSig0 := extractFloat128Frac0( a );
  6486. aExp := extractFloat128Exp( a );
  6487. aSign := extractFloat128Sign( a );
  6488. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6489. if ( $401E < aExp ) then
  6490. begin
  6491. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6492. aSign := 0;
  6493. goto invalid;
  6494. end
  6495. else if ( aExp < $3FFF ) then
  6496. begin
  6497. if ( aExp or aSig0 )<>0 then
  6498. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6499. result := 0;
  6500. exit;
  6501. end;
  6502. aSig0 := aSig0 or int64( $0001000000000000 );
  6503. shiftCount := $402F - aExp;
  6504. savedASig := aSig0;
  6505. aSig0 := aSig0 shr shiftCount;
  6506. z := aSig0;
  6507. if ( aSign )<>0 then
  6508. z := - z;
  6509. if ( ord( z < 0 ) xor aSign )<>0 then
  6510. begin
  6511. invalid:
  6512. float_raise( float_flag_invalid );
  6513. if aSign<>0 then
  6514. result:=$80000000
  6515. else
  6516. result:=$7FFFFFFF;
  6517. exit;
  6518. end;
  6519. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6520. begin
  6521. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6522. end;
  6523. result := z;
  6524. end;
  6525. {*----------------------------------------------------------------------------
  6526. | Returns the result of converting the quadruple-precision floating-point
  6527. | value `a' to the 64-bit two's complement integer format. The conversion
  6528. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6529. | Arithmetic---which means in particular that the conversion is rounded
  6530. | according to the current rounding mode. If `a' is a NaN, the largest
  6531. | positive integer is returned. Otherwise, if the conversion overflows, the
  6532. | largest integer with the same sign as `a' is returned.
  6533. *----------------------------------------------------------------------------*}
  6534. function float128_to_int64(a: float128): int64;
  6535. var
  6536. aSign: flag;
  6537. aExp, shiftCount: int32;
  6538. aSig0, aSig1: bits64;
  6539. begin
  6540. aSig1 := extractFloat128Frac1( a );
  6541. aSig0 := extractFloat128Frac0( a );
  6542. aExp := extractFloat128Exp( a );
  6543. aSign := extractFloat128Sign( a );
  6544. if ( aExp<>0 ) then
  6545. aSig0 := aSig0 or int64( $0001000000000000 );
  6546. shiftCount := $402F - aExp;
  6547. if ( shiftCount <= 0 ) then
  6548. begin
  6549. if ( $403E < aExp ) then
  6550. begin
  6551. float_raise( float_flag_invalid );
  6552. if ( (aSign=0)
  6553. or ( ( aExp = $7FFF )
  6554. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6555. )
  6556. ) then
  6557. begin
  6558. result := int64( $7FFFFFFFFFFFFFFF );
  6559. end;
  6560. result := int64( $8000000000000000 );
  6561. end;
  6562. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6563. end
  6564. else begin
  6565. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6566. end;
  6567. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6568. end;
  6569. {*----------------------------------------------------------------------------
  6570. | Returns the result of converting the quadruple-precision floating-point
  6571. | value `a' to the 64-bit two's complement integer format. The conversion
  6572. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6573. | Arithmetic, except that the conversion is always rounded toward zero.
  6574. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6575. | the conversion overflows, the largest integer with the same sign as `a' is
  6576. | returned.
  6577. *----------------------------------------------------------------------------*}
  6578. function float128_to_int64_round_to_zero(a: float128): int64;
  6579. var
  6580. aSign: flag;
  6581. aExp, shiftCount: int32;
  6582. aSig0, aSig1: bits64;
  6583. z: int64;
  6584. begin
  6585. aSig1 := extractFloat128Frac1( a );
  6586. aSig0 := extractFloat128Frac0( a );
  6587. aExp := extractFloat128Exp( a );
  6588. aSign := extractFloat128Sign( a );
  6589. if ( aExp<>0 ) then
  6590. aSig0 := aSig0 or int64( $0001000000000000 );
  6591. shiftCount := aExp - $402F;
  6592. if ( 0 < shiftCount ) then
  6593. begin
  6594. if ( $403E <= aExp ) then
  6595. begin
  6596. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6597. if ( ( a.high = int64( $C03E000000000000 ) )
  6598. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6599. begin
  6600. if ( aSig1<>0 ) then
  6601. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6602. end
  6603. else begin
  6604. float_raise( float_flag_invalid );
  6605. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6606. begin
  6607. result := int64( $7FFFFFFFFFFFFFFF );
  6608. exit;
  6609. end;
  6610. end;
  6611. result := int64( $8000000000000000 );
  6612. exit;
  6613. end;
  6614. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6615. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6616. begin
  6617. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6618. end;
  6619. end
  6620. else begin
  6621. if ( aExp < $3FFF ) then
  6622. begin
  6623. if ( aExp or aSig0 or aSig1 )<>0 then
  6624. begin
  6625. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6626. end;
  6627. result := 0;
  6628. exit;
  6629. end;
  6630. z := aSig0 shr ( - shiftCount );
  6631. if ( (aSig1<>0)
  6632. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6633. begin
  6634. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6635. end;
  6636. end;
  6637. if ( aSign<>0 ) then
  6638. z := - z;
  6639. result := z;
  6640. end;
  6641. {*----------------------------------------------------------------------------
  6642. | Returns the result of converting the quadruple-precision floating-point
  6643. | value `a' to the single-precision floating-point format. The conversion
  6644. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6645. | Arithmetic.
  6646. *----------------------------------------------------------------------------*}
  6647. function float128_to_float32(a: float128): float32;
  6648. var
  6649. aSign: flag;
  6650. aExp: int32;
  6651. aSig0, aSig1: bits64;
  6652. zSig: bits32;
  6653. begin
  6654. aSig1 := extractFloat128Frac1( a );
  6655. aSig0 := extractFloat128Frac0( a );
  6656. aExp := extractFloat128Exp( a );
  6657. aSign := extractFloat128Sign( a );
  6658. if ( aExp = $7FFF ) then
  6659. begin
  6660. if ( aSig0 or aSig1 )<>0 then
  6661. begin
  6662. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  6663. exit;
  6664. end;
  6665. result := packFloat32( aSign, $FF, 0 );
  6666. exit;
  6667. end;
  6668. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6669. shift64RightJamming( aSig0, 18, aSig0 );
  6670. zSig := aSig0;
  6671. if ( aExp or zSig )<>0 then
  6672. begin
  6673. zSig := zSig or $40000000;
  6674. dec(aExp,$3F81);
  6675. end;
  6676. result := roundAndPackFloat32( aSign, aExp, zSig );
  6677. end;
  6678. {*----------------------------------------------------------------------------
  6679. | Returns the result of converting the quadruple-precision floating-point
  6680. | value `a' to the double-precision floating-point format. The conversion
  6681. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6682. | Arithmetic.
  6683. *----------------------------------------------------------------------------*}
  6684. function float128_to_float64(a: float128): float64;
  6685. var
  6686. aSign: flag;
  6687. aExp: int32;
  6688. aSig0, aSig1: bits64;
  6689. begin
  6690. aSig1 := extractFloat128Frac1( a );
  6691. aSig0 := extractFloat128Frac0( a );
  6692. aExp := extractFloat128Exp( a );
  6693. aSign := extractFloat128Sign( a );
  6694. if ( aExp = $7FFF ) then
  6695. begin
  6696. if ( aSig0 or aSig1 )<>0 then
  6697. begin
  6698. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  6699. exit;
  6700. end;
  6701. result:=packFloat64( aSign, $7FF, 0);
  6702. exit;
  6703. end;
  6704. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  6705. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6706. if ( aExp or aSig0 )<>0 then
  6707. begin
  6708. aSig0 := aSig0 or int64( $4000000000000000 );
  6709. dec(aExp,$3C01);
  6710. end;
  6711. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  6712. end;
  6713. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  6714. {*----------------------------------------------------------------------------
  6715. | Returns the result of converting the quadruple-precision floating-point
  6716. | value `a' to the extended double-precision floating-point format. The
  6717. | conversion is performed according to the IEC/IEEE Standard for Binary
  6718. | Floating-Point Arithmetic.
  6719. *----------------------------------------------------------------------------*}
  6720. function float128_to_floatx80(a: float128): floatx80;
  6721. var
  6722. aSign: flag;
  6723. aExp: int32;
  6724. aSig0, aSig1: bits64;
  6725. begin
  6726. aSig1 := extractFloat128Frac1( a );
  6727. aSig0 := extractFloat128Frac0( a );
  6728. aExp := extractFloat128Exp( a );
  6729. aSign := extractFloat128Sign( a );
  6730. if ( aExp = $7FFF ) begin
  6731. if ( aSig0 or aSig1 ) begin
  6732. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  6733. end;
  6734. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  6735. end;
  6736. if ( aExp = 0 ) begin
  6737. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloatx80( aSign, 0, 0 );
  6738. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  6739. end;
  6740. else begin
  6741. aSig0 or= int64( $0001000000000000 );
  6742. end;
  6743. shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
  6744. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  6745. end;
  6746. {$endif FPC_SOFTFLOAT_FLOATX80}
  6747. {*----------------------------------------------------------------------------
  6748. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  6749. | Returns the result as a quadruple-precision floating-point value. The
  6750. | operation is performed according to the IEC/IEEE Standard for Binary
  6751. | Floating-Point Arithmetic.
  6752. *----------------------------------------------------------------------------*}
  6753. function float128_round_to_int(a: float128): float128;
  6754. var
  6755. aSign: flag;
  6756. aExp: int32;
  6757. lastBitMask, roundBitsMask: bits64;
  6758. roundingMode: int8;
  6759. z: float128;
  6760. begin
  6761. aExp := extractFloat128Exp( a );
  6762. if ( $402F <= aExp ) then
  6763. begin
  6764. if ( $406F <= aExp ) then
  6765. begin
  6766. if ( ( aExp = $7FFF )
  6767. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  6768. ) then
  6769. begin
  6770. result := propagateFloat128NaN( a, a );
  6771. exit;
  6772. end;
  6773. result := a;
  6774. exit;
  6775. end;
  6776. lastBitMask := 1;
  6777. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  6778. roundBitsMask := lastBitMask - 1;
  6779. z := a;
  6780. roundingMode := float_rounding_mode;
  6781. if ( roundingMode = float_round_nearest_even ) then
  6782. begin
  6783. if ( lastBitMask )<>0 then
  6784. begin
  6785. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  6786. if ( ( z.low and roundBitsMask ) = 0 ) then
  6787. z.low := z.low and not(lastBitMask);
  6788. end
  6789. else begin
  6790. if ( sbits64(z.low) < 0 ) then
  6791. begin
  6792. inc(z.high);
  6793. if ( bits64( z.low shl 1 ) = 0 ) then
  6794. z.high := z.high and not(1);
  6795. end;
  6796. end;
  6797. end
  6798. else if ( roundingMode <> float_round_to_zero ) then
  6799. begin
  6800. if ( extractFloat128Sign( z )
  6801. xor ord( roundingMode = float_round_up ) )<>0 then
  6802. begin
  6803. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  6804. end;
  6805. end;
  6806. z.low := z.low and not(roundBitsMask);
  6807. end
  6808. else begin
  6809. if ( aExp < $3FFF ) then
  6810. begin
  6811. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  6812. begin
  6813. result := a;
  6814. exit;
  6815. end;
  6816. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6817. aSign := extractFloat128Sign( a );
  6818. case float_rounding_mode of
  6819. float_round_nearest_even:
  6820. if ( ( aExp = $3FFE )
  6821. and ( extractFloat128Frac0( a )
  6822. or extractFloat128Frac1( a ) )
  6823. ) begin
  6824. begin
  6825. result := packFloat128( aSign, $3FFF, 0, 0 );
  6826. exit;
  6827. end;
  6828. end;
  6829. float_round_down:
  6830. begin
  6831. result :=
  6832. aSign ? packFloat128( 1, $3FFF, 0, 0 )
  6833. : packFloat128( 0, 0, 0, 0 );
  6834. end;
  6835. float_round_up:
  6836. begin
  6837. result :=
  6838. aSign ? packFloat128( 1, 0, 0, 0 )
  6839. : packFloat128( 0, $3FFF, 0, 0 );
  6840. exit;
  6841. end;
  6842. end;
  6843. result := packFloat128( aSign, 0, 0, 0 );
  6844. exit;
  6845. end;
  6846. lastBitMask := 1;
  6847. lastBitMask shl = $402F - aExp;
  6848. roundBitsMask := lastBitMask - 1;
  6849. z.low := 0;
  6850. z.high := a.high;
  6851. roundingMode := float_rounding_mode;
  6852. if ( roundingMode = float_round_nearest_even ) begin
  6853. z.high += lastBitMask>>1;
  6854. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) begin
  6855. z.high &= ~ lastBitMask;
  6856. end;
  6857. end;
  6858. else if ( roundingMode <> float_round_to_zero ) begin
  6859. if ( extractFloat128Sign( z )
  6860. xor ( roundingMode = float_round_up ) ) begin
  6861. z.high or= ( a.low <> 0 );
  6862. z.high += roundBitsMask;
  6863. end;
  6864. end;
  6865. z.high &= ~ roundBitsMask;
  6866. end;
  6867. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) begin
  6868. softfloat_exception_flags or= float_flag_inexact;
  6869. end;
  6870. result := z;
  6871. end;
  6872. {*----------------------------------------------------------------------------
  6873. | Returns the result of adding the absolute values of the quadruple-precision
  6874. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  6875. | before being returned. `zSign' is ignored if the result is a NaN.
  6876. | The addition is performed according to the IEC/IEEE Standard for Binary
  6877. | Floating-Point Arithmetic.
  6878. *----------------------------------------------------------------------------*}
  6879. function addFloat128Sigs( float128 a, float128 b, flag zSign ): float128;
  6880. var
  6881. aExp, bExp, zExp: int32;
  6882. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  6883. expDiff: int32;
  6884. begin
  6885. aSig1 := extractFloat128Frac1( a );
  6886. aSig0 := extractFloat128Frac0( a );
  6887. aExp := extractFloat128Exp( a );
  6888. bSig1 := extractFloat128Frac1( b );
  6889. bSig0 := extractFloat128Frac0( b );
  6890. bExp := extractFloat128Exp( b );
  6891. expDiff := aExp - bExp;
  6892. if ( 0 < expDiff ) begin
  6893. if ( aExp = $7FFF ) begin
  6894. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
  6895. result := a;
  6896. end;
  6897. if ( bExp = 0 ) begin
  6898. --expDiff;
  6899. end;
  6900. else begin
  6901. bSig0 or= int64( $0001000000000000 );
  6902. end;
  6903. shift128ExtraRightJamming(
  6904. bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
  6905. zExp := aExp;
  6906. end;
  6907. else if ( expDiff < 0 ) begin
  6908. if ( bExp = $7FFF ) begin
  6909. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  6910. result := packFloat128( zSign, $7FFF, 0, 0 );
  6911. end;
  6912. if ( aExp = 0 ) begin
  6913. ++expDiff;
  6914. end;
  6915. else begin
  6916. aSig0 or= int64( $0001000000000000 );
  6917. end;
  6918. shift128ExtraRightJamming(
  6919. aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
  6920. zExp := bExp;
  6921. end;
  6922. else begin
  6923. if ( aExp = $7FFF ) begin
  6924. if ( aSig0 or aSig1 or bSig0 or bSig1 ) begin
  6925. result := propagateFloat128NaN( a, b );
  6926. end;
  6927. result := a;
  6928. end;
  6929. add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  6930. if ( aExp = 0 ) result := packFloat128( zSign, 0, zSig0, zSig1 );
  6931. zSig2 := 0;
  6932. zSig0 or= int64( $0002000000000000 );
  6933. zExp := aExp;
  6934. goto shiftRight1;
  6935. end;
  6936. aSig0 or= int64( $0001000000000000 );
  6937. add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  6938. --zExp;
  6939. if ( zSig0 < int64( $0002000000000000 ) ) goto roundAndPack;
  6940. ++zExp;
  6941. shiftRight1:
  6942. shift128ExtraRightJamming(
  6943. zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
  6944. roundAndPack:
  6945. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6946. end;
  6947. {*----------------------------------------------------------------------------
  6948. | Returns the result of subtracting the absolute values of the quadruple-
  6949. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  6950. | difference is negated before being returned. `zSign' is ignored if the
  6951. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6952. | Standard for Binary Floating-Point Arithmetic.
  6953. *----------------------------------------------------------------------------*}
  6954. function subFloat128Sigs( float128 a, float128 b, flag zSign ): float128;
  6955. var
  6956. aExp, bExp, zExp: int32;
  6957. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  6958. expDiff: int32;
  6959. z: float128;
  6960. begin
  6961. aSig1 := extractFloat128Frac1( a );
  6962. aSig0 := extractFloat128Frac0( a );
  6963. aExp := extractFloat128Exp( a );
  6964. bSig1 := extractFloat128Frac1( b );
  6965. bSig0 := extractFloat128Frac0( b );
  6966. bExp := extractFloat128Exp( b );
  6967. expDiff := aExp - bExp;
  6968. shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
  6969. shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
  6970. if ( 0 < expDiff ) goto aExpBigger;
  6971. if ( expDiff < 0 ) goto bExpBigger;
  6972. if ( aExp = $7FFF ) begin
  6973. if ( aSig0 or aSig1 or bSig0 or bSig1 ) begin
  6974. result := propagateFloat128NaN( a, b );
  6975. end;
  6976. float_raise( float_flag_invalid );
  6977. z.low := float128_default_nan_low;
  6978. z.high := float128_default_nan_high;
  6979. result := z;
  6980. end;
  6981. if ( aExp = 0 ) begin
  6982. aExp := 1;
  6983. bExp := 1;
  6984. end;
  6985. if ( bSig0 < aSig0 ) goto aBigger;
  6986. if ( aSig0 < bSig0 ) goto bBigger;
  6987. if ( bSig1 < aSig1 ) goto aBigger;
  6988. if ( aSig1 < bSig1 ) goto bBigger;
  6989. result := packFloat128( float_rounding_mode = float_round_down, 0, 0, 0 );
  6990. bExpBigger:
  6991. if ( bExp = $7FFF ) begin
  6992. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  6993. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  6994. end;
  6995. if ( aExp = 0 ) begin
  6996. ++expDiff;
  6997. end;
  6998. else begin
  6999. aSig0 or= int64( $4000000000000000 );
  7000. end;
  7001. shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
  7002. bSig0 or= int64( $4000000000000000 );
  7003. bBigger:
  7004. sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
  7005. zExp := bExp;
  7006. zSign xor = 1;
  7007. goto normalizeRoundAndPack;
  7008. aExpBigger:
  7009. if ( aExp = $7FFF ) begin
  7010. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
  7011. result := a;
  7012. end;
  7013. if ( bExp = 0 ) begin
  7014. --expDiff;
  7015. end;
  7016. else begin
  7017. bSig0 or= int64( $4000000000000000 );
  7018. end;
  7019. shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
  7020. aSig0 or= int64( $4000000000000000 );
  7021. aBigger:
  7022. sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  7023. zExp := aExp;
  7024. normalizeRoundAndPack:
  7025. --zExp;
  7026. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  7027. end;
  7028. {*----------------------------------------------------------------------------
  7029. | Returns the result of adding the quadruple-precision floating-point values
  7030. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  7031. | for Binary Floating-Point Arithmetic.
  7032. *----------------------------------------------------------------------------*}
  7033. function float128_add(a: float128; b: float128): float128;
  7034. var
  7035. aSign, bSign: flag;
  7036. begin
  7037. aSign := extractFloat128Sign( a );
  7038. bSign := extractFloat128Sign( b );
  7039. if ( aSign = bSign ) begin
  7040. result := addFloat128Sigs( a, b, aSign );
  7041. end;
  7042. else begin
  7043. result := subFloat128Sigs( a, b, aSign );
  7044. end;
  7045. end;
  7046. {*----------------------------------------------------------------------------
  7047. | Returns the result of subtracting the quadruple-precision floating-point
  7048. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7049. | Standard for Binary Floating-Point Arithmetic.
  7050. *----------------------------------------------------------------------------*}
  7051. function float128_sub(a: float128; b: float128): float128;
  7052. var
  7053. aSign, bSign: flag;
  7054. begin
  7055. aSign := extractFloat128Sign( a );
  7056. bSign := extractFloat128Sign( b );
  7057. if ( aSign = bSign ) begin
  7058. result := subFloat128Sigs( a, b, aSign );
  7059. end;
  7060. else begin
  7061. result := addFloat128Sigs( a, b, aSign );
  7062. end;
  7063. end;
  7064. {*----------------------------------------------------------------------------
  7065. | Returns the result of multiplying the quadruple-precision floating-point
  7066. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7067. | Standard for Binary Floating-Point Arithmetic.
  7068. *----------------------------------------------------------------------------*}
  7069. function float128_mul(a: float128; b: float128): float128;
  7070. var
  7071. aSign, bSign, zSign: flag;
  7072. aExp, bExp, zExp: int32;
  7073. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7074. z: float128;
  7075. begin
  7076. aSig1 := extractFloat128Frac1( a );
  7077. aSig0 := extractFloat128Frac0( a );
  7078. aExp := extractFloat128Exp( a );
  7079. aSign := extractFloat128Sign( a );
  7080. bSig1 := extractFloat128Frac1( b );
  7081. bSig0 := extractFloat128Frac0( b );
  7082. bExp := extractFloat128Exp( b );
  7083. bSign := extractFloat128Sign( b );
  7084. zSign := aSign xor bSign;
  7085. if ( aExp = $7FFF ) begin
  7086. if ( ( aSig0 or aSig1 )
  7087. or ( ( bExp = $7FFF ) and ( bSig0 or bSig1 ) ) ) begin
  7088. result := propagateFloat128NaN( a, b );
  7089. end;
  7090. if ( ( bExp or bSig0 or bSig1 ) = 0 ) goto invalid;
  7091. result := packFloat128( zSign, $7FFF, 0, 0 );
  7092. end;
  7093. if ( bExp = $7FFF ) begin
  7094. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7095. if ( ( aExp or aSig0 or aSig1 ) = 0 ) begin
  7096. invalid:
  7097. float_raise( float_flag_invalid );
  7098. z.low := float128_default_nan_low;
  7099. z.high := float128_default_nan_high;
  7100. result := z;
  7101. end;
  7102. result := packFloat128( zSign, $7FFF, 0, 0 );
  7103. end;
  7104. if ( aExp = 0 ) begin
  7105. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
  7106. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7107. end;
  7108. if ( bExp = 0 ) begin
  7109. if ( ( bSig0 or bSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
  7110. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  7111. end;
  7112. zExp := aExp + bExp - $4000;
  7113. aSig0 or= int64( $0001000000000000 );
  7114. shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
  7115. mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
  7116. add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
  7117. zSig2 or= ( zSig3 <> 0 );
  7118. if ( int64( $0002000000000000 ) <= zSig0 ) begin
  7119. shift128ExtraRightJamming(
  7120. zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
  7121. ++zExp;
  7122. end;
  7123. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7124. end;
  7125. {*----------------------------------------------------------------------------
  7126. | Returns the result of dividing the quadruple-precision floating-point value
  7127. | `a' by the corresponding value `b'. The operation is performed according to
  7128. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7129. *----------------------------------------------------------------------------*}
  7130. function float128_div(a: float128; b: float128): float128;
  7131. var
  7132. aSign, bSign, zSign: flag;
  7133. aExp, bExp, zExp: int32;
  7134. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7135. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7136. z: float128;
  7137. begin
  7138. aSig1 := extractFloat128Frac1( a );
  7139. aSig0 := extractFloat128Frac0( a );
  7140. aExp := extractFloat128Exp( a );
  7141. aSign := extractFloat128Sign( a );
  7142. bSig1 := extractFloat128Frac1( b );
  7143. bSig0 := extractFloat128Frac0( b );
  7144. bExp := extractFloat128Exp( b );
  7145. bSign := extractFloat128Sign( b );
  7146. zSign := aSign xor bSign;
  7147. if ( aExp = $7FFF ) begin
  7148. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
  7149. if ( bExp = $7FFF ) begin
  7150. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7151. goto invalid;
  7152. end;
  7153. result := packFloat128( zSign, $7FFF, 0, 0 );
  7154. end;
  7155. if ( bExp = $7FFF ) begin
  7156. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7157. result := packFloat128( zSign, 0, 0, 0 );
  7158. end;
  7159. if ( bExp = 0 ) begin
  7160. if ( ( bSig0 or bSig1 ) = 0 ) begin
  7161. if ( ( aExp or aSig0 or aSig1 ) = 0 ) begin
  7162. invalid:
  7163. float_raise( float_flag_invalid );
  7164. z.low := float128_default_nan_low;
  7165. z.high := float128_default_nan_high;
  7166. result := z;
  7167. end;
  7168. float_raise( float_flag_divbyzero );
  7169. result := packFloat128( zSign, $7FFF, 0, 0 );
  7170. end;
  7171. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  7172. end;
  7173. if ( aExp = 0 ) begin
  7174. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
  7175. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7176. end;
  7177. zExp := aExp - bExp + $3FFD;
  7178. shortShift128Left(
  7179. aSig0 or int64( $0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
  7180. shortShift128Left(
  7181. bSig0 or int64( $0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
  7182. if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) begin
  7183. shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
  7184. ++zExp;
  7185. end;
  7186. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7187. mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
  7188. sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
  7189. while ( (sbits64) rem0 < 0 ) begin
  7190. --zSig0;
  7191. add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
  7192. end;
  7193. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7194. if ( ( zSig1 and $3FFF ) <= 4 ) begin
  7195. mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
  7196. sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
  7197. while ( (sbits64) rem1 < 0 ) begin
  7198. --zSig1;
  7199. add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
  7200. end;
  7201. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  7202. end;
  7203. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
  7204. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7205. end;
  7206. {*----------------------------------------------------------------------------
  7207. | Returns the remainder of the quadruple-precision floating-point value `a'
  7208. | with respect to the corresponding value `b'. The operation is performed
  7209. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7210. *----------------------------------------------------------------------------*}
  7211. function float128_rem(a: float128; b: float128): float128;
  7212. var
  7213. aSign, bSign, zSign: flag;
  7214. aExp, bExp, expDiff: int32;
  7215. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7216. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7217. sigMean0: sbits64;
  7218. z: float128;
  7219. begin
  7220. aSig1 := extractFloat128Frac1( a );
  7221. aSig0 := extractFloat128Frac0( a );
  7222. aExp := extractFloat128Exp( a );
  7223. aSign := extractFloat128Sign( a );
  7224. bSig1 := extractFloat128Frac1( b );
  7225. bSig0 := extractFloat128Frac0( b );
  7226. bExp := extractFloat128Exp( b );
  7227. bSign := extractFloat128Sign( b );
  7228. if ( aExp = $7FFF ) begin
  7229. if ( ( aSig0 or aSig1 )
  7230. or ( ( bExp = $7FFF ) and ( bSig0 or bSig1 ) ) ) begin
  7231. result := propagateFloat128NaN( a, b );
  7232. end;
  7233. goto invalid;
  7234. end;
  7235. if ( bExp = $7FFF ) begin
  7236. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7237. result := a;
  7238. end;
  7239. if ( bExp = 0 ) begin
  7240. if ( ( bSig0 or bSig1 ) = 0 ) begin
  7241. invalid:
  7242. float_raise( float_flag_invalid );
  7243. z.low := float128_default_nan_low;
  7244. z.high := float128_default_nan_high;
  7245. result := z;
  7246. end;
  7247. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  7248. end;
  7249. if ( aExp = 0 ) begin
  7250. if ( ( aSig0 or aSig1 ) = 0 ) result := a;
  7251. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7252. end;
  7253. expDiff := aExp - bExp;
  7254. if ( expDiff < -1 ) result := a;
  7255. shortShift128Left(
  7256. aSig0 or int64( $0001000000000000 ),
  7257. aSig1,
  7258. 15 - ( expDiff < 0 ),
  7259. &aSig0,
  7260. &aSig1
  7261. );
  7262. shortShift128Left(
  7263. bSig0 or int64( $0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
  7264. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7265. if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
  7266. expDiff -= 64;
  7267. while ( 0 < expDiff ) begin
  7268. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7269. q := ( 4 < q ) ? q - 4 : 0;
  7270. mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
  7271. shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
  7272. shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
  7273. sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
  7274. expDiff -= 61;
  7275. end;
  7276. if ( -64 < expDiff ) begin
  7277. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7278. q := ( 4 < q ) ? q - 4 : 0;
  7279. q >>= - expDiff;
  7280. shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
  7281. expDiff += 52;
  7282. if ( expDiff < 0 ) begin
  7283. shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
  7284. end;
  7285. else begin
  7286. shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
  7287. end;
  7288. mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
  7289. sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
  7290. end;
  7291. else begin
  7292. shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
  7293. shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
  7294. end;
  7295. do begin
  7296. alternateASig0 := aSig0;
  7297. alternateASig1 := aSig1;
  7298. ++q;
  7299. sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
  7300. end; while ( 0 <= (sbits64) aSig0 );
  7301. add128(
  7302. aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
  7303. if ( ( sigMean0 < 0 )
  7304. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and ( q and 1 ) ) ) begin
  7305. aSig0 := alternateASig0;
  7306. aSig1 := alternateASig1;
  7307. end;
  7308. zSign := ( (sbits64) aSig0 < 0 );
  7309. if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
  7310. result :=
  7311. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7312. end;
  7313. {*----------------------------------------------------------------------------
  7314. | Returns the square root of the quadruple-precision floating-point value `a'.
  7315. | The operation is performed according to the IEC/IEEE Standard for Binary
  7316. | Floating-Point Arithmetic.
  7317. *----------------------------------------------------------------------------*}
  7318. function float128_sqrt(a: float128): float128;
  7319. var
  7320. aSign: flag;
  7321. aExp, zExp: int32;
  7322. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7323. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7324. z: float128;
  7325. label
  7326. invalid;
  7327. begin
  7328. aSig1 := extractFloat128Frac1( a );
  7329. aSig0 := extractFloat128Frac0( a );
  7330. aExp := extractFloat128Exp( a );
  7331. aSign := extractFloat128Sign( a );
  7332. if ( aExp = $7FFF ) begin
  7333. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, a );
  7334. if ( ! aSign ) result := a;
  7335. goto invalid;
  7336. end;
  7337. if ( aSign ) begin
  7338. if ( ( aExp or aSig0 or aSig1 ) = 0 ) result := a;
  7339. invalid:
  7340. float_raise( float_flag_invalid );
  7341. z.low := float128_default_nan_low;
  7342. z.high := float128_default_nan_high;
  7343. result := z;
  7344. end;
  7345. if ( aExp = 0 ) begin
  7346. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( 0, 0, 0, 0 );
  7347. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7348. end;
  7349. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7350. aSig0 := aSig0 or int64( $0001000000000000 );
  7351. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7352. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), &aSig0, &aSig1 );
  7353. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7354. doubleZSig0 := zSig0 shl 1;
  7355. mul64To128( zSig0, zSig0, &term0, &term1 );
  7356. sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
  7357. while ( (sbits64) rem0 < 0 ) begin
  7358. --zSig0;
  7359. doubleZSig0 -= 2;
  7360. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, &rem0, &rem1 );
  7361. end;
  7362. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7363. if ( ( zSig1 and $1FFF ) <= 5 ) begin
  7364. if ( zSig1 = 0 ) zSig1 := 1;
  7365. mul64To128( doubleZSig0, zSig1, &term1, &term2 );
  7366. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  7367. mul64To128( zSig1, zSig1, &term2, &term3 );
  7368. sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
  7369. while ( (sbits64) rem1 < 0 ) begin
  7370. --zSig1;
  7371. shortShift128Left( 0, zSig1, 1, &term2, &term3 );
  7372. term3 or= 1;
  7373. term2 or= doubleZSig0;
  7374. add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
  7375. end;
  7376. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  7377. end;
  7378. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
  7379. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7380. end;
  7381. {*----------------------------------------------------------------------------
  7382. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7383. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7384. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7385. *----------------------------------------------------------------------------*}
  7386. function float128_eq(a: float128; b: float128): flag;
  7387. begin
  7388. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7389. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7390. or ( ( extractFloat128Exp( b ) = $7FFF )
  7391. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7392. ) begin
  7393. if ( float128_is_signaling_nan( a )
  7394. or float128_is_signaling_nan( b ) ) begin
  7395. float_raise( float_flag_invalid );
  7396. end;
  7397. result := 0;
  7398. end;
  7399. result :=
  7400. ( a.low = b.low )
  7401. and ( ( a.high = b.high )
  7402. or ( ( a.low = 0 )
  7403. and ( (bits64) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7404. );
  7405. end;
  7406. {*----------------------------------------------------------------------------
  7407. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7408. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7409. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7410. | Arithmetic.
  7411. *----------------------------------------------------------------------------*}
  7412. function float128_le(a: float128; b: float128): flag;
  7413. var
  7414. aSign, bSign: flag;
  7415. begin
  7416. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7417. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7418. or ( ( extractFloat128Exp( b ) = $7FFF )
  7419. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7420. ) begin
  7421. float_raise( float_flag_invalid );
  7422. result := 0;
  7423. end;
  7424. aSign := extractFloat128Sign( a );
  7425. bSign := extractFloat128Sign( b );
  7426. if ( aSign <> bSign ) begin
  7427. result :=
  7428. aSign
  7429. or ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7430. = 0 );
  7431. end;
  7432. result :=
  7433. aSign ? le128( b.high, b.low, a.high, a.low )
  7434. : le128( a.high, a.low, b.high, b.low );
  7435. end;
  7436. {*----------------------------------------------------------------------------
  7437. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7438. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7439. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7440. *----------------------------------------------------------------------------*}
  7441. function float128_lt(a: float128; b: float128): flag;
  7442. var
  7443. aSign, bSign: flag;
  7444. begin
  7445. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7446. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7447. or ( ( extractFloat128Exp( b ) = $7FFF )
  7448. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7449. ) begin
  7450. float_raise( float_flag_invalid );
  7451. result := 0;
  7452. end;
  7453. aSign := extractFloat128Sign( a );
  7454. bSign := extractFloat128Sign( b );
  7455. if ( aSign <> bSign ) begin
  7456. result :=
  7457. aSign
  7458. and ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7459. <> 0 );
  7460. end;
  7461. result :=
  7462. aSign ? lt128( b.high, b.low, a.high, a.low )
  7463. : lt128( a.high, a.low, b.high, b.low );
  7464. end;
  7465. {*----------------------------------------------------------------------------
  7466. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7467. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7468. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7469. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7470. *----------------------------------------------------------------------------*}
  7471. function float128_eq_signaling(a: float128; b: float128): flag;
  7472. begin
  7473. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7474. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7475. or ( ( extractFloat128Exp( b ) = $7FFF )
  7476. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7477. ) begin
  7478. float_raise( float_flag_invalid );
  7479. result := 0;
  7480. end;
  7481. result :=
  7482. ( a.low = b.low )
  7483. and ( ( a.high = b.high )
  7484. or ( ( a.low = 0 )
  7485. and ( (bits64) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7486. );
  7487. end;
  7488. {*----------------------------------------------------------------------------
  7489. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7490. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7491. | cause an exception. Otherwise, the comparison is performed according to the
  7492. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7493. *----------------------------------------------------------------------------*}
  7494. function float128_le_quiet(a: float128; b: float128): flag;
  7495. var
  7496. aSign, bSign: flag;
  7497. begin
  7498. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7499. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7500. or ( ( extractFloat128Exp( b ) = $7FFF )
  7501. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7502. ) begin
  7503. if ( float128_is_signaling_nan( a )
  7504. or float128_is_signaling_nan( b ) ) begin
  7505. float_raise( float_flag_invalid );
  7506. end;
  7507. result := 0;
  7508. end;
  7509. aSign := extractFloat128Sign( a );
  7510. bSign := extractFloat128Sign( b );
  7511. if ( aSign <> bSign ) begin
  7512. result :=
  7513. aSign
  7514. or ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7515. = 0 );
  7516. end;
  7517. result :=
  7518. aSign ? le128( b.high, b.low, a.high, a.low )
  7519. : le128( a.high, a.low, b.high, b.low );
  7520. end;
  7521. {*----------------------------------------------------------------------------
  7522. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7523. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  7524. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  7525. | Standard for Binary Floating-Point Arithmetic.
  7526. *----------------------------------------------------------------------------*}
  7527. function float128_lt_quiet(a: float128; b: float128): flag;
  7528. var
  7529. aSign, bSign: flag;
  7530. begin
  7531. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7532. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7533. or ( ( extractFloat128Exp( b ) = $7FFF )
  7534. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7535. ) begin
  7536. if ( float128_is_signaling_nan( a )
  7537. or float128_is_signaling_nan( b ) ) begin
  7538. float_raise( float_flag_invalid );
  7539. end;
  7540. result := 0;
  7541. end;
  7542. aSign := extractFloat128Sign( a );
  7543. bSign := extractFloat128Sign( b );
  7544. if ( aSign <> bSign ) begin
  7545. result :=
  7546. aSign
  7547. and ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7548. <> 0 );
  7549. end;
  7550. result :=
  7551. aSign ? lt128( b.high, b.low, a.high, a.low )
  7552. : lt128( a.high, a.low, b.high, b.low );
  7553. end;
  7554. {$endif FPC_SOFTFLOAT_FLOAT128}
  7555. {$endif not(defined(fpc_softfpu_interface))}
  7556. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  7557. end.
  7558. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}