softfpu.pp 278 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. interface
  69. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  70. {$if not(defined(fpc_softfpu_implementation))}
  71. {
  72. -------------------------------------------------------------------------------
  73. Software IEC/IEEE floating-point types.
  74. -------------------------------------------------------------------------------
  75. }
  76. TYPE
  77. float32 = longword;
  78. { we use here a record in the function header because
  79. the record allows bitwise conversion to single }
  80. float32rec = record
  81. float32 : float32;
  82. end;
  83. flag = byte;
  84. uint8 = byte;
  85. int8 = shortint;
  86. uint16 = word;
  87. int16 = smallint;
  88. uint32 = longword;
  89. int32 = longint;
  90. bits8 = byte;
  91. sbits8 = shortint;
  92. bits16 = word;
  93. sbits16 = smallint;
  94. sbits32 = longint;
  95. bits32 = longword;
  96. {$ifndef fpc}
  97. qword = int64;
  98. {$endif}
  99. { now part of the system unit
  100. uint64 = qword;
  101. }
  102. bits64 = qword;
  103. sbits64 = int64;
  104. {$ifdef ENDIAN_LITTLE}
  105. float64 = packed record
  106. low: bits32;
  107. high: bits32;
  108. end;
  109. int64rec = packed record
  110. low: bits32;
  111. high: bits32;
  112. end;
  113. floatx80 = packed record
  114. low : qword;
  115. high : word;
  116. end;
  117. float128 = packed record
  118. low : qword;
  119. high : qword;
  120. end;
  121. {$else}
  122. float64 = packed record
  123. high,low : bits32;
  124. end;
  125. int64rec = packed record
  126. high,low : bits32;
  127. end;
  128. floatx80 = packed record
  129. high : word;
  130. low : qword;
  131. end;
  132. float128 = packed record
  133. high : qword;
  134. low : qword;
  135. end;
  136. {$endif}
  137. {*
  138. -------------------------------------------------------------------------------
  139. Returns 1 if the double-precision floating-point value `a' is less than
  140. the corresponding value `b', and 0 otherwise. The comparison is performed
  141. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  142. -------------------------------------------------------------------------------
  143. *}
  144. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  145. {*
  146. -------------------------------------------------------------------------------
  147. Returns 1 if the double-precision floating-point value `a' is less than
  148. or equal to the corresponding value `b', and 0 otherwise. The comparison
  149. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  150. Arithmetic.
  151. -------------------------------------------------------------------------------
  152. *}
  153. Function float64_le(a: float64;b: float64): flag; compilerproc;
  154. {*
  155. -------------------------------------------------------------------------------
  156. Returns 1 if the double-precision floating-point value `a' is equal to
  157. the corresponding value `b', and 0 otherwise. The comparison is performed
  158. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  159. -------------------------------------------------------------------------------
  160. *}
  161. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  162. {*
  163. -------------------------------------------------------------------------------
  164. Returns the square root of the double-precision floating-point value `a'.
  165. The operation is performed according to the IEC/IEEE Standard for Binary
  166. Floating-Point Arithmetic.
  167. -------------------------------------------------------------------------------
  168. *}
  169. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  170. {*
  171. -------------------------------------------------------------------------------
  172. Returns the remainder of the double-precision floating-point value `a'
  173. with respect to the corresponding value `b'. The operation is performed
  174. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  175. -------------------------------------------------------------------------------
  176. *}
  177. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  178. {*
  179. -------------------------------------------------------------------------------
  180. Returns the result of dividing the double-precision floating-point value `a'
  181. by the corresponding value `b'. The operation is performed according to the
  182. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  183. -------------------------------------------------------------------------------
  184. *}
  185. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  186. {*
  187. -------------------------------------------------------------------------------
  188. Returns the result of multiplying the double-precision floating-point values
  189. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  190. for Binary Floating-Point Arithmetic.
  191. -------------------------------------------------------------------------------
  192. *}
  193. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  194. {*
  195. -------------------------------------------------------------------------------
  196. Returns the result of subtracting the double-precision floating-point values
  197. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  198. for Binary Floating-Point Arithmetic.
  199. -------------------------------------------------------------------------------
  200. *}
  201. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  202. {*
  203. -------------------------------------------------------------------------------
  204. Returns the result of adding the double-precision floating-point values `a'
  205. and `b'. The operation is performed according to the IEC/IEEE Standard for
  206. Binary Floating-Point Arithmetic.
  207. -------------------------------------------------------------------------------
  208. *}
  209. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  210. {*
  211. -------------------------------------------------------------------------------
  212. Rounds the double-precision floating-point value `a' to an integer,
  213. and returns the result as a double-precision floating-point value. The
  214. operation is performed according to the IEC/IEEE Standard for Binary
  215. Floating-Point Arithmetic.
  216. -------------------------------------------------------------------------------
  217. *}
  218. Function float64_round_to_int(a: float64) : float64; compilerproc;
  219. {*
  220. -------------------------------------------------------------------------------
  221. Returns the result of converting the double-precision floating-point value
  222. `a' to the single-precision floating-point format. The conversion is
  223. performed according to the IEC/IEEE Standard for Binary Floating-Point
  224. Arithmetic.
  225. -------------------------------------------------------------------------------
  226. *}
  227. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  228. {*
  229. -------------------------------------------------------------------------------
  230. Returns the result of converting the double-precision floating-point value
  231. `a' to the 32-bit two's complement integer format. The conversion is
  232. performed according to the IEC/IEEE Standard for Binary Floating-Point
  233. Arithmetic, except that the conversion is always rounded toward zero.
  234. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  235. the conversion overflows, the largest integer with the same sign as `a' is
  236. returned.
  237. -------------------------------------------------------------------------------
  238. *}
  239. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  240. {*
  241. -------------------------------------------------------------------------------
  242. Returns the result of converting the double-precision floating-point value
  243. `a' to the 32-bit two's complement integer format. The conversion is
  244. performed according to the IEC/IEEE Standard for Binary Floating-Point
  245. Arithmetic---which means in particular that the conversion is rounded
  246. according to the current rounding mode. If `a' is a NaN, the largest
  247. positive integer is returned. Otherwise, if the conversion overflows, the
  248. largest integer with the same sign as `a' is returned.
  249. -------------------------------------------------------------------------------
  250. *}
  251. Function float64_to_int32(a: float64): int32; compilerproc;
  252. {*
  253. -------------------------------------------------------------------------------
  254. Returns 1 if the single-precision floating-point value `a' is less than
  255. the corresponding value `b', and 0 otherwise. The comparison is performed
  256. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  257. -------------------------------------------------------------------------------
  258. *}
  259. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  260. {*
  261. -------------------------------------------------------------------------------
  262. Returns 1 if the single-precision floating-point value `a' is less than
  263. or equal to the corresponding value `b', and 0 otherwise. The comparison
  264. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  265. Arithmetic.
  266. -------------------------------------------------------------------------------
  267. *}
  268. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  269. {*
  270. -------------------------------------------------------------------------------
  271. Returns 1 if the single-precision floating-point value `a' is equal to
  272. the corresponding value `b', and 0 otherwise. The comparison is performed
  273. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  274. -------------------------------------------------------------------------------
  275. *}
  276. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  277. {*
  278. -------------------------------------------------------------------------------
  279. Returns the square root of the single-precision floating-point value `a'.
  280. The operation is performed according to the IEC/IEEE Standard for Binary
  281. Floating-Point Arithmetic.
  282. -------------------------------------------------------------------------------
  283. *}
  284. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  285. {*
  286. -------------------------------------------------------------------------------
  287. Returns the remainder of the single-precision floating-point value `a'
  288. with respect to the corresponding value `b'. The operation is performed
  289. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  290. -------------------------------------------------------------------------------
  291. *}
  292. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  293. {*
  294. -------------------------------------------------------------------------------
  295. Returns the result of dividing the single-precision floating-point value `a'
  296. by the corresponding value `b'. The operation is performed according to the
  297. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  298. -------------------------------------------------------------------------------
  299. *}
  300. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  301. {*
  302. -------------------------------------------------------------------------------
  303. Returns the result of multiplying the single-precision floating-point values
  304. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  305. for Binary Floating-Point Arithmetic.
  306. -------------------------------------------------------------------------------
  307. *}
  308. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  309. {*
  310. -------------------------------------------------------------------------------
  311. Returns the result of subtracting the single-precision floating-point values
  312. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  313. for Binary Floating-Point Arithmetic.
  314. -------------------------------------------------------------------------------
  315. *}
  316. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  317. {*
  318. -------------------------------------------------------------------------------
  319. Returns the result of adding the single-precision floating-point values `a'
  320. and `b'. The operation is performed according to the IEC/IEEE Standard for
  321. Binary Floating-Point Arithmetic.
  322. -------------------------------------------------------------------------------
  323. *}
  324. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  325. {*
  326. -------------------------------------------------------------------------------
  327. Rounds the single-precision floating-point value `a' to an integer,
  328. and returns the result as a single-precision floating-point value. The
  329. operation is performed according to the IEC/IEEE Standard for Binary
  330. Floating-Point Arithmetic.
  331. -------------------------------------------------------------------------------
  332. *}
  333. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  334. {*
  335. -------------------------------------------------------------------------------
  336. Returns the result of converting the single-precision floating-point value
  337. `a' to the double-precision floating-point format. The conversion is
  338. performed according to the IEC/IEEE Standard for Binary Floating-Point
  339. Arithmetic.
  340. -------------------------------------------------------------------------------
  341. *}
  342. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  343. {*
  344. -------------------------------------------------------------------------------
  345. Returns the result of converting the single-precision floating-point value
  346. `a' to the 32-bit two's complement integer format. The conversion is
  347. performed according to the IEC/IEEE Standard for Binary Floating-Point
  348. Arithmetic, except that the conversion is always rounded toward zero.
  349. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  350. the conversion overflows, the largest integer with the same sign as `a' is
  351. returned.
  352. -------------------------------------------------------------------------------
  353. *}
  354. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  355. {*
  356. -------------------------------------------------------------------------------
  357. Returns the result of converting the single-precision floating-point value
  358. `a' to the 32-bit two's complement integer format. The conversion is
  359. performed according to the IEC/IEEE Standard for Binary Floating-Point
  360. Arithmetic---which means in particular that the conversion is rounded
  361. according to the current rounding mode. If `a' is a NaN, the largest
  362. positive integer is returned. Otherwise, if the conversion overflows, the
  363. largest integer with the same sign as `a' is returned.
  364. -------------------------------------------------------------------------------
  365. *}
  366. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  367. {*
  368. -------------------------------------------------------------------------------
  369. Returns the result of converting the 32-bit two's complement integer `a' to
  370. the double-precision floating-point format. The conversion is performed
  371. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  372. -------------------------------------------------------------------------------
  373. *}
  374. Function int32_to_float64( a: int32) : float64; compilerproc;
  375. {*
  376. -------------------------------------------------------------------------------
  377. Returns the result of converting the 32-bit two's complement integer `a' to
  378. the single-precision floating-point format. The conversion is performed
  379. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  380. -------------------------------------------------------------------------------
  381. *}
  382. Function int32_to_float32( a: int32): float32rec; compilerproc;
  383. {*----------------------------------------------------------------------------
  384. | Returns the result of converting the 64-bit two's complement integer `a'
  385. | to the double-precision floating-point format. The conversion is performed
  386. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  387. *----------------------------------------------------------------------------*}
  388. Function int64_to_float64( a: int64 ): float64; compilerproc;
  389. {*----------------------------------------------------------------------------
  390. | Returns the result of converting the 64-bit two's complement integer `a'
  391. | to the single-precision floating-point format. The conversion is performed
  392. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  393. *----------------------------------------------------------------------------*}
  394. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  395. CONST
  396. {-------------------------------------------------------------------------------
  397. Software IEC/IEEE floating-point underflow tininess-detection mode.
  398. -------------------------------------------------------------------------------
  399. *}
  400. float_tininess_after_rounding = 0;
  401. float_tininess_before_rounding = 1;
  402. {*
  403. -------------------------------------------------------------------------------
  404. Software IEC/IEEE floating-point rounding mode.
  405. -------------------------------------------------------------------------------
  406. *}
  407. {
  408. Round to nearest.
  409. This is the default mode. It should be used unless there is a specific
  410. need for one of the others. In this mode results are rounded to the
  411. nearest representable value. If the result is midway between two
  412. representable values, the even representable is chosen. Even here
  413. means the lowest-order bit is zero. This rounding mode prevents
  414. statistical bias and guarantees numeric stability: round-off errors
  415. in a lengthy calculation will remain smaller than half of FLT_EPSILON.
  416. Round toward plus Infinity.
  417. All results are rounded to the smallest representable value which is
  418. greater than the result.
  419. Round toward minus Infinity.
  420. All results are rounded to the largest representable value which is
  421. less than the result.
  422. Round toward zero.
  423. All results are rounded to the largest representable value whose
  424. magnitude is less than that of the result. In other words, if the
  425. result is negative it is rounded up; if it is positive, it is
  426. rounded down.
  427. }
  428. float_round_nearest_even = 0;
  429. float_round_down = 1;
  430. float_round_up = 2;
  431. float_round_to_zero = 3;
  432. {*
  433. -------------------------------------------------------------------------------
  434. Floating-point rounding mode and exception flags.
  435. -------------------------------------------------------------------------------
  436. *}
  437. const
  438. float_rounding_mode : Byte = float_round_nearest_even;
  439. {*
  440. -------------------------------------------------------------------------------
  441. Underflow tininess-detection mode, statically initialized to default value.
  442. (The declaration in `softfloat.h' must match the `int8' type here.)
  443. -------------------------------------------------------------------------------
  444. *}
  445. const float_detect_tininess: int8 = float_tininess_after_rounding;
  446. {$endif not(defined(fpc_softfpu_implementation))}
  447. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  448. implementation
  449. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  450. {$if not(defined(fpc_softfpu_interface))}
  451. (*****************************************************************************)
  452. (*----------------------------------------------------------------------------*)
  453. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  454. (* division and square root approximations. (Can be specialized to target if *)
  455. (* desired.) *)
  456. (* ---------------------------------------------------------------------------*)
  457. (*****************************************************************************)
  458. {*----------------------------------------------------------------------------
  459. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  460. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  461. | input. If `zSign' is 1, the input is negated before being converted to an
  462. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  463. | is simply rounded to an integer, with the inexact exception raised if the
  464. | input cannot be represented exactly as an integer. However, if the fixed-
  465. | point input is too large, the invalid exception is raised and the largest
  466. | positive or negative integer is returned.
  467. *----------------------------------------------------------------------------*}
  468. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  469. var
  470. roundingMode: int8;
  471. roundNearestEven: flag;
  472. roundIncrement, roundBits: int8;
  473. z: int32;
  474. begin
  475. roundingMode := float_rounding_mode;
  476. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  477. roundIncrement := $40;
  478. if ( roundNearestEven=0 ) then
  479. begin
  480. if ( roundingMode = float_round_to_zero ) then
  481. begin
  482. roundIncrement := 0;
  483. end
  484. else begin
  485. roundIncrement := $7F;
  486. if ( zSign<>0 ) then
  487. begin
  488. if ( roundingMode = float_round_up ) then
  489. roundIncrement := 0;
  490. end
  491. else begin
  492. if ( roundingMode = float_round_down ) then
  493. roundIncrement := 0;
  494. end;
  495. end;
  496. end;
  497. roundBits := absZ and $7F;
  498. absZ := ( absZ + roundIncrement ) shr 7;
  499. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  500. z := absZ;
  501. if ( zSign<>0 ) then
  502. z := - z;
  503. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  504. begin
  505. float_raise( float_flag_invalid );
  506. if zSign<>0 then
  507. result:=sbits32($80000000)
  508. else
  509. result:=$7FFFFFFF;
  510. exit;
  511. end;
  512. if ( roundBits<>0 ) then
  513. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  514. result:=z;
  515. end;
  516. {*----------------------------------------------------------------------------
  517. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  518. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  519. | and returns the properly rounded 64-bit integer corresponding to the input.
  520. | If `zSign' is 1, the input is negated before being converted to an integer.
  521. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  522. | the inexact exception raised if the input cannot be represented exactly as
  523. | an integer. However, if the fixed-point input is too large, the invalid
  524. | exception is raised and the largest positive or negative integer is
  525. | returned.
  526. *----------------------------------------------------------------------------*}
  527. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  528. var
  529. roundingMode: int8;
  530. roundNearestEven, increment: flag;
  531. z: int64;
  532. label
  533. overflow;
  534. begin
  535. roundingMode := float_rounding_mode;
  536. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  537. increment := ord( sbits64(absZ1) < 0 );
  538. if ( roundNearestEven=0 ) then
  539. begin
  540. if ( roundingMode = float_round_to_zero ) then
  541. begin
  542. increment := 0;
  543. end
  544. else begin
  545. if ( zSign<>0 ) then
  546. begin
  547. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  548. end
  549. else begin
  550. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  551. end;
  552. end;
  553. end;
  554. if ( increment<>0 ) then
  555. begin
  556. inc(absZ0);
  557. if ( absZ0 = 0 ) then
  558. goto overflow;
  559. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  560. end;
  561. z := absZ0;
  562. if ( zSign<>0 ) then
  563. z := - z;
  564. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  565. begin
  566. overflow:
  567. float_raise( float_flag_invalid );
  568. if zSign<>0 then
  569. result:=int64($8000000000000000)
  570. else
  571. result:=int64($7FFFFFFFFFFFFFFF);
  572. end;
  573. if ( absZ1<>0 ) then
  574. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  575. result:=z;
  576. end;
  577. {*
  578. -------------------------------------------------------------------------------
  579. Shifts `a' right by the number of bits given in `count'. If any nonzero
  580. bits are shifted off, they are ``jammed'' into the least significant bit of
  581. the result by setting the least significant bit to 1. The value of `count'
  582. can be arbitrarily large; in particular, if `count' is greater than 32, the
  583. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  584. The result is stored in the location pointed to by `zPtr'.
  585. -------------------------------------------------------------------------------
  586. *}
  587. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  588. var
  589. z: Bits32;
  590. Begin
  591. if ( count = 0 ) then
  592. z := a
  593. else
  594. if ( count < 32 ) then
  595. Begin
  596. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  597. End
  598. else
  599. Begin
  600. z := bits32( a <> 0 );
  601. End;
  602. zPtr := z;
  603. End;
  604. {*----------------------------------------------------------------------------
  605. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  606. | number of bits given in `count'. Any bits shifted off are lost. The value
  607. | of `count' can be arbitrarily large; in particular, if `count' is greater
  608. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  609. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  610. *----------------------------------------------------------------------------*}
  611. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  612. var
  613. z0, z1: bits64;
  614. negCount: int8;
  615. begin
  616. negCount := ( - count ) and 63;
  617. if ( count = 0 ) then
  618. begin
  619. z1 := a1;
  620. z0 := a0;
  621. end
  622. else if ( count < 64 ) then
  623. begin
  624. z1 := ( a0 shl negCount ) or ( a1 shr count );
  625. z0 := a0 shr count;
  626. end
  627. else
  628. begin
  629. if ( count shl 64 )<>0 then
  630. z1 := a0 shr ( count and 63 )
  631. else
  632. z1 := 0;
  633. z0 := 0;
  634. end;
  635. z1Ptr := z1;
  636. z0Ptr := z0;
  637. end;
  638. {*
  639. -------------------------------------------------------------------------------
  640. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  641. number of bits given in `count'. Any bits shifted off are lost. The value
  642. of `count' can be arbitrarily large; in particular, if `count' is greater
  643. than 64, the result will be 0. The result is broken into two 32-bit pieces
  644. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  645. -------------------------------------------------------------------------------
  646. *}
  647. Procedure
  648. shift64Right(
  649. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  650. Var
  651. z0, z1: bits32;
  652. negCount : int8;
  653. Begin
  654. negCount := ( - count ) AND 31;
  655. if ( count = 0 ) then
  656. Begin
  657. z1 := a1;
  658. z0 := a0;
  659. End
  660. else if ( count < 32 ) then
  661. Begin
  662. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  663. z0 := a0 shr count;
  664. End
  665. else
  666. Begin
  667. if (count < 64) then
  668. z1 := ( a0 shr ( count AND 31 ) )
  669. else
  670. z1 := 0;
  671. z0 := 0;
  672. End;
  673. z1Ptr := z1;
  674. z0Ptr := z0;
  675. End;
  676. {*
  677. -------------------------------------------------------------------------------
  678. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  679. number of bits given in `count'. If any nonzero bits are shifted off, they
  680. are ``jammed'' into the least significant bit of the result by setting the
  681. least significant bit to 1. The value of `count' can be arbitrarily large;
  682. in particular, if `count' is greater than 64, the result will be either 0
  683. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  684. nonzero. The result is broken into two 32-bit pieces which are stored at
  685. the locations pointed to by `z0Ptr' and `z1Ptr'.
  686. -------------------------------------------------------------------------------
  687. *}
  688. Procedure
  689. shift64RightJamming(
  690. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  691. VAR
  692. z0, z1 : bits32;
  693. negCount : int8;
  694. Begin
  695. negCount := ( - count ) AND 31;
  696. if ( count = 0 ) then
  697. Begin
  698. z1 := a1;
  699. z0 := a0;
  700. End
  701. else
  702. if ( count < 32 ) then
  703. Begin
  704. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  705. z0 := a0 shr count;
  706. End
  707. else
  708. Begin
  709. if ( count = 32 ) then
  710. Begin
  711. z1 := a0 OR bits32( a1 <> 0 );
  712. End
  713. else
  714. if ( count < 64 ) Then
  715. Begin
  716. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  717. End
  718. else
  719. Begin
  720. z1 := bits32( ( a0 OR a1 ) <> 0 );
  721. End;
  722. z0 := 0;
  723. End;
  724. z1Ptr := z1;
  725. z0Ptr := z0;
  726. End;
  727. {*----------------------------------------------------------------------------
  728. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  729. | bits are shifted off, they are ``jammed'' into the least significant bit of
  730. | the result by setting the least significant bit to 1. The value of `count'
  731. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  732. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  733. | The result is stored in the location pointed to by `zPtr'.
  734. *----------------------------------------------------------------------------*}
  735. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  736. var
  737. z: bits64;
  738. begin
  739. if ( count = 0 ) then
  740. begin
  741. z := a;
  742. end
  743. else if ( count < 64 ) then
  744. begin
  745. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  746. end
  747. else
  748. begin
  749. z := ord( a <> 0 );
  750. end;
  751. zPtr := z;
  752. end;
  753. {*
  754. -------------------------------------------------------------------------------
  755. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  756. by 32 _plus_ the number of bits given in `count'. The shifted result is
  757. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  758. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  759. off form a third 32-bit result as follows: The _last_ bit shifted off is
  760. the most-significant bit of the extra result, and the other 31 bits of the
  761. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  762. were all zero. This extra result is stored in the location pointed to by
  763. `z2Ptr'. The value of `count' can be arbitrarily large.
  764. (This routine makes more sense if `a0', `a1', and `a2' are considered
  765. to form a fixed-point value with binary point between `a1' and `a2'. This
  766. fixed-point value is shifted right by the number of bits given in `count',
  767. and the integer part of the result is returned at the locations pointed to
  768. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  769. corrupted as described above, and is returned at the location pointed to by
  770. `z2Ptr'.)
  771. -------------------------------------------------------------------------------
  772. }
  773. Procedure
  774. shift64ExtraRightJamming(
  775. a0: bits32;
  776. a1: bits32;
  777. a2: bits32;
  778. count: int16;
  779. VAR z0Ptr: bits32;
  780. VAR z1Ptr: bits32;
  781. VAR z2Ptr: bits32
  782. );
  783. Var
  784. z0, z1, z2: bits32;
  785. negCount : int8;
  786. Begin
  787. negCount := ( - count ) AND 31;
  788. if ( count = 0 ) then
  789. Begin
  790. z2 := a2;
  791. z1 := a1;
  792. z0 := a0;
  793. End
  794. else
  795. Begin
  796. if ( count < 32 ) Then
  797. Begin
  798. z2 := a1 shl negCount;
  799. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  800. z0 := a0 shr count;
  801. End
  802. else
  803. Begin
  804. if ( count = 32 ) then
  805. Begin
  806. z2 := a1;
  807. z1 := a0;
  808. End
  809. else
  810. Begin
  811. a2 := a2 or a1;
  812. if ( count < 64 ) then
  813. Begin
  814. z2 := a0 shl negCount;
  815. z1 := a0 shr ( count AND 31 );
  816. End
  817. else
  818. Begin
  819. if count = 64 then
  820. z2 := a0
  821. else
  822. z2 := bits32(a0 <> 0);
  823. z1 := 0;
  824. End;
  825. End;
  826. z0 := 0;
  827. End;
  828. z2 := z2 or bits32( a2 <> 0 );
  829. End;
  830. z2Ptr := z2;
  831. z1Ptr := z1;
  832. z0Ptr := z0;
  833. End;
  834. {*
  835. -------------------------------------------------------------------------------
  836. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  837. number of bits given in `count'. Any bits shifted off are lost. The value
  838. of `count' must be less than 32. The result is broken into two 32-bit
  839. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  840. -------------------------------------------------------------------------------
  841. *}
  842. Procedure
  843. shortShift64Left(
  844. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  845. Begin
  846. z1Ptr := a1 shl count;
  847. if count = 0 then
  848. z0Ptr := a0
  849. else
  850. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  851. End;
  852. {*
  853. -------------------------------------------------------------------------------
  854. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  855. by the number of bits given in `count'. Any bits shifted off are lost.
  856. The value of `count' must be less than 32. The result is broken into three
  857. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  858. `z1Ptr', and `z2Ptr'.
  859. -------------------------------------------------------------------------------
  860. *}
  861. Procedure
  862. shortShift96Left(
  863. a0: bits32;
  864. a1: bits32;
  865. a2: bits32;
  866. count: int16;
  867. VAR z0Ptr: bits32;
  868. VAR z1Ptr: bits32;
  869. VAR z2Ptr: bits32
  870. );
  871. Var
  872. z0, z1, z2: bits32;
  873. negCount: int8;
  874. Begin
  875. z2 := a2 shl count;
  876. z1 := a1 shl count;
  877. z0 := a0 shl count;
  878. if ( 0 < count ) then
  879. Begin
  880. negCount := ( ( - count ) AND 31 );
  881. z1 := z1 or (a2 shr negCount);
  882. z0 := z0 or (a1 shr negCount);
  883. End;
  884. z2Ptr := z2;
  885. z1Ptr := z1;
  886. z0Ptr := z0;
  887. End;
  888. {*----------------------------------------------------------------------------
  889. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  890. | number of bits given in `count'. Any bits shifted off are lost. The value
  891. | of `count' must be less than 64. The result is broken into two 64-bit
  892. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  893. *----------------------------------------------------------------------------*}
  894. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);inline;
  895. begin
  896. z1Ptr := a1 shl count;
  897. if count=0 then
  898. z0Ptr:=a0
  899. else
  900. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  901. end;
  902. {*
  903. -------------------------------------------------------------------------------
  904. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  905. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  906. any carry out is lost. The result is broken into two 32-bit pieces which
  907. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  908. -------------------------------------------------------------------------------
  909. *}
  910. Procedure
  911. add64(
  912. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  913. Var
  914. z1: bits32;
  915. Begin
  916. z1 := a1 + b1;
  917. z1Ptr := z1;
  918. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  919. End;
  920. {*
  921. -------------------------------------------------------------------------------
  922. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  923. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  924. modulo 2^96, so any carry out is lost. The result is broken into three
  925. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  926. `z1Ptr', and `z2Ptr'.
  927. -------------------------------------------------------------------------------
  928. *}
  929. Procedure
  930. add96(
  931. a0: bits32;
  932. a1: bits32;
  933. a2: bits32;
  934. b0: bits32;
  935. b1: bits32;
  936. b2: bits32;
  937. VAR z0Ptr: bits32;
  938. VAR z1Ptr: bits32;
  939. VAR z2Ptr: bits32
  940. );
  941. var
  942. z0, z1, z2: bits32;
  943. carry0, carry1: int8;
  944. Begin
  945. z2 := a2 + b2;
  946. carry1 := int8( z2 < a2 );
  947. z1 := a1 + b1;
  948. carry0 := int8( z1 < a1 );
  949. z0 := a0 + b0;
  950. z1 := z1 + carry1;
  951. z0 := z0 + bits32( z1 < carry1 );
  952. z0 := z0 + carry0;
  953. z2Ptr := z2;
  954. z1Ptr := z1;
  955. z0Ptr := z0;
  956. End;
  957. {*
  958. -------------------------------------------------------------------------------
  959. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  960. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  961. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  962. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  963. `z1Ptr'.
  964. -------------------------------------------------------------------------------
  965. *}
  966. Procedure
  967. sub64(
  968. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  969. Begin
  970. z1Ptr := a1 - b1;
  971. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  972. End;
  973. {*
  974. -------------------------------------------------------------------------------
  975. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  976. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  977. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  978. into three 32-bit pieces which are stored at the locations pointed to by
  979. `z0Ptr', `z1Ptr', and `z2Ptr'.
  980. -------------------------------------------------------------------------------
  981. *}
  982. Procedure
  983. sub96(
  984. a0:bits32;
  985. a1:bits32;
  986. a2:bits32;
  987. b0:bits32;
  988. b1:bits32;
  989. b2:bits32;
  990. VAR z0Ptr:bits32;
  991. VAR z1Ptr:bits32;
  992. VAR z2Ptr:bits32
  993. );
  994. Var
  995. z0, z1, z2: bits32;
  996. borrow0, borrow1: int8;
  997. Begin
  998. z2 := a2 - b2;
  999. borrow1 := int8( a2 < b2 );
  1000. z1 := a1 - b1;
  1001. borrow0 := int8( a1 < b1 );
  1002. z0 := a0 - b0;
  1003. z0 := z0 - bits32( z1 < borrow1 );
  1004. z1 := z1 - borrow1;
  1005. z0 := z0 -borrow0;
  1006. z2Ptr := z2;
  1007. z1Ptr := z1;
  1008. z0Ptr := z0;
  1009. End;
  1010. {*
  1011. -------------------------------------------------------------------------------
  1012. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1013. into two 32-bit pieces which are stored at the locations pointed to by
  1014. `z0Ptr' and `z1Ptr'.
  1015. -------------------------------------------------------------------------------
  1016. *}
  1017. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1018. :bits32 );
  1019. Var
  1020. aHigh, aLow, bHigh, bLow: bits16;
  1021. z0, zMiddleA, zMiddleB, z1: bits32;
  1022. Begin
  1023. aLow := a and $ffff;
  1024. aHigh := a shr 16;
  1025. bLow := b and $ffff;
  1026. bHigh := b shr 16;
  1027. z1 := ( bits32( aLow) ) * bLow;
  1028. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1029. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1030. z0 := ( bits32 (aHigh) ) * bHigh;
  1031. zMiddleA := zMiddleA + zMiddleB;
  1032. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1033. zMiddleA := zmiddleA shl 16;
  1034. z1 := z1 + zMiddleA;
  1035. z0 := z0 + bits32( z1 < zMiddleA );
  1036. z1Ptr := z1;
  1037. z0Ptr := z0;
  1038. End;
  1039. {*
  1040. -------------------------------------------------------------------------------
  1041. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1042. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1043. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1044. `z2Ptr'.
  1045. -------------------------------------------------------------------------------
  1046. *}
  1047. Procedure
  1048. mul64By32To96(
  1049. a0:bits32;
  1050. a1:bits32;
  1051. b:bits32;
  1052. VAR z0Ptr:bits32;
  1053. VAR z1Ptr:bits32;
  1054. VAR z2Ptr:bits32
  1055. );
  1056. Var
  1057. z0, z1, z2, more1: bits32;
  1058. Begin
  1059. mul32To64( a1, b, z1, z2 );
  1060. mul32To64( a0, b, z0, more1 );
  1061. add64( z0, more1, 0, z1, z0, z1 );
  1062. z2Ptr := z2;
  1063. z1Ptr := z1;
  1064. z0Ptr := z0;
  1065. End;
  1066. {*
  1067. -------------------------------------------------------------------------------
  1068. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1069. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1070. product. The product is broken into four 32-bit pieces which are stored at
  1071. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1072. -------------------------------------------------------------------------------
  1073. *}
  1074. Procedure
  1075. mul64To128(
  1076. a0:bits32;
  1077. a1:bits32;
  1078. b0:bits32;
  1079. b1:bits32;
  1080. VAR z0Ptr:bits32;
  1081. VAR z1Ptr:bits32;
  1082. VAR z2Ptr:bits32;
  1083. VAR z3Ptr:bits32
  1084. );
  1085. Var
  1086. z0, z1, z2, z3: bits32;
  1087. more1, more2: bits32;
  1088. Begin
  1089. mul32To64( a1, b1, z2, z3 );
  1090. mul32To64( a1, b0, z1, more2 );
  1091. add64( z1, more2, 0, z2, z1, z2 );
  1092. mul32To64( a0, b0, z0, more1 );
  1093. add64( z0, more1, 0, z1, z0, z1 );
  1094. mul32To64( a0, b1, more1, more2 );
  1095. add64( more1, more2, 0, z2, more1, z2 );
  1096. add64( z0, z1, 0, more1, z0, z1 );
  1097. z3Ptr := z3;
  1098. z2Ptr := z2;
  1099. z1Ptr := z1;
  1100. z0Ptr := z0;
  1101. End;
  1102. {*
  1103. -------------------------------------------------------------------------------
  1104. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1105. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1106. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1107. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1108. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1109. unsigned integer is returned.
  1110. -------------------------------------------------------------------------------
  1111. *}
  1112. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1113. Var
  1114. b0, b1: bits32;
  1115. rem0, rem1, term0, term1: bits32;
  1116. z: bits32;
  1117. Begin
  1118. if ( b <= a0 ) then
  1119. Begin
  1120. estimateDiv64To32 := $FFFFFFFF;
  1121. exit;
  1122. End;
  1123. b0 := b shr 16;
  1124. if ( b0 shl 16 <= a0 ) then
  1125. z:= $FFFF0000
  1126. else
  1127. z:= ( a0 div b0 ) shl 16;
  1128. mul32To64( b, z, term0, term1 );
  1129. sub64( a0, a1, term0, term1, rem0, rem1 );
  1130. while ( ( sbits32 (rem0) ) < 0 ) do
  1131. Begin
  1132. z := z - $10000;
  1133. b1 := b shl 16;
  1134. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1135. End;
  1136. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1137. if ( b0 shl 16 <= rem0 ) then
  1138. z := z or $FFFF
  1139. else
  1140. z := z or (rem0 div b0);
  1141. estimateDiv64To32 := z;
  1142. End;
  1143. {*
  1144. -------------------------------------------------------------------------------
  1145. Returns an approximation to the square root of the 32-bit significand given
  1146. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1147. `aExp' (the least significant bit) is 1, the integer returned approximates
  1148. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1149. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1150. case, the approximation returned lies strictly within +/-2 of the exact
  1151. value.
  1152. -------------------------------------------------------------------------------
  1153. *}
  1154. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1155. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1156. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1157. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1158. );
  1159. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1160. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1161. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1162. );
  1163. Var
  1164. index: int8;
  1165. z: bits32;
  1166. Begin
  1167. index := ( a shr 27 ) AND 15;
  1168. if ( aExp AND 1 ) <> 0 then
  1169. Begin
  1170. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1171. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1172. a := a shr 1;
  1173. End
  1174. else
  1175. Begin
  1176. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1177. z := a div z + z;
  1178. if ( $20000 <= z ) then
  1179. z := $FFFF8000
  1180. else
  1181. z := ( z shl 15 );
  1182. if ( z <= a ) then
  1183. Begin
  1184. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1185. exit;
  1186. End;
  1187. End;
  1188. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1189. End;
  1190. {*
  1191. -------------------------------------------------------------------------------
  1192. Returns the number of leading 0 bits before the most-significant 1 bit of
  1193. `a'. If `a' is zero, 32 is returned.
  1194. -------------------------------------------------------------------------------
  1195. *}
  1196. Function countLeadingZeros32( a:bits32 ): int8;
  1197. const countLeadingZerosHigh:array[0..255] of int8 = (
  1198. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1199. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1200. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1201. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1202. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1203. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1204. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1205. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1206. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1207. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1208. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1209. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1210. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1211. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1212. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1213. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1214. );
  1215. Var
  1216. shiftCount: int8;
  1217. Begin
  1218. shiftCount := 0;
  1219. if ( a < $10000 ) then
  1220. Begin
  1221. shiftCount := shiftcount + 16;
  1222. a := a shl 16;
  1223. End;
  1224. if ( a < $1000000 ) then
  1225. Begin
  1226. shiftCount := shiftcount + 8;
  1227. a := a shl 8;
  1228. end;
  1229. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1230. countLeadingZeros32:= shiftCount;
  1231. End;
  1232. {*----------------------------------------------------------------------------
  1233. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1234. | `a'. If `a' is zero, 64 is returned.
  1235. *----------------------------------------------------------------------------*}
  1236. function countLeadingZeros64( a : bits64): int8;
  1237. var
  1238. shiftcount : int8;
  1239. Begin
  1240. shiftCount := 0;
  1241. if ( a < (bits64(1) shl 32 )) then
  1242. shiftCount := shiftcount + 32
  1243. else
  1244. a := a shr 32;
  1245. shiftCount := shiftCount + countLeadingZeros32( a );
  1246. countLeadingZeros64:= shiftCount;
  1247. End;
  1248. {*
  1249. -------------------------------------------------------------------------------
  1250. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1251. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1252. returns 0.
  1253. -------------------------------------------------------------------------------
  1254. *}
  1255. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1256. Begin
  1257. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1258. End;
  1259. {*
  1260. -------------------------------------------------------------------------------
  1261. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1262. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1263. Otherwise, returns 0.
  1264. -------------------------------------------------------------------------------
  1265. *}
  1266. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1267. Begin
  1268. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1269. End;
  1270. {*
  1271. -------------------------------------------------------------------------------
  1272. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1273. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1274. returns 0.
  1275. -------------------------------------------------------------------------------
  1276. *}
  1277. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1278. Begin
  1279. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1280. End;
  1281. {*
  1282. -------------------------------------------------------------------------------
  1283. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1284. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1285. returns 0.
  1286. -------------------------------------------------------------------------------
  1287. *}
  1288. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1289. Begin
  1290. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1291. End;
  1292. (*****************************************************************************)
  1293. (* End Low-Level arithmetic *)
  1294. (*****************************************************************************)
  1295. {*
  1296. -------------------------------------------------------------------------------
  1297. Functions and definitions to determine: (1) whether tininess for underflow
  1298. is detected before or after rounding by default, (2) what (if anything)
  1299. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1300. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1301. are propagated from function inputs to output. These details are ENDIAN
  1302. specific
  1303. -------------------------------------------------------------------------------
  1304. *}
  1305. {$IFDEF ENDIAN_LITTLE}
  1306. {*
  1307. -------------------------------------------------------------------------------
  1308. Internal canonical NaN format.
  1309. -------------------------------------------------------------------------------
  1310. *}
  1311. TYPE
  1312. commonNaNT = packed record
  1313. sign: flag;
  1314. high, low : bits32;
  1315. end;
  1316. {*
  1317. -------------------------------------------------------------------------------
  1318. The pattern for a default generated single-precision NaN.
  1319. -------------------------------------------------------------------------------
  1320. *}
  1321. const float32_default_nan = $FFC00000;
  1322. {*
  1323. -------------------------------------------------------------------------------
  1324. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1325. otherwise returns 0.
  1326. -------------------------------------------------------------------------------
  1327. *}
  1328. Function float32_is_nan( a : float32 ): flag;
  1329. Begin
  1330. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1331. End;
  1332. {*
  1333. -------------------------------------------------------------------------------
  1334. Returns 1 if the single-precision floating-point value `a' is a signaling
  1335. NaN; otherwise returns 0.
  1336. -------------------------------------------------------------------------------
  1337. *}
  1338. Function float32_is_signaling_nan( a : float32 ): flag;
  1339. Begin
  1340. float32_is_signaling_nan := flag
  1341. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1342. End;
  1343. {*
  1344. -------------------------------------------------------------------------------
  1345. Returns the result of converting the single-precision floating-point NaN
  1346. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1347. exception is raised.
  1348. -------------------------------------------------------------------------------
  1349. *}
  1350. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1351. var
  1352. z : commonNaNT ;
  1353. Begin
  1354. if ( float32_is_signaling_nan( a ) <> 0) then
  1355. float_raise( float_flag_invalid );
  1356. z.sign := a shr 31;
  1357. z.low := 0;
  1358. z.high := a shl 9;
  1359. c := z;
  1360. End;
  1361. {*
  1362. -------------------------------------------------------------------------------
  1363. Returns the result of converting the canonical NaN `a' to the single-
  1364. precision floating-point format.
  1365. -------------------------------------------------------------------------------
  1366. *}
  1367. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1368. Begin
  1369. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1370. End;
  1371. {*
  1372. -------------------------------------------------------------------------------
  1373. Takes two single-precision floating-point values `a' and `b', one of which
  1374. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1375. signaling NaN, the invalid exception is raised.
  1376. -------------------------------------------------------------------------------
  1377. *}
  1378. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1379. Var
  1380. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1381. label returnLargerSignificand;
  1382. Begin
  1383. aIsNaN := float32_is_nan( a );
  1384. aIsSignalingNaN := float32_is_signaling_nan( a );
  1385. bIsNaN := float32_is_nan( b );
  1386. bIsSignalingNaN := float32_is_signaling_nan( b );
  1387. a := a or $00400000;
  1388. b := b or $00400000;
  1389. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1390. float_raise( float_flag_invalid );
  1391. if ( aIsSignalingNaN )<> 0 then
  1392. Begin
  1393. if ( bIsSignalingNaN ) <> 0 then
  1394. goto returnLargerSignificand;
  1395. if bIsNan <> 0 then
  1396. propagateFloat32NaN := b
  1397. else
  1398. propagateFloat32NaN := a;
  1399. exit;
  1400. End
  1401. else if ( aIsNaN <> 0) then
  1402. Begin
  1403. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1404. Begin
  1405. propagateFloat32NaN := a;
  1406. exit;
  1407. End;
  1408. returnLargerSignificand:
  1409. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1410. Begin
  1411. propagateFloat32NaN := b;
  1412. exit;
  1413. End;
  1414. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1415. Begin
  1416. propagateFloat32NaN := a;
  1417. End;
  1418. if a < b then
  1419. propagateFloat32NaN := a
  1420. else
  1421. propagateFloat32NaN := b;
  1422. exit;
  1423. End
  1424. else
  1425. Begin
  1426. propagateFloat32NaN := b;
  1427. exit;
  1428. End;
  1429. End;
  1430. {*
  1431. -------------------------------------------------------------------------------
  1432. The pattern for a default generated double-precision NaN. The `high' and
  1433. `low' values hold the most- and least-significant bits, respectively.
  1434. -------------------------------------------------------------------------------
  1435. *}
  1436. const
  1437. float64_default_nan_high = $FFF80000;
  1438. float64_default_nan_low = $00000000;
  1439. {*
  1440. -------------------------------------------------------------------------------
  1441. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1442. otherwise returns 0.
  1443. -------------------------------------------------------------------------------
  1444. *}
  1445. Function float64_is_nan( a : float64 ) : flag;
  1446. Begin
  1447. float64_is_nan :=
  1448. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1449. and ( a.low or ( a.high and $000FFFFF ) );
  1450. End;
  1451. {*
  1452. -------------------------------------------------------------------------------
  1453. Returns 1 if the double-precision floating-point value `a' is a signaling
  1454. NaN; otherwise returns 0.
  1455. -------------------------------------------------------------------------------
  1456. *}
  1457. Function float64_is_signaling_nan( a : float64 ): flag;
  1458. Begin
  1459. float64_is_signaling_nan :=
  1460. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1461. and ( a.low or ( a.high and $0007FFFF ) );
  1462. End;
  1463. {*
  1464. -------------------------------------------------------------------------------
  1465. Returns the result of converting the double-precision floating-point NaN
  1466. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1467. exception is raised.
  1468. -------------------------------------------------------------------------------
  1469. *}
  1470. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1471. Var
  1472. z : commonNaNT;
  1473. Begin
  1474. if ( float64_is_signaling_nan( a )<>0 ) then
  1475. float_raise( float_flag_invalid );
  1476. z.sign := a.high shr 31;
  1477. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1478. c := z;
  1479. End;
  1480. {*
  1481. -------------------------------------------------------------------------------
  1482. Returns the result of converting the canonical NaN `a' to the double-
  1483. precision floating-point format.
  1484. -------------------------------------------------------------------------------
  1485. *}
  1486. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1487. Var
  1488. z: float64;
  1489. Begin
  1490. shift64Right( a.high, a.low, 12, z.high, z.low );
  1491. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1492. c := z;
  1493. End;
  1494. {*
  1495. -------------------------------------------------------------------------------
  1496. Takes two double-precision floating-point values `a' and `b', one of which
  1497. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1498. signaling NaN, the invalid exception is raised.
  1499. -------------------------------------------------------------------------------
  1500. *}
  1501. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1502. Var
  1503. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1504. label returnLargerSignificand;
  1505. Begin
  1506. aIsNaN := float64_is_nan( a );
  1507. aIsSignalingNaN := float64_is_signaling_nan( a );
  1508. bIsNaN := float64_is_nan( b );
  1509. bIsSignalingNaN := float64_is_signaling_nan( b );
  1510. a.high := a.high or $00080000;
  1511. b.high := b.high or $00080000;
  1512. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1513. float_raise( float_flag_invalid );
  1514. if ( aIsSignalingNaN )<>0 then
  1515. Begin
  1516. if ( bIsSignalingNaN )<>0 then
  1517. goto returnLargerSignificand;
  1518. if bIsNan <> 0 then
  1519. c := b
  1520. else
  1521. c := a;
  1522. exit;
  1523. End
  1524. else if ( aIsNaN )<> 0 then
  1525. Begin
  1526. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1527. Begin
  1528. c := a;
  1529. exit;
  1530. End;
  1531. returnLargerSignificand:
  1532. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1533. Begin
  1534. c := b;
  1535. exit;
  1536. End;
  1537. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1538. Begin
  1539. c := a;
  1540. exit;
  1541. End;
  1542. if a.high < b.high then
  1543. c := a
  1544. else
  1545. c := b;
  1546. exit;
  1547. End
  1548. else
  1549. Begin
  1550. c := b;
  1551. exit;
  1552. End;
  1553. End;
  1554. {*----------------------------------------------------------------------------
  1555. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1556. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1557. | returns 0.
  1558. *----------------------------------------------------------------------------*}
  1559. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1560. begin
  1561. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1562. end;
  1563. {*----------------------------------------------------------------------------
  1564. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1565. | otherwise returns 0.
  1566. *----------------------------------------------------------------------------*}
  1567. function float128_is_nan( a : float128): flag;
  1568. begin
  1569. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1570. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1571. end;
  1572. {*----------------------------------------------------------------------------
  1573. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1574. | signaling NaN; otherwise returns 0.
  1575. *----------------------------------------------------------------------------*}
  1576. function float128_is_signaling_nan( a : float128): flag;
  1577. begin
  1578. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1579. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1580. end;
  1581. {*----------------------------------------------------------------------------
  1582. | Returns the result of converting the quadruple-precision floating-point NaN
  1583. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1584. | exception is raised.
  1585. *----------------------------------------------------------------------------*}
  1586. function float128ToCommonNaN( a : float128): commonNaNT;
  1587. var
  1588. z: commonNaNT;
  1589. qhigh,qlow : qword;
  1590. begin
  1591. if ( float128_is_signaling_nan( a )<>0) then
  1592. float_raise( float_flag_invalid );
  1593. z.sign := a.high shr 63;
  1594. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1595. z.high:=qhigh shr 32;
  1596. z.low:=qhigh and $ffffffff;
  1597. result:=z;
  1598. end;
  1599. {*----------------------------------------------------------------------------
  1600. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1601. | precision floating-point format.
  1602. *----------------------------------------------------------------------------*}
  1603. function commonNaNToFloat128( a : commonNaNT): float128;
  1604. var
  1605. z: float128;
  1606. begin
  1607. shift128Right( a.high, a.low, 16, z.high, z.low );
  1608. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1609. result:=z;
  1610. end;
  1611. {*----------------------------------------------------------------------------
  1612. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1613. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1614. | `b' is a signaling NaN, the invalid exception is raised.
  1615. *----------------------------------------------------------------------------*}
  1616. function propagateFloat128NaN( a: float128; b : float128): float128;
  1617. var
  1618. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1619. label
  1620. returnLargerSignificand;
  1621. begin
  1622. aIsNaN := float128_is_nan( a );
  1623. aIsSignalingNaN := float128_is_signaling_nan( a );
  1624. bIsNaN := float128_is_nan( b );
  1625. bIsSignalingNaN := float128_is_signaling_nan( b );
  1626. a.high := a.high or int64( $0000800000000000 );
  1627. b.high := b.high or int64( $0000800000000000 );
  1628. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1629. float_raise( float_flag_invalid );
  1630. if ( aIsSignalingNaN )<>0 then
  1631. begin
  1632. if ( bIsSignalingNaN )<>0 then
  1633. goto returnLargerSignificand;
  1634. if bIsNaN<>0 then
  1635. result := b
  1636. else
  1637. result := a;
  1638. exit;
  1639. end
  1640. else if ( aIsNaN )<>0 then
  1641. begin
  1642. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1643. begin
  1644. result := a;
  1645. exit;
  1646. end;
  1647. returnLargerSignificand:
  1648. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1649. begin
  1650. result := b;
  1651. exit;
  1652. end;
  1653. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1654. begin
  1655. result := a;
  1656. exit
  1657. end;
  1658. if ( a.high < b.high ) then
  1659. result := a
  1660. else
  1661. result := b;
  1662. exit;
  1663. end
  1664. else
  1665. result:=b;
  1666. end;
  1667. {$ELSE}
  1668. { Big endian code }
  1669. (*----------------------------------------------------------------------------
  1670. | Internal canonical NaN format.
  1671. *----------------------------------------------------------------------------*)
  1672. type
  1673. commonNANT = packed record
  1674. sign : flag;
  1675. high, low : bits32;
  1676. end;
  1677. (*----------------------------------------------------------------------------
  1678. | The pattern for a default generated single-precision NaN.
  1679. *----------------------------------------------------------------------------*)
  1680. const float32_default_nan = $7FFFFFFF;
  1681. (*----------------------------------------------------------------------------
  1682. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1683. | otherwise returns 0.
  1684. *----------------------------------------------------------------------------*)
  1685. function float32_is_nan(a: float32): flag;
  1686. begin
  1687. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1688. end;
  1689. (*----------------------------------------------------------------------------
  1690. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1691. | NaN; otherwise returns 0.
  1692. *----------------------------------------------------------------------------*)
  1693. function float32_is_signaling_nan(a: float32):flag;
  1694. begin
  1695. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1696. end;
  1697. (*----------------------------------------------------------------------------
  1698. | Returns the result of converting the single-precision floating-point NaN
  1699. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1700. | exception is raised.
  1701. *----------------------------------------------------------------------------*)
  1702. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1703. var
  1704. z: commonNANT;
  1705. begin
  1706. if float32_is_signaling_nan(a)<>0 then
  1707. float_raise(float_flag_invalid);
  1708. z.sign := a shr 31;
  1709. z.low := 0;
  1710. z.high := a shl 9;
  1711. c:=z;
  1712. end;
  1713. (*----------------------------------------------------------------------------
  1714. | Returns the result of converting the canonical NaN `a' to the single-
  1715. | precision floating-point format.
  1716. *----------------------------------------------------------------------------*)
  1717. function CommonNanToFloat32(a : CommonNaNT): float32;
  1718. begin
  1719. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  1720. end;
  1721. (*----------------------------------------------------------------------------
  1722. | Takes two single-precision floating-point values `a' and `b', one of which
  1723. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1724. | signaling NaN, the invalid exception is raised.
  1725. *----------------------------------------------------------------------------*)
  1726. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  1727. var
  1728. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1729. begin
  1730. aIsNaN := float32_is_nan( a );
  1731. aIsSignalingNaN := float32_is_signaling_nan( a );
  1732. bIsNaN := float32_is_nan( b );
  1733. bIsSignalingNaN := float32_is_signaling_nan( b );
  1734. a := a or $00400000;
  1735. b := b or $00400000;
  1736. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1737. float_raise( float_flag_invalid );
  1738. if bIsSignalingNaN<>0 then
  1739. propagateFloat32Nan := b
  1740. else if aIsSignalingNan<>0 then
  1741. propagateFloat32Nan := a
  1742. else if bIsNan<>0 then
  1743. propagateFloat32Nan := b
  1744. else
  1745. propagateFloat32Nan := a;
  1746. end;
  1747. (*----------------------------------------------------------------------------
  1748. | The pattern for a default generated double-precision NaN. The `high' and
  1749. | `low' values hold the most- and least-significant bits, respectively.
  1750. *----------------------------------------------------------------------------*)
  1751. const
  1752. float64_default_nan_high = $7FFFFFFF;
  1753. float64_default_nan_low = $FFFFFFFF;
  1754. (*----------------------------------------------------------------------------
  1755. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  1756. | otherwise returns 0.
  1757. *----------------------------------------------------------------------------*)
  1758. function float64_is_nan(a: float64): flag;
  1759. begin
  1760. float64_is_nan := flag (
  1761. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1762. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  1763. end;
  1764. (*----------------------------------------------------------------------------
  1765. | Returns 1 if the double-precision floating-point value `a' is a signaling
  1766. | NaN; otherwise returns 0.
  1767. *----------------------------------------------------------------------------*)
  1768. function float64_is_signaling_nan( a:float64): flag;
  1769. begin
  1770. float64_is_signaling_nan := flag(
  1771. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1772. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  1773. end;
  1774. (*----------------------------------------------------------------------------
  1775. | Returns the result of converting the double-precision floating-point NaN
  1776. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1777. | exception is raised.
  1778. *----------------------------------------------------------------------------*)
  1779. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1780. var
  1781. z : commonNaNT;
  1782. begin
  1783. if ( float64_is_signaling_nan( a )<>0 ) then
  1784. float_raise( float_flag_invalid );
  1785. z.sign := a.high shr 31;
  1786. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1787. c:=z;
  1788. end;
  1789. (*----------------------------------------------------------------------------
  1790. | Returns the result of converting the canonical NaN `a' to the double-
  1791. | precision floating-point format.
  1792. *----------------------------------------------------------------------------*)
  1793. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1794. var
  1795. z: float64;
  1796. begin
  1797. shift64Right( a.high, a.low, 12, z.high, z.low );
  1798. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1799. c:=z;
  1800. end;
  1801. (*----------------------------------------------------------------------------
  1802. | Takes two double-precision floating-point values `a' and `b', one of which
  1803. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1804. | signaling NaN, the invalid exception is raised.
  1805. *----------------------------------------------------------------------------*)
  1806. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1807. var
  1808. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  1809. begin
  1810. aIsNaN := float64_is_nan( a );
  1811. aIsSignalingNaN := float64_is_signaling_nan( a );
  1812. bIsNaN := float64_is_nan( b );
  1813. bIsSignalingNaN := float64_is_signaling_nan( b );
  1814. a.high := a.high or $00080000;
  1815. b.high := b.high or $00080000;
  1816. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  1817. float_raise( float_flag_invalid );
  1818. if bIsSignalingNaN<>0 then
  1819. c := b
  1820. else if aIsSignalingNan<>0 then
  1821. c := a
  1822. else if bIsNan<>0 then
  1823. c := b
  1824. else
  1825. c := a;
  1826. end;
  1827. {$ENDIF}
  1828. (****************************************************************************)
  1829. (* END ENDIAN SPECIFIC CODE *)
  1830. (****************************************************************************)
  1831. {*
  1832. -------------------------------------------------------------------------------
  1833. Returns the fraction bits of the single-precision floating-point value `a'.
  1834. -------------------------------------------------------------------------------
  1835. *}
  1836. Function ExtractFloat32Frac(a : Float32) : Bits32;
  1837. Begin
  1838. ExtractFloat32Frac := A AND $007FFFFF;
  1839. End;
  1840. {*
  1841. -------------------------------------------------------------------------------
  1842. Returns the exponent bits of the single-precision floating-point value `a'.
  1843. -------------------------------------------------------------------------------
  1844. *}
  1845. Function extractFloat32Exp( a: float32 ): Int16;
  1846. Begin
  1847. extractFloat32Exp := (a shr 23) AND $FF;
  1848. End;
  1849. {*
  1850. -------------------------------------------------------------------------------
  1851. Returns the sign bit of the single-precision floating-point value `a'.
  1852. -------------------------------------------------------------------------------
  1853. *}
  1854. Function extractFloat32Sign( a: float32 ): Flag;
  1855. Begin
  1856. extractFloat32Sign := a shr 31;
  1857. End;
  1858. {*
  1859. -------------------------------------------------------------------------------
  1860. Normalizes the subnormal single-precision floating-point value represented
  1861. by the denormalized significand `aSig'. The normalized exponent and
  1862. significand are stored at the locations pointed to by `zExpPtr' and
  1863. `zSigPtr', respectively.
  1864. -------------------------------------------------------------------------------
  1865. *}
  1866. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  1867. Var
  1868. ShiftCount : BYTE;
  1869. Begin
  1870. shiftCount := countLeadingZeros32( aSig ) - 8;
  1871. zSigPtr := aSig shl shiftCount;
  1872. zExpPtr := 1 - shiftCount;
  1873. End;
  1874. {*
  1875. -------------------------------------------------------------------------------
  1876. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  1877. single-precision floating-point value, returning the result. After being
  1878. shifted into the proper positions, the three fields are simply added
  1879. together to form the result. This means that any integer portion of `zSig'
  1880. will be added into the exponent. Since a properly normalized significand
  1881. will have an integer portion equal to 1, the `zExp' input should be 1 less
  1882. than the desired result exponent whenever `zSig' is a complete, normalized
  1883. significand.
  1884. -------------------------------------------------------------------------------
  1885. *}
  1886. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  1887. Begin
  1888. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  1889. + zSig;
  1890. End;
  1891. {*
  1892. -------------------------------------------------------------------------------
  1893. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1894. and significand `zSig', and returns the proper single-precision floating-
  1895. point value corresponding to the abstract input. Ordinarily, the abstract
  1896. value is simply rounded and packed into the single-precision format, with
  1897. the inexact exception raised if the abstract input cannot be represented
  1898. exactly. However, if the abstract value is too large, the overflow and
  1899. inexact exceptions are raised and an infinity or maximal finite value is
  1900. returned. If the abstract value is too small, the input value is rounded to
  1901. a subnormal number, and the underflow and inexact exceptions are raised if
  1902. the abstract input cannot be represented exactly as a subnormal single-
  1903. precision floating-point number.
  1904. The input significand `zSig' has its binary point between bits 30
  1905. and 29, which is 7 bits to the left of the usual location. This shifted
  1906. significand must be normalized or smaller. If `zSig' is not normalized,
  1907. `zExp' must be 0; in that case, the result returned is a subnormal number,
  1908. and it must not require rounding. In the usual case that `zSig' is
  1909. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  1910. The handling of underflow and overflow follows the IEC/IEEE Standard for
  1911. Binary Floating-Point Arithmetic.
  1912. -------------------------------------------------------------------------------
  1913. *}
  1914. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  1915. Var
  1916. roundingMode : BYTE;
  1917. roundNearestEven : Flag;
  1918. roundIncrement, roundBits : BYTE;
  1919. IsTiny : Flag;
  1920. Begin
  1921. roundingMode := float_rounding_mode;
  1922. if (roundingMode = float_round_nearest_even) then
  1923. Begin
  1924. roundNearestEven := Flag(TRUE);
  1925. end
  1926. else
  1927. roundNearestEven := Flag(FALSE);
  1928. roundIncrement := $40;
  1929. if ( Boolean(roundNearestEven) = FALSE) then
  1930. Begin
  1931. if ( roundingMode = float_round_to_zero ) Then
  1932. Begin
  1933. roundIncrement := 0;
  1934. End
  1935. else
  1936. Begin
  1937. roundIncrement := $7F;
  1938. if ( zSign <> 0 ) then
  1939. Begin
  1940. if roundingMode = float_round_up then roundIncrement := 0;
  1941. End
  1942. else
  1943. Begin
  1944. if roundingMode = float_round_down then roundIncrement := 0;
  1945. End;
  1946. End
  1947. End;
  1948. roundBits := zSig AND $7F;
  1949. if ($FD <= bits16 (zExp) ) then
  1950. Begin
  1951. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  1952. Begin
  1953. float_raise( float_flag_overflow OR float_flag_inexact );
  1954. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  1955. exit;
  1956. End;
  1957. if ( zExp < 0 ) then
  1958. Begin
  1959. isTiny :=
  1960. flag(( float_detect_tininess = float_tininess_before_rounding )
  1961. OR ( zExp < -1 )
  1962. OR ( (zSig + roundIncrement) < $80000000 ));
  1963. shift32RightJamming( zSig, - zExp, zSig );
  1964. zExp := 0;
  1965. roundBits := zSig AND $7F;
  1966. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  1967. float_raise( float_flag_underflow );
  1968. End;
  1969. End;
  1970. if ( roundBits )<> 0 then
  1971. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  1972. zSig := ( zSig + roundIncrement ) shr 7;
  1973. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  1974. if ( zSig = 0 ) then zExp := 0;
  1975. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  1976. exit;
  1977. End;
  1978. {*
  1979. -------------------------------------------------------------------------------
  1980. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1981. and significand `zSig', and returns the proper single-precision floating-
  1982. point value corresponding to the abstract input. This routine is just like
  1983. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  1984. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  1985. floating-point exponent.
  1986. -------------------------------------------------------------------------------
  1987. *}
  1988. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  1989. Var
  1990. ShiftCount : int8;
  1991. Begin
  1992. shiftCount := countLeadingZeros32( zSig ) - 1;
  1993. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  1994. End;
  1995. {*
  1996. -------------------------------------------------------------------------------
  1997. Returns the most-significant 20 fraction bits of the double-precision
  1998. floating-point value `a'.
  1999. -------------------------------------------------------------------------------
  2000. *}
  2001. Function extractFloat64Frac0(a: float64): bits32;
  2002. Begin
  2003. extractFloat64Frac0 := a.high and $000FFFFF;
  2004. End;
  2005. {*
  2006. -------------------------------------------------------------------------------
  2007. Returns the least-significant 32 fraction bits of the double-precision
  2008. floating-point value `a'.
  2009. -------------------------------------------------------------------------------
  2010. *}
  2011. Function extractFloat64Frac1(a: float64): bits32;
  2012. Begin
  2013. extractFloat64Frac1 := a.low;
  2014. End;
  2015. {*
  2016. -------------------------------------------------------------------------------
  2017. Returns the exponent bits of the double-precision floating-point value `a'.
  2018. -------------------------------------------------------------------------------
  2019. *}
  2020. Function extractFloat64Exp(a: float64): int16;
  2021. Begin
  2022. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2023. End;
  2024. {*
  2025. -------------------------------------------------------------------------------
  2026. Returns the sign bit of the double-precision floating-point value `a'.
  2027. -------------------------------------------------------------------------------
  2028. *}
  2029. Function extractFloat64Sign(a: float64) : flag;
  2030. Begin
  2031. extractFloat64Sign := a.high shr 31;
  2032. End;
  2033. {*
  2034. -------------------------------------------------------------------------------
  2035. Normalizes the subnormal double-precision floating-point value represented
  2036. by the denormalized significand formed by the concatenation of `aSig0' and
  2037. `aSig1'. The normalized exponent is stored at the location pointed to by
  2038. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2039. stored at the location pointed to by `zSig0Ptr', and the least significant
  2040. 32 bits of the normalized significand are stored at the location pointed to
  2041. by `zSig1Ptr'.
  2042. -------------------------------------------------------------------------------
  2043. *}
  2044. Procedure normalizeFloat64Subnormal(
  2045. aSig0: bits32;
  2046. aSig1: bits32;
  2047. VAR zExpPtr : Int16;
  2048. VAR zSig0Ptr : Bits32;
  2049. VAR zSig1Ptr : Bits32
  2050. );
  2051. Var
  2052. ShiftCount : Int8;
  2053. Begin
  2054. if ( aSig0 = 0 ) then
  2055. Begin
  2056. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2057. if ( shiftCount < 0 ) then
  2058. Begin
  2059. zSig0Ptr := aSig1 shr ( - shiftCount );
  2060. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2061. End
  2062. else
  2063. Begin
  2064. zSig0Ptr := aSig1 shl shiftCount;
  2065. zSig1Ptr := 0;
  2066. End;
  2067. zExpPtr := - shiftCount - 31;
  2068. End
  2069. else
  2070. Begin
  2071. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2072. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2073. zExpPtr := 1 - shiftCount;
  2074. End;
  2075. End;
  2076. {*
  2077. -------------------------------------------------------------------------------
  2078. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2079. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2080. point value, returning the result. After being shifted into the proper
  2081. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2082. together to form the most significant 32 bits of the result. This means
  2083. that any integer portion of `zSig0' will be added into the exponent. Since
  2084. a properly normalized significand will have an integer portion equal to 1,
  2085. the `zExp' input should be 1 less than the desired result exponent whenever
  2086. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2087. -------------------------------------------------------------------------------
  2088. *}
  2089. Procedure
  2090. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2091. var
  2092. z: Float64;
  2093. Begin
  2094. z.low := zSig1;
  2095. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2096. c := z;
  2097. End;
  2098. {*----------------------------------------------------------------------------
  2099. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2100. | double-precision floating-point value, returning the result. After being
  2101. | shifted into the proper positions, the three fields are simply added
  2102. | together to form the result. This means that any integer portion of `zSig'
  2103. | will be added into the exponent. Since a properly normalized significand
  2104. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2105. | than the desired result exponent whenever `zSig' is a complete, normalized
  2106. | significand.
  2107. *----------------------------------------------------------------------------*}
  2108. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2109. begin
  2110. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2111. end;
  2112. {*
  2113. -------------------------------------------------------------------------------
  2114. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2115. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2116. and `zSig2', and returns the proper double-precision floating-point value
  2117. corresponding to the abstract input. Ordinarily, the abstract value is
  2118. simply rounded and packed into the double-precision format, with the inexact
  2119. exception raised if the abstract input cannot be represented exactly.
  2120. However, if the abstract value is too large, the overflow and inexact
  2121. exceptions are raised and an infinity or maximal finite value is returned.
  2122. If the abstract value is too small, the input value is rounded to a
  2123. subnormal number, and the underflow and inexact exceptions are raised if the
  2124. abstract input cannot be represented exactly as a subnormal double-precision
  2125. floating-point number.
  2126. The input significand must be normalized or smaller. If the input
  2127. significand is not normalized, `zExp' must be 0; in that case, the result
  2128. returned is a subnormal number, and it must not require rounding. In the
  2129. usual case that the input significand is normalized, `zExp' must be 1 less
  2130. than the ``true'' floating-point exponent. The handling of underflow and
  2131. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2132. -------------------------------------------------------------------------------
  2133. *}
  2134. Procedure
  2135. roundAndPackFloat64(
  2136. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2137. Var
  2138. roundingMode : Int8;
  2139. roundNearestEven, increment, isTiny : Flag;
  2140. Begin
  2141. roundingMode := float_rounding_mode;
  2142. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2143. increment := flag( sbits32 (zSig2) < 0 );
  2144. if ( roundNearestEven = flag(FALSE) ) then
  2145. Begin
  2146. if ( roundingMode = float_round_to_zero ) then
  2147. increment := 0
  2148. else
  2149. Begin
  2150. if ( zSign )<> 0 then
  2151. Begin
  2152. increment := flag( roundingMode = float_round_down ) and zSig2;
  2153. End
  2154. else
  2155. Begin
  2156. increment := flag( roundingMode = float_round_up ) and zSig2;
  2157. End
  2158. End
  2159. End;
  2160. if ( $7FD <= bits16 (zExp) ) then
  2161. Begin
  2162. if (( $7FD < zExp )
  2163. or (( zExp = $7FD )
  2164. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2165. and (increment<>0)
  2166. )
  2167. ) then
  2168. Begin
  2169. float_raise( float_flag_overflow OR float_flag_inexact );
  2170. if (( roundingMode = float_round_to_zero )
  2171. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2172. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2173. ) then
  2174. Begin
  2175. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2176. exit;
  2177. End;
  2178. packFloat64( zSign, $7FF, 0, 0, c );
  2179. exit;
  2180. End;
  2181. if ( zExp < 0 ) then
  2182. Begin
  2183. isTiny :=
  2184. flag( float_detect_tininess = float_tininess_before_rounding )
  2185. or flag( zExp < -1 )
  2186. or flag(increment = 0)
  2187. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2188. shift64ExtraRightJamming(
  2189. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2190. zExp := 0;
  2191. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2192. if ( roundNearestEven )<>0 then
  2193. Begin
  2194. increment := flag( sbits32 (zSig2) < 0 );
  2195. End
  2196. else
  2197. Begin
  2198. if ( zSign )<>0 then
  2199. Begin
  2200. increment := flag( roundingMode = float_round_down ) and zSig2;
  2201. End
  2202. else
  2203. Begin
  2204. increment := flag( roundingMode = float_round_up ) and zSig2;
  2205. End
  2206. End;
  2207. End;
  2208. End;
  2209. if ( zSig2 )<>0 then
  2210. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2211. if ( increment )<>0 then
  2212. Begin
  2213. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2214. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2215. End
  2216. else
  2217. Begin
  2218. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2219. End;
  2220. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2221. End;
  2222. {*----------------------------------------------------------------------------
  2223. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2224. | and significand `zSig', and returns the proper double-precision floating-
  2225. | point value corresponding to the abstract input. Ordinarily, the abstract
  2226. | value is simply rounded and packed into the double-precision format, with
  2227. | the inexact exception raised if the abstract input cannot be represented
  2228. | exactly. However, if the abstract value is too large, the overflow and
  2229. | inexact exceptions are raised and an infinity or maximal finite value is
  2230. | returned. If the abstract value is too small, the input value is rounded
  2231. | to a subnormal number, and the underflow and inexact exceptions are raised
  2232. | if the abstract input cannot be represented exactly as a subnormal double-
  2233. | precision floating-point number.
  2234. | The input significand `zSig' has its binary point between bits 62
  2235. | and 61, which is 10 bits to the left of the usual location. This shifted
  2236. | significand must be normalized or smaller. If `zSig' is not normalized,
  2237. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2238. | and it must not require rounding. In the usual case that `zSig' is
  2239. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2240. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2241. | Binary Floating-Point Arithmetic.
  2242. *----------------------------------------------------------------------------*}
  2243. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2244. var
  2245. roundingMode: int8;
  2246. roundNearestEven: flag;
  2247. roundIncrement, roundBits: int16;
  2248. isTiny: flag;
  2249. begin
  2250. roundingMode := float_rounding_mode;
  2251. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2252. roundIncrement := $200;
  2253. if ( roundNearestEven=0 ) then
  2254. begin
  2255. if ( roundingMode = float_round_to_zero ) then
  2256. begin
  2257. roundIncrement := 0;
  2258. end
  2259. else begin
  2260. roundIncrement := $3FF;
  2261. if ( zSign<>0 ) then
  2262. begin
  2263. if ( roundingMode = float_round_up ) then
  2264. roundIncrement := 0;
  2265. end
  2266. else begin
  2267. if ( roundingMode = float_round_down ) then
  2268. roundIncrement := 0;
  2269. end
  2270. end
  2271. end;
  2272. roundBits := zSig and $3FF;
  2273. if ( $7FD <= bits16(zExp) ) then
  2274. begin
  2275. if ( ( $7FD < zExp )
  2276. or ( ( zExp = $7FD )
  2277. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2278. ) then
  2279. begin
  2280. float_raise( float_flag_overflow or float_flag_inexact );
  2281. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2282. exit;
  2283. end;
  2284. if ( zExp < 0 ) then
  2285. begin
  2286. isTiny := ord(
  2287. ( float_detect_tininess = float_tininess_before_rounding )
  2288. or ( zExp < -1 )
  2289. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2290. shift64RightJamming( zSig, - zExp, zSig );
  2291. zExp := 0;
  2292. roundBits := zSig and $3FF;
  2293. if ( isTiny and roundBits )<>0 then
  2294. float_raise( float_flag_underflow );
  2295. end
  2296. end;
  2297. if ( roundBits<>0 ) then
  2298. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2299. zSig := ( zSig + roundIncrement ) shr 10;
  2300. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2301. if ( zSig = 0 ) then
  2302. zExp := 0;
  2303. result:=packFloat64( zSign, zExp, zSig );
  2304. end;
  2305. {*
  2306. -------------------------------------------------------------------------------
  2307. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2308. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2309. returns the proper double-precision floating-point value corresponding
  2310. to the abstract input. This routine is just like `roundAndPackFloat64'
  2311. except that the input significand has fewer bits and does not have to be
  2312. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2313. point exponent.
  2314. -------------------------------------------------------------------------------
  2315. *}
  2316. Procedure
  2317. normalizeRoundAndPackFloat64(
  2318. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2319. Var
  2320. shiftCount : int8;
  2321. zSig2 : bits32;
  2322. Begin
  2323. if ( zSig0 = 0 ) then
  2324. Begin
  2325. zSig0 := zSig1;
  2326. zSig1 := 0;
  2327. zExp := zExp -32;
  2328. End;
  2329. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2330. if ( 0 <= shiftCount ) then
  2331. Begin
  2332. zSig2 := 0;
  2333. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2334. End
  2335. else
  2336. Begin
  2337. shift64ExtraRightJamming
  2338. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2339. End;
  2340. zExp := zExp - shiftCount;
  2341. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2342. End;
  2343. {*
  2344. -------------------------------------------------------------------------------
  2345. Returns the result of converting the 32-bit two's complement integer `a' to
  2346. the single-precision floating-point format. The conversion is performed
  2347. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2348. -------------------------------------------------------------------------------
  2349. *}
  2350. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2351. Var
  2352. zSign : Flag;
  2353. Begin
  2354. if ( a = 0 ) then
  2355. Begin
  2356. int32_to_float32.float32 := 0;
  2357. exit;
  2358. End;
  2359. if ( a = sbits32 ($80000000) ) then
  2360. Begin
  2361. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2362. exit;
  2363. end;
  2364. zSign := flag( a < 0 );
  2365. If zSign<>0 then
  2366. a := -a;
  2367. int32_to_float32.float32:=
  2368. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2369. End;
  2370. {*
  2371. -------------------------------------------------------------------------------
  2372. Returns the result of converting the 32-bit two's complement integer `a' to
  2373. the double-precision floating-point format. The conversion is performed
  2374. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2375. -------------------------------------------------------------------------------
  2376. *}
  2377. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2378. var
  2379. zSign : flag;
  2380. absA : bits32;
  2381. shiftCount : int8;
  2382. zSig0, zSig1 : bits32;
  2383. Begin
  2384. if ( a = 0 ) then
  2385. Begin
  2386. packFloat64( 0, 0, 0, 0, result );
  2387. exit;
  2388. end;
  2389. zSign := flag( a < 0 );
  2390. if ZSign<>0 then
  2391. AbsA := -a
  2392. else
  2393. AbsA := a;
  2394. shiftCount := countLeadingZeros32( absA ) - 11;
  2395. if ( 0 <= shiftCount ) then
  2396. Begin
  2397. zSig0 := absA shl shiftCount;
  2398. zSig1 := 0;
  2399. End
  2400. else
  2401. Begin
  2402. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2403. End;
  2404. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2405. End;
  2406. {*
  2407. -------------------------------------------------------------------------------
  2408. Returns the result of converting the single-precision floating-point value
  2409. `a' to the 32-bit two's complement integer format. The conversion is
  2410. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2411. Arithmetic---which means in particular that the conversion is rounded
  2412. according to the current rounding mode. If `a' is a NaN, the largest
  2413. positive integer is returned. Otherwise, if the conversion overflows, the
  2414. largest integer with the same sign as `a' is returned.
  2415. -------------------------------------------------------------------------------
  2416. *}
  2417. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2418. Var
  2419. aSign: flag;
  2420. aExp, shiftCount: int16;
  2421. aSig, aSigExtra: bits32;
  2422. z: int32;
  2423. roundingMode: int8;
  2424. Begin
  2425. aSig := extractFloat32Frac( a.float32 );
  2426. aExp := extractFloat32Exp( a.float32 );
  2427. aSign := extractFloat32Sign( a.float32 );
  2428. shiftCount := aExp - $96;
  2429. if ( 0 <= shiftCount ) then
  2430. Begin
  2431. if ( $9E <= aExp ) then
  2432. Begin
  2433. if ( a.float32 <> $CF000000 ) then
  2434. Begin
  2435. float_raise( float_flag_invalid );
  2436. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2437. Begin
  2438. float32_to_int32 := $7FFFFFFF;
  2439. exit;
  2440. End;
  2441. End;
  2442. float32_to_int32 := sbits32 ($80000000);
  2443. exit;
  2444. End;
  2445. z := ( aSig or $00800000 ) shl shiftCount;
  2446. if ( aSign<>0 ) then z := - z;
  2447. End
  2448. else
  2449. Begin
  2450. if ( aExp < $7E ) then
  2451. Begin
  2452. aSigExtra := aExp OR aSig;
  2453. z := 0;
  2454. End
  2455. else
  2456. Begin
  2457. aSig := aSig OR $00800000;
  2458. aSigExtra := aSig shl ( shiftCount and 31 );
  2459. z := aSig shr ( - shiftCount );
  2460. End;
  2461. if ( aSigExtra<>0 ) then
  2462. softfloat_exception_flags := softfloat_exception_flags
  2463. or float_flag_inexact;
  2464. roundingMode := float_rounding_mode;
  2465. if ( roundingMode = float_round_nearest_even ) then
  2466. Begin
  2467. if ( sbits32 (aSigExtra) < 0 ) then
  2468. Begin
  2469. Inc(z);
  2470. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2471. z := z and not 1;
  2472. End;
  2473. if ( aSign<>0 ) then
  2474. z := - z;
  2475. End
  2476. else
  2477. Begin
  2478. aSigExtra := flag( aSigExtra <> 0 );
  2479. if ( aSign<>0 ) then
  2480. Begin
  2481. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2482. z := - z;
  2483. End
  2484. else
  2485. Begin
  2486. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2487. End
  2488. End;
  2489. End;
  2490. float32_to_int32 := z;
  2491. End;
  2492. {*
  2493. -------------------------------------------------------------------------------
  2494. Returns the result of converting the single-precision floating-point value
  2495. `a' to the 32-bit two's complement integer format. The conversion is
  2496. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2497. Arithmetic, except that the conversion is always rounded toward zero.
  2498. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2499. the conversion overflows, the largest integer with the same sign as `a' is
  2500. returned.
  2501. -------------------------------------------------------------------------------
  2502. *}
  2503. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2504. Var
  2505. aSign : flag;
  2506. aExp, shiftCount : int16;
  2507. aSig : bits32;
  2508. z : int32;
  2509. Begin
  2510. aSig := extractFloat32Frac( a.float32 );
  2511. aExp := extractFloat32Exp( a.float32 );
  2512. aSign := extractFloat32Sign( a.float32 );
  2513. shiftCount := aExp - $9E;
  2514. if ( 0 <= shiftCount ) then
  2515. Begin
  2516. if ( a.float32 <> $CF000000 ) then
  2517. Begin
  2518. float_raise( float_flag_invalid );
  2519. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2520. Begin
  2521. float32_to_int32_round_to_zero := $7FFFFFFF;
  2522. exit;
  2523. end;
  2524. End;
  2525. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2526. exit;
  2527. End
  2528. else
  2529. if ( aExp <= $7E ) then
  2530. Begin
  2531. if ( aExp or aSig )<>0 then
  2532. softfloat_exception_flags :=
  2533. softfloat_exception_flags or float_flag_inexact;
  2534. float32_to_int32_round_to_zero := 0;
  2535. exit;
  2536. End;
  2537. aSig := ( aSig or $00800000 ) shl 8;
  2538. z := aSig shr ( - shiftCount );
  2539. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2540. Begin
  2541. softfloat_exception_flags :=
  2542. softfloat_exception_flags or float_flag_inexact;
  2543. End;
  2544. if ( aSign<>0 ) then z := - z;
  2545. float32_to_int32_round_to_zero := z;
  2546. End;
  2547. {*
  2548. -------------------------------------------------------------------------------
  2549. Returns the result of converting the single-precision floating-point value
  2550. `a' to the double-precision floating-point format. The conversion is
  2551. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2552. Arithmetic.
  2553. -------------------------------------------------------------------------------
  2554. *}
  2555. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2556. Var
  2557. aSign : flag;
  2558. aExp : int16;
  2559. aSig, zSig0, zSig1: bits32;
  2560. tmp : CommonNanT;
  2561. Begin
  2562. aSig := extractFloat32Frac( a.float32 );
  2563. aExp := extractFloat32Exp( a.float32 );
  2564. aSign := extractFloat32Sign( a.float32 );
  2565. if ( aExp = $FF ) then
  2566. Begin
  2567. if ( aSig<>0 ) then
  2568. Begin
  2569. float32ToCommonNaN(a.float32, tmp);
  2570. commonNaNToFloat64(tmp , result);
  2571. exit;
  2572. End;
  2573. packFloat64( aSign, $7FF, 0, 0, result);
  2574. exit;
  2575. End;
  2576. if ( aExp = 0 ) then
  2577. Begin
  2578. if ( aSig = 0 ) then
  2579. Begin
  2580. packFloat64( aSign, 0, 0, 0, result );
  2581. exit;
  2582. end;
  2583. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2584. Dec(aExp);
  2585. End;
  2586. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2587. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2588. End;
  2589. {*
  2590. -------------------------------------------------------------------------------
  2591. Rounds the single-precision floating-point value `a' to an integer,
  2592. and returns the result as a single-precision floating-point value. The
  2593. operation is performed according to the IEC/IEEE Standard for Binary
  2594. Floating-Point Arithmetic.
  2595. -------------------------------------------------------------------------------
  2596. *}
  2597. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2598. Var
  2599. aSign: flag;
  2600. aExp: int16;
  2601. lastBitMask, roundBitsMask: bits32;
  2602. roundingMode: int8;
  2603. z: float32;
  2604. Begin
  2605. aExp := extractFloat32Exp( a.float32 );
  2606. if ( $96 <= aExp ) then
  2607. Begin
  2608. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2609. Begin
  2610. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2611. exit;
  2612. End;
  2613. float32_round_to_int:=a;
  2614. exit;
  2615. End;
  2616. if ( aExp <= $7E ) then
  2617. Begin
  2618. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2619. Begin
  2620. float32_round_to_int:=a;
  2621. exit;
  2622. end;
  2623. softfloat_exception_flags
  2624. := softfloat_exception_flags OR float_flag_inexact;
  2625. aSign := extractFloat32Sign( a.float32 );
  2626. case ( float_rounding_mode ) of
  2627. float_round_nearest_even:
  2628. Begin
  2629. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2630. Begin
  2631. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2632. exit;
  2633. End;
  2634. End;
  2635. float_round_down:
  2636. Begin
  2637. if aSign <> 0 then
  2638. float32_round_to_int.float32 := $BF800000
  2639. else
  2640. float32_round_to_int.float32 := 0;
  2641. exit;
  2642. End;
  2643. float_round_up:
  2644. Begin
  2645. if aSign <> 0 then
  2646. float32_round_to_int.float32 := $80000000
  2647. else
  2648. float32_round_to_int.float32 := $3F800000;
  2649. exit;
  2650. End;
  2651. end;
  2652. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2653. End;
  2654. lastBitMask := 1;
  2655. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2656. lastBitMask := lastBitMask shl ($96 - aExp);
  2657. roundBitsMask := lastBitMask - 1;
  2658. z := a.float32;
  2659. roundingMode := float_rounding_mode;
  2660. if ( roundingMode = float_round_nearest_even ) then
  2661. Begin
  2662. z := z + (lastBitMask shr 1);
  2663. if ( ( z and roundBitsMask ) = 0 ) then
  2664. z := z and not lastBitMask;
  2665. End
  2666. else if ( roundingMode <> float_round_to_zero ) then
  2667. Begin
  2668. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2669. Begin
  2670. z := z + roundBitsMask;
  2671. End;
  2672. End;
  2673. z := z and not roundBitsMask;
  2674. if ( z <> a.float32 ) then
  2675. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2676. float32_round_to_int.float32 := z;
  2677. End;
  2678. {*
  2679. -------------------------------------------------------------------------------
  2680. Returns the result of adding the absolute values of the single-precision
  2681. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2682. before being returned. `zSign' is ignored if the result is a NaN.
  2683. The addition is performed according to the IEC/IEEE Standard for Binary
  2684. Floating-Point Arithmetic.
  2685. -------------------------------------------------------------------------------
  2686. *}
  2687. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2688. Var
  2689. aExp, bExp, zExp: int16;
  2690. aSig, bSig, zSig: bits32;
  2691. expDiff: int16;
  2692. label roundAndPack;
  2693. Begin
  2694. aSig:=extractFloat32Frac( a );
  2695. aExp:=extractFloat32Exp( a );
  2696. bSig:=extractFloat32Frac( b );
  2697. bExp := extractFloat32Exp( b );
  2698. expDiff := aExp - bExp;
  2699. aSig := aSig shl 6;
  2700. bSig := bSig shl 6;
  2701. if ( 0 < expDiff ) then
  2702. Begin
  2703. if ( aExp = $FF ) then
  2704. Begin
  2705. if ( aSig <> 0) then
  2706. Begin
  2707. addFloat32Sigs := propagateFloat32NaN( a, b );
  2708. exit;
  2709. End;
  2710. addFloat32Sigs := a;
  2711. exit;
  2712. End;
  2713. if ( bExp = 0 ) then
  2714. Begin
  2715. Dec(expDiff);
  2716. End
  2717. else
  2718. Begin
  2719. bSig := bSig or $20000000;
  2720. End;
  2721. shift32RightJamming( bSig, expDiff, bSig );
  2722. zExp := aExp;
  2723. End
  2724. else
  2725. If ( expDiff < 0 ) then
  2726. Begin
  2727. if ( bExp = $FF ) then
  2728. Begin
  2729. if ( bSig<>0 ) then
  2730. Begin
  2731. addFloat32Sigs := propagateFloat32NaN( a, b );
  2732. exit;
  2733. end;
  2734. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  2735. exit;
  2736. End;
  2737. if ( aExp = 0 ) then
  2738. Begin
  2739. Inc(expDiff);
  2740. End
  2741. else
  2742. Begin
  2743. aSig := aSig OR $20000000;
  2744. End;
  2745. shift32RightJamming( aSig, - expDiff, aSig );
  2746. zExp := bExp;
  2747. End
  2748. else
  2749. Begin
  2750. if ( aExp = $FF ) then
  2751. Begin
  2752. if ( aSig OR bSig )<> 0 then
  2753. Begin
  2754. addFloat32Sigs := propagateFloat32NaN( a, b );
  2755. exit;
  2756. end;
  2757. addFloat32Sigs := a;
  2758. exit;
  2759. End;
  2760. if ( aExp = 0 ) then
  2761. Begin
  2762. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  2763. exit;
  2764. end;
  2765. zSig := $40000000 + aSig + bSig;
  2766. zExp := aExp;
  2767. goto roundAndPack;
  2768. End;
  2769. aSig := aSig OR $20000000;
  2770. zSig := ( aSig + bSig ) shl 1;
  2771. Dec(zExp);
  2772. if ( sbits32 (zSig) < 0 ) then
  2773. Begin
  2774. zSig := aSig + bSig;
  2775. Inc(zExp);
  2776. End;
  2777. roundAndPack:
  2778. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  2779. End;
  2780. {*
  2781. -------------------------------------------------------------------------------
  2782. Returns the result of subtracting the absolute values of the single-
  2783. precision floating-point values `a' and `b'. If `zSign' is 1, the
  2784. difference is negated before being returned. `zSign' is ignored if the
  2785. result is a NaN. The subtraction is performed according to the IEC/IEEE
  2786. Standard for Binary Floating-Point Arithmetic.
  2787. -------------------------------------------------------------------------------
  2788. *}
  2789. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  2790. Var
  2791. aExp, bExp, zExp: int16;
  2792. aSig, bSig, zSig: bits32;
  2793. expDiff : int16;
  2794. label aExpBigger;
  2795. label bExpBigger;
  2796. label aBigger;
  2797. label bBigger;
  2798. label normalizeRoundAndPack;
  2799. Begin
  2800. aSig := extractFloat32Frac( a );
  2801. aExp := extractFloat32Exp( a );
  2802. bSig := extractFloat32Frac( b );
  2803. bExp := extractFloat32Exp( b );
  2804. expDiff := aExp - bExp;
  2805. aSig := aSig shl 7;
  2806. bSig := bSig shl 7;
  2807. if ( 0 < expDiff ) then goto aExpBigger;
  2808. if ( expDiff < 0 ) then goto bExpBigger;
  2809. if ( aExp = $FF ) then
  2810. Begin
  2811. if ( aSig OR bSig )<> 0 then
  2812. Begin
  2813. subFloat32Sigs := propagateFloat32NaN( a, b );
  2814. exit;
  2815. End;
  2816. float_raise( float_flag_invalid );
  2817. subFloat32Sigs := float32_default_nan;
  2818. exit;
  2819. End;
  2820. if ( aExp = 0 ) then
  2821. Begin
  2822. aExp := 1;
  2823. bExp := 1;
  2824. End;
  2825. if ( bSig < aSig ) Then goto aBigger;
  2826. if ( aSig < bSig ) Then goto bBigger;
  2827. subFloat32Sigs := packFloat32( flag(float_rounding_mode = float_round_down), 0, 0 );
  2828. exit;
  2829. bExpBigger:
  2830. if ( bExp = $FF ) then
  2831. Begin
  2832. if ( bSig<>0 ) then
  2833. Begin
  2834. subFloat32Sigs := propagateFloat32NaN( a, b );
  2835. exit;
  2836. End;
  2837. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  2838. exit;
  2839. End;
  2840. if ( aExp = 0 ) then
  2841. Begin
  2842. Inc(expDiff);
  2843. End
  2844. else
  2845. Begin
  2846. aSig := aSig OR $40000000;
  2847. End;
  2848. shift32RightJamming( aSig, - expDiff, aSig );
  2849. bSig := bSig OR $40000000;
  2850. bBigger:
  2851. zSig := bSig - aSig;
  2852. zExp := bExp;
  2853. zSign := zSign xor 1;
  2854. goto normalizeRoundAndPack;
  2855. aExpBigger:
  2856. if ( aExp = $FF ) then
  2857. Begin
  2858. if ( aSig <> 0) then
  2859. Begin
  2860. subFloat32Sigs := propagateFloat32NaN( a, b );
  2861. exit;
  2862. End;
  2863. subFloat32Sigs := a;
  2864. exit;
  2865. End;
  2866. if ( bExp = 0 ) then
  2867. Begin
  2868. Dec(expDiff);
  2869. End
  2870. else
  2871. Begin
  2872. bSig := bSig OR $40000000;
  2873. End;
  2874. shift32RightJamming( bSig, expDiff, bSig );
  2875. aSig := aSig OR $40000000;
  2876. aBigger:
  2877. zSig := aSig - bSig;
  2878. zExp := aExp;
  2879. normalizeRoundAndPack:
  2880. Dec(zExp);
  2881. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  2882. End;
  2883. {*
  2884. -------------------------------------------------------------------------------
  2885. Returns the result of adding the single-precision floating-point values `a'
  2886. and `b'. The operation is performed according to the IEC/IEEE Standard for
  2887. Binary Floating-Point Arithmetic.
  2888. -------------------------------------------------------------------------------
  2889. *}
  2890. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  2891. Var
  2892. aSign, bSign: Flag;
  2893. Begin
  2894. aSign := extractFloat32Sign( a.float32 );
  2895. bSign := extractFloat32Sign( b.float32 );
  2896. if ( aSign = bSign ) then
  2897. Begin
  2898. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  2899. End
  2900. else
  2901. Begin
  2902. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  2903. End;
  2904. End;
  2905. {*
  2906. -------------------------------------------------------------------------------
  2907. Returns the result of subtracting the single-precision floating-point values
  2908. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  2909. for Binary Floating-Point Arithmetic.
  2910. -------------------------------------------------------------------------------
  2911. *}
  2912. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  2913. Var
  2914. aSign, bSign: flag;
  2915. Begin
  2916. aSign := extractFloat32Sign( a.float32 );
  2917. bSign := extractFloat32Sign( b.float32 );
  2918. if ( aSign = bSign ) then
  2919. Begin
  2920. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  2921. End
  2922. else
  2923. Begin
  2924. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  2925. End;
  2926. End;
  2927. {*
  2928. -------------------------------------------------------------------------------
  2929. Returns the result of multiplying the single-precision floating-point values
  2930. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  2931. for Binary Floating-Point Arithmetic.
  2932. -------------------------------------------------------------------------------
  2933. *}
  2934. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  2935. Var
  2936. aSign, bSign, zSign: flag;
  2937. aExp, bExp, zExp : int16;
  2938. aSig, bSig, zSig0, zSig1: bits32;
  2939. Begin
  2940. aSig := extractFloat32Frac( a.float32 );
  2941. aExp := extractFloat32Exp( a.float32 );
  2942. aSign := extractFloat32Sign( a.float32 );
  2943. bSig := extractFloat32Frac( b.float32 );
  2944. bExp := extractFloat32Exp( b.float32 );
  2945. bSign := extractFloat32Sign( b.float32 );
  2946. zSign := aSign xor bSign;
  2947. if ( aExp = $FF ) then
  2948. Begin
  2949. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  2950. Begin
  2951. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  2952. End;
  2953. if ( ( bExp OR bSig ) = 0 ) then
  2954. Begin
  2955. float_raise( float_flag_invalid );
  2956. float32_mul.float32 := float32_default_nan;
  2957. exit;
  2958. End;
  2959. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  2960. exit;
  2961. End;
  2962. if ( bExp = $FF ) then
  2963. Begin
  2964. if ( bSig <> 0 ) then
  2965. Begin
  2966. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  2967. exit;
  2968. End;
  2969. if ( ( aExp OR aSig ) = 0 ) then
  2970. Begin
  2971. float_raise( float_flag_invalid );
  2972. float32_mul.float32 := float32_default_nan;
  2973. exit;
  2974. End;
  2975. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  2976. exit;
  2977. End;
  2978. if ( aExp = 0 ) then
  2979. Begin
  2980. if ( aSig = 0 ) then
  2981. Begin
  2982. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  2983. exit;
  2984. End;
  2985. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2986. End;
  2987. if ( bExp = 0 ) then
  2988. Begin
  2989. if ( bSig = 0 ) then
  2990. Begin
  2991. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  2992. exit;
  2993. End;
  2994. normalizeFloat32Subnormal( bSig, bExp, bSig );
  2995. End;
  2996. zExp := aExp + bExp - $7F;
  2997. aSig := ( aSig OR $00800000 ) shl 7;
  2998. bSig := ( bSig OR $00800000 ) shl 8;
  2999. mul32To64( aSig, bSig, zSig0, zSig1 );
  3000. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3001. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3002. Begin
  3003. zSig0 := zSig0 shl 1;
  3004. Dec(zExp);
  3005. End;
  3006. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3007. End;
  3008. {*
  3009. -------------------------------------------------------------------------------
  3010. Returns the result of dividing the single-precision floating-point value `a'
  3011. by the corresponding value `b'. The operation is performed according to the
  3012. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3013. -------------------------------------------------------------------------------
  3014. *}
  3015. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3016. Var
  3017. aSign, bSign, zSign: flag;
  3018. aExp, bExp, zExp: int16;
  3019. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3020. Begin
  3021. aSig := extractFloat32Frac( a.float32 );
  3022. aExp := extractFloat32Exp( a.float32 );
  3023. aSign := extractFloat32Sign( a.float32 );
  3024. bSig := extractFloat32Frac( b.float32 );
  3025. bExp := extractFloat32Exp( b.float32 );
  3026. bSign := extractFloat32Sign( b.float32 );
  3027. zSign := aSign xor bSign;
  3028. if ( aExp = $FF ) then
  3029. Begin
  3030. if ( aSig <> 0 ) then
  3031. Begin
  3032. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3033. exit;
  3034. End;
  3035. if ( bExp = $FF ) then
  3036. Begin
  3037. if ( bSig <> 0) then
  3038. Begin
  3039. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3040. End;
  3041. float_raise( float_flag_invalid );
  3042. float32_div.float32 := float32_default_nan;
  3043. exit;
  3044. End;
  3045. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3046. exit;
  3047. End;
  3048. if ( bExp = $FF ) then
  3049. Begin
  3050. if ( bSig <> 0) then
  3051. Begin
  3052. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3053. exit;
  3054. End;
  3055. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3056. exit;
  3057. End;
  3058. if ( bExp = 0 ) Then
  3059. Begin
  3060. if ( bSig = 0 ) Then
  3061. Begin
  3062. if ( ( aExp OR aSig ) = 0 ) then
  3063. Begin
  3064. float_raise( float_flag_invalid );
  3065. float32_div.float32 := float32_default_nan;
  3066. exit;
  3067. End;
  3068. float_raise( float_flag_divbyzero );
  3069. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3070. exit;
  3071. End;
  3072. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3073. End;
  3074. if ( aExp = 0 ) Then
  3075. Begin
  3076. if ( aSig = 0 ) Then
  3077. Begin
  3078. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3079. exit;
  3080. End;
  3081. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3082. End;
  3083. zExp := aExp - bExp + $7D;
  3084. aSig := ( aSig OR $00800000 ) shl 7;
  3085. bSig := ( bSig OR $00800000 ) shl 8;
  3086. if ( bSig <= ( aSig + aSig ) ) then
  3087. Begin
  3088. aSig := aSig shr 1;
  3089. Inc(zExp);
  3090. End;
  3091. zSig := estimateDiv64To32( aSig, 0, bSig );
  3092. if ( ( zSig and $3F ) <= 2 ) then
  3093. Begin
  3094. mul32To64( bSig, zSig, term0, term1 );
  3095. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3096. while ( sbits32 (rem0) < 0 ) do
  3097. Begin
  3098. Dec(zSig);
  3099. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3100. End;
  3101. zSig := zSig or bits32( rem1 <> 0 );
  3102. End;
  3103. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3104. End;
  3105. {*
  3106. -------------------------------------------------------------------------------
  3107. Returns the remainder of the single-precision floating-point value `a'
  3108. with respect to the corresponding value `b'. The operation is performed
  3109. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3110. -------------------------------------------------------------------------------
  3111. *}
  3112. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3113. Var
  3114. aSign, bSign, zSign: flag;
  3115. aExp, bExp, expDiff: int16;
  3116. aSig, bSig, q, allZero, alternateASig: bits32;
  3117. sigMean: sbits32;
  3118. Begin
  3119. aSig := extractFloat32Frac( a.float32 );
  3120. aExp := extractFloat32Exp( a.float32 );
  3121. aSign := extractFloat32Sign( a.float32 );
  3122. bSig := extractFloat32Frac( b.float32 );
  3123. bExp := extractFloat32Exp( b.float32 );
  3124. bSign := extractFloat32Sign( b.float32 );
  3125. if ( aExp = $FF ) then
  3126. Begin
  3127. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3128. Begin
  3129. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3130. exit;
  3131. End;
  3132. float_raise( float_flag_invalid );
  3133. float32_rem.float32 := float32_default_nan;
  3134. exit;
  3135. End;
  3136. if ( bExp = $FF ) then
  3137. Begin
  3138. if ( bSig <> 0 ) then
  3139. Begin
  3140. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3141. exit;
  3142. End;
  3143. float32_rem := a;
  3144. exit;
  3145. End;
  3146. if ( bExp = 0 ) then
  3147. Begin
  3148. if ( bSig = 0 ) then
  3149. Begin
  3150. float_raise( float_flag_invalid );
  3151. float32_rem.float32 := float32_default_nan;
  3152. exit;
  3153. End;
  3154. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3155. End;
  3156. if ( aExp = 0 ) then
  3157. Begin
  3158. if ( aSig = 0 ) then
  3159. Begin
  3160. float32_rem := a;
  3161. exit;
  3162. End;
  3163. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3164. End;
  3165. expDiff := aExp - bExp;
  3166. aSig := ( aSig OR $00800000 ) shl 8;
  3167. bSig := ( bSig OR $00800000 ) shl 8;
  3168. if ( expDiff < 0 ) then
  3169. Begin
  3170. if ( expDiff < -1 ) then
  3171. Begin
  3172. float32_rem := a;
  3173. exit;
  3174. End;
  3175. aSig := aSig shr 1;
  3176. End;
  3177. q := bits32( bSig <= aSig );
  3178. if ( q <> 0) then
  3179. aSig := aSig - bSig;
  3180. expDiff := expDiff - 32;
  3181. while ( 0 < expDiff ) do
  3182. Begin
  3183. q := estimateDiv64To32( aSig, 0, bSig );
  3184. if (2 < q) then
  3185. q := q - 2
  3186. else
  3187. q := 0;
  3188. aSig := - ( ( bSig shr 2 ) * q );
  3189. expDiff := expDiff - 30;
  3190. End;
  3191. expDiff := expDiff + 32;
  3192. if ( 0 < expDiff ) then
  3193. Begin
  3194. q := estimateDiv64To32( aSig, 0, bSig );
  3195. if (2 < q) then
  3196. q := q - 2
  3197. else
  3198. q := 0;
  3199. q := q shr (32 - expDiff);
  3200. bSig := bSig shr 2;
  3201. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3202. End
  3203. else
  3204. Begin
  3205. aSig := aSig shr 2;
  3206. bSig := bSig shr 2;
  3207. End;
  3208. Repeat
  3209. alternateASig := aSig;
  3210. Inc(q);
  3211. aSig := aSig - bSig;
  3212. Until not ( 0 <= sbits32 (aSig) );
  3213. sigMean := aSig + alternateASig;
  3214. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3215. Begin
  3216. aSig := alternateASig;
  3217. End;
  3218. zSign := flag( sbits32 (aSig) < 0 );
  3219. if ( zSign<>0 ) then
  3220. aSig := - aSig;
  3221. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3222. End;
  3223. {*
  3224. -------------------------------------------------------------------------------
  3225. Returns the square root of the single-precision floating-point value `a'.
  3226. The operation is performed according to the IEC/IEEE Standard for Binary
  3227. Floating-Point Arithmetic.
  3228. -------------------------------------------------------------------------------
  3229. *}
  3230. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3231. Var
  3232. aSign : flag;
  3233. aExp, zExp : int16;
  3234. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3235. label roundAndPack;
  3236. Begin
  3237. aSig := extractFloat32Frac( a.float32 );
  3238. aExp := extractFloat32Exp( a.float32 );
  3239. aSign := extractFloat32Sign( a.float32 );
  3240. if ( aExp = $FF ) then
  3241. Begin
  3242. if ( aSig <> 0) then
  3243. Begin
  3244. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3245. exit;
  3246. End;
  3247. if ( aSign = 0) then
  3248. Begin
  3249. float32_sqrt := a;
  3250. exit;
  3251. End;
  3252. float_raise( float_flag_invalid );
  3253. float32_sqrt.float32 := float32_default_nan;
  3254. exit;
  3255. End;
  3256. if ( aSign <> 0) then
  3257. Begin
  3258. if ( ( aExp OR aSig ) = 0 ) then
  3259. Begin
  3260. float32_sqrt := a;
  3261. exit;
  3262. End;
  3263. float_raise( float_flag_invalid );
  3264. float32_sqrt.float32 := float32_default_nan;
  3265. exit;
  3266. End;
  3267. if ( aExp = 0 ) then
  3268. Begin
  3269. if ( aSig = 0 ) then
  3270. Begin
  3271. float32_sqrt.float32 := 0;
  3272. exit;
  3273. End;
  3274. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3275. End;
  3276. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3277. aSig := ( aSig OR $00800000 ) shl 8;
  3278. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3279. if ( ( zSig and $7F ) <= 5 ) then
  3280. Begin
  3281. if ( zSig < 2 ) then
  3282. Begin
  3283. zSig := $7FFFFFFF;
  3284. goto roundAndPack;
  3285. End
  3286. else
  3287. Begin
  3288. aSig := aSig shr (aExp and 1);
  3289. mul32To64( zSig, zSig, term0, term1 );
  3290. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3291. while ( sbits32 (rem0) < 0 ) do
  3292. Begin
  3293. Dec(zSig);
  3294. shortShift64Left( 0, zSig, 1, term0, term1 );
  3295. term1 := term1 or 1;
  3296. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3297. End;
  3298. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3299. End;
  3300. End;
  3301. shift32RightJamming( zSig, 1, zSig );
  3302. roundAndPack:
  3303. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3304. End;
  3305. {*
  3306. -------------------------------------------------------------------------------
  3307. Returns 1 if the single-precision floating-point value `a' is equal to
  3308. the corresponding value `b', and 0 otherwise. The comparison is performed
  3309. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3310. -------------------------------------------------------------------------------
  3311. *}
  3312. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3313. Begin
  3314. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3315. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3316. ) then
  3317. Begin
  3318. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3319. Begin
  3320. float_raise( float_flag_invalid );
  3321. End;
  3322. float32_eq := 0;
  3323. exit;
  3324. End;
  3325. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3326. End;
  3327. {*
  3328. -------------------------------------------------------------------------------
  3329. Returns 1 if the single-precision floating-point value `a' is less than
  3330. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3331. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3332. Arithmetic.
  3333. -------------------------------------------------------------------------------
  3334. *}
  3335. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3336. var
  3337. aSign, bSign: flag;
  3338. Begin
  3339. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3340. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3341. ) then
  3342. Begin
  3343. float_raise( float_flag_invalid );
  3344. float32_le := 0;
  3345. exit;
  3346. End;
  3347. aSign := extractFloat32Sign( a.float32 );
  3348. bSign := extractFloat32Sign( b.float32 );
  3349. if ( aSign <> bSign ) then
  3350. Begin
  3351. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3352. exit;
  3353. End;
  3354. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3355. End;
  3356. {*
  3357. -------------------------------------------------------------------------------
  3358. Returns 1 if the single-precision floating-point value `a' is less than
  3359. the corresponding value `b', and 0 otherwise. The comparison is performed
  3360. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3361. -------------------------------------------------------------------------------
  3362. *}
  3363. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3364. var
  3365. aSign, bSign: flag;
  3366. Begin
  3367. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3368. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3369. ) then
  3370. Begin
  3371. float_raise( float_flag_invalid );
  3372. float32_lt :=0;
  3373. exit;
  3374. End;
  3375. aSign := extractFloat32Sign( a.float32 );
  3376. bSign := extractFloat32Sign( b.float32 );
  3377. if ( aSign <> bSign ) then
  3378. Begin
  3379. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3380. exit;
  3381. End;
  3382. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3383. End;
  3384. {*
  3385. -------------------------------------------------------------------------------
  3386. Returns 1 if the single-precision floating-point value `a' is equal to
  3387. the corresponding value `b', and 0 otherwise. The invalid exception is
  3388. raised if either operand is a NaN. Otherwise, the comparison is performed
  3389. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3390. -------------------------------------------------------------------------------
  3391. *}
  3392. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3393. Begin
  3394. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3395. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3396. ) then
  3397. Begin
  3398. float_raise( float_flag_invalid );
  3399. float32_eq_signaling := 0;
  3400. exit;
  3401. End;
  3402. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3403. End;
  3404. {*
  3405. -------------------------------------------------------------------------------
  3406. Returns 1 if the single-precision floating-point value `a' is less than or
  3407. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3408. cause an exception. Otherwise, the comparison is performed according to the
  3409. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3410. -------------------------------------------------------------------------------
  3411. *}
  3412. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3413. Var
  3414. aSign, bSign: flag;
  3415. aExp, bExp: int16;
  3416. Begin
  3417. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3418. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3419. ) then
  3420. Begin
  3421. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3422. Begin
  3423. float_raise( float_flag_invalid );
  3424. End;
  3425. float32_le_quiet := 0;
  3426. exit;
  3427. End;
  3428. aSign := extractFloat32Sign( a );
  3429. bSign := extractFloat32Sign( b );
  3430. if ( aSign <> bSign ) then
  3431. Begin
  3432. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3433. exit;
  3434. End;
  3435. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3436. End;
  3437. {*
  3438. -------------------------------------------------------------------------------
  3439. Returns 1 if the single-precision floating-point value `a' is less than
  3440. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3441. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3442. Standard for Binary Floating-Point Arithmetic.
  3443. -------------------------------------------------------------------------------
  3444. *}
  3445. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3446. Var
  3447. aSign, bSign: flag;
  3448. Begin
  3449. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3450. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3451. ) then
  3452. Begin
  3453. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3454. Begin
  3455. float_raise( float_flag_invalid );
  3456. End;
  3457. float32_lt_quiet := 0;
  3458. exit;
  3459. End;
  3460. aSign := extractFloat32Sign( a );
  3461. bSign := extractFloat32Sign( b );
  3462. if ( aSign <> bSign ) then
  3463. Begin
  3464. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3465. exit;
  3466. End;
  3467. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3468. End;
  3469. {*
  3470. -------------------------------------------------------------------------------
  3471. Returns the result of converting the double-precision floating-point value
  3472. `a' to the 32-bit two's complement integer format. The conversion is
  3473. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3474. Arithmetic---which means in particular that the conversion is rounded
  3475. according to the current rounding mode. If `a' is a NaN, the largest
  3476. positive integer is returned. Otherwise, if the conversion overflows, the
  3477. largest integer with the same sign as `a' is returned.
  3478. -------------------------------------------------------------------------------
  3479. *}
  3480. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3481. var
  3482. aSign: flag;
  3483. aExp, shiftCount: int16;
  3484. aSig0, aSig1, absZ, aSigExtra: bits32;
  3485. z: int32;
  3486. roundingMode: int8;
  3487. label invalid;
  3488. Begin
  3489. aSig1 := extractFloat64Frac1( a );
  3490. aSig0 := extractFloat64Frac0( a );
  3491. aExp := extractFloat64Exp( a );
  3492. aSign := extractFloat64Sign( a );
  3493. shiftCount := aExp - $413;
  3494. if ( 0 <= shiftCount ) then
  3495. Begin
  3496. if ( $41E < aExp ) then
  3497. Begin
  3498. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3499. aSign := 0;
  3500. goto invalid;
  3501. End;
  3502. shortShift64Left(
  3503. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3504. if ( $80000000 < absZ ) then
  3505. goto invalid;
  3506. End
  3507. else
  3508. Begin
  3509. aSig1 := flag( aSig1 <> 0 );
  3510. if ( aExp < $3FE ) then
  3511. Begin
  3512. aSigExtra := aExp OR aSig0 OR aSig1;
  3513. absZ := 0;
  3514. End
  3515. else
  3516. Begin
  3517. aSig0 := aSig0 OR $00100000;
  3518. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3519. absZ := aSig0 shr ( - shiftCount );
  3520. End;
  3521. End;
  3522. roundingMode := float_rounding_mode;
  3523. if ( roundingMode = float_round_nearest_even ) then
  3524. Begin
  3525. if ( sbits32(aSigExtra) < 0 ) then
  3526. Begin
  3527. Inc(absZ);
  3528. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3529. absZ := absZ and not 1;
  3530. End;
  3531. if aSign <> 0 then
  3532. z := - absZ
  3533. else
  3534. z := absZ;
  3535. End
  3536. else
  3537. Begin
  3538. aSigExtra := bits32( aSigExtra <> 0 );
  3539. if ( aSign <> 0) then
  3540. Begin
  3541. z := - ( absZ
  3542. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3543. End
  3544. else
  3545. Begin
  3546. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3547. End
  3548. End;
  3549. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3550. Begin
  3551. invalid:
  3552. float_raise( float_flag_invalid );
  3553. if (aSign <> 0 ) then
  3554. float64_to_int32 := sbits32 ($80000000)
  3555. else
  3556. float64_to_int32 := $7FFFFFFF;
  3557. exit;
  3558. End;
  3559. if ( aSigExtra <> 0) then
  3560. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3561. float64_to_int32 := z;
  3562. End;
  3563. {*
  3564. -------------------------------------------------------------------------------
  3565. Returns the result of converting the double-precision floating-point value
  3566. `a' to the 32-bit two's complement integer format. The conversion is
  3567. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3568. Arithmetic, except that the conversion is always rounded toward zero.
  3569. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3570. the conversion overflows, the largest integer with the same sign as `a' is
  3571. returned.
  3572. -------------------------------------------------------------------------------
  3573. *}
  3574. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3575. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3576. Var
  3577. aSign: flag;
  3578. aExp, shiftCount: int16;
  3579. aSig0, aSig1, absZ, aSigExtra: bits32;
  3580. z: int32;
  3581. label invalid;
  3582. Begin
  3583. aSig1 := extractFloat64Frac1( a );
  3584. aSig0 := extractFloat64Frac0( a );
  3585. aExp := extractFloat64Exp( a );
  3586. aSign := extractFloat64Sign( a );
  3587. shiftCount := aExp - $413;
  3588. if ( 0 <= shiftCount ) then
  3589. Begin
  3590. if ( $41E < aExp ) then
  3591. Begin
  3592. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3593. aSign := 0;
  3594. goto invalid;
  3595. End;
  3596. shortShift64Left(
  3597. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3598. End
  3599. else
  3600. Begin
  3601. if ( aExp < $3FF ) then
  3602. Begin
  3603. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3604. Begin
  3605. softfloat_exception_flags :=
  3606. softfloat_exception_flags or float_flag_inexact;
  3607. End;
  3608. float64_to_int32_round_to_zero := 0;
  3609. exit;
  3610. End;
  3611. aSig0 := aSig0 or $00100000;
  3612. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3613. absZ := aSig0 shr ( - shiftCount );
  3614. End;
  3615. if aSign <> 0 then
  3616. z := - absZ
  3617. else
  3618. z := absZ;
  3619. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3620. Begin
  3621. invalid:
  3622. float_raise( float_flag_invalid );
  3623. if (aSign <> 0) then
  3624. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3625. else
  3626. float64_to_int32_round_to_zero := $7FFFFFFF;
  3627. exit;
  3628. End;
  3629. if ( aSigExtra <> 0) then
  3630. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3631. float64_to_int32_round_to_zero := z;
  3632. End;
  3633. {*
  3634. -------------------------------------------------------------------------------
  3635. Returns the result of converting the double-precision floating-point value
  3636. `a' to the single-precision floating-point format. The conversion is
  3637. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3638. Arithmetic.
  3639. -------------------------------------------------------------------------------
  3640. *}
  3641. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3642. Var
  3643. aSign: flag;
  3644. aExp: int16;
  3645. aSig0, aSig1, zSig: bits32;
  3646. allZero: bits32;
  3647. tmp : CommonNanT;
  3648. Begin
  3649. aSig1 := extractFloat64Frac1( a );
  3650. aSig0 := extractFloat64Frac0( a );
  3651. aExp := extractFloat64Exp( a );
  3652. aSign := extractFloat64Sign( a );
  3653. if ( aExp = $7FF ) then
  3654. Begin
  3655. if ( aSig0 OR aSig1 ) <> 0 then
  3656. Begin
  3657. float64ToCommonNaN( a, tmp );
  3658. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3659. exit;
  3660. End;
  3661. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3662. exit;
  3663. End;
  3664. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3665. if ( aExp <> 0) then
  3666. zSig := zSig OR $40000000;
  3667. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3668. End;
  3669. {*
  3670. -------------------------------------------------------------------------------
  3671. Rounds the double-precision floating-point value `a' to an integer,
  3672. and returns the result as a double-precision floating-point value. The
  3673. operation is performed according to the IEC/IEEE Standard for Binary
  3674. Floating-Point Arithmetic.
  3675. -------------------------------------------------------------------------------
  3676. *}
  3677. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3678. Var
  3679. aSign: flag;
  3680. aExp: int16;
  3681. lastBitMask, roundBitsMask: bits32;
  3682. roundingMode: int8;
  3683. z: float64;
  3684. Begin
  3685. aExp := extractFloat64Exp( a );
  3686. if ( $413 <= aExp ) then
  3687. Begin
  3688. if ( $433 <= aExp ) then
  3689. Begin
  3690. if ( ( aExp = $7FF )
  3691. AND
  3692. (
  3693. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3694. ) <>0)
  3695. ) then
  3696. Begin
  3697. propagateFloat64NaN( a, a, result );
  3698. exit;
  3699. End;
  3700. result := a;
  3701. exit;
  3702. End;
  3703. lastBitMask := 1;
  3704. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3705. roundBitsMask := lastBitMask - 1;
  3706. z := a;
  3707. roundingMode := float_rounding_mode;
  3708. if ( roundingMode = float_round_nearest_even ) then
  3709. Begin
  3710. if ( lastBitMask <> 0) then
  3711. Begin
  3712. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  3713. if ( ( z.low and roundBitsMask ) = 0 ) then
  3714. z.low := z.low and not lastBitMask;
  3715. End
  3716. else
  3717. Begin
  3718. if ( sbits32 (z.low) < 0 ) then
  3719. Begin
  3720. Inc(z.high);
  3721. if ( bits32 ( z.low shl 1 ) = 0 ) then
  3722. z.high := z.high and not 1;
  3723. End;
  3724. End;
  3725. End
  3726. else if ( roundingMode <> float_round_to_zero ) then
  3727. Begin
  3728. if ( extractFloat64Sign( z )
  3729. xor flag( roundingMode = float_round_up ) )<> 0 then
  3730. Begin
  3731. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  3732. End;
  3733. End;
  3734. z.low := z.low and not roundBitsMask;
  3735. End
  3736. else
  3737. Begin
  3738. if ( aExp <= $3FE ) then
  3739. Begin
  3740. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  3741. Begin
  3742. result := a;
  3743. exit;
  3744. End;
  3745. softfloat_exception_flags := softfloat_exception_flags or
  3746. float_flag_inexact;
  3747. aSign := extractFloat64Sign( a );
  3748. case ( float_rounding_mode ) of
  3749. float_round_nearest_even:
  3750. Begin
  3751. if ( ( aExp = $3FE )
  3752. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  3753. ) then
  3754. Begin
  3755. packFloat64( aSign, $3FF, 0, 0, result );
  3756. exit;
  3757. End;
  3758. End;
  3759. float_round_down:
  3760. Begin
  3761. if aSign<>0 then
  3762. packFloat64( 1, $3FF, 0, 0, result )
  3763. else
  3764. packFloat64( 0, 0, 0, 0, result );
  3765. exit;
  3766. End;
  3767. float_round_up:
  3768. Begin
  3769. if aSign <> 0 then
  3770. packFloat64( 1, 0, 0, 0, result )
  3771. else
  3772. packFloat64( 0, $3FF, 0, 0, result );
  3773. exit;
  3774. End;
  3775. end;
  3776. packFloat64( aSign, 0, 0, 0, result );
  3777. exit;
  3778. End;
  3779. lastBitMask := 1;
  3780. lastBitMask := lastBitMask shl ($413 - aExp);
  3781. roundBitsMask := lastBitMask - 1;
  3782. z.low := 0;
  3783. z.high := a.high;
  3784. roundingMode := float_rounding_mode;
  3785. if ( roundingMode = float_round_nearest_even ) then
  3786. Begin
  3787. z.high := z.high + lastBitMask shr 1;
  3788. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  3789. Begin
  3790. z.high := z.high and not lastBitMask;
  3791. End;
  3792. End
  3793. else if ( roundingMode <> float_round_to_zero ) then
  3794. Begin
  3795. if ( extractFloat64Sign( z )
  3796. xor flag( roundingMode = float_round_up ) )<> 0 then
  3797. Begin
  3798. z.high := z.high or bits32( a.low <> 0 );
  3799. z.high := z.high + roundBitsMask;
  3800. End;
  3801. End;
  3802. z.high := z.high and not roundBitsMask;
  3803. End;
  3804. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  3805. Begin
  3806. softfloat_exception_flags :=
  3807. softfloat_exception_flags or float_flag_inexact;
  3808. End;
  3809. result := z;
  3810. End;
  3811. {*
  3812. -------------------------------------------------------------------------------
  3813. Returns the result of adding the absolute values of the double-precision
  3814. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3815. before being returned. `zSign' is ignored if the result is a NaN.
  3816. The addition is performed according to the IEC/IEEE Standard for Binary
  3817. Floating-Point Arithmetic.
  3818. -------------------------------------------------------------------------------
  3819. *}
  3820. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  3821. Var
  3822. aExp, bExp, zExp: int16;
  3823. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  3824. expDiff: int16;
  3825. label shiftRight1;
  3826. label roundAndPack;
  3827. Begin
  3828. aSig1 := extractFloat64Frac1( a );
  3829. aSig0 := extractFloat64Frac0( a );
  3830. aExp := extractFloat64Exp( a );
  3831. bSig1 := extractFloat64Frac1( b );
  3832. bSig0 := extractFloat64Frac0( b );
  3833. bExp := extractFloat64Exp( b );
  3834. expDiff := aExp - bExp;
  3835. if ( 0 < expDiff ) then
  3836. Begin
  3837. if ( aExp = $7FF ) then
  3838. Begin
  3839. if ( aSig0 OR aSig1 ) <> 0 then
  3840. Begin
  3841. propagateFloat64NaN( a, b, out );
  3842. exit;
  3843. end;
  3844. out := a;
  3845. exit;
  3846. End;
  3847. if ( bExp = 0 ) then
  3848. Begin
  3849. Dec(expDiff);
  3850. End
  3851. else
  3852. Begin
  3853. bSig0 := bSig0 or $00100000;
  3854. End;
  3855. shift64ExtraRightJamming(
  3856. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  3857. zExp := aExp;
  3858. End
  3859. else if ( expDiff < 0 ) then
  3860. Begin
  3861. if ( bExp = $7FF ) then
  3862. Begin
  3863. if ( bSig0 OR bSig1 ) <> 0 then
  3864. Begin
  3865. propagateFloat64NaN( a, b, out );
  3866. exit;
  3867. End;
  3868. packFloat64( zSign, $7FF, 0, 0, out );
  3869. End;
  3870. if ( aExp = 0 ) then
  3871. Begin
  3872. Inc(expDiff);
  3873. End
  3874. else
  3875. Begin
  3876. aSig0 := aSig0 or $00100000;
  3877. End;
  3878. shift64ExtraRightJamming(
  3879. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  3880. zExp := bExp;
  3881. End
  3882. else
  3883. Begin
  3884. if ( aExp = $7FF ) then
  3885. Begin
  3886. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  3887. Begin
  3888. propagateFloat64NaN( a, b, out );
  3889. exit;
  3890. End;
  3891. out := a;
  3892. exit;
  3893. End;
  3894. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  3895. if ( aExp = 0 ) then
  3896. Begin
  3897. packFloat64( zSign, 0, zSig0, zSig1, out );
  3898. exit;
  3899. End;
  3900. zSig2 := 0;
  3901. zSig0 := zSig0 or $00200000;
  3902. zExp := aExp;
  3903. goto shiftRight1;
  3904. End;
  3905. aSig0 := aSig0 or $00100000;
  3906. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  3907. Dec(zExp);
  3908. if ( zSig0 < $00200000 ) then
  3909. goto roundAndPack;
  3910. Inc(zExp);
  3911. shiftRight1:
  3912. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  3913. roundAndPack:
  3914. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  3915. End;
  3916. {*
  3917. -------------------------------------------------------------------------------
  3918. Returns the result of subtracting the absolute values of the double-
  3919. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3920. difference is negated before being returned. `zSign' is ignored if the
  3921. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3922. Standard for Binary Floating-Point Arithmetic.
  3923. -------------------------------------------------------------------------------
  3924. *}
  3925. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  3926. Var
  3927. aExp, bExp, zExp: int16;
  3928. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  3929. expDiff: int16;
  3930. z: float64;
  3931. label aExpBigger;
  3932. label bExpBigger;
  3933. label aBigger;
  3934. label bBigger;
  3935. label normalizeRoundAndPack;
  3936. Begin
  3937. aSig1 := extractFloat64Frac1( a );
  3938. aSig0 := extractFloat64Frac0( a );
  3939. aExp := extractFloat64Exp( a );
  3940. bSig1 := extractFloat64Frac1( b );
  3941. bSig0 := extractFloat64Frac0( b );
  3942. bExp := extractFloat64Exp( b );
  3943. expDiff := aExp - bExp;
  3944. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  3945. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  3946. if ( 0 < expDiff ) then goto aExpBigger;
  3947. if ( expDiff < 0 ) then goto bExpBigger;
  3948. if ( aExp = $7FF ) then
  3949. Begin
  3950. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  3951. Begin
  3952. propagateFloat64NaN( a, b, out );
  3953. exit;
  3954. End;
  3955. float_raise( float_flag_invalid );
  3956. z.low := float64_default_nan_low;
  3957. z.high := float64_default_nan_high;
  3958. out := z;
  3959. exit;
  3960. End;
  3961. if ( aExp = 0 ) then
  3962. Begin
  3963. aExp := 1;
  3964. bExp := 1;
  3965. End;
  3966. if ( bSig0 < aSig0 ) then goto aBigger;
  3967. if ( aSig0 < bSig0 ) then goto bBigger;
  3968. if ( bSig1 < aSig1 ) then goto aBigger;
  3969. if ( aSig1 < bSig1 ) then goto bBigger;
  3970. packFloat64( flag(float_rounding_mode = float_round_down), 0, 0, 0 , out);
  3971. exit;
  3972. bExpBigger:
  3973. if ( bExp = $7FF ) then
  3974. Begin
  3975. if ( bSig0 OR bSig1 ) <> 0 then
  3976. Begin
  3977. propagateFloat64NaN( a, b, out );
  3978. exit;
  3979. End;
  3980. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  3981. exit;
  3982. End;
  3983. if ( aExp = 0 ) then
  3984. Begin
  3985. Inc(expDiff);
  3986. End
  3987. else
  3988. Begin
  3989. aSig0 := aSig0 or $40000000;
  3990. End;
  3991. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  3992. bSig0 := bSig0 or $40000000;
  3993. bBigger:
  3994. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  3995. zExp := bExp;
  3996. zSign := zSign xor 1;
  3997. goto normalizeRoundAndPack;
  3998. aExpBigger:
  3999. if ( aExp = $7FF ) then
  4000. Begin
  4001. if ( aSig0 OR aSig1 ) <> 0 then
  4002. Begin
  4003. propagateFloat64NaN( a, b, out );
  4004. exit;
  4005. End;
  4006. out := a;
  4007. exit;
  4008. End;
  4009. if ( bExp = 0 ) then
  4010. Begin
  4011. Dec(expDiff);
  4012. End
  4013. else
  4014. Begin
  4015. bSig0 := bSig0 or $40000000;
  4016. End;
  4017. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4018. aSig0 := aSig0 or $40000000;
  4019. aBigger:
  4020. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4021. zExp := aExp;
  4022. normalizeRoundAndPack:
  4023. Dec(zExp);
  4024. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4025. End;
  4026. {*
  4027. -------------------------------------------------------------------------------
  4028. Returns the result of adding the double-precision floating-point values `a'
  4029. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4030. Binary Floating-Point Arithmetic.
  4031. -------------------------------------------------------------------------------
  4032. *}
  4033. Function float64_add( a: float64; b : float64) : Float64;
  4034. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4035. Var
  4036. aSign, bSign: flag;
  4037. Begin
  4038. aSign := extractFloat64Sign( a );
  4039. bSign := extractFloat64Sign( b );
  4040. if ( aSign = bSign ) then
  4041. Begin
  4042. addFloat64Sigs( a, b, aSign, result );
  4043. End
  4044. else
  4045. Begin
  4046. subFloat64Sigs( a, b, aSign, result );
  4047. End;
  4048. End;
  4049. {*
  4050. -------------------------------------------------------------------------------
  4051. Returns the result of subtracting the double-precision floating-point values
  4052. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4053. for Binary Floating-Point Arithmetic.
  4054. -------------------------------------------------------------------------------
  4055. *}
  4056. Function float64_sub(a: float64; b : float64) : Float64;
  4057. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4058. Var
  4059. aSign, bSign: flag;
  4060. Begin
  4061. aSign := extractFloat64Sign( a );
  4062. bSign := extractFloat64Sign( b );
  4063. if ( aSign = bSign ) then
  4064. Begin
  4065. subFloat64Sigs( a, b, aSign, result );
  4066. End
  4067. else
  4068. Begin
  4069. addFloat64Sigs( a, b, aSign, result );
  4070. End;
  4071. End;
  4072. {*
  4073. -------------------------------------------------------------------------------
  4074. Returns the result of multiplying the double-precision floating-point values
  4075. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4076. for Binary Floating-Point Arithmetic.
  4077. -------------------------------------------------------------------------------
  4078. *}
  4079. Function float64_mul( a: float64; b:float64) : Float64;
  4080. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4081. Var
  4082. aSign, bSign, zSign: flag;
  4083. aExp, bExp, zExp: int16;
  4084. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4085. z: float64;
  4086. label invalid;
  4087. Begin
  4088. aSig1 := extractFloat64Frac1( a );
  4089. aSig0 := extractFloat64Frac0( a );
  4090. aExp := extractFloat64Exp( a );
  4091. aSign := extractFloat64Sign( a );
  4092. bSig1 := extractFloat64Frac1( b );
  4093. bSig0 := extractFloat64Frac0( b );
  4094. bExp := extractFloat64Exp( b );
  4095. bSign := extractFloat64Sign( b );
  4096. zSign := aSign xor bSign;
  4097. if ( aExp = $7FF ) then
  4098. Begin
  4099. if ( (( aSig0 OR aSig1 ) <>0)
  4100. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4101. Begin
  4102. propagateFloat64NaN( a, b, result );
  4103. exit;
  4104. End;
  4105. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4106. packFloat64( zSign, $7FF, 0, 0, result );
  4107. exit;
  4108. End;
  4109. if ( bExp = $7FF ) then
  4110. Begin
  4111. if ( bSig0 OR bSig1 )<> 0 then
  4112. Begin
  4113. propagateFloat64NaN( a, b, result );
  4114. exit;
  4115. End;
  4116. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4117. Begin
  4118. invalid:
  4119. float_raise( float_flag_invalid );
  4120. z.low := float64_default_nan_low;
  4121. z.high := float64_default_nan_high;
  4122. result := z;
  4123. exit;
  4124. End;
  4125. packFloat64( zSign, $7FF, 0, 0, result );
  4126. exit;
  4127. End;
  4128. if ( aExp = 0 ) then
  4129. Begin
  4130. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4131. Begin
  4132. packFloat64( zSign, 0, 0, 0, result );
  4133. exit;
  4134. End;
  4135. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4136. End;
  4137. if ( bExp = 0 ) then
  4138. Begin
  4139. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4140. Begin
  4141. packFloat64( zSign, 0, 0, 0, result );
  4142. exit;
  4143. End;
  4144. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4145. End;
  4146. zExp := aExp + bExp - $400;
  4147. aSig0 := aSig0 or $00100000;
  4148. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4149. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4150. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4151. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4152. if ( $00200000 <= zSig0 ) then
  4153. Begin
  4154. shift64ExtraRightJamming(
  4155. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4156. Inc(zExp);
  4157. End;
  4158. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4159. End;
  4160. {*
  4161. -------------------------------------------------------------------------------
  4162. Returns the result of dividing the double-precision floating-point value `a'
  4163. by the corresponding value `b'. The operation is performed according to the
  4164. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4165. -------------------------------------------------------------------------------
  4166. *}
  4167. Function float64_div(a: float64; b : float64) : Float64;
  4168. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4169. Var
  4170. aSign, bSign, zSign: flag;
  4171. aExp, bExp, zExp: int16;
  4172. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4173. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4174. z: float64;
  4175. label invalid;
  4176. Begin
  4177. aSig1 := extractFloat64Frac1( a );
  4178. aSig0 := extractFloat64Frac0( a );
  4179. aExp := extractFloat64Exp( a );
  4180. aSign := extractFloat64Sign( a );
  4181. bSig1 := extractFloat64Frac1( b );
  4182. bSig0 := extractFloat64Frac0( b );
  4183. bExp := extractFloat64Exp( b );
  4184. bSign := extractFloat64Sign( b );
  4185. zSign := aSign xor bSign;
  4186. if ( aExp = $7FF ) then
  4187. Begin
  4188. if ( aSig0 OR aSig1 )<> 0 then
  4189. Begin
  4190. propagateFloat64NaN( a, b, result );
  4191. exit;
  4192. end;
  4193. if ( bExp = $7FF ) then
  4194. Begin
  4195. if ( bSig0 OR bSig1 )<>0 then
  4196. Begin
  4197. propagateFloat64NaN( a, b, result );
  4198. exit;
  4199. End;
  4200. goto invalid;
  4201. End;
  4202. packFloat64( zSign, $7FF, 0, 0, result );
  4203. exit;
  4204. End;
  4205. if ( bExp = $7FF ) then
  4206. Begin
  4207. if ( bSig0 OR bSig1 )<> 0 then
  4208. Begin
  4209. propagateFloat64NaN( a, b, result );
  4210. exit;
  4211. End;
  4212. packFloat64( zSign, 0, 0, 0, result );
  4213. exit;
  4214. End;
  4215. if ( bExp = 0 ) then
  4216. Begin
  4217. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4218. Begin
  4219. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4220. Begin
  4221. invalid:
  4222. float_raise( float_flag_invalid );
  4223. z.low := float64_default_nan_low;
  4224. z.high := float64_default_nan_high;
  4225. result := z;
  4226. exit;
  4227. End;
  4228. float_raise( float_flag_divbyzero );
  4229. packFloat64( zSign, $7FF, 0, 0, result );
  4230. exit;
  4231. End;
  4232. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4233. End;
  4234. if ( aExp = 0 ) then
  4235. Begin
  4236. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4237. Begin
  4238. packFloat64( zSign, 0, 0, 0, result );
  4239. exit;
  4240. End;
  4241. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4242. End;
  4243. zExp := aExp - bExp + $3FD;
  4244. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4245. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4246. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4247. Begin
  4248. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4249. Inc(zExp);
  4250. End;
  4251. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4252. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4253. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4254. while ( sbits32 (rem0) < 0 ) do
  4255. Begin
  4256. Dec(zSig0);
  4257. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4258. End;
  4259. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4260. if ( ( zSig1 and $3FF ) <= 4 ) then
  4261. Begin
  4262. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4263. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4264. while ( sbits32 (rem1) < 0 ) do
  4265. Begin
  4266. Dec(zSig1);
  4267. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4268. End;
  4269. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4270. End;
  4271. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4272. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4273. End;
  4274. {*
  4275. -------------------------------------------------------------------------------
  4276. Returns the remainder of the double-precision floating-point value `a'
  4277. with respect to the corresponding value `b'. The operation is performed
  4278. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4279. -------------------------------------------------------------------------------
  4280. *}
  4281. Function float64_rem(a: float64; b : float64) : float64;
  4282. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4283. Var
  4284. aSign, bSign, zSign: flag;
  4285. aExp, bExp, expDiff: int16;
  4286. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4287. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4288. sigMean0: sbits32;
  4289. z: float64;
  4290. label invalid;
  4291. Begin
  4292. aSig1 := extractFloat64Frac1( a );
  4293. aSig0 := extractFloat64Frac0( a );
  4294. aExp := extractFloat64Exp( a );
  4295. aSign := extractFloat64Sign( a );
  4296. bSig1 := extractFloat64Frac1( b );
  4297. bSig0 := extractFloat64Frac0( b );
  4298. bExp := extractFloat64Exp( b );
  4299. bSign := extractFloat64Sign( b );
  4300. if ( aExp = $7FF ) then
  4301. Begin
  4302. if ((( aSig0 OR aSig1 )<>0)
  4303. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4304. Begin
  4305. propagateFloat64NaN( a, b, result );
  4306. exit;
  4307. End;
  4308. goto invalid;
  4309. End;
  4310. if ( bExp = $7FF ) then
  4311. Begin
  4312. if ( bSig0 OR bSig1 ) <> 0 then
  4313. Begin
  4314. propagateFloat64NaN( a, b, result );
  4315. exit;
  4316. End;
  4317. result := a;
  4318. exit;
  4319. End;
  4320. if ( bExp = 0 ) then
  4321. Begin
  4322. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4323. Begin
  4324. invalid:
  4325. float_raise( float_flag_invalid );
  4326. z.low := float64_default_nan_low;
  4327. z.high := float64_default_nan_high;
  4328. result := z;
  4329. exit;
  4330. End;
  4331. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4332. End;
  4333. if ( aExp = 0 ) then
  4334. Begin
  4335. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4336. Begin
  4337. result := a;
  4338. exit;
  4339. End;
  4340. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4341. End;
  4342. expDiff := aExp - bExp;
  4343. if ( expDiff < -1 ) then
  4344. Begin
  4345. result := a;
  4346. exit;
  4347. End;
  4348. shortShift64Left(
  4349. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4350. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4351. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4352. if ( q )<>0 then
  4353. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4354. expDiff := expDiff - 32;
  4355. while ( 0 < expDiff ) do
  4356. Begin
  4357. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4358. if 4 < q then
  4359. q:= q - 4
  4360. else
  4361. q := 0;
  4362. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4363. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4364. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4365. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4366. expDiff := expDiff - 29;
  4367. End;
  4368. if ( -32 < expDiff ) then
  4369. Begin
  4370. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4371. if 4 < q then
  4372. q := q - 4
  4373. else
  4374. q := 0;
  4375. q := q shr (- expDiff);
  4376. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4377. expDiff := expDiff + 24;
  4378. if ( expDiff < 0 ) then
  4379. Begin
  4380. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4381. End
  4382. else
  4383. Begin
  4384. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4385. End;
  4386. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4387. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4388. End
  4389. else
  4390. Begin
  4391. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4392. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4393. End;
  4394. Repeat
  4395. alternateASig0 := aSig0;
  4396. alternateASig1 := aSig1;
  4397. Inc(q);
  4398. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4399. Until not ( 0 <= sbits32 (aSig0) );
  4400. add64(
  4401. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4402. if ( ( sigMean0 < 0 )
  4403. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4404. Begin
  4405. aSig0 := alternateASig0;
  4406. aSig1 := alternateASig1;
  4407. End;
  4408. zSign := flag( sbits32 (aSig0) < 0 );
  4409. if ( zSign <> 0 ) then
  4410. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4411. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4412. End;
  4413. {*
  4414. -------------------------------------------------------------------------------
  4415. Returns the square root of the double-precision floating-point value `a'.
  4416. The operation is performed according to the IEC/IEEE Standard for Binary
  4417. Floating-Point Arithmetic.
  4418. -------------------------------------------------------------------------------
  4419. *}
  4420. Procedure float64_sqrt( a: float64; var out: float64 );
  4421. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4422. Var
  4423. aSign: flag;
  4424. aExp, zExp: int16;
  4425. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4426. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4427. z: float64;
  4428. label invalid;
  4429. Begin
  4430. aSig1 := extractFloat64Frac1( a );
  4431. aSig0 := extractFloat64Frac0( a );
  4432. aExp := extractFloat64Exp( a );
  4433. aSign := extractFloat64Sign( a );
  4434. if ( aExp = $7FF ) then
  4435. Begin
  4436. if ( aSig0 OR aSig1 ) <> 0 then
  4437. Begin
  4438. propagateFloat64NaN( a, a, out );
  4439. exit;
  4440. End;
  4441. if ( aSign = 0) then
  4442. Begin
  4443. out := a;
  4444. exit;
  4445. End;
  4446. goto invalid;
  4447. End;
  4448. if ( aSign <> 0 ) then
  4449. Begin
  4450. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4451. Begin
  4452. out := a;
  4453. exit;
  4454. End;
  4455. invalid:
  4456. float_raise( float_flag_invalid );
  4457. z.low := float64_default_nan_low;
  4458. z.high := float64_default_nan_high;
  4459. out := z;
  4460. exit;
  4461. End;
  4462. if ( aExp = 0 ) then
  4463. Begin
  4464. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4465. Begin
  4466. packFloat64( 0, 0, 0, 0, out );
  4467. exit;
  4468. End;
  4469. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4470. End;
  4471. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4472. aSig0 := aSig0 or $00100000;
  4473. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4474. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4475. if ( zSig0 = 0 ) then
  4476. zSig0 := $7FFFFFFF;
  4477. doubleZSig0 := zSig0 + zSig0;
  4478. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4479. mul32To64( zSig0, zSig0, term0, term1 );
  4480. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4481. while ( sbits32 (rem0) < 0 ) do
  4482. Begin
  4483. Dec(zSig0);
  4484. doubleZSig0 := doubleZSig0 - 2;
  4485. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4486. End;
  4487. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4488. if ( ( zSig1 and $1FF ) <= 5 ) then
  4489. Begin
  4490. if ( zSig1 = 0 ) then
  4491. zSig1 := 1;
  4492. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4493. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4494. mul32To64( zSig1, zSig1, term2, term3 );
  4495. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4496. while ( sbits32 (rem1) < 0 ) do
  4497. Begin
  4498. Dec(zSig1);
  4499. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4500. term3 := term3 or 1;
  4501. term2 := term2 or doubleZSig0;
  4502. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4503. End;
  4504. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4505. End;
  4506. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4507. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4508. End;
  4509. {*
  4510. -------------------------------------------------------------------------------
  4511. Returns 1 if the double-precision floating-point value `a' is equal to
  4512. the corresponding value `b', and 0 otherwise. The comparison is performed
  4513. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4514. -------------------------------------------------------------------------------
  4515. *}
  4516. Function float64_eq(a: float64; b: float64): flag;
  4517. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4518. Begin
  4519. if
  4520. (
  4521. ( extractFloat64Exp( a ) = $7FF )
  4522. AND
  4523. (
  4524. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4525. )
  4526. )
  4527. OR (
  4528. ( extractFloat64Exp( b ) = $7FF )
  4529. AND (
  4530. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4531. )
  4532. )
  4533. ) then
  4534. Begin
  4535. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4536. float_raise( float_flag_invalid );
  4537. float64_eq := 0;
  4538. exit;
  4539. End;
  4540. float64_eq := flag(
  4541. ( a.low = b.low )
  4542. AND ( ( a.high = b.high )
  4543. OR ( ( a.low = 0 )
  4544. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4545. ));
  4546. End;
  4547. {*
  4548. -------------------------------------------------------------------------------
  4549. Returns 1 if the double-precision floating-point value `a' is less than
  4550. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4551. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4552. Arithmetic.
  4553. -------------------------------------------------------------------------------
  4554. *}
  4555. Function float64_le(a: float64;b: float64): flag;
  4556. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4557. Var
  4558. aSign, bSign: flag;
  4559. Begin
  4560. if
  4561. (
  4562. ( extractFloat64Exp( a ) = $7FF )
  4563. AND
  4564. (
  4565. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4566. )
  4567. )
  4568. OR (
  4569. ( extractFloat64Exp( b ) = $7FF )
  4570. AND (
  4571. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4572. )
  4573. )
  4574. ) then
  4575. Begin
  4576. float_raise( float_flag_invalid );
  4577. float64_le := 0;
  4578. exit;
  4579. End;
  4580. aSign := extractFloat64Sign( a );
  4581. bSign := extractFloat64Sign( b );
  4582. if ( aSign <> bSign ) then
  4583. Begin
  4584. float64_le := flag(
  4585. (aSign <> 0)
  4586. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4587. = 0 ));
  4588. exit;
  4589. End;
  4590. if aSign <> 0 then
  4591. float64_le := le64( b.high, b.low, a.high, a.low )
  4592. else
  4593. float64_le := le64( a.high, a.low, b.high, b.low );
  4594. End;
  4595. {*
  4596. -------------------------------------------------------------------------------
  4597. Returns 1 if the double-precision floating-point value `a' is less than
  4598. the corresponding value `b', and 0 otherwise. The comparison is performed
  4599. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4600. -------------------------------------------------------------------------------
  4601. *}
  4602. Function float64_lt(a: float64;b: float64): flag;
  4603. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4604. Var
  4605. aSign, bSign: flag;
  4606. Begin
  4607. if
  4608. (
  4609. ( extractFloat64Exp( a ) = $7FF )
  4610. AND
  4611. (
  4612. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4613. )
  4614. )
  4615. OR (
  4616. ( extractFloat64Exp( b ) = $7FF )
  4617. AND (
  4618. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4619. )
  4620. )
  4621. ) then
  4622. Begin
  4623. float_raise( float_flag_invalid );
  4624. float64_lt := 0;
  4625. exit;
  4626. End;
  4627. aSign := extractFloat64Sign( a );
  4628. bSign := extractFloat64Sign( b );
  4629. if ( aSign <> bSign ) then
  4630. Begin
  4631. float64_lt := flag(
  4632. (aSign <> 0)
  4633. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4634. <> 0 ));
  4635. exit;
  4636. End;
  4637. if aSign <> 0 then
  4638. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4639. else
  4640. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4641. End;
  4642. {*
  4643. -------------------------------------------------------------------------------
  4644. Returns 1 if the double-precision floating-point value `a' is equal to
  4645. the corresponding value `b', and 0 otherwise. The invalid exception is
  4646. raised if either operand is a NaN. Otherwise, the comparison is performed
  4647. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4648. -------------------------------------------------------------------------------
  4649. *}
  4650. Function float64_eq_signaling( a: float64; b: float64): flag;
  4651. Begin
  4652. if
  4653. (
  4654. ( extractFloat64Exp( a ) = $7FF )
  4655. AND
  4656. (
  4657. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4658. )
  4659. )
  4660. OR (
  4661. ( extractFloat64Exp( b ) = $7FF )
  4662. AND (
  4663. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4664. )
  4665. )
  4666. ) then
  4667. Begin
  4668. float_raise( float_flag_invalid );
  4669. float64_eq_signaling := 0;
  4670. exit;
  4671. End;
  4672. float64_eq_signaling := flag(
  4673. ( a.low = b.low )
  4674. AND ( ( a.high = b.high )
  4675. OR ( ( a.low = 0 )
  4676. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4677. ));
  4678. End;
  4679. {*
  4680. -------------------------------------------------------------------------------
  4681. Returns 1 if the double-precision floating-point value `a' is less than or
  4682. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4683. cause an exception. Otherwise, the comparison is performed according to the
  4684. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4685. -------------------------------------------------------------------------------
  4686. *}
  4687. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4688. Var
  4689. aSign, bSign : flag;
  4690. Begin
  4691. if
  4692. (
  4693. ( extractFloat64Exp( a ) = $7FF )
  4694. AND
  4695. (
  4696. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4697. )
  4698. )
  4699. OR (
  4700. ( extractFloat64Exp( b ) = $7FF )
  4701. AND (
  4702. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4703. )
  4704. )
  4705. ) then
  4706. Begin
  4707. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4708. float_raise( float_flag_invalid );
  4709. float64_le_quiet := 0;
  4710. exit;
  4711. End;
  4712. aSign := extractFloat64Sign( a );
  4713. bSign := extractFloat64Sign( b );
  4714. if ( aSign <> bSign ) then
  4715. Begin
  4716. float64_le_quiet := flag
  4717. ((aSign <> 0)
  4718. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4719. = 0 ));
  4720. exit;
  4721. End;
  4722. if aSign <> 0 then
  4723. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  4724. else
  4725. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  4726. End;
  4727. {*
  4728. -------------------------------------------------------------------------------
  4729. Returns 1 if the double-precision floating-point value `a' is less than
  4730. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4731. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4732. Standard for Binary Floating-Point Arithmetic.
  4733. -------------------------------------------------------------------------------
  4734. *}
  4735. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  4736. Var
  4737. aSign, bSign: flag;
  4738. Begin
  4739. if
  4740. (
  4741. ( extractFloat64Exp( a ) = $7FF )
  4742. AND
  4743. (
  4744. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4745. )
  4746. )
  4747. OR (
  4748. ( extractFloat64Exp( b ) = $7FF )
  4749. AND (
  4750. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4751. )
  4752. )
  4753. ) then
  4754. Begin
  4755. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4756. float_raise( float_flag_invalid );
  4757. float64_lt_quiet := 0;
  4758. exit;
  4759. End;
  4760. aSign := extractFloat64Sign( a );
  4761. bSign := extractFloat64Sign( b );
  4762. if ( aSign <> bSign ) then
  4763. Begin
  4764. float64_lt_quiet := flag(
  4765. (aSign<>0)
  4766. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4767. <> 0 ));
  4768. exit;
  4769. End;
  4770. If aSign <> 0 then
  4771. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  4772. else
  4773. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  4774. End;
  4775. {*----------------------------------------------------------------------------
  4776. | Returns the result of converting the 64-bit two's complement integer `a'
  4777. | to the single-precision floating-point format. The conversion is performed
  4778. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4779. *----------------------------------------------------------------------------*}
  4780. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  4781. var
  4782. zSign : flag;
  4783. absA : uint64;
  4784. shiftCount: int8;
  4785. zSig : bits32;
  4786. intval : int64rec;
  4787. Begin
  4788. if ( a = 0 ) then
  4789. begin
  4790. int64_to_float32.float32 := 0;
  4791. exit;
  4792. end;
  4793. if a < 0 then
  4794. zSign := flag(TRUE)
  4795. else
  4796. zSign := flag(FALSE);
  4797. if zSign<>0 then
  4798. absA := -a
  4799. else
  4800. absA := a;
  4801. shiftCount := countLeadingZeros64( absA ) - 40;
  4802. if ( 0 <= shiftCount ) then
  4803. begin
  4804. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  4805. end
  4806. else
  4807. begin
  4808. shiftCount := shiftCount + 7;
  4809. if ( shiftCount < 0 ) then
  4810. begin
  4811. intval.low := int64rec(AbsA).low;
  4812. intval.high := int64rec(AbsA).high;
  4813. shift64RightJamming( intval.low, intval.high, - shiftCount,
  4814. intval.low, intval.high);
  4815. int64rec(absA).low := intval.low;
  4816. int64rec(absA).high := intval.high;
  4817. end
  4818. else
  4819. absA := absA shl shiftCount;
  4820. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  4821. end;
  4822. End;
  4823. {*----------------------------------------------------------------------------
  4824. | Returns the result of converting the 64-bit two's complement integer `a'
  4825. | to the double-precision floating-point format. The conversion is performed
  4826. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4827. *----------------------------------------------------------------------------*}
  4828. function int64_to_float64( a: int64 ): float64;
  4829. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  4830. var
  4831. zSign : flag;
  4832. float_result : float64;
  4833. intval : int64rec;
  4834. AbsA : bits64;
  4835. shiftcount : int8;
  4836. zSig0, zSig1 : bits32;
  4837. Begin
  4838. if ( a = 0 ) then
  4839. Begin
  4840. packFloat64( 0, 0, 0, 0, result );
  4841. exit;
  4842. end;
  4843. zSign := flag( a < 0 );
  4844. if ZSign<>0 then
  4845. AbsA := -a
  4846. else
  4847. AbsA := a;
  4848. shiftCount := countLeadingZeros64( absA ) - 11;
  4849. if ( 0 <= shiftCount ) then
  4850. Begin
  4851. absA := absA shl shiftcount;
  4852. zSig0:=int64rec(absA).high;
  4853. zSig1:=int64rec(absA).low;
  4854. End
  4855. else
  4856. Begin
  4857. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  4858. End;
  4859. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  4860. int64_to_float64:= float_result;
  4861. End;
  4862. {*----------------------------------------------------------------------------
  4863. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  4864. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  4865. | Otherwise, returns 0.
  4866. *----------------------------------------------------------------------------*}
  4867. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  4868. begin
  4869. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  4870. end;
  4871. {*----------------------------------------------------------------------------
  4872. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  4873. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  4874. | any carry out is lost. The result is broken into two 64-bit pieces which
  4875. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  4876. *----------------------------------------------------------------------------*}
  4877. procedure add128(a0: bits64; a1: bits64; b0: bits64; b1: bits64; var z0Ptr: bits64; var z1Ptr : bits64);inline;
  4878. var
  4879. z1: bits64;
  4880. begin
  4881. z1 := a1 + b1;
  4882. z1Ptr := z1;
  4883. z0Ptr := a0 + b0 + ord( z1 < a1 );
  4884. end;
  4885. {*----------------------------------------------------------------------------
  4886. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  4887. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  4888. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  4889. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  4890. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  4891. | the most-significant bit of the extra result, and the other 63 bits of the
  4892. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  4893. | were all zero. This extra result is stored in the location pointed to by
  4894. | `z2Ptr'. The value of `count' can be arbitrarily large.
  4895. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  4896. | to form a fixed-point value with binary point between `a1' and `a2'. This
  4897. | fixed-point value is shifted right by the number of bits given in `count',
  4898. | and the integer part of the result is returned at the locations pointed to
  4899. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  4900. | corrupted as described above, and is returned at the location pointed to by
  4901. | `z2Ptr'.)
  4902. *----------------------------------------------------------------------------*}
  4903. procedure shift128ExtraRightJamming(
  4904. a0: bits64;
  4905. a1: bits64;
  4906. a2: bits64;
  4907. count: int16;
  4908. var z0Ptr: bits64;
  4909. var z1Ptr: bits64;
  4910. var z2Ptr: bits64);
  4911. var
  4912. z0, z1, z2: bits64;
  4913. negCount: int8;
  4914. begin
  4915. negCount := ( - count ) and 63;
  4916. if ( count = 0 ) then
  4917. begin
  4918. z2 := a2;
  4919. z1 := a1;
  4920. z0 := a0;
  4921. end
  4922. else begin
  4923. if ( count < 64 ) then
  4924. begin
  4925. z2 := a1 shr negCount;
  4926. z1 := ( a0 shl negCount ) or ( a1 shr count );
  4927. z0 := a0 shr count;
  4928. end
  4929. else begin
  4930. if ( count = 64 ) then
  4931. begin
  4932. z2 := a1;
  4933. z1 := a0;
  4934. end
  4935. else begin
  4936. a2 := a2 or a1;
  4937. if ( count < 128 ) then
  4938. begin
  4939. z2 := a0 shl negCount;
  4940. z1 := a0 shr ( count and 63 );
  4941. end
  4942. else begin
  4943. if ( count = 128 ) then
  4944. z2 := a0
  4945. else
  4946. z2 := ord( a0 <> 0 );
  4947. z1 := 0;
  4948. end;
  4949. end;
  4950. z0 := 0;
  4951. end;
  4952. z2 := z2 or ord( a2 <> 0 );
  4953. end;
  4954. z2Ptr := z2;
  4955. z1Ptr := z1;
  4956. z0Ptr := z0;
  4957. end;
  4958. {*----------------------------------------------------------------------------
  4959. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  4960. | _plus_ the number of bits given in `count'. The shifted result is at most
  4961. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  4962. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  4963. | shifted off is the most-significant bit of the extra result, and the other
  4964. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  4965. | bits shifted off were all zero. This extra result is stored in the location
  4966. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  4967. | (This routine makes more sense if `a0' and `a1' are considered to form
  4968. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  4969. | point value is shifted right by the number of bits given in `count', and
  4970. | the integer part of the result is returned at the location pointed to by
  4971. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  4972. | described above, and is returned at the location pointed to by `z1Ptr'.)
  4973. *----------------------------------------------------------------------------*}
  4974. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  4975. var
  4976. z0, z1: bits64;
  4977. negCount: int8;
  4978. begin
  4979. negCount := ( - count ) and 63;
  4980. if ( count = 0 ) then
  4981. begin
  4982. z1 := a1;
  4983. z0 := a0;
  4984. end
  4985. else if ( count < 64 ) then
  4986. begin
  4987. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  4988. z0 := a0 shr count;
  4989. end
  4990. else begin
  4991. if ( count = 64 ) then
  4992. begin
  4993. z1 := a0 or ord( a1 <> 0 );
  4994. end
  4995. else begin
  4996. z1 := ord( ( a0 or a1 ) <> 0 );
  4997. end;
  4998. z0 := 0;
  4999. end;
  5000. z1Ptr := z1;
  5001. z0Ptr := z0;
  5002. end;
  5003. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5004. {*----------------------------------------------------------------------------
  5005. | Returns the fraction bits of the extended double-precision floating-point
  5006. | value `a'.
  5007. *----------------------------------------------------------------------------*}
  5008. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5009. begin
  5010. result:=a.low;
  5011. end;
  5012. {*----------------------------------------------------------------------------
  5013. | Returns the exponent bits of the extended double-precision floating-point
  5014. | value `a'.
  5015. *----------------------------------------------------------------------------*}
  5016. function extractFloatx80Exp(a : floatx80): int32;inline;
  5017. begin
  5018. result:=a.high and $7FFF;
  5019. end;
  5020. {*----------------------------------------------------------------------------
  5021. | Returns the sign bit of the extended double-precision floating-point value
  5022. | `a'.
  5023. *----------------------------------------------------------------------------*}
  5024. function extractFloatx80Sign(a : floatx80): flag;inline;
  5025. begin
  5026. result:=a.high shr 15;
  5027. end;
  5028. {*----------------------------------------------------------------------------
  5029. | Normalizes the subnormal extended double-precision floating-point value
  5030. | represented by the denormalized significand `aSig'. The normalized exponent
  5031. | and significand are stored at the locations pointed to by `zExpPtr' and
  5032. | `zSigPtr', respectively.
  5033. *----------------------------------------------------------------------------*}
  5034. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5035. var
  5036. shiftCount: int8;
  5037. begin
  5038. shiftCount := countLeadingZeros64( aSig );
  5039. zSigPtr := aSig shl shiftCount;
  5040. zExpPtr := 1 - shiftCount;
  5041. end;
  5042. {*----------------------------------------------------------------------------
  5043. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5044. | extended double-precision floating-point value, returning the result.
  5045. *----------------------------------------------------------------------------*}
  5046. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5047. var
  5048. z: floatx80;
  5049. begin
  5050. z.low := zSig;
  5051. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5052. result:=z;
  5053. end;
  5054. {*----------------------------------------------------------------------------
  5055. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5056. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5057. | and returns the proper extended double-precision floating-point value
  5058. | corresponding to the abstract input. Ordinarily, the abstract value is
  5059. | rounded and packed into the extended double-precision format, with the
  5060. | inexact exception raised if the abstract input cannot be represented
  5061. | exactly. However, if the abstract value is too large, the overflow and
  5062. | inexact exceptions are raised and an infinity or maximal finite value is
  5063. | returned. If the abstract value is too small, the input value is rounded to
  5064. | a subnormal number, and the underflow and inexact exceptions are raised if
  5065. | the abstract input cannot be represented exactly as a subnormal extended
  5066. | double-precision floating-point number.
  5067. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5068. | number of bits as single or double precision, respectively. Otherwise, the
  5069. | result is rounded to the full precision of the extended double-precision
  5070. | format.
  5071. | The input significand must be normalized or smaller. If the input
  5072. | significand is not normalized, `zExp' must be 0; in that case, the result
  5073. | returned is a subnormal number, and it must not require rounding. The
  5074. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5075. | Floating-Point Arithmetic.
  5076. *----------------------------------------------------------------------------*}
  5077. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5078. var
  5079. roundingMode: int8;
  5080. roundNearestEven, increment, isTiny: flag;
  5081. roundIncrement, roundMask, roundBits: int64;
  5082. label
  5083. precision80;
  5084. begin
  5085. roundingMode := float_rounding_mode;
  5086. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5087. if ( roundingPrecision = 80 ) then
  5088. goto precision80;
  5089. if ( roundingPrecision = 64 ) then
  5090. begin
  5091. roundIncrement := int64( $0000000000000400 );
  5092. roundMask := int64( $00000000000007FF );
  5093. end
  5094. else if ( roundingPrecision = 32 ) then
  5095. begin
  5096. roundIncrement := int64( $0000008000000000 );
  5097. roundMask := int64( $000000FFFFFFFFFF );
  5098. end
  5099. else begin
  5100. goto precision80;
  5101. end;
  5102. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5103. if ( not (roundNearestEven<>0) ) then
  5104. begin
  5105. if ( roundingMode = float_round_to_zero ) then
  5106. begin
  5107. roundIncrement := 0;
  5108. end
  5109. else begin
  5110. roundIncrement := roundMask;
  5111. if ( zSign<>0 ) then
  5112. begin
  5113. if ( roundingMode = float_round_up ) then
  5114. roundIncrement := 0;
  5115. end
  5116. else begin
  5117. if ( roundingMode = float_round_down ) then
  5118. roundIncrement := 0;
  5119. end;
  5120. end;
  5121. end;
  5122. roundBits := zSig0 and roundMask;
  5123. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5124. if ( ( $7FFE < zExp )
  5125. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5126. ) begin
  5127. goto overflow;
  5128. end;
  5129. if ( zExp <= 0 ) begin
  5130. isTiny =
  5131. ( float_detect_tininess = float_tininess_before_rounding )
  5132. or ( zExp < 0 )
  5133. or ( zSig0 <= zSig0 + roundIncrement );
  5134. shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
  5135. zExp := 0;
  5136. roundBits := zSig0 and roundMask;
  5137. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5138. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5139. zSig0 += roundIncrement;
  5140. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5141. roundIncrement := roundMask + 1;
  5142. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5143. roundMask |= roundIncrement;
  5144. end;
  5145. zSig0 &= ~ roundMask;
  5146. result:=packFloatx80( zSign, zExp, zSig0 );
  5147. end;
  5148. end;
  5149. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5150. zSig0 += roundIncrement;
  5151. if ( zSig0 < roundIncrement ) begin
  5152. ++zExp;
  5153. zSig0 := LIT64( $8000000000000000 );
  5154. end;
  5155. roundIncrement := roundMask + 1;
  5156. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5157. roundMask |= roundIncrement;
  5158. end;
  5159. zSig0 &= ~ roundMask;
  5160. if ( zSig0 = 0 ) zExp := 0;
  5161. result:=packFloatx80( zSign, zExp, zSig0 );
  5162. precision80:
  5163. increment := ( (sbits64) zSig1 < 0 );
  5164. if ( ! roundNearestEven ) begin
  5165. if ( roundingMode = float_round_to_zero ) begin
  5166. increment := 0;
  5167. end;
  5168. else begin
  5169. if ( zSign ) begin
  5170. increment := ( roundingMode = float_round_down ) and zSig1;
  5171. end;
  5172. else begin
  5173. increment := ( roundingMode = float_round_up ) and zSig1;
  5174. end;
  5175. end;
  5176. end;
  5177. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5178. if ( ( $7FFE < zExp )
  5179. or ( ( zExp = $7FFE )
  5180. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5181. and increment
  5182. )
  5183. ) begin
  5184. roundMask := 0;
  5185. overflow:
  5186. float_raise( float_flag_overflow or float_flag_inexact );
  5187. if ( ( roundingMode = float_round_to_zero )
  5188. or ( zSign and ( roundingMode = float_round_up ) )
  5189. or ( ! zSign and ( roundingMode = float_round_down ) )
  5190. ) begin
  5191. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5192. end;
  5193. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5194. end;
  5195. if ( zExp <= 0 ) begin
  5196. isTiny =
  5197. ( float_detect_tininess = float_tininess_before_rounding )
  5198. or ( zExp < 0 )
  5199. or ! increment
  5200. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5201. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
  5202. zExp := 0;
  5203. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5204. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5205. if ( roundNearestEven ) begin
  5206. increment := ( (sbits64) zSig1 < 0 );
  5207. end;
  5208. else begin
  5209. if ( zSign ) begin
  5210. increment := ( roundingMode = float_round_down ) and zSig1;
  5211. end;
  5212. else begin
  5213. increment := ( roundingMode = float_round_up ) and zSig1;
  5214. end;
  5215. end;
  5216. if ( increment ) begin
  5217. ++zSig0;
  5218. zSig0 &=
  5219. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5220. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5221. end;
  5222. result:=packFloatx80( zSign, zExp, zSig0 );
  5223. end;
  5224. end;
  5225. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5226. if ( increment ) begin
  5227. ++zSig0;
  5228. if ( zSig0 = 0 ) begin
  5229. ++zExp;
  5230. zSig0 := LIT64( $8000000000000000 );
  5231. end;
  5232. else begin
  5233. zSig0 &= ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5234. end;
  5235. end;
  5236. else begin
  5237. if ( zSig0 = 0 ) zExp := 0;
  5238. end;
  5239. result:=packFloatx80( zSign, zExp, zSig0 );
  5240. end;
  5241. {*----------------------------------------------------------------------------
  5242. | Takes an abstract floating-point value having sign `zSign', exponent
  5243. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5244. | and returns the proper extended double-precision floating-point value
  5245. | corresponding to the abstract input. This routine is just like
  5246. | `roundAndPackFloatx80' except that the input significand does not have to be
  5247. | normalized.
  5248. *----------------------------------------------------------------------------*}
  5249. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5250. var
  5251. shiftCount: int8;
  5252. begin
  5253. if ( zSig0 = 0 ) begin
  5254. zSig0 := zSig1;
  5255. zSig1 := 0;
  5256. zExp -= 64;
  5257. end;
  5258. shiftCount := countLeadingZeros64( zSig0 );
  5259. shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  5260. zExp := eExp - shiftCount;
  5261. return
  5262. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5263. end;
  5264. {*----------------------------------------------------------------------------
  5265. | Returns the result of converting the extended double-precision floating-
  5266. | point value `a' to the 32-bit two's complement integer format. The
  5267. | conversion is performed according to the IEC/IEEE Standard for Binary
  5268. | Floating-Point Arithmetic---which means in particular that the conversion
  5269. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5270. | largest positive integer is returned. Otherwise, if the conversion
  5271. | overflows, the largest integer with the same sign as `a' is returned.
  5272. *----------------------------------------------------------------------------*}
  5273. function floatx80_to_int32(a: floatx80): int32;
  5274. var
  5275. aSign: flag;
  5276. aExp, shiftCount: int32;
  5277. aSig: bits64;
  5278. begin
  5279. aSig := extractFloatx80Frac( a );
  5280. aExp := extractFloatx80Exp( a );
  5281. aSign := extractFloatx80Sign( a );
  5282. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5283. shiftCount := $4037 - aExp;
  5284. if ( shiftCount <= 0 ) shiftCount := 1;
  5285. shift64RightJamming( aSig, shiftCount, &aSig );
  5286. result := roundAndPackInt32( aSign, aSig );
  5287. end;
  5288. {*----------------------------------------------------------------------------
  5289. | Returns the result of converting the extended double-precision floating-
  5290. | point value `a' to the 32-bit two's complement integer format. The
  5291. | conversion is performed according to the IEC/IEEE Standard for Binary
  5292. | Floating-Point Arithmetic, except that the conversion is always rounded
  5293. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5294. | Otherwise, if the conversion overflows, the largest integer with the same
  5295. | sign as `a' is returned.
  5296. *----------------------------------------------------------------------------*}
  5297. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5298. var
  5299. aSign: flag;
  5300. aExp, shiftCount: int32;
  5301. aSig, savedASig: bits64;
  5302. z: int32;
  5303. begin
  5304. aSig := extractFloatx80Frac( a );
  5305. aExp := extractFloatx80Exp( a );
  5306. aSign := extractFloatx80Sign( a );
  5307. if ( $401E < aExp ) begin
  5308. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5309. goto invalid;
  5310. end;
  5311. else if ( aExp < $3FFF ) begin
  5312. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5313. result := 0;
  5314. end;
  5315. shiftCount := $403E - aExp;
  5316. savedASig := aSig;
  5317. aSig >>= shiftCount;
  5318. z := aSig;
  5319. if ( aSign ) z := - z;
  5320. if ( ( z < 0 ) xor aSign ) begin
  5321. invalid:
  5322. float_raise( float_flag_invalid );
  5323. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5324. end;
  5325. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5326. softfloat_exception_flags or= float_flag_inexact;
  5327. end;
  5328. result := z;
  5329. end;
  5330. {*----------------------------------------------------------------------------
  5331. | Returns the result of converting the extended double-precision floating-
  5332. | point value `a' to the 64-bit two's complement integer format. The
  5333. | conversion is performed according to the IEC/IEEE Standard for Binary
  5334. | Floating-Point Arithmetic---which means in particular that the conversion
  5335. | is rounded according to the current rounding mode. If `a' is a NaN,
  5336. | the largest positive integer is returned. Otherwise, if the conversion
  5337. | overflows, the largest integer with the same sign as `a' is returned.
  5338. *----------------------------------------------------------------------------*}
  5339. function floatx80_to_int64(a: floatx80): int64;
  5340. var
  5341. aSign: flag;
  5342. aExp, shiftCount: int32;
  5343. aSig, aSigExtra: bits64;
  5344. begin
  5345. aSig := extractFloatx80Frac( a );
  5346. aExp := extractFloatx80Exp( a );
  5347. aSign := extractFloatx80Sign( a );
  5348. shiftCount := $403E - aExp;
  5349. if ( shiftCount <= 0 ) begin
  5350. if ( shiftCount ) begin
  5351. float_raise( float_flag_invalid );
  5352. if ( ! aSign
  5353. or ( ( aExp = $7FFF )
  5354. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5355. ) begin
  5356. result := LIT64( $7FFFFFFFFFFFFFFF );
  5357. end;
  5358. result := (sbits64) LIT64( $8000000000000000 );
  5359. end;
  5360. aSigExtra := 0;
  5361. end;
  5362. else begin
  5363. shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
  5364. end;
  5365. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5366. end;
  5367. {*----------------------------------------------------------------------------
  5368. | Returns the result of converting the extended double-precision floating-
  5369. | point value `a' to the 64-bit two's complement integer format. The
  5370. | conversion is performed according to the IEC/IEEE Standard for Binary
  5371. | Floating-Point Arithmetic, except that the conversion is always rounded
  5372. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5373. | Otherwise, if the conversion overflows, the largest integer with the same
  5374. | sign as `a' is returned.
  5375. *----------------------------------------------------------------------------*}
  5376. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5377. var
  5378. aSign: flag;
  5379. aExp, shiftCount: int32;
  5380. aSig: bits64;
  5381. z: int64;
  5382. begin
  5383. aSig := extractFloatx80Frac( a );
  5384. aExp := extractFloatx80Exp( a );
  5385. aSign := extractFloatx80Sign( a );
  5386. shiftCount := aExp - $403E;
  5387. if ( 0 <= shiftCount ) begin
  5388. aSig &= LIT64( $7FFFFFFFFFFFFFFF );
  5389. if ( ( a.high <> $C03E ) or aSig ) begin
  5390. float_raise( float_flag_invalid );
  5391. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5392. result := LIT64( $7FFFFFFFFFFFFFFF );
  5393. end;
  5394. end;
  5395. result := (sbits64) LIT64( $8000000000000000 );
  5396. end;
  5397. else if ( aExp < $3FFF ) begin
  5398. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5399. result := 0;
  5400. end;
  5401. z := aSig>>( - shiftCount );
  5402. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5403. softfloat_exception_flags or= float_flag_inexact;
  5404. end;
  5405. if ( aSign ) z := - z;
  5406. result := z;
  5407. end;
  5408. {*----------------------------------------------------------------------------
  5409. | Returns the result of converting the extended double-precision floating-
  5410. | point value `a' to the single-precision floating-point format. The
  5411. | conversion is performed according to the IEC/IEEE Standard for Binary
  5412. | Floating-Point Arithmetic.
  5413. *----------------------------------------------------------------------------*}
  5414. function floatx80_to_float32(a: floatx80): float32;
  5415. var
  5416. aSign: flag;
  5417. aExp: int32;
  5418. aSig: bits64;
  5419. begin
  5420. aSig := extractFloatx80Frac( a );
  5421. aExp := extractFloatx80Exp( a );
  5422. aSign := extractFloatx80Sign( a );
  5423. if ( aExp = $7FFF ) begin
  5424. if ( (bits64) ( aSig shl 1 ) ) begin
  5425. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5426. end;
  5427. result := packFloat32( aSign, $FF, 0 );
  5428. end;
  5429. shift64RightJamming( aSig, 33, &aSig );
  5430. if ( aExp or aSig ) aExp -= $3F81;
  5431. result := roundAndPackFloat32( aSign, aExp, aSig );
  5432. end;
  5433. {*----------------------------------------------------------------------------
  5434. | Returns the result of converting the extended double-precision floating-
  5435. | point value `a' to the double-precision floating-point format. The
  5436. | conversion is performed according to the IEC/IEEE Standard for Binary
  5437. | Floating-Point Arithmetic.
  5438. *----------------------------------------------------------------------------*}
  5439. function floatx80_to_float64(a: floatx80): float64;
  5440. var
  5441. aSign: flag;
  5442. aExp: int32;
  5443. aSig, zSig: bits64;
  5444. begin
  5445. aSig := extractFloatx80Frac( a );
  5446. aExp := extractFloatx80Exp( a );
  5447. aSign := extractFloatx80Sign( a );
  5448. if ( aExp = $7FFF ) begin
  5449. if ( (bits64) ( aSig shl 1 ) ) begin
  5450. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5451. end;
  5452. result := packFloat64( aSign, $7FF, 0 );
  5453. end;
  5454. shift64RightJamming( aSig, 1, &zSig );
  5455. if ( aExp or aSig ) aExp -= $3C01;
  5456. result := roundAndPackFloat64( aSign, aExp, zSig );
  5457. end;
  5458. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5459. {*----------------------------------------------------------------------------
  5460. | Returns the result of converting the extended double-precision floating-
  5461. | point value `a' to the quadruple-precision floating-point format. The
  5462. | conversion is performed according to the IEC/IEEE Standard for Binary
  5463. | Floating-Point Arithmetic.
  5464. *----------------------------------------------------------------------------*}
  5465. function floatx80_to_float128(a: floatx80): float128;
  5466. var
  5467. aSign: flag;
  5468. aExp: int16;
  5469. aSig, zSig0, zSig1: bits64;
  5470. begin
  5471. aSig := extractFloatx80Frac( a );
  5472. aExp := extractFloatx80Exp( a );
  5473. aSign := extractFloatx80Sign( a );
  5474. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5475. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5476. end;
  5477. shift128Right( aSig shl 1, 0, 16, &zSig0, &zSig1 );
  5478. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5479. end;
  5480. {$endif FPC_SOFTFLOAT_FLOAT128}
  5481. {*----------------------------------------------------------------------------
  5482. | Rounds the extended double-precision floating-point value `a' to an integer,
  5483. | and Returns the result as an extended quadruple-precision floating-point
  5484. | value. The operation is performed according to the IEC/IEEE Standard for
  5485. | Binary Floating-Point Arithmetic.
  5486. *----------------------------------------------------------------------------*}
  5487. function floatx80_round_to_int(a: floatx80): floatx80;
  5488. var
  5489. aSign: flag;
  5490. aExp: int32;
  5491. lastBitMask, roundBitsMask: bits64;
  5492. roundingMode: int8;
  5493. z: floatx80;
  5494. begin
  5495. aExp := extractFloatx80Exp( a );
  5496. if ( $403E <= aExp ) begin
  5497. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5498. result := propagateFloatx80NaN( a, a );
  5499. end;
  5500. result := a;
  5501. end;
  5502. if ( aExp < $3FFF ) begin
  5503. if ( ( aExp = 0 )
  5504. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5505. result := a;
  5506. end;
  5507. softfloat_exception_flags or= float_flag_inexact;
  5508. aSign := extractFloatx80Sign( a );
  5509. switch ( float_rounding_mode ) begin
  5510. case float_round_nearest_even:
  5511. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5512. ) begin
  5513. result :=
  5514. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5515. end;
  5516. break;
  5517. case float_round_down:
  5518. result :=
  5519. aSign ?
  5520. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5521. : packFloatx80( 0, 0, 0 );
  5522. case float_round_up:
  5523. result :=
  5524. aSign ? packFloatx80( 1, 0, 0 )
  5525. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5526. end;
  5527. result := packFloatx80( aSign, 0, 0 );
  5528. end;
  5529. lastBitMask := 1;
  5530. lastBitMask shl = $403E - aExp;
  5531. roundBitsMask := lastBitMask - 1;
  5532. z := a;
  5533. roundingMode := float_rounding_mode;
  5534. if ( roundingMode = float_round_nearest_even ) begin
  5535. z.low += lastBitMask>>1;
  5536. if ( ( z.low and roundBitsMask ) = 0 ) z.low &= ~ lastBitMask;
  5537. end;
  5538. else if ( roundingMode <> float_round_to_zero ) begin
  5539. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5540. z.low += roundBitsMask;
  5541. end;
  5542. end;
  5543. z.low &= ~ roundBitsMask;
  5544. if ( z.low = 0 ) begin
  5545. ++z.high;
  5546. z.low := LIT64( $8000000000000000 );
  5547. end;
  5548. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5549. result := z;
  5550. end;
  5551. {*----------------------------------------------------------------------------
  5552. | Returns the result of adding the absolute values of the extended double-
  5553. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5554. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5555. | The addition is performed according to the IEC/IEEE Standard for Binary
  5556. | Floating-Point Arithmetic.
  5557. *----------------------------------------------------------------------------*}
  5558. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5559. var
  5560. aExp, bExp, zExp: int32;
  5561. aSig, bSig, zSig0, zSig1: bits64;
  5562. expDiff: int32;
  5563. begin
  5564. aSig := extractFloatx80Frac( a );
  5565. aExp := extractFloatx80Exp( a );
  5566. bSig := extractFloatx80Frac( b );
  5567. bExp := extractFloatx80Exp( b );
  5568. expDiff := aExp - bExp;
  5569. if ( 0 < expDiff ) begin
  5570. if ( aExp = $7FFF ) begin
  5571. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5572. result := a;
  5573. end;
  5574. if ( bExp = 0 ) --expDiff;
  5575. shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
  5576. zExp := aExp;
  5577. end;
  5578. else if ( expDiff < 0 ) begin
  5579. if ( bExp = $7FFF ) begin
  5580. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5581. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5582. end;
  5583. if ( aExp = 0 ) ++expDiff;
  5584. shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
  5585. zExp := bExp;
  5586. end;
  5587. else begin
  5588. if ( aExp = $7FFF ) begin
  5589. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5590. result := propagateFloatx80NaN( a, b );
  5591. end;
  5592. result := a;
  5593. end;
  5594. zSig1 := 0;
  5595. zSig0 := aSig + bSig;
  5596. if ( aExp = 0 ) begin
  5597. normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
  5598. goto roundAndPack;
  5599. end;
  5600. zExp := aExp;
  5601. goto shiftRight1;
  5602. end;
  5603. zSig0 := aSig + bSig;
  5604. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5605. shiftRight1:
  5606. shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
  5607. zSig0 or= LIT64( $8000000000000000 );
  5608. ++zExp;
  5609. roundAndPack:
  5610. result :=
  5611. roundAndPackFloatx80(
  5612. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5613. end;
  5614. {*----------------------------------------------------------------------------
  5615. | Returns the result of subtracting the absolute values of the extended
  5616. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5617. | difference is negated before being returned. `zSign' is ignored if the
  5618. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5619. | Standard for Binary Floating-Point Arithmetic.
  5620. *----------------------------------------------------------------------------*}
  5621. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5622. var
  5623. aExp, bExp, zExp: int32;
  5624. aSig, bSig, zSig0, zSig1: bits64;
  5625. expDiff: int32;
  5626. z: floatx80;
  5627. begin
  5628. aSig := extractFloatx80Frac( a );
  5629. aExp := extractFloatx80Exp( a );
  5630. bSig := extractFloatx80Frac( b );
  5631. bExp := extractFloatx80Exp( b );
  5632. expDiff := aExp - bExp;
  5633. if ( 0 < expDiff ) goto aExpBigger;
  5634. if ( expDiff < 0 ) goto bExpBigger;
  5635. if ( aExp = $7FFF ) begin
  5636. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5637. result := propagateFloatx80NaN( a, b );
  5638. end;
  5639. float_raise( float_flag_invalid );
  5640. z.low := floatx80_default_nan_low;
  5641. z.high := floatx80_default_nan_high;
  5642. result := z;
  5643. end;
  5644. if ( aExp = 0 ) begin
  5645. aExp := 1;
  5646. bExp := 1;
  5647. end;
  5648. zSig1 := 0;
  5649. if ( bSig < aSig ) goto aBigger;
  5650. if ( aSig < bSig ) goto bBigger;
  5651. result := packFloatx80( float_rounding_mode = float_round_down, 0, 0 );
  5652. bExpBigger:
  5653. if ( bExp = $7FFF ) begin
  5654. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5655. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  5656. end;
  5657. if ( aExp = 0 ) ++expDiff;
  5658. shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
  5659. bBigger:
  5660. sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
  5661. zExp := bExp;
  5662. zSign xor = 1;
  5663. goto normalizeRoundAndPack;
  5664. aExpBigger:
  5665. if ( aExp = $7FFF ) begin
  5666. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5667. result := a;
  5668. end;
  5669. if ( bExp = 0 ) --expDiff;
  5670. shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
  5671. aBigger:
  5672. sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
  5673. zExp := aExp;
  5674. normalizeRoundAndPack:
  5675. result :=
  5676. normalizeRoundAndPackFloatx80(
  5677. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5678. end;
  5679. {*----------------------------------------------------------------------------
  5680. | Returns the result of adding the extended double-precision floating-point
  5681. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  5682. | Standard for Binary Floating-Point Arithmetic.
  5683. *----------------------------------------------------------------------------*}
  5684. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  5685. var
  5686. aSign, bSign: flag;
  5687. begin
  5688. aSign := extractFloatx80Sign( a );
  5689. bSign := extractFloatx80Sign( b );
  5690. if ( aSign = bSign ) begin
  5691. result := addFloatx80Sigs( a, b, aSign );
  5692. end;
  5693. else begin
  5694. result := subFloatx80Sigs( a, b, aSign );
  5695. end;
  5696. end;
  5697. {*----------------------------------------------------------------------------
  5698. | Returns the result of subtracting the extended double-precision floating-
  5699. | point values `a' and `b'. The operation is performed according to the
  5700. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5701. *----------------------------------------------------------------------------*}
  5702. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  5703. var
  5704. aSign, bSign: flag;
  5705. begin
  5706. aSign := extractFloatx80Sign( a );
  5707. bSign := extractFloatx80Sign( b );
  5708. if ( aSign = bSign ) begin
  5709. result := subFloatx80Sigs( a, b, aSign );
  5710. end;
  5711. else begin
  5712. result := addFloatx80Sigs( a, b, aSign );
  5713. end;
  5714. end;
  5715. {*----------------------------------------------------------------------------
  5716. | Returns the result of multiplying the extended double-precision floating-
  5717. | point values `a' and `b'. The operation is performed according to the
  5718. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5719. *----------------------------------------------------------------------------*}
  5720. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  5721. var
  5722. aSign, bSign, zSign: flag;
  5723. aExp, bExp, zExp: int32;
  5724. aSig, bSig, zSig0, zSig1: bits64;
  5725. z: floatx80;
  5726. begin
  5727. aSig := extractFloatx80Frac( a );
  5728. aExp := extractFloatx80Exp( a );
  5729. aSign := extractFloatx80Sign( a );
  5730. bSig := extractFloatx80Frac( b );
  5731. bExp := extractFloatx80Exp( b );
  5732. bSign := extractFloatx80Sign( b );
  5733. zSign := aSign xor bSign;
  5734. if ( aExp = $7FFF ) begin
  5735. if ( (bits64) ( aSig shl 1 )
  5736. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  5737. result := propagateFloatx80NaN( a, b );
  5738. end;
  5739. if ( ( bExp or bSig ) = 0 ) goto invalid;
  5740. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5741. end;
  5742. if ( bExp = $7FFF ) begin
  5743. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5744. if ( ( aExp or aSig ) = 0 ) begin
  5745. invalid:
  5746. float_raise( float_flag_invalid );
  5747. z.low := floatx80_default_nan_low;
  5748. z.high := floatx80_default_nan_high;
  5749. result := z;
  5750. end;
  5751. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5752. end;
  5753. if ( aExp = 0 ) begin
  5754. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5755. normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
  5756. end;
  5757. if ( bExp = 0 ) begin
  5758. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5759. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  5760. end;
  5761. zExp := aExp + bExp - $3FFE;
  5762. mul64To128( aSig, bSig, &zSig0, &zSig1 );
  5763. if ( 0 < (sbits64) zSig0 ) begin
  5764. shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
  5765. --zExp;
  5766. end;
  5767. result :=
  5768. roundAndPackFloatx80(
  5769. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5770. end;
  5771. {*----------------------------------------------------------------------------
  5772. | Returns the result of dividing the extended double-precision floating-point
  5773. | value `a' by the corresponding value `b'. The operation is performed
  5774. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5775. *----------------------------------------------------------------------------*}
  5776. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  5777. var
  5778. aSign, bSign, zSign: flag;
  5779. aExp, bExp, zExp: int32;
  5780. aSig, bSig, zSig0, zSig1: bits64;
  5781. rem0, rem1, rem2, term0, term1, term2: bits64;
  5782. z: floatx80;
  5783. begin
  5784. aSig := extractFloatx80Frac( a );
  5785. aExp := extractFloatx80Exp( a );
  5786. aSign := extractFloatx80Sign( a );
  5787. bSig := extractFloatx80Frac( b );
  5788. bExp := extractFloatx80Exp( b );
  5789. bSign := extractFloatx80Sign( b );
  5790. zSign := aSign xor bSign;
  5791. if ( aExp = $7FFF ) begin
  5792. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5793. if ( bExp = $7FFF ) begin
  5794. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5795. goto invalid;
  5796. end;
  5797. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5798. end;
  5799. if ( bExp = $7FFF ) begin
  5800. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5801. result := packFloatx80( zSign, 0, 0 );
  5802. end;
  5803. if ( bExp = 0 ) begin
  5804. if ( bSig = 0 ) begin
  5805. if ( ( aExp or aSig ) = 0 ) begin
  5806. invalid:
  5807. float_raise( float_flag_invalid );
  5808. z.low := floatx80_default_nan_low;
  5809. z.high := floatx80_default_nan_high;
  5810. result := z;
  5811. end;
  5812. float_raise( float_flag_divbyzero );
  5813. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5814. end;
  5815. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  5816. end;
  5817. if ( aExp = 0 ) begin
  5818. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5819. normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
  5820. end;
  5821. zExp := aExp - bExp + $3FFE;
  5822. rem1 := 0;
  5823. if ( bSig <= aSig ) begin
  5824. shift128Right( aSig, 0, 1, &aSig, &rem1 );
  5825. ++zExp;
  5826. end;
  5827. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  5828. mul64To128( bSig, zSig0, &term0, &term1 );
  5829. sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
  5830. while ( (sbits64) rem0 < 0 ) begin
  5831. --zSig0;
  5832. add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
  5833. end;
  5834. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  5835. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  5836. mul64To128( bSig, zSig1, &term1, &term2 );
  5837. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  5838. while ( (sbits64) rem1 < 0 ) begin
  5839. --zSig1;
  5840. add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
  5841. end;
  5842. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  5843. end;
  5844. result :=
  5845. roundAndPackFloatx80(
  5846. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5847. end;
  5848. {*----------------------------------------------------------------------------
  5849. | Returns the remainder of the extended double-precision floating-point value
  5850. | `a' with respect to the corresponding value `b'. The operation is performed
  5851. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5852. *----------------------------------------------------------------------------*}
  5853. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  5854. var
  5855. aSign, bSign, zSign: flag;
  5856. aExp, bExp, expDiff: int32;
  5857. aSig0, aSig1, bSig: bits64;
  5858. q, term0, term1, alternateASig0, alternateASig1: bits64;
  5859. z: floatx80;
  5860. begin
  5861. aSig0 := extractFloatx80Frac( a );
  5862. aExp := extractFloatx80Exp( a );
  5863. aSign := extractFloatx80Sign( a );
  5864. bSig := extractFloatx80Frac( b );
  5865. bExp := extractFloatx80Exp( b );
  5866. bSign := extractFloatx80Sign( b );
  5867. if ( aExp = $7FFF ) begin
  5868. if ( (bits64) ( aSig0 shl 1 )
  5869. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  5870. result := propagateFloatx80NaN( a, b );
  5871. end;
  5872. goto invalid;
  5873. end;
  5874. if ( bExp = $7FFF ) begin
  5875. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5876. result := a;
  5877. end;
  5878. if ( bExp = 0 ) begin
  5879. if ( bSig = 0 ) begin
  5880. invalid:
  5881. float_raise( float_flag_invalid );
  5882. z.low := floatx80_default_nan_low;
  5883. z.high := floatx80_default_nan_high;
  5884. result := z;
  5885. end;
  5886. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  5887. end;
  5888. if ( aExp = 0 ) begin
  5889. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  5890. normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
  5891. end;
  5892. bSig or= LIT64( $8000000000000000 );
  5893. zSign := aSign;
  5894. expDiff := aExp - bExp;
  5895. aSig1 := 0;
  5896. if ( expDiff < 0 ) begin
  5897. if ( expDiff < -1 ) result := a;
  5898. shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
  5899. expDiff := 0;
  5900. end;
  5901. q := ( bSig <= aSig0 );
  5902. if ( q ) aSig0 -= bSig;
  5903. expDiff -= 64;
  5904. while ( 0 < expDiff ) begin
  5905. q := estimateDiv128To64( aSig0, aSig1, bSig );
  5906. q := ( 2 < q ) ? q - 2 : 0;
  5907. mul64To128( bSig, q, &term0, &term1 );
  5908. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5909. shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
  5910. expDiff -= 62;
  5911. end;
  5912. expDiff += 64;
  5913. if ( 0 < expDiff ) begin
  5914. q := estimateDiv128To64( aSig0, aSig1, bSig );
  5915. q := ( 2 < q ) ? q - 2 : 0;
  5916. q >>= 64 - expDiff;
  5917. mul64To128( bSig, q shl ( 64 - expDiff ), &term0, &term1 );
  5918. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5919. shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
  5920. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  5921. ++q;
  5922. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5923. end;
  5924. end;
  5925. else begin
  5926. term1 := 0;
  5927. term0 := bSig;
  5928. end;
  5929. sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
  5930. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  5931. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  5932. and ( q and 1 ) )
  5933. ) begin
  5934. aSig0 := alternateASig0;
  5935. aSig1 := alternateASig1;
  5936. zSign := ! zSign;
  5937. end;
  5938. result :=
  5939. normalizeRoundAndPackFloatx80(
  5940. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  5941. end;
  5942. {*----------------------------------------------------------------------------
  5943. | Returns the square root of the extended double-precision floating-point
  5944. | value `a'. The operation is performed according to the IEC/IEEE Standard
  5945. | for Binary Floating-Point Arithmetic.
  5946. *----------------------------------------------------------------------------*}
  5947. function floatx80_sqrt(a: floatx80): floatx80;
  5948. var
  5949. aSign: flag;
  5950. aExp, zExp: int32;
  5951. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  5952. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  5953. z: floatx80;
  5954. label
  5955. invalid;
  5956. begin
  5957. aSig0 := extractFloatx80Frac( a );
  5958. aExp := extractFloatx80Exp( a );
  5959. aSign := extractFloatx80Sign( a );
  5960. if ( aExp = $7FFF ) begin
  5961. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  5962. if ( ! aSign ) result := a;
  5963. goto invalid;
  5964. end;
  5965. if ( aSign ) begin
  5966. if ( ( aExp or aSig0 ) = 0 ) result := a;
  5967. invalid:
  5968. float_raise( float_flag_invalid );
  5969. z.low := floatx80_default_nan_low;
  5970. z.high := floatx80_default_nan_high;
  5971. result := z;
  5972. end;
  5973. if ( aExp = 0 ) begin
  5974. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  5975. normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
  5976. end;
  5977. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  5978. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  5979. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), &aSig0, &aSig1 );
  5980. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  5981. doubleZSig0 := zSig0 shl 1;
  5982. mul64To128( zSig0, zSig0, &term0, &term1 );
  5983. sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
  5984. while ( (sbits64) rem0 < 0 ) begin
  5985. --zSig0;
  5986. doubleZSig0 -= 2;
  5987. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, &rem0, &rem1 );
  5988. end;
  5989. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  5990. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  5991. if ( zSig1 = 0 ) zSig1 := 1;
  5992. mul64To128( doubleZSig0, zSig1, &term1, &term2 );
  5993. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  5994. mul64To128( zSig1, zSig1, &term2, &term3 );
  5995. sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
  5996. while ( (sbits64) rem1 < 0 ) begin
  5997. --zSig1;
  5998. shortShift128Left( 0, zSig1, 1, &term2, &term3 );
  5999. term3 or= 1;
  6000. term2 or= doubleZSig0;
  6001. add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
  6002. end;
  6003. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  6004. end;
  6005. shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
  6006. zSig0 or= doubleZSig0;
  6007. result :=
  6008. roundAndPackFloatx80(
  6009. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  6010. end;
  6011. {*----------------------------------------------------------------------------
  6012. | Returns 1 if the extended double-precision floating-point value `a' is
  6013. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  6014. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6015. | Arithmetic.
  6016. *----------------------------------------------------------------------------*}
  6017. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  6018. begin
  6019. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6020. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6021. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6022. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6023. ) begin
  6024. if ( floatx80_is_signaling_nan( a )
  6025. or floatx80_is_signaling_nan( b ) ) begin
  6026. float_raise( float_flag_invalid );
  6027. end;
  6028. result := 0;
  6029. end;
  6030. result :=
  6031. ( a.low = b.low )
  6032. and ( ( a.high = b.high )
  6033. or ( ( a.low = 0 )
  6034. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6035. );
  6036. end;
  6037. {*----------------------------------------------------------------------------
  6038. | Returns 1 if the extended double-precision floating-point value `a' is
  6039. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6040. | comparison is performed according to the IEC/IEEE Standard for Binary
  6041. | Floating-Point Arithmetic.
  6042. *----------------------------------------------------------------------------*}
  6043. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6044. var
  6045. aSign, bSign: flag;
  6046. begin
  6047. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6048. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6049. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6050. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6051. ) begin
  6052. float_raise( float_flag_invalid );
  6053. result := 0;
  6054. end;
  6055. aSign := extractFloatx80Sign( a );
  6056. bSign := extractFloatx80Sign( b );
  6057. if ( aSign <> bSign ) begin
  6058. result :=
  6059. aSign
  6060. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6061. = 0 );
  6062. end;
  6063. result :=
  6064. aSign ? le128( b.high, b.low, a.high, a.low )
  6065. : le128( a.high, a.low, b.high, b.low );
  6066. end;
  6067. {*----------------------------------------------------------------------------
  6068. | Returns 1 if the extended double-precision floating-point value `a' is
  6069. | less than the corresponding value `b', and 0 otherwise. The comparison
  6070. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6071. | Arithmetic.
  6072. *----------------------------------------------------------------------------*}
  6073. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6074. var
  6075. aSign, bSign: flag;
  6076. begin
  6077. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6078. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6079. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6080. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6081. ) begin
  6082. float_raise( float_flag_invalid );
  6083. result := 0;
  6084. end;
  6085. aSign := extractFloatx80Sign( a );
  6086. bSign := extractFloatx80Sign( b );
  6087. if ( aSign <> bSign ) begin
  6088. result :=
  6089. aSign
  6090. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6091. <> 0 );
  6092. end;
  6093. result :=
  6094. aSign ? lt128( b.high, b.low, a.high, a.low )
  6095. : lt128( a.high, a.low, b.high, b.low );
  6096. end;
  6097. {*----------------------------------------------------------------------------
  6098. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6099. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6100. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6101. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6102. *----------------------------------------------------------------------------*}
  6103. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6104. begin
  6105. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6106. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6107. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6108. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6109. ) begin
  6110. float_raise( float_flag_invalid );
  6111. result := 0;
  6112. end;
  6113. result :=
  6114. ( a.low = b.low )
  6115. and ( ( a.high = b.high )
  6116. or ( ( a.low = 0 )
  6117. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6118. );
  6119. end;
  6120. {*----------------------------------------------------------------------------
  6121. | Returns 1 if the extended double-precision floating-point value `a' is less
  6122. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6123. | do not cause an exception. Otherwise, the comparison is performed according
  6124. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6125. *----------------------------------------------------------------------------*}
  6126. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6127. var
  6128. aSign, bSign: flag;
  6129. begin
  6130. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6131. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6132. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6133. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6134. ) begin
  6135. if ( floatx80_is_signaling_nan( a )
  6136. or floatx80_is_signaling_nan( b ) ) begin
  6137. float_raise( float_flag_invalid );
  6138. end;
  6139. result := 0;
  6140. end;
  6141. aSign := extractFloatx80Sign( a );
  6142. bSign := extractFloatx80Sign( b );
  6143. if ( aSign <> bSign ) begin
  6144. result :=
  6145. aSign
  6146. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6147. = 0 );
  6148. end;
  6149. result :=
  6150. aSign ? le128( b.high, b.low, a.high, a.low )
  6151. : le128( a.high, a.low, b.high, b.low );
  6152. end;
  6153. {*----------------------------------------------------------------------------
  6154. | Returns 1 if the extended double-precision floating-point value `a' is less
  6155. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6156. | an exception. Otherwise, the comparison is performed according to the
  6157. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6158. *----------------------------------------------------------------------------*}
  6159. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6160. var
  6161. aSign, bSign: flag;
  6162. begin
  6163. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6164. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6165. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6166. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6167. ) begin
  6168. if ( floatx80_is_signaling_nan( a )
  6169. or floatx80_is_signaling_nan( b ) ) begin
  6170. float_raise( float_flag_invalid );
  6171. end;
  6172. result := 0;
  6173. end;
  6174. aSign := extractFloatx80Sign( a );
  6175. bSign := extractFloatx80Sign( b );
  6176. if ( aSign <> bSign ) begin
  6177. result :=
  6178. aSign
  6179. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6180. <> 0 );
  6181. end;
  6182. result :=
  6183. aSign ? lt128( b.high, b.low, a.high, a.low )
  6184. : lt128( a.high, a.low, b.high, b.low );
  6185. end;
  6186. {$endif FPC_SOFTFLOAT_FLOATX80}
  6187. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6188. {*----------------------------------------------------------------------------
  6189. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6190. | floating-point value `a'.
  6191. *----------------------------------------------------------------------------*}
  6192. function extractFloat128Frac1(a : float128): bits64;
  6193. begin
  6194. result:=a.low;
  6195. end;
  6196. {*----------------------------------------------------------------------------
  6197. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6198. | floating-point value `a'.
  6199. *----------------------------------------------------------------------------*}
  6200. function extractFloat128Frac0(a : float128): bits64;
  6201. begin
  6202. result:=a.high and int64($0000FFFFFFFFFFFF);
  6203. end;
  6204. {*----------------------------------------------------------------------------
  6205. | Returns the exponent bits of the quadruple-precision floating-point value
  6206. | `a'.
  6207. *----------------------------------------------------------------------------*}
  6208. function extractFloat128Exp(a : float128): int32;
  6209. begin
  6210. result:=( a.high shr 48 ) and $7FFF;
  6211. end;
  6212. {*----------------------------------------------------------------------------
  6213. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6214. *----------------------------------------------------------------------------*}
  6215. function extractFloat128Sign(a : float128): flag;
  6216. begin
  6217. result:=a.high shr 63;
  6218. end;
  6219. {*----------------------------------------------------------------------------
  6220. | Normalizes the subnormal quadruple-precision floating-point value
  6221. | represented by the denormalized significand formed by the concatenation of
  6222. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6223. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6224. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6225. | least significant 64 bits of the normalized significand are stored at the
  6226. | location pointed to by `zSig1Ptr'.
  6227. *----------------------------------------------------------------------------*}
  6228. procedure normalizeFloat128Subnormal(
  6229. aSig0: bits64;
  6230. aSig1: bits64;
  6231. var zExpPtr: int32;
  6232. var zSig0Ptr: bits64;
  6233. var zSig1Ptr: bits64);
  6234. var
  6235. shiftCount: int8;
  6236. begin
  6237. if ( aSig0 = 0 ) then
  6238. begin
  6239. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6240. if ( shiftCount < 0 ) then
  6241. begin
  6242. zSig0Ptr := aSig1 shr ( - shiftCount );
  6243. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6244. end
  6245. else begin
  6246. zSig0Ptr := aSig1 shl shiftCount;
  6247. zSig1Ptr := 0;
  6248. end;
  6249. zExpPtr := - shiftCount - 63;
  6250. end
  6251. else begin
  6252. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6253. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6254. zExpPtr := 1 - shiftCount;
  6255. end;
  6256. end;
  6257. {*----------------------------------------------------------------------------
  6258. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6259. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6260. | floating-point value, returning the result. After being shifted into the
  6261. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6262. | added together to form the most significant 32 bits of the result. This
  6263. | means that any integer portion of `zSig0' will be added into the exponent.
  6264. | Since a properly normalized significand will have an integer portion equal
  6265. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6266. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6267. | significand.
  6268. *----------------------------------------------------------------------------*}
  6269. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6270. var
  6271. z: float128;
  6272. begin
  6273. z.low := zSig1;
  6274. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6275. result:=z;
  6276. end;
  6277. {*----------------------------------------------------------------------------
  6278. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6279. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6280. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6281. | corresponding to the abstract input. Ordinarily, the abstract value is
  6282. | simply rounded and packed into the quadruple-precision format, with the
  6283. | inexact exception raised if the abstract input cannot be represented
  6284. | exactly. However, if the abstract value is too large, the overflow and
  6285. | inexact exceptions are raised and an infinity or maximal finite value is
  6286. | returned. If the abstract value is too small, the input value is rounded to
  6287. | a subnormal number, and the underflow and inexact exceptions are raised if
  6288. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6289. | precision floating-point number.
  6290. | The input significand must be normalized or smaller. If the input
  6291. | significand is not normalized, `zExp' must be 0; in that case, the result
  6292. | returned is a subnormal number, and it must not require rounding. In the
  6293. | usual case that the input significand is normalized, `zExp' must be 1 less
  6294. | than the ``true'' floating-point exponent. The handling of underflow and
  6295. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6296. *----------------------------------------------------------------------------*}
  6297. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6298. var
  6299. roundingMode: int8;
  6300. roundNearestEven, increment, isTiny: flag;
  6301. begin
  6302. roundingMode := float_rounding_mode;
  6303. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6304. increment := ord( sbits64(zSig2) < 0 );
  6305. if ( roundNearestEven=0 ) then
  6306. begin
  6307. if ( roundingMode = float_round_to_zero ) then
  6308. begin
  6309. increment := 0;
  6310. end
  6311. else begin
  6312. if ( zSign<>0 ) then
  6313. begin
  6314. increment := ord( roundingMode = float_round_down ) and zSig2;
  6315. end
  6316. else begin
  6317. increment := ord( roundingMode = float_round_up ) and zSig2;
  6318. end;
  6319. end;
  6320. end;
  6321. if ( $7FFD <= bits32(zExp) ) then
  6322. begin
  6323. if ( ord( $7FFD < zExp )
  6324. or ( ord( zExp = $7FFD )
  6325. and eq128(
  6326. int64( $0001FFFFFFFFFFFF ),
  6327. int64( $FFFFFFFFFFFFFFFF ),
  6328. zSig0,
  6329. zSig1
  6330. )
  6331. and increment
  6332. )
  6333. )<>0 then
  6334. begin
  6335. float_raise( float_flag_overflow or float_flag_inexact );
  6336. if ( ord( roundingMode = float_round_to_zero )
  6337. or ( zSign and ord( roundingMode = float_round_up ) )
  6338. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6339. )<>0 then
  6340. begin
  6341. result :=
  6342. packFloat128(
  6343. zSign,
  6344. $7FFE,
  6345. int64( $0000FFFFFFFFFFFF ),
  6346. int64( $FFFFFFFFFFFFFFFF )
  6347. );
  6348. end;
  6349. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6350. end;
  6351. if ( zExp < 0 ) then
  6352. begin
  6353. isTiny :=
  6354. ord(( float_detect_tininess = float_tininess_before_rounding )
  6355. or ( zExp < -1 )
  6356. or not( increment<>0 )
  6357. or boolean(lt128(
  6358. zSig0,
  6359. zSig1,
  6360. int64( $0001FFFFFFFFFFFF ),
  6361. int64( $FFFFFFFFFFFFFFFF )
  6362. )));
  6363. shift128ExtraRightJamming(
  6364. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6365. zExp := 0;
  6366. if ( isTiny and zSig2 )<>0 then
  6367. float_raise( float_flag_underflow );
  6368. if ( roundNearestEven<>0 ) then
  6369. begin
  6370. increment := ord( sbits64(zSig2) < 0 );
  6371. end
  6372. else begin
  6373. if ( zSign<>0 ) then
  6374. begin
  6375. increment := ord( roundingMode = float_round_down ) and zSig2;
  6376. end
  6377. else begin
  6378. increment := ord( roundingMode = float_round_up ) and zSig2;
  6379. end;
  6380. end;
  6381. end;
  6382. end;
  6383. if ( zSig2<>0 ) then
  6384. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6385. if ( increment<>0 ) then
  6386. begin
  6387. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6388. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6389. end
  6390. else begin
  6391. if ( ( zSig0 or zSig1 ) = 0 ) then
  6392. zExp := 0;
  6393. end;
  6394. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6395. end;
  6396. {*----------------------------------------------------------------------------
  6397. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6398. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6399. | returns the proper quadruple-precision floating-point value corresponding
  6400. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6401. | except that the input significand has fewer bits and does not have to be
  6402. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6403. | point exponent.
  6404. *----------------------------------------------------------------------------*}
  6405. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6406. var
  6407. shiftCount: int8;
  6408. zSig2: bits64;
  6409. begin
  6410. if ( zSig0 = 0 ) then
  6411. begin
  6412. zSig0 := zSig1;
  6413. zSig1 := 0;
  6414. dec(zExp, 64);
  6415. end;
  6416. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6417. if ( 0 <= shiftCount ) then
  6418. begin
  6419. zSig2 := 0;
  6420. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6421. end
  6422. else begin
  6423. shift128ExtraRightJamming(
  6424. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6425. end;
  6426. dec(zExp, shiftCount);
  6427. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6428. end;
  6429. {*----------------------------------------------------------------------------
  6430. | Returns the result of converting the quadruple-precision floating-point
  6431. | value `a' to the 32-bit two's complement integer format. The conversion
  6432. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6433. | Arithmetic---which means in particular that the conversion is rounded
  6434. | according to the current rounding mode. If `a' is a NaN, the largest
  6435. | positive integer is returned. Otherwise, if the conversion overflows, the
  6436. | largest integer with the same sign as `a' is returned.
  6437. *----------------------------------------------------------------------------*}
  6438. function float128_to_int32(a: float128): int32;
  6439. var
  6440. aSign: flag;
  6441. aExp, shiftCount: int32;
  6442. aSig0, aSig1: bits64;
  6443. begin
  6444. aSig1 := extractFloat128Frac1( a );
  6445. aSig0 := extractFloat128Frac0( a );
  6446. aExp := extractFloat128Exp( a );
  6447. aSign := extractFloat128Sign( a );
  6448. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6449. aSign := 0;
  6450. if ( aExp<>0 ) then
  6451. aSig0 := aSig0 or int64( $0001000000000000 );
  6452. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6453. shiftCount := $4028 - aExp;
  6454. if ( 0 < shiftCount ) then
  6455. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6456. result := roundAndPackInt32( aSign, aSig0 );
  6457. end;
  6458. {*----------------------------------------------------------------------------
  6459. | Returns the result of converting the quadruple-precision floating-point
  6460. | value `a' to the 32-bit two's complement integer format. The conversion
  6461. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6462. | Arithmetic, except that the conversion is always rounded toward zero. If
  6463. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6464. | conversion overflows, the largest integer with the same sign as `a' is
  6465. | returned.
  6466. *----------------------------------------------------------------------------*}
  6467. function float128_to_int32_round_to_zero(a: float128): int32;
  6468. var
  6469. aSign: flag;
  6470. aExp, shiftCount: int32;
  6471. aSig0, aSig1, savedASig: bits64;
  6472. z: int32;
  6473. label
  6474. invalid;
  6475. begin
  6476. aSig1 := extractFloat128Frac1( a );
  6477. aSig0 := extractFloat128Frac0( a );
  6478. aExp := extractFloat128Exp( a );
  6479. aSign := extractFloat128Sign( a );
  6480. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6481. if ( $401E < aExp ) then
  6482. begin
  6483. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6484. aSign := 0;
  6485. goto invalid;
  6486. end
  6487. else if ( aExp < $3FFF ) then
  6488. begin
  6489. if ( aExp or aSig0 )<>0 then
  6490. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6491. result := 0;
  6492. exit;
  6493. end;
  6494. aSig0 := aSig0 or int64( $0001000000000000 );
  6495. shiftCount := $402F - aExp;
  6496. savedASig := aSig0;
  6497. aSig0 := aSig0 shr shiftCount;
  6498. z := aSig0;
  6499. if ( aSign )<>0 then
  6500. z := - z;
  6501. if ( ord( z < 0 ) xor aSign )<>0 then
  6502. begin
  6503. invalid:
  6504. float_raise( float_flag_invalid );
  6505. if aSign<>0 then
  6506. result:=$80000000
  6507. else
  6508. result:=$7FFFFFFF;
  6509. exit;
  6510. end;
  6511. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6512. begin
  6513. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6514. end;
  6515. result := z;
  6516. end;
  6517. {*----------------------------------------------------------------------------
  6518. | Returns the result of converting the quadruple-precision floating-point
  6519. | value `a' to the 64-bit two's complement integer format. The conversion
  6520. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6521. | Arithmetic---which means in particular that the conversion is rounded
  6522. | according to the current rounding mode. If `a' is a NaN, the largest
  6523. | positive integer is returned. Otherwise, if the conversion overflows, the
  6524. | largest integer with the same sign as `a' is returned.
  6525. *----------------------------------------------------------------------------*}
  6526. function float128_to_int64(a: float128): int64;
  6527. var
  6528. aSign: flag;
  6529. aExp, shiftCount: int32;
  6530. aSig0, aSig1: bits64;
  6531. begin
  6532. aSig1 := extractFloat128Frac1( a );
  6533. aSig0 := extractFloat128Frac0( a );
  6534. aExp := extractFloat128Exp( a );
  6535. aSign := extractFloat128Sign( a );
  6536. if ( aExp<>0 ) then
  6537. aSig0 := aSig0 or int64( $0001000000000000 );
  6538. shiftCount := $402F - aExp;
  6539. if ( shiftCount <= 0 ) then
  6540. begin
  6541. if ( $403E < aExp ) then
  6542. begin
  6543. float_raise( float_flag_invalid );
  6544. if ( (aSign=0)
  6545. or ( ( aExp = $7FFF )
  6546. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6547. )
  6548. ) then
  6549. begin
  6550. result := int64( $7FFFFFFFFFFFFFFF );
  6551. end;
  6552. result := int64( $8000000000000000 );
  6553. end;
  6554. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6555. end
  6556. else begin
  6557. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6558. end;
  6559. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6560. end;
  6561. {*----------------------------------------------------------------------------
  6562. | Returns the result of converting the quadruple-precision floating-point
  6563. | value `a' to the 64-bit two's complement integer format. The conversion
  6564. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6565. | Arithmetic, except that the conversion is always rounded toward zero.
  6566. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6567. | the conversion overflows, the largest integer with the same sign as `a' is
  6568. | returned.
  6569. *----------------------------------------------------------------------------*}
  6570. function float128_to_int64_round_to_zero(a: float128): int64;
  6571. var
  6572. aSign: flag;
  6573. aExp, shiftCount: int32;
  6574. aSig0, aSig1: bits64;
  6575. z: int64;
  6576. begin
  6577. aSig1 := extractFloat128Frac1( a );
  6578. aSig0 := extractFloat128Frac0( a );
  6579. aExp := extractFloat128Exp( a );
  6580. aSign := extractFloat128Sign( a );
  6581. if ( aExp<>0 ) then
  6582. aSig0 := aSig0 or int64( $0001000000000000 );
  6583. shiftCount := aExp - $402F;
  6584. if ( 0 < shiftCount ) then
  6585. begin
  6586. if ( $403E <= aExp ) then
  6587. begin
  6588. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6589. if ( ( a.high = int64( $C03E000000000000 ) )
  6590. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6591. begin
  6592. if ( aSig1<>0 ) then
  6593. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6594. end
  6595. else begin
  6596. float_raise( float_flag_invalid );
  6597. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6598. begin
  6599. result := int64( $7FFFFFFFFFFFFFFF );
  6600. exit;
  6601. end;
  6602. end;
  6603. result := int64( $8000000000000000 );
  6604. exit;
  6605. end;
  6606. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6607. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6608. begin
  6609. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6610. end;
  6611. end
  6612. else begin
  6613. if ( aExp < $3FFF ) then
  6614. begin
  6615. if ( aExp or aSig0 or aSig1 )<>0 then
  6616. begin
  6617. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6618. end;
  6619. result := 0;
  6620. exit;
  6621. end;
  6622. z := aSig0 shr ( - shiftCount );
  6623. if ( (aSig1<>0)
  6624. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6625. begin
  6626. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6627. end;
  6628. end;
  6629. if ( aSign<>0 ) then
  6630. z := - z;
  6631. result := z;
  6632. end;
  6633. {*----------------------------------------------------------------------------
  6634. | Returns the result of converting the quadruple-precision floating-point
  6635. | value `a' to the single-precision floating-point format. The conversion
  6636. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6637. | Arithmetic.
  6638. *----------------------------------------------------------------------------*}
  6639. function float128_to_float32(a: float128): float32;
  6640. var
  6641. aSign: flag;
  6642. aExp: int32;
  6643. aSig0, aSig1: bits64;
  6644. zSig: bits32;
  6645. begin
  6646. aSig1 := extractFloat128Frac1( a );
  6647. aSig0 := extractFloat128Frac0( a );
  6648. aExp := extractFloat128Exp( a );
  6649. aSign := extractFloat128Sign( a );
  6650. if ( aExp = $7FFF ) then
  6651. begin
  6652. if ( aSig0 or aSig1 )<>0 then
  6653. begin
  6654. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  6655. exit;
  6656. end;
  6657. result := packFloat32( aSign, $FF, 0 );
  6658. exit;
  6659. end;
  6660. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6661. shift64RightJamming( aSig0, 18, aSig0 );
  6662. zSig := aSig0;
  6663. if ( aExp or zSig )<>0 then
  6664. begin
  6665. zSig := zSig or $40000000;
  6666. dec(aExp,$3F81);
  6667. end;
  6668. result := roundAndPackFloat32( aSign, aExp, zSig );
  6669. end;
  6670. {*----------------------------------------------------------------------------
  6671. | Returns the result of converting the quadruple-precision floating-point
  6672. | value `a' to the double-precision floating-point format. The conversion
  6673. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6674. | Arithmetic.
  6675. *----------------------------------------------------------------------------*}
  6676. function float128_to_float64(a: float128): float64;
  6677. var
  6678. aSign: flag;
  6679. aExp: int32;
  6680. aSig0, aSig1: bits64;
  6681. begin
  6682. aSig1 := extractFloat128Frac1( a );
  6683. aSig0 := extractFloat128Frac0( a );
  6684. aExp := extractFloat128Exp( a );
  6685. aSign := extractFloat128Sign( a );
  6686. if ( aExp = $7FFF ) then
  6687. begin
  6688. if ( aSig0 or aSig1 )<>0 then
  6689. begin
  6690. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  6691. exit;
  6692. end;
  6693. result:=packFloat64( aSign, $7FF, 0);
  6694. exit;
  6695. end;
  6696. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  6697. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6698. if ( aExp or aSig0 )<>0 then
  6699. begin
  6700. aSig0 := aSig0 or int64( $4000000000000000 );
  6701. dec(aExp,$3C01);
  6702. end;
  6703. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  6704. end;
  6705. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  6706. {*----------------------------------------------------------------------------
  6707. | Returns the result of converting the quadruple-precision floating-point
  6708. | value `a' to the extended double-precision floating-point format. The
  6709. | conversion is performed according to the IEC/IEEE Standard for Binary
  6710. | Floating-Point Arithmetic.
  6711. *----------------------------------------------------------------------------*}
  6712. function float128_to_floatx80(a: float128): floatx80;
  6713. var
  6714. aSign: flag;
  6715. aExp: int32;
  6716. aSig0, aSig1: bits64;
  6717. begin
  6718. aSig1 := extractFloat128Frac1( a );
  6719. aSig0 := extractFloat128Frac0( a );
  6720. aExp := extractFloat128Exp( a );
  6721. aSign := extractFloat128Sign( a );
  6722. if ( aExp = $7FFF ) begin
  6723. if ( aSig0 or aSig1 ) begin
  6724. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  6725. end;
  6726. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  6727. end;
  6728. if ( aExp = 0 ) begin
  6729. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloatx80( aSign, 0, 0 );
  6730. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  6731. end;
  6732. else begin
  6733. aSig0 or= int64( $0001000000000000 );
  6734. end;
  6735. shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
  6736. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  6737. end;
  6738. {$endif FPC_SOFTFLOAT_FLOATX80}
  6739. {*----------------------------------------------------------------------------
  6740. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  6741. | Returns the result as a quadruple-precision floating-point value. The
  6742. | operation is performed according to the IEC/IEEE Standard for Binary
  6743. | Floating-Point Arithmetic.
  6744. *----------------------------------------------------------------------------*}
  6745. function float128_round_to_int(a: float128): float128;
  6746. var
  6747. aSign: flag;
  6748. aExp: int32;
  6749. lastBitMask, roundBitsMask: bits64;
  6750. roundingMode: int8;
  6751. z: float128;
  6752. begin
  6753. aExp := extractFloat128Exp( a );
  6754. if ( $402F <= aExp ) then
  6755. begin
  6756. if ( $406F <= aExp ) then
  6757. begin
  6758. if ( ( aExp = $7FFF )
  6759. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  6760. ) then
  6761. begin
  6762. result := propagateFloat128NaN( a, a );
  6763. exit;
  6764. end;
  6765. result := a;
  6766. exit;
  6767. end;
  6768. lastBitMask := 1;
  6769. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  6770. roundBitsMask := lastBitMask - 1;
  6771. z := a;
  6772. roundingMode := float_rounding_mode;
  6773. if ( roundingMode = float_round_nearest_even ) then
  6774. begin
  6775. if ( lastBitMask )<>0 then
  6776. begin
  6777. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  6778. if ( ( z.low and roundBitsMask ) = 0 ) then
  6779. z.low := z.low and not(lastBitMask);
  6780. end
  6781. else begin
  6782. if ( sbits64(z.low) < 0 ) then
  6783. begin
  6784. inc(z.high);
  6785. if ( bits64( z.low shl 1 ) = 0 ) then
  6786. z.high := z.high and not(1);
  6787. end;
  6788. end;
  6789. end
  6790. else if ( roundingMode <> float_round_to_zero ) then
  6791. begin
  6792. if ( extractFloat128Sign( z )
  6793. xor ord( roundingMode = float_round_up ) )<>0 then
  6794. begin
  6795. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  6796. end;
  6797. end;
  6798. z.low := z.low and not(roundBitsMask);
  6799. end
  6800. else begin
  6801. if ( aExp < $3FFF ) then
  6802. begin
  6803. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  6804. begin
  6805. result := a;
  6806. exit;
  6807. end;
  6808. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6809. aSign := extractFloat128Sign( a );
  6810. case float_rounding_mode of
  6811. float_round_nearest_even:
  6812. if ( ( aExp = $3FFE )
  6813. and ( extractFloat128Frac0( a )
  6814. or extractFloat128Frac1( a ) )
  6815. ) begin
  6816. begin
  6817. result := packFloat128( aSign, $3FFF, 0, 0 );
  6818. exit;
  6819. end;
  6820. end;
  6821. float_round_down:
  6822. begin
  6823. result :=
  6824. aSign ? packFloat128( 1, $3FFF, 0, 0 )
  6825. : packFloat128( 0, 0, 0, 0 );
  6826. end;
  6827. float_round_up:
  6828. begin
  6829. result :=
  6830. aSign ? packFloat128( 1, 0, 0, 0 )
  6831. : packFloat128( 0, $3FFF, 0, 0 );
  6832. exit;
  6833. end;
  6834. end;
  6835. result := packFloat128( aSign, 0, 0, 0 );
  6836. exit;
  6837. end;
  6838. lastBitMask := 1;
  6839. lastBitMask shl = $402F - aExp;
  6840. roundBitsMask := lastBitMask - 1;
  6841. z.low := 0;
  6842. z.high := a.high;
  6843. roundingMode := float_rounding_mode;
  6844. if ( roundingMode = float_round_nearest_even ) begin
  6845. z.high += lastBitMask>>1;
  6846. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) begin
  6847. z.high &= ~ lastBitMask;
  6848. end;
  6849. end;
  6850. else if ( roundingMode <> float_round_to_zero ) begin
  6851. if ( extractFloat128Sign( z )
  6852. xor ( roundingMode = float_round_up ) ) begin
  6853. z.high or= ( a.low <> 0 );
  6854. z.high += roundBitsMask;
  6855. end;
  6856. end;
  6857. z.high &= ~ roundBitsMask;
  6858. end;
  6859. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) begin
  6860. softfloat_exception_flags or= float_flag_inexact;
  6861. end;
  6862. result := z;
  6863. end;
  6864. {*----------------------------------------------------------------------------
  6865. | Returns the result of adding the absolute values of the quadruple-precision
  6866. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  6867. | before being returned. `zSign' is ignored if the result is a NaN.
  6868. | The addition is performed according to the IEC/IEEE Standard for Binary
  6869. | Floating-Point Arithmetic.
  6870. *----------------------------------------------------------------------------*}
  6871. function addFloat128Sigs( float128 a, float128 b, flag zSign ): float128;
  6872. var
  6873. aExp, bExp, zExp: int32;
  6874. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  6875. expDiff: int32;
  6876. begin
  6877. aSig1 := extractFloat128Frac1( a );
  6878. aSig0 := extractFloat128Frac0( a );
  6879. aExp := extractFloat128Exp( a );
  6880. bSig1 := extractFloat128Frac1( b );
  6881. bSig0 := extractFloat128Frac0( b );
  6882. bExp := extractFloat128Exp( b );
  6883. expDiff := aExp - bExp;
  6884. if ( 0 < expDiff ) begin
  6885. if ( aExp = $7FFF ) begin
  6886. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
  6887. result := a;
  6888. end;
  6889. if ( bExp = 0 ) begin
  6890. --expDiff;
  6891. end;
  6892. else begin
  6893. bSig0 or= int64( $0001000000000000 );
  6894. end;
  6895. shift128ExtraRightJamming(
  6896. bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
  6897. zExp := aExp;
  6898. end;
  6899. else if ( expDiff < 0 ) begin
  6900. if ( bExp = $7FFF ) begin
  6901. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  6902. result := packFloat128( zSign, $7FFF, 0, 0 );
  6903. end;
  6904. if ( aExp = 0 ) begin
  6905. ++expDiff;
  6906. end;
  6907. else begin
  6908. aSig0 or= int64( $0001000000000000 );
  6909. end;
  6910. shift128ExtraRightJamming(
  6911. aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
  6912. zExp := bExp;
  6913. end;
  6914. else begin
  6915. if ( aExp = $7FFF ) begin
  6916. if ( aSig0 or aSig1 or bSig0 or bSig1 ) begin
  6917. result := propagateFloat128NaN( a, b );
  6918. end;
  6919. result := a;
  6920. end;
  6921. add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  6922. if ( aExp = 0 ) result := packFloat128( zSign, 0, zSig0, zSig1 );
  6923. zSig2 := 0;
  6924. zSig0 or= int64( $0002000000000000 );
  6925. zExp := aExp;
  6926. goto shiftRight1;
  6927. end;
  6928. aSig0 or= int64( $0001000000000000 );
  6929. add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  6930. --zExp;
  6931. if ( zSig0 < int64( $0002000000000000 ) ) goto roundAndPack;
  6932. ++zExp;
  6933. shiftRight1:
  6934. shift128ExtraRightJamming(
  6935. zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
  6936. roundAndPack:
  6937. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6938. end;
  6939. {*----------------------------------------------------------------------------
  6940. | Returns the result of subtracting the absolute values of the quadruple-
  6941. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  6942. | difference is negated before being returned. `zSign' is ignored if the
  6943. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6944. | Standard for Binary Floating-Point Arithmetic.
  6945. *----------------------------------------------------------------------------*}
  6946. function subFloat128Sigs( float128 a, float128 b, flag zSign ): float128;
  6947. var
  6948. aExp, bExp, zExp: int32;
  6949. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  6950. expDiff: int32;
  6951. z: float128;
  6952. begin
  6953. aSig1 := extractFloat128Frac1( a );
  6954. aSig0 := extractFloat128Frac0( a );
  6955. aExp := extractFloat128Exp( a );
  6956. bSig1 := extractFloat128Frac1( b );
  6957. bSig0 := extractFloat128Frac0( b );
  6958. bExp := extractFloat128Exp( b );
  6959. expDiff := aExp - bExp;
  6960. shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
  6961. shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
  6962. if ( 0 < expDiff ) goto aExpBigger;
  6963. if ( expDiff < 0 ) goto bExpBigger;
  6964. if ( aExp = $7FFF ) begin
  6965. if ( aSig0 or aSig1 or bSig0 or bSig1 ) begin
  6966. result := propagateFloat128NaN( a, b );
  6967. end;
  6968. float_raise( float_flag_invalid );
  6969. z.low := float128_default_nan_low;
  6970. z.high := float128_default_nan_high;
  6971. result := z;
  6972. end;
  6973. if ( aExp = 0 ) begin
  6974. aExp := 1;
  6975. bExp := 1;
  6976. end;
  6977. if ( bSig0 < aSig0 ) goto aBigger;
  6978. if ( aSig0 < bSig0 ) goto bBigger;
  6979. if ( bSig1 < aSig1 ) goto aBigger;
  6980. if ( aSig1 < bSig1 ) goto bBigger;
  6981. result := packFloat128( float_rounding_mode = float_round_down, 0, 0, 0 );
  6982. bExpBigger:
  6983. if ( bExp = $7FFF ) begin
  6984. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  6985. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  6986. end;
  6987. if ( aExp = 0 ) begin
  6988. ++expDiff;
  6989. end;
  6990. else begin
  6991. aSig0 or= int64( $4000000000000000 );
  6992. end;
  6993. shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
  6994. bSig0 or= int64( $4000000000000000 );
  6995. bBigger:
  6996. sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
  6997. zExp := bExp;
  6998. zSign xor = 1;
  6999. goto normalizeRoundAndPack;
  7000. aExpBigger:
  7001. if ( aExp = $7FFF ) begin
  7002. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
  7003. result := a;
  7004. end;
  7005. if ( bExp = 0 ) begin
  7006. --expDiff;
  7007. end;
  7008. else begin
  7009. bSig0 or= int64( $4000000000000000 );
  7010. end;
  7011. shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
  7012. aSig0 or= int64( $4000000000000000 );
  7013. aBigger:
  7014. sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  7015. zExp := aExp;
  7016. normalizeRoundAndPack:
  7017. --zExp;
  7018. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  7019. end;
  7020. {*----------------------------------------------------------------------------
  7021. | Returns the result of adding the quadruple-precision floating-point values
  7022. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  7023. | for Binary Floating-Point Arithmetic.
  7024. *----------------------------------------------------------------------------*}
  7025. function float128_add(a: float128; b: float128): float128;
  7026. var
  7027. aSign, bSign: flag;
  7028. begin
  7029. aSign := extractFloat128Sign( a );
  7030. bSign := extractFloat128Sign( b );
  7031. if ( aSign = bSign ) begin
  7032. result := addFloat128Sigs( a, b, aSign );
  7033. end;
  7034. else begin
  7035. result := subFloat128Sigs( a, b, aSign );
  7036. end;
  7037. end;
  7038. {*----------------------------------------------------------------------------
  7039. | Returns the result of subtracting the quadruple-precision floating-point
  7040. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7041. | Standard for Binary Floating-Point Arithmetic.
  7042. *----------------------------------------------------------------------------*}
  7043. function float128_sub(a: float128; b: float128): float128;
  7044. var
  7045. aSign, bSign: flag;
  7046. begin
  7047. aSign := extractFloat128Sign( a );
  7048. bSign := extractFloat128Sign( b );
  7049. if ( aSign = bSign ) begin
  7050. result := subFloat128Sigs( a, b, aSign );
  7051. end;
  7052. else begin
  7053. result := addFloat128Sigs( a, b, aSign );
  7054. end;
  7055. end;
  7056. {*----------------------------------------------------------------------------
  7057. | Returns the result of multiplying the quadruple-precision floating-point
  7058. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7059. | Standard for Binary Floating-Point Arithmetic.
  7060. *----------------------------------------------------------------------------*}
  7061. function float128_mul(a: float128; b: float128): float128;
  7062. var
  7063. aSign, bSign, zSign: flag;
  7064. aExp, bExp, zExp: int32;
  7065. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7066. z: float128;
  7067. begin
  7068. aSig1 := extractFloat128Frac1( a );
  7069. aSig0 := extractFloat128Frac0( a );
  7070. aExp := extractFloat128Exp( a );
  7071. aSign := extractFloat128Sign( a );
  7072. bSig1 := extractFloat128Frac1( b );
  7073. bSig0 := extractFloat128Frac0( b );
  7074. bExp := extractFloat128Exp( b );
  7075. bSign := extractFloat128Sign( b );
  7076. zSign := aSign xor bSign;
  7077. if ( aExp = $7FFF ) begin
  7078. if ( ( aSig0 or aSig1 )
  7079. or ( ( bExp = $7FFF ) and ( bSig0 or bSig1 ) ) ) begin
  7080. result := propagateFloat128NaN( a, b );
  7081. end;
  7082. if ( ( bExp or bSig0 or bSig1 ) = 0 ) goto invalid;
  7083. result := packFloat128( zSign, $7FFF, 0, 0 );
  7084. end;
  7085. if ( bExp = $7FFF ) begin
  7086. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7087. if ( ( aExp or aSig0 or aSig1 ) = 0 ) begin
  7088. invalid:
  7089. float_raise( float_flag_invalid );
  7090. z.low := float128_default_nan_low;
  7091. z.high := float128_default_nan_high;
  7092. result := z;
  7093. end;
  7094. result := packFloat128( zSign, $7FFF, 0, 0 );
  7095. end;
  7096. if ( aExp = 0 ) begin
  7097. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
  7098. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7099. end;
  7100. if ( bExp = 0 ) begin
  7101. if ( ( bSig0 or bSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
  7102. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  7103. end;
  7104. zExp := aExp + bExp - $4000;
  7105. aSig0 or= int64( $0001000000000000 );
  7106. shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
  7107. mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
  7108. add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
  7109. zSig2 or= ( zSig3 <> 0 );
  7110. if ( int64( $0002000000000000 ) <= zSig0 ) begin
  7111. shift128ExtraRightJamming(
  7112. zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
  7113. ++zExp;
  7114. end;
  7115. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7116. end;
  7117. {*----------------------------------------------------------------------------
  7118. | Returns the result of dividing the quadruple-precision floating-point value
  7119. | `a' by the corresponding value `b'. The operation is performed according to
  7120. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7121. *----------------------------------------------------------------------------*}
  7122. function float128_div(a: float128; b: float128): float128;
  7123. var
  7124. aSign, bSign, zSign: flag;
  7125. aExp, bExp, zExp: int32;
  7126. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7127. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7128. z: float128;
  7129. begin
  7130. aSig1 := extractFloat128Frac1( a );
  7131. aSig0 := extractFloat128Frac0( a );
  7132. aExp := extractFloat128Exp( a );
  7133. aSign := extractFloat128Sign( a );
  7134. bSig1 := extractFloat128Frac1( b );
  7135. bSig0 := extractFloat128Frac0( b );
  7136. bExp := extractFloat128Exp( b );
  7137. bSign := extractFloat128Sign( b );
  7138. zSign := aSign xor bSign;
  7139. if ( aExp = $7FFF ) begin
  7140. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
  7141. if ( bExp = $7FFF ) begin
  7142. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7143. goto invalid;
  7144. end;
  7145. result := packFloat128( zSign, $7FFF, 0, 0 );
  7146. end;
  7147. if ( bExp = $7FFF ) begin
  7148. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7149. result := packFloat128( zSign, 0, 0, 0 );
  7150. end;
  7151. if ( bExp = 0 ) begin
  7152. if ( ( bSig0 or bSig1 ) = 0 ) begin
  7153. if ( ( aExp or aSig0 or aSig1 ) = 0 ) begin
  7154. invalid:
  7155. float_raise( float_flag_invalid );
  7156. z.low := float128_default_nan_low;
  7157. z.high := float128_default_nan_high;
  7158. result := z;
  7159. end;
  7160. float_raise( float_flag_divbyzero );
  7161. result := packFloat128( zSign, $7FFF, 0, 0 );
  7162. end;
  7163. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  7164. end;
  7165. if ( aExp = 0 ) begin
  7166. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
  7167. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7168. end;
  7169. zExp := aExp - bExp + $3FFD;
  7170. shortShift128Left(
  7171. aSig0 or int64( $0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
  7172. shortShift128Left(
  7173. bSig0 or int64( $0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
  7174. if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) begin
  7175. shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
  7176. ++zExp;
  7177. end;
  7178. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7179. mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
  7180. sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
  7181. while ( (sbits64) rem0 < 0 ) begin
  7182. --zSig0;
  7183. add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
  7184. end;
  7185. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7186. if ( ( zSig1 and $3FFF ) <= 4 ) begin
  7187. mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
  7188. sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
  7189. while ( (sbits64) rem1 < 0 ) begin
  7190. --zSig1;
  7191. add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
  7192. end;
  7193. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  7194. end;
  7195. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
  7196. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7197. end;
  7198. {*----------------------------------------------------------------------------
  7199. | Returns the remainder of the quadruple-precision floating-point value `a'
  7200. | with respect to the corresponding value `b'. The operation is performed
  7201. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7202. *----------------------------------------------------------------------------*}
  7203. function float128_rem(a: float128; b: float128): float128;
  7204. var
  7205. aSign, bSign, zSign: flag;
  7206. aExp, bExp, expDiff: int32;
  7207. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7208. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7209. sigMean0: sbits64;
  7210. z: float128;
  7211. begin
  7212. aSig1 := extractFloat128Frac1( a );
  7213. aSig0 := extractFloat128Frac0( a );
  7214. aExp := extractFloat128Exp( a );
  7215. aSign := extractFloat128Sign( a );
  7216. bSig1 := extractFloat128Frac1( b );
  7217. bSig0 := extractFloat128Frac0( b );
  7218. bExp := extractFloat128Exp( b );
  7219. bSign := extractFloat128Sign( b );
  7220. if ( aExp = $7FFF ) begin
  7221. if ( ( aSig0 or aSig1 )
  7222. or ( ( bExp = $7FFF ) and ( bSig0 or bSig1 ) ) ) begin
  7223. result := propagateFloat128NaN( a, b );
  7224. end;
  7225. goto invalid;
  7226. end;
  7227. if ( bExp = $7FFF ) begin
  7228. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7229. result := a;
  7230. end;
  7231. if ( bExp = 0 ) begin
  7232. if ( ( bSig0 or bSig1 ) = 0 ) begin
  7233. invalid:
  7234. float_raise( float_flag_invalid );
  7235. z.low := float128_default_nan_low;
  7236. z.high := float128_default_nan_high;
  7237. result := z;
  7238. end;
  7239. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  7240. end;
  7241. if ( aExp = 0 ) begin
  7242. if ( ( aSig0 or aSig1 ) = 0 ) result := a;
  7243. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7244. end;
  7245. expDiff := aExp - bExp;
  7246. if ( expDiff < -1 ) result := a;
  7247. shortShift128Left(
  7248. aSig0 or int64( $0001000000000000 ),
  7249. aSig1,
  7250. 15 - ( expDiff < 0 ),
  7251. &aSig0,
  7252. &aSig1
  7253. );
  7254. shortShift128Left(
  7255. bSig0 or int64( $0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
  7256. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7257. if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
  7258. expDiff -= 64;
  7259. while ( 0 < expDiff ) begin
  7260. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7261. q := ( 4 < q ) ? q - 4 : 0;
  7262. mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
  7263. shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
  7264. shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
  7265. sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
  7266. expDiff -= 61;
  7267. end;
  7268. if ( -64 < expDiff ) begin
  7269. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7270. q := ( 4 < q ) ? q - 4 : 0;
  7271. q >>= - expDiff;
  7272. shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
  7273. expDiff += 52;
  7274. if ( expDiff < 0 ) begin
  7275. shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
  7276. end;
  7277. else begin
  7278. shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
  7279. end;
  7280. mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
  7281. sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
  7282. end;
  7283. else begin
  7284. shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
  7285. shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
  7286. end;
  7287. do begin
  7288. alternateASig0 := aSig0;
  7289. alternateASig1 := aSig1;
  7290. ++q;
  7291. sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
  7292. end; while ( 0 <= (sbits64) aSig0 );
  7293. add128(
  7294. aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
  7295. if ( ( sigMean0 < 0 )
  7296. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and ( q and 1 ) ) ) begin
  7297. aSig0 := alternateASig0;
  7298. aSig1 := alternateASig1;
  7299. end;
  7300. zSign := ( (sbits64) aSig0 < 0 );
  7301. if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
  7302. result :=
  7303. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7304. end;
  7305. {*----------------------------------------------------------------------------
  7306. | Returns the square root of the quadruple-precision floating-point value `a'.
  7307. | The operation is performed according to the IEC/IEEE Standard for Binary
  7308. | Floating-Point Arithmetic.
  7309. *----------------------------------------------------------------------------*}
  7310. function float128_sqrt(a: float128): float128;
  7311. var
  7312. aSign: flag;
  7313. aExp, zExp: int32;
  7314. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7315. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7316. z: float128;
  7317. label
  7318. invalid;
  7319. begin
  7320. aSig1 := extractFloat128Frac1( a );
  7321. aSig0 := extractFloat128Frac0( a );
  7322. aExp := extractFloat128Exp( a );
  7323. aSign := extractFloat128Sign( a );
  7324. if ( aExp = $7FFF ) begin
  7325. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, a );
  7326. if ( ! aSign ) result := a;
  7327. goto invalid;
  7328. end;
  7329. if ( aSign ) begin
  7330. if ( ( aExp or aSig0 or aSig1 ) = 0 ) result := a;
  7331. invalid:
  7332. float_raise( float_flag_invalid );
  7333. z.low := float128_default_nan_low;
  7334. z.high := float128_default_nan_high;
  7335. result := z;
  7336. end;
  7337. if ( aExp = 0 ) begin
  7338. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( 0, 0, 0, 0 );
  7339. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7340. end;
  7341. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7342. aSig0 := aSig0 or int64( $0001000000000000 );
  7343. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7344. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), &aSig0, &aSig1 );
  7345. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7346. doubleZSig0 := zSig0 shl 1;
  7347. mul64To128( zSig0, zSig0, &term0, &term1 );
  7348. sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
  7349. while ( (sbits64) rem0 < 0 ) begin
  7350. --zSig0;
  7351. doubleZSig0 -= 2;
  7352. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, &rem0, &rem1 );
  7353. end;
  7354. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7355. if ( ( zSig1 and $1FFF ) <= 5 ) begin
  7356. if ( zSig1 = 0 ) zSig1 := 1;
  7357. mul64To128( doubleZSig0, zSig1, &term1, &term2 );
  7358. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  7359. mul64To128( zSig1, zSig1, &term2, &term3 );
  7360. sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
  7361. while ( (sbits64) rem1 < 0 ) begin
  7362. --zSig1;
  7363. shortShift128Left( 0, zSig1, 1, &term2, &term3 );
  7364. term3 or= 1;
  7365. term2 or= doubleZSig0;
  7366. add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
  7367. end;
  7368. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  7369. end;
  7370. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
  7371. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7372. end;
  7373. {*----------------------------------------------------------------------------
  7374. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7375. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7376. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7377. *----------------------------------------------------------------------------*}
  7378. function float128_eq(a: float128; b: float128): flag;
  7379. begin
  7380. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7381. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7382. or ( ( extractFloat128Exp( b ) = $7FFF )
  7383. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7384. ) begin
  7385. if ( float128_is_signaling_nan( a )
  7386. or float128_is_signaling_nan( b ) ) begin
  7387. float_raise( float_flag_invalid );
  7388. end;
  7389. result := 0;
  7390. end;
  7391. result :=
  7392. ( a.low = b.low )
  7393. and ( ( a.high = b.high )
  7394. or ( ( a.low = 0 )
  7395. and ( (bits64) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7396. );
  7397. end;
  7398. {*----------------------------------------------------------------------------
  7399. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7400. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7401. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7402. | Arithmetic.
  7403. *----------------------------------------------------------------------------*}
  7404. function float128_le(a: float128; b: float128): flag;
  7405. var
  7406. aSign, bSign: flag;
  7407. begin
  7408. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7409. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7410. or ( ( extractFloat128Exp( b ) = $7FFF )
  7411. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7412. ) begin
  7413. float_raise( float_flag_invalid );
  7414. result := 0;
  7415. end;
  7416. aSign := extractFloat128Sign( a );
  7417. bSign := extractFloat128Sign( b );
  7418. if ( aSign <> bSign ) begin
  7419. result :=
  7420. aSign
  7421. or ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7422. = 0 );
  7423. end;
  7424. result :=
  7425. aSign ? le128( b.high, b.low, a.high, a.low )
  7426. : le128( a.high, a.low, b.high, b.low );
  7427. end;
  7428. {*----------------------------------------------------------------------------
  7429. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7430. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7431. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7432. *----------------------------------------------------------------------------*}
  7433. function float128_lt(a: float128; b: float128): flag;
  7434. var
  7435. aSign, bSign: flag;
  7436. begin
  7437. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7438. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7439. or ( ( extractFloat128Exp( b ) = $7FFF )
  7440. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7441. ) begin
  7442. float_raise( float_flag_invalid );
  7443. result := 0;
  7444. end;
  7445. aSign := extractFloat128Sign( a );
  7446. bSign := extractFloat128Sign( b );
  7447. if ( aSign <> bSign ) begin
  7448. result :=
  7449. aSign
  7450. and ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7451. <> 0 );
  7452. end;
  7453. result :=
  7454. aSign ? lt128( b.high, b.low, a.high, a.low )
  7455. : lt128( a.high, a.low, b.high, b.low );
  7456. end;
  7457. {*----------------------------------------------------------------------------
  7458. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7459. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7460. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7461. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7462. *----------------------------------------------------------------------------*}
  7463. function float128_eq_signaling(a: float128; b: float128): flag;
  7464. begin
  7465. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7466. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7467. or ( ( extractFloat128Exp( b ) = $7FFF )
  7468. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7469. ) begin
  7470. float_raise( float_flag_invalid );
  7471. result := 0;
  7472. end;
  7473. result :=
  7474. ( a.low = b.low )
  7475. and ( ( a.high = b.high )
  7476. or ( ( a.low = 0 )
  7477. and ( (bits64) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7478. );
  7479. end;
  7480. {*----------------------------------------------------------------------------
  7481. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7482. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7483. | cause an exception. Otherwise, the comparison is performed according to the
  7484. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7485. *----------------------------------------------------------------------------*}
  7486. function float128_le_quiet(a: float128; b: float128): flag;
  7487. var
  7488. aSign, bSign: flag;
  7489. begin
  7490. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7491. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7492. or ( ( extractFloat128Exp( b ) = $7FFF )
  7493. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7494. ) begin
  7495. if ( float128_is_signaling_nan( a )
  7496. or float128_is_signaling_nan( b ) ) begin
  7497. float_raise( float_flag_invalid );
  7498. end;
  7499. result := 0;
  7500. end;
  7501. aSign := extractFloat128Sign( a );
  7502. bSign := extractFloat128Sign( b );
  7503. if ( aSign <> bSign ) begin
  7504. result :=
  7505. aSign
  7506. or ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7507. = 0 );
  7508. end;
  7509. result :=
  7510. aSign ? le128( b.high, b.low, a.high, a.low )
  7511. : le128( a.high, a.low, b.high, b.low );
  7512. end;
  7513. {*----------------------------------------------------------------------------
  7514. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7515. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  7516. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  7517. | Standard for Binary Floating-Point Arithmetic.
  7518. *----------------------------------------------------------------------------*}
  7519. function float128_lt_quiet(a: float128; b: float128): flag;
  7520. var
  7521. aSign, bSign: flag;
  7522. begin
  7523. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7524. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7525. or ( ( extractFloat128Exp( b ) = $7FFF )
  7526. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7527. ) begin
  7528. if ( float128_is_signaling_nan( a )
  7529. or float128_is_signaling_nan( b ) ) begin
  7530. float_raise( float_flag_invalid );
  7531. end;
  7532. result := 0;
  7533. end;
  7534. aSign := extractFloat128Sign( a );
  7535. bSign := extractFloat128Sign( b );
  7536. if ( aSign <> bSign ) begin
  7537. result :=
  7538. aSign
  7539. and ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7540. <> 0 );
  7541. end;
  7542. result :=
  7543. aSign ? lt128( b.high, b.low, a.high, a.low )
  7544. : lt128( a.high, a.low, b.high, b.low );
  7545. end;
  7546. {$endif FPC_SOFTFLOAT_FLOAT128}
  7547. {$endif not(defined(fpc_softfpu_interface))}
  7548. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  7549. end.
  7550. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}