softfpu.pp 277 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. interface
  69. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  70. {$if not(defined(fpc_softfpu_implementation))}
  71. {
  72. -------------------------------------------------------------------------------
  73. Software IEC/IEEE floating-point types.
  74. -------------------------------------------------------------------------------
  75. }
  76. TYPE
  77. float32 = longword;
  78. { we use here a record in the function header because
  79. the record allows bitwise conversion to single }
  80. float32rec = record
  81. float32 : float32;
  82. end;
  83. flag = byte;
  84. uint8 = byte;
  85. int8 = shortint;
  86. uint16 = word;
  87. int16 = smallint;
  88. uint32 = longword;
  89. int32 = longint;
  90. bits8 = byte;
  91. sbits8 = shortint;
  92. bits16 = word;
  93. sbits16 = smallint;
  94. sbits32 = longint;
  95. bits32 = longword;
  96. {$ifndef fpc}
  97. qword = int64;
  98. {$endif}
  99. { now part of the system unit
  100. uint64 = qword;
  101. }
  102. bits64 = qword;
  103. sbits64 = int64;
  104. {$ifdef ENDIAN_LITTLE}
  105. float64 = packed record
  106. low: bits32;
  107. high: bits32;
  108. end;
  109. int64rec = packed record
  110. low: bits32;
  111. high: bits32;
  112. end;
  113. floatx80 = packed record
  114. low : qword;
  115. high : word;
  116. end;
  117. float128 = packed record
  118. low : qword;
  119. high : qword;
  120. end;
  121. {$else}
  122. float64 = packed record
  123. high,low : bits32;
  124. end;
  125. int64rec = packed record
  126. high,low : bits32;
  127. end;
  128. floatx80 = packed record
  129. high : word;
  130. low : qword;
  131. end;
  132. float128 = packed record
  133. high : qword;
  134. low : qword;
  135. end;
  136. {$endif}
  137. {*
  138. -------------------------------------------------------------------------------
  139. Returns 1 if the double-precision floating-point value `a' is less than
  140. the corresponding value `b', and 0 otherwise. The comparison is performed
  141. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  142. -------------------------------------------------------------------------------
  143. *}
  144. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  145. {*
  146. -------------------------------------------------------------------------------
  147. Returns 1 if the double-precision floating-point value `a' is less than
  148. or equal to the corresponding value `b', and 0 otherwise. The comparison
  149. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  150. Arithmetic.
  151. -------------------------------------------------------------------------------
  152. *}
  153. Function float64_le(a: float64;b: float64): flag; compilerproc;
  154. {*
  155. -------------------------------------------------------------------------------
  156. Returns 1 if the double-precision floating-point value `a' is equal to
  157. the corresponding value `b', and 0 otherwise. The comparison is performed
  158. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  159. -------------------------------------------------------------------------------
  160. *}
  161. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  162. {*
  163. -------------------------------------------------------------------------------
  164. Returns the square root of the double-precision floating-point value `a'.
  165. The operation is performed according to the IEC/IEEE Standard for Binary
  166. Floating-Point Arithmetic.
  167. -------------------------------------------------------------------------------
  168. *}
  169. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  170. {*
  171. -------------------------------------------------------------------------------
  172. Returns the remainder of the double-precision floating-point value `a'
  173. with respect to the corresponding value `b'. The operation is performed
  174. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  175. -------------------------------------------------------------------------------
  176. *}
  177. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  178. {*
  179. -------------------------------------------------------------------------------
  180. Returns the result of dividing the double-precision floating-point value `a'
  181. by the corresponding value `b'. The operation is performed according to the
  182. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  183. -------------------------------------------------------------------------------
  184. *}
  185. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  186. {*
  187. -------------------------------------------------------------------------------
  188. Returns the result of multiplying the double-precision floating-point values
  189. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  190. for Binary Floating-Point Arithmetic.
  191. -------------------------------------------------------------------------------
  192. *}
  193. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  194. {*
  195. -------------------------------------------------------------------------------
  196. Returns the result of subtracting the double-precision floating-point values
  197. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  198. for Binary Floating-Point Arithmetic.
  199. -------------------------------------------------------------------------------
  200. *}
  201. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  202. {*
  203. -------------------------------------------------------------------------------
  204. Returns the result of adding the double-precision floating-point values `a'
  205. and `b'. The operation is performed according to the IEC/IEEE Standard for
  206. Binary Floating-Point Arithmetic.
  207. -------------------------------------------------------------------------------
  208. *}
  209. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  210. {*
  211. -------------------------------------------------------------------------------
  212. Rounds the double-precision floating-point value `a' to an integer,
  213. and returns the result as a double-precision floating-point value. The
  214. operation is performed according to the IEC/IEEE Standard for Binary
  215. Floating-Point Arithmetic.
  216. -------------------------------------------------------------------------------
  217. *}
  218. Function float64_round_to_int(a: float64) : float64; compilerproc;
  219. {*
  220. -------------------------------------------------------------------------------
  221. Returns the result of converting the double-precision floating-point value
  222. `a' to the single-precision floating-point format. The conversion is
  223. performed according to the IEC/IEEE Standard for Binary Floating-Point
  224. Arithmetic.
  225. -------------------------------------------------------------------------------
  226. *}
  227. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  228. {*
  229. -------------------------------------------------------------------------------
  230. Returns the result of converting the double-precision floating-point value
  231. `a' to the 32-bit two's complement integer format. The conversion is
  232. performed according to the IEC/IEEE Standard for Binary Floating-Point
  233. Arithmetic, except that the conversion is always rounded toward zero.
  234. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  235. the conversion overflows, the largest integer with the same sign as `a' is
  236. returned.
  237. -------------------------------------------------------------------------------
  238. *}
  239. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  240. {*
  241. -------------------------------------------------------------------------------
  242. Returns the result of converting the double-precision floating-point value
  243. `a' to the 32-bit two's complement integer format. The conversion is
  244. performed according to the IEC/IEEE Standard for Binary Floating-Point
  245. Arithmetic---which means in particular that the conversion is rounded
  246. according to the current rounding mode. If `a' is a NaN, the largest
  247. positive integer is returned. Otherwise, if the conversion overflows, the
  248. largest integer with the same sign as `a' is returned.
  249. -------------------------------------------------------------------------------
  250. *}
  251. Function float64_to_int32(a: float64): int32; compilerproc;
  252. {*
  253. -------------------------------------------------------------------------------
  254. Returns 1 if the single-precision floating-point value `a' is less than
  255. the corresponding value `b', and 0 otherwise. The comparison is performed
  256. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  257. -------------------------------------------------------------------------------
  258. *}
  259. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  260. {*
  261. -------------------------------------------------------------------------------
  262. Returns 1 if the single-precision floating-point value `a' is less than
  263. or equal to the corresponding value `b', and 0 otherwise. The comparison
  264. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  265. Arithmetic.
  266. -------------------------------------------------------------------------------
  267. *}
  268. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  269. {*
  270. -------------------------------------------------------------------------------
  271. Returns 1 if the single-precision floating-point value `a' is equal to
  272. the corresponding value `b', and 0 otherwise. The comparison is performed
  273. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  274. -------------------------------------------------------------------------------
  275. *}
  276. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  277. {*
  278. -------------------------------------------------------------------------------
  279. Returns the square root of the single-precision floating-point value `a'.
  280. The operation is performed according to the IEC/IEEE Standard for Binary
  281. Floating-Point Arithmetic.
  282. -------------------------------------------------------------------------------
  283. *}
  284. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  285. {*
  286. -------------------------------------------------------------------------------
  287. Returns the remainder of the single-precision floating-point value `a'
  288. with respect to the corresponding value `b'. The operation is performed
  289. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  290. -------------------------------------------------------------------------------
  291. *}
  292. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  293. {*
  294. -------------------------------------------------------------------------------
  295. Returns the result of dividing the single-precision floating-point value `a'
  296. by the corresponding value `b'. The operation is performed according to the
  297. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  298. -------------------------------------------------------------------------------
  299. *}
  300. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  301. {*
  302. -------------------------------------------------------------------------------
  303. Returns the result of multiplying the single-precision floating-point values
  304. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  305. for Binary Floating-Point Arithmetic.
  306. -------------------------------------------------------------------------------
  307. *}
  308. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  309. {*
  310. -------------------------------------------------------------------------------
  311. Returns the result of subtracting the single-precision floating-point values
  312. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  313. for Binary Floating-Point Arithmetic.
  314. -------------------------------------------------------------------------------
  315. *}
  316. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  317. {*
  318. -------------------------------------------------------------------------------
  319. Returns the result of adding the single-precision floating-point values `a'
  320. and `b'. The operation is performed according to the IEC/IEEE Standard for
  321. Binary Floating-Point Arithmetic.
  322. -------------------------------------------------------------------------------
  323. *}
  324. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  325. {*
  326. -------------------------------------------------------------------------------
  327. Rounds the single-precision floating-point value `a' to an integer,
  328. and returns the result as a single-precision floating-point value. The
  329. operation is performed according to the IEC/IEEE Standard for Binary
  330. Floating-Point Arithmetic.
  331. -------------------------------------------------------------------------------
  332. *}
  333. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  334. {*
  335. -------------------------------------------------------------------------------
  336. Returns the result of converting the single-precision floating-point value
  337. `a' to the double-precision floating-point format. The conversion is
  338. performed according to the IEC/IEEE Standard for Binary Floating-Point
  339. Arithmetic.
  340. -------------------------------------------------------------------------------
  341. *}
  342. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  343. {*
  344. -------------------------------------------------------------------------------
  345. Returns the result of converting the single-precision floating-point value
  346. `a' to the 32-bit two's complement integer format. The conversion is
  347. performed according to the IEC/IEEE Standard for Binary Floating-Point
  348. Arithmetic, except that the conversion is always rounded toward zero.
  349. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  350. the conversion overflows, the largest integer with the same sign as `a' is
  351. returned.
  352. -------------------------------------------------------------------------------
  353. *}
  354. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  355. {*
  356. -------------------------------------------------------------------------------
  357. Returns the result of converting the single-precision floating-point value
  358. `a' to the 32-bit two's complement integer format. The conversion is
  359. performed according to the IEC/IEEE Standard for Binary Floating-Point
  360. Arithmetic---which means in particular that the conversion is rounded
  361. according to the current rounding mode. If `a' is a NaN, the largest
  362. positive integer is returned. Otherwise, if the conversion overflows, the
  363. largest integer with the same sign as `a' is returned.
  364. -------------------------------------------------------------------------------
  365. *}
  366. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  367. {*
  368. -------------------------------------------------------------------------------
  369. Returns the result of converting the 32-bit two's complement integer `a' to
  370. the double-precision floating-point format. The conversion is performed
  371. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  372. -------------------------------------------------------------------------------
  373. *}
  374. Function int32_to_float64( a: int32) : float64; compilerproc;
  375. {*
  376. -------------------------------------------------------------------------------
  377. Returns the result of converting the 32-bit two's complement integer `a' to
  378. the single-precision floating-point format. The conversion is performed
  379. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  380. -------------------------------------------------------------------------------
  381. *}
  382. Function int32_to_float32( a: int32): float32rec; compilerproc;
  383. {*----------------------------------------------------------------------------
  384. | Returns the result of converting the 64-bit two's complement integer `a'
  385. | to the double-precision floating-point format. The conversion is performed
  386. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  387. *----------------------------------------------------------------------------*}
  388. Function int64_to_float64( a: int64 ): float64; compilerproc;
  389. {*----------------------------------------------------------------------------
  390. | Returns the result of converting the 64-bit two's complement integer `a'
  391. | to the single-precision floating-point format. The conversion is performed
  392. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  393. *----------------------------------------------------------------------------*}
  394. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  395. CONST
  396. {-------------------------------------------------------------------------------
  397. Software IEC/IEEE floating-point underflow tininess-detection mode.
  398. -------------------------------------------------------------------------------
  399. *}
  400. float_tininess_after_rounding = 0;
  401. float_tininess_before_rounding = 1;
  402. {*
  403. -------------------------------------------------------------------------------
  404. Underflow tininess-detection mode, statically initialized to default value.
  405. (The declaration in `softfloat.h' must match the `int8' type here.)
  406. -------------------------------------------------------------------------------
  407. *}
  408. const float_detect_tininess: int8 = float_tininess_after_rounding;
  409. {$endif not(defined(fpc_softfpu_implementation))}
  410. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  411. implementation
  412. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  413. {$if not(defined(fpc_softfpu_interface))}
  414. (*****************************************************************************)
  415. (*----------------------------------------------------------------------------*)
  416. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  417. (* division and square root approximations. (Can be specialized to target if *)
  418. (* desired.) *)
  419. (* ---------------------------------------------------------------------------*)
  420. (*****************************************************************************)
  421. {*----------------------------------------------------------------------------
  422. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  423. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  424. | input. If `zSign' is 1, the input is negated before being converted to an
  425. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  426. | is simply rounded to an integer, with the inexact exception raised if the
  427. | input cannot be represented exactly as an integer. However, if the fixed-
  428. | point input is too large, the invalid exception is raised and the largest
  429. | positive or negative integer is returned.
  430. *----------------------------------------------------------------------------*}
  431. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  432. var
  433. roundingMode: int8;
  434. roundNearestEven: flag;
  435. roundIncrement, roundBits: int8;
  436. z: int32;
  437. begin
  438. roundingMode := softfloat_rounding_mode;
  439. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  440. roundIncrement := $40;
  441. if ( roundNearestEven=0 ) then
  442. begin
  443. if ( roundingMode = float_round_to_zero ) then
  444. begin
  445. roundIncrement := 0;
  446. end
  447. else begin
  448. roundIncrement := $7F;
  449. if ( zSign<>0 ) then
  450. begin
  451. if ( roundingMode = float_round_up ) then
  452. roundIncrement := 0;
  453. end
  454. else begin
  455. if ( roundingMode = float_round_down ) then
  456. roundIncrement := 0;
  457. end;
  458. end;
  459. end;
  460. roundBits := absZ and $7F;
  461. absZ := ( absZ + roundIncrement ) shr 7;
  462. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  463. z := absZ;
  464. if ( zSign<>0 ) then
  465. z := - z;
  466. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  467. begin
  468. float_raise( float_flag_invalid );
  469. if zSign<>0 then
  470. result:=sbits32($80000000)
  471. else
  472. result:=$7FFFFFFF;
  473. exit;
  474. end;
  475. if ( roundBits<>0 ) then
  476. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  477. result:=z;
  478. end;
  479. {*----------------------------------------------------------------------------
  480. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  481. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  482. | and returns the properly rounded 64-bit integer corresponding to the input.
  483. | If `zSign' is 1, the input is negated before being converted to an integer.
  484. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  485. | the inexact exception raised if the input cannot be represented exactly as
  486. | an integer. However, if the fixed-point input is too large, the invalid
  487. | exception is raised and the largest positive or negative integer is
  488. | returned.
  489. *----------------------------------------------------------------------------*}
  490. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  491. var
  492. roundingMode: int8;
  493. roundNearestEven, increment: flag;
  494. z: int64;
  495. label
  496. overflow;
  497. begin
  498. roundingMode := softfloat_rounding_mode;
  499. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  500. increment := ord( sbits64(absZ1) < 0 );
  501. if ( roundNearestEven=0 ) then
  502. begin
  503. if ( roundingMode = float_round_to_zero ) then
  504. begin
  505. increment := 0;
  506. end
  507. else begin
  508. if ( zSign<>0 ) then
  509. begin
  510. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  511. end
  512. else begin
  513. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  514. end;
  515. end;
  516. end;
  517. if ( increment<>0 ) then
  518. begin
  519. inc(absZ0);
  520. if ( absZ0 = 0 ) then
  521. goto overflow;
  522. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  523. end;
  524. z := absZ0;
  525. if ( zSign<>0 ) then
  526. z := - z;
  527. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  528. begin
  529. overflow:
  530. float_raise( float_flag_invalid );
  531. if zSign<>0 then
  532. result:=int64($8000000000000000)
  533. else
  534. result:=int64($7FFFFFFFFFFFFFFF);
  535. end;
  536. if ( absZ1<>0 ) then
  537. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  538. result:=z;
  539. end;
  540. {*
  541. -------------------------------------------------------------------------------
  542. Shifts `a' right by the number of bits given in `count'. If any nonzero
  543. bits are shifted off, they are ``jammed'' into the least significant bit of
  544. the result by setting the least significant bit to 1. The value of `count'
  545. can be arbitrarily large; in particular, if `count' is greater than 32, the
  546. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  547. The result is stored in the location pointed to by `zPtr'.
  548. -------------------------------------------------------------------------------
  549. *}
  550. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  551. var
  552. z: Bits32;
  553. Begin
  554. if ( count = 0 ) then
  555. z := a
  556. else
  557. if ( count < 32 ) then
  558. Begin
  559. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  560. End
  561. else
  562. Begin
  563. z := bits32( a <> 0 );
  564. End;
  565. zPtr := z;
  566. End;
  567. {*----------------------------------------------------------------------------
  568. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  569. | number of bits given in `count'. Any bits shifted off are lost. The value
  570. | of `count' can be arbitrarily large; in particular, if `count' is greater
  571. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  572. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  573. *----------------------------------------------------------------------------*}
  574. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  575. var
  576. z0, z1: bits64;
  577. negCount: int8;
  578. begin
  579. negCount := ( - count ) and 63;
  580. if ( count = 0 ) then
  581. begin
  582. z1 := a1;
  583. z0 := a0;
  584. end
  585. else if ( count < 64 ) then
  586. begin
  587. z1 := ( a0 shl negCount ) or ( a1 shr count );
  588. z0 := a0 shr count;
  589. end
  590. else
  591. begin
  592. if ( count shl 64 )<>0 then
  593. z1 := a0 shr ( count and 63 )
  594. else
  595. z1 := 0;
  596. z0 := 0;
  597. end;
  598. z1Ptr := z1;
  599. z0Ptr := z0;
  600. end;
  601. {*
  602. -------------------------------------------------------------------------------
  603. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  604. number of bits given in `count'. Any bits shifted off are lost. The value
  605. of `count' can be arbitrarily large; in particular, if `count' is greater
  606. than 64, the result will be 0. The result is broken into two 32-bit pieces
  607. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  608. -------------------------------------------------------------------------------
  609. *}
  610. Procedure
  611. shift64Right(
  612. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  613. Var
  614. z0, z1: bits32;
  615. negCount : int8;
  616. Begin
  617. negCount := ( - count ) AND 31;
  618. if ( count = 0 ) then
  619. Begin
  620. z1 := a1;
  621. z0 := a0;
  622. End
  623. else if ( count < 32 ) then
  624. Begin
  625. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  626. z0 := a0 shr count;
  627. End
  628. else
  629. Begin
  630. if (count < 64) then
  631. z1 := ( a0 shr ( count AND 31 ) )
  632. else
  633. z1 := 0;
  634. z0 := 0;
  635. End;
  636. z1Ptr := z1;
  637. z0Ptr := z0;
  638. End;
  639. {*
  640. -------------------------------------------------------------------------------
  641. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  642. number of bits given in `count'. If any nonzero bits are shifted off, they
  643. are ``jammed'' into the least significant bit of the result by setting the
  644. least significant bit to 1. The value of `count' can be arbitrarily large;
  645. in particular, if `count' is greater than 64, the result will be either 0
  646. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  647. nonzero. The result is broken into two 32-bit pieces which are stored at
  648. the locations pointed to by `z0Ptr' and `z1Ptr'.
  649. -------------------------------------------------------------------------------
  650. *}
  651. Procedure
  652. shift64RightJamming(
  653. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  654. VAR
  655. z0, z1 : bits32;
  656. negCount : int8;
  657. Begin
  658. negCount := ( - count ) AND 31;
  659. if ( count = 0 ) then
  660. Begin
  661. z1 := a1;
  662. z0 := a0;
  663. End
  664. else
  665. if ( count < 32 ) then
  666. Begin
  667. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  668. z0 := a0 shr count;
  669. End
  670. else
  671. Begin
  672. if ( count = 32 ) then
  673. Begin
  674. z1 := a0 OR bits32( a1 <> 0 );
  675. End
  676. else
  677. if ( count < 64 ) Then
  678. Begin
  679. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  680. End
  681. else
  682. Begin
  683. z1 := bits32( ( a0 OR a1 ) <> 0 );
  684. End;
  685. z0 := 0;
  686. End;
  687. z1Ptr := z1;
  688. z0Ptr := z0;
  689. End;
  690. {*----------------------------------------------------------------------------
  691. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  692. | bits are shifted off, they are ``jammed'' into the least significant bit of
  693. | the result by setting the least significant bit to 1. The value of `count'
  694. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  695. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  696. | The result is stored in the location pointed to by `zPtr'.
  697. *----------------------------------------------------------------------------*}
  698. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  699. var
  700. z: bits64;
  701. begin
  702. if ( count = 0 ) then
  703. begin
  704. z := a;
  705. end
  706. else if ( count < 64 ) then
  707. begin
  708. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  709. end
  710. else
  711. begin
  712. z := ord( a <> 0 );
  713. end;
  714. zPtr := z;
  715. end;
  716. {*
  717. -------------------------------------------------------------------------------
  718. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  719. by 32 _plus_ the number of bits given in `count'. The shifted result is
  720. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  721. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  722. off form a third 32-bit result as follows: The _last_ bit shifted off is
  723. the most-significant bit of the extra result, and the other 31 bits of the
  724. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  725. were all zero. This extra result is stored in the location pointed to by
  726. `z2Ptr'. The value of `count' can be arbitrarily large.
  727. (This routine makes more sense if `a0', `a1', and `a2' are considered
  728. to form a fixed-point value with binary point between `a1' and `a2'. This
  729. fixed-point value is shifted right by the number of bits given in `count',
  730. and the integer part of the result is returned at the locations pointed to
  731. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  732. corrupted as described above, and is returned at the location pointed to by
  733. `z2Ptr'.)
  734. -------------------------------------------------------------------------------
  735. }
  736. Procedure
  737. shift64ExtraRightJamming(
  738. a0: bits32;
  739. a1: bits32;
  740. a2: bits32;
  741. count: int16;
  742. VAR z0Ptr: bits32;
  743. VAR z1Ptr: bits32;
  744. VAR z2Ptr: bits32
  745. );
  746. Var
  747. z0, z1, z2: bits32;
  748. negCount : int8;
  749. Begin
  750. negCount := ( - count ) AND 31;
  751. if ( count = 0 ) then
  752. Begin
  753. z2 := a2;
  754. z1 := a1;
  755. z0 := a0;
  756. End
  757. else
  758. Begin
  759. if ( count < 32 ) Then
  760. Begin
  761. z2 := a1 shl negCount;
  762. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  763. z0 := a0 shr count;
  764. End
  765. else
  766. Begin
  767. if ( count = 32 ) then
  768. Begin
  769. z2 := a1;
  770. z1 := a0;
  771. End
  772. else
  773. Begin
  774. a2 := a2 or a1;
  775. if ( count < 64 ) then
  776. Begin
  777. z2 := a0 shl negCount;
  778. z1 := a0 shr ( count AND 31 );
  779. End
  780. else
  781. Begin
  782. if count = 64 then
  783. z2 := a0
  784. else
  785. z2 := bits32(a0 <> 0);
  786. z1 := 0;
  787. End;
  788. End;
  789. z0 := 0;
  790. End;
  791. z2 := z2 or bits32( a2 <> 0 );
  792. End;
  793. z2Ptr := z2;
  794. z1Ptr := z1;
  795. z0Ptr := z0;
  796. End;
  797. {*
  798. -------------------------------------------------------------------------------
  799. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  800. number of bits given in `count'. Any bits shifted off are lost. The value
  801. of `count' must be less than 32. The result is broken into two 32-bit
  802. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  803. -------------------------------------------------------------------------------
  804. *}
  805. Procedure
  806. shortShift64Left(
  807. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  808. Begin
  809. z1Ptr := a1 shl count;
  810. if count = 0 then
  811. z0Ptr := a0
  812. else
  813. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  814. End;
  815. {*
  816. -------------------------------------------------------------------------------
  817. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  818. by the number of bits given in `count'. Any bits shifted off are lost.
  819. The value of `count' must be less than 32. The result is broken into three
  820. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  821. `z1Ptr', and `z2Ptr'.
  822. -------------------------------------------------------------------------------
  823. *}
  824. Procedure
  825. shortShift96Left(
  826. a0: bits32;
  827. a1: bits32;
  828. a2: bits32;
  829. count: int16;
  830. VAR z0Ptr: bits32;
  831. VAR z1Ptr: bits32;
  832. VAR z2Ptr: bits32
  833. );
  834. Var
  835. z0, z1, z2: bits32;
  836. negCount: int8;
  837. Begin
  838. z2 := a2 shl count;
  839. z1 := a1 shl count;
  840. z0 := a0 shl count;
  841. if ( 0 < count ) then
  842. Begin
  843. negCount := ( ( - count ) AND 31 );
  844. z1 := z1 or (a2 shr negCount);
  845. z0 := z0 or (a1 shr negCount);
  846. End;
  847. z2Ptr := z2;
  848. z1Ptr := z1;
  849. z0Ptr := z0;
  850. End;
  851. {*----------------------------------------------------------------------------
  852. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  853. | number of bits given in `count'. Any bits shifted off are lost. The value
  854. | of `count' must be less than 64. The result is broken into two 64-bit
  855. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  856. *----------------------------------------------------------------------------*}
  857. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);inline;
  858. begin
  859. z1Ptr := a1 shl count;
  860. if count=0 then
  861. z0Ptr:=a0
  862. else
  863. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  864. end;
  865. {*
  866. -------------------------------------------------------------------------------
  867. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  868. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  869. any carry out is lost. The result is broken into two 32-bit pieces which
  870. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  871. -------------------------------------------------------------------------------
  872. *}
  873. Procedure
  874. add64(
  875. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  876. Var
  877. z1: bits32;
  878. Begin
  879. z1 := a1 + b1;
  880. z1Ptr := z1;
  881. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  882. End;
  883. {*
  884. -------------------------------------------------------------------------------
  885. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  886. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  887. modulo 2^96, so any carry out is lost. The result is broken into three
  888. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  889. `z1Ptr', and `z2Ptr'.
  890. -------------------------------------------------------------------------------
  891. *}
  892. Procedure
  893. add96(
  894. a0: bits32;
  895. a1: bits32;
  896. a2: bits32;
  897. b0: bits32;
  898. b1: bits32;
  899. b2: bits32;
  900. VAR z0Ptr: bits32;
  901. VAR z1Ptr: bits32;
  902. VAR z2Ptr: bits32
  903. );
  904. var
  905. z0, z1, z2: bits32;
  906. carry0, carry1: int8;
  907. Begin
  908. z2 := a2 + b2;
  909. carry1 := int8( z2 < a2 );
  910. z1 := a1 + b1;
  911. carry0 := int8( z1 < a1 );
  912. z0 := a0 + b0;
  913. z1 := z1 + carry1;
  914. z0 := z0 + bits32( z1 < carry1 );
  915. z0 := z0 + carry0;
  916. z2Ptr := z2;
  917. z1Ptr := z1;
  918. z0Ptr := z0;
  919. End;
  920. {*
  921. -------------------------------------------------------------------------------
  922. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  923. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  924. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  925. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  926. `z1Ptr'.
  927. -------------------------------------------------------------------------------
  928. *}
  929. Procedure
  930. sub64(
  931. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  932. Begin
  933. z1Ptr := a1 - b1;
  934. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  935. End;
  936. {*
  937. -------------------------------------------------------------------------------
  938. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  939. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  940. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  941. into three 32-bit pieces which are stored at the locations pointed to by
  942. `z0Ptr', `z1Ptr', and `z2Ptr'.
  943. -------------------------------------------------------------------------------
  944. *}
  945. Procedure
  946. sub96(
  947. a0:bits32;
  948. a1:bits32;
  949. a2:bits32;
  950. b0:bits32;
  951. b1:bits32;
  952. b2:bits32;
  953. VAR z0Ptr:bits32;
  954. VAR z1Ptr:bits32;
  955. VAR z2Ptr:bits32
  956. );
  957. Var
  958. z0, z1, z2: bits32;
  959. borrow0, borrow1: int8;
  960. Begin
  961. z2 := a2 - b2;
  962. borrow1 := int8( a2 < b2 );
  963. z1 := a1 - b1;
  964. borrow0 := int8( a1 < b1 );
  965. z0 := a0 - b0;
  966. z0 := z0 - bits32( z1 < borrow1 );
  967. z1 := z1 - borrow1;
  968. z0 := z0 -borrow0;
  969. z2Ptr := z2;
  970. z1Ptr := z1;
  971. z0Ptr := z0;
  972. End;
  973. {*
  974. -------------------------------------------------------------------------------
  975. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  976. into two 32-bit pieces which are stored at the locations pointed to by
  977. `z0Ptr' and `z1Ptr'.
  978. -------------------------------------------------------------------------------
  979. *}
  980. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  981. :bits32 );
  982. Var
  983. aHigh, aLow, bHigh, bLow: bits16;
  984. z0, zMiddleA, zMiddleB, z1: bits32;
  985. Begin
  986. aLow := a and $ffff;
  987. aHigh := a shr 16;
  988. bLow := b and $ffff;
  989. bHigh := b shr 16;
  990. z1 := ( bits32( aLow) ) * bLow;
  991. zMiddleA := ( bits32 (aLow) ) * bHigh;
  992. zMiddleB := ( bits32 (aHigh) ) * bLow;
  993. z0 := ( bits32 (aHigh) ) * bHigh;
  994. zMiddleA := zMiddleA + zMiddleB;
  995. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  996. zMiddleA := zmiddleA shl 16;
  997. z1 := z1 + zMiddleA;
  998. z0 := z0 + bits32( z1 < zMiddleA );
  999. z1Ptr := z1;
  1000. z0Ptr := z0;
  1001. End;
  1002. {*
  1003. -------------------------------------------------------------------------------
  1004. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1005. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1006. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1007. `z2Ptr'.
  1008. -------------------------------------------------------------------------------
  1009. *}
  1010. Procedure
  1011. mul64By32To96(
  1012. a0:bits32;
  1013. a1:bits32;
  1014. b:bits32;
  1015. VAR z0Ptr:bits32;
  1016. VAR z1Ptr:bits32;
  1017. VAR z2Ptr:bits32
  1018. );
  1019. Var
  1020. z0, z1, z2, more1: bits32;
  1021. Begin
  1022. mul32To64( a1, b, z1, z2 );
  1023. mul32To64( a0, b, z0, more1 );
  1024. add64( z0, more1, 0, z1, z0, z1 );
  1025. z2Ptr := z2;
  1026. z1Ptr := z1;
  1027. z0Ptr := z0;
  1028. End;
  1029. {*
  1030. -------------------------------------------------------------------------------
  1031. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1032. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1033. product. The product is broken into four 32-bit pieces which are stored at
  1034. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1035. -------------------------------------------------------------------------------
  1036. *}
  1037. Procedure
  1038. mul64To128(
  1039. a0:bits32;
  1040. a1:bits32;
  1041. b0:bits32;
  1042. b1:bits32;
  1043. VAR z0Ptr:bits32;
  1044. VAR z1Ptr:bits32;
  1045. VAR z2Ptr:bits32;
  1046. VAR z3Ptr:bits32
  1047. );
  1048. Var
  1049. z0, z1, z2, z3: bits32;
  1050. more1, more2: bits32;
  1051. Begin
  1052. mul32To64( a1, b1, z2, z3 );
  1053. mul32To64( a1, b0, z1, more2 );
  1054. add64( z1, more2, 0, z2, z1, z2 );
  1055. mul32To64( a0, b0, z0, more1 );
  1056. add64( z0, more1, 0, z1, z0, z1 );
  1057. mul32To64( a0, b1, more1, more2 );
  1058. add64( more1, more2, 0, z2, more1, z2 );
  1059. add64( z0, z1, 0, more1, z0, z1 );
  1060. z3Ptr := z3;
  1061. z2Ptr := z2;
  1062. z1Ptr := z1;
  1063. z0Ptr := z0;
  1064. End;
  1065. {*
  1066. -------------------------------------------------------------------------------
  1067. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1068. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1069. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1070. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1071. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1072. unsigned integer is returned.
  1073. -------------------------------------------------------------------------------
  1074. *}
  1075. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1076. Var
  1077. b0, b1: bits32;
  1078. rem0, rem1, term0, term1: bits32;
  1079. z: bits32;
  1080. Begin
  1081. if ( b <= a0 ) then
  1082. Begin
  1083. estimateDiv64To32 := $FFFFFFFF;
  1084. exit;
  1085. End;
  1086. b0 := b shr 16;
  1087. if ( b0 shl 16 <= a0 ) then
  1088. z:= $FFFF0000
  1089. else
  1090. z:= ( a0 div b0 ) shl 16;
  1091. mul32To64( b, z, term0, term1 );
  1092. sub64( a0, a1, term0, term1, rem0, rem1 );
  1093. while ( ( sbits32 (rem0) ) < 0 ) do
  1094. Begin
  1095. z := z - $10000;
  1096. b1 := b shl 16;
  1097. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1098. End;
  1099. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1100. if ( b0 shl 16 <= rem0 ) then
  1101. z := z or $FFFF
  1102. else
  1103. z := z or (rem0 div b0);
  1104. estimateDiv64To32 := z;
  1105. End;
  1106. {*
  1107. -------------------------------------------------------------------------------
  1108. Returns an approximation to the square root of the 32-bit significand given
  1109. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1110. `aExp' (the least significant bit) is 1, the integer returned approximates
  1111. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1112. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1113. case, the approximation returned lies strictly within +/-2 of the exact
  1114. value.
  1115. -------------------------------------------------------------------------------
  1116. *}
  1117. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1118. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1119. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1120. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1121. );
  1122. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1123. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1124. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1125. );
  1126. Var
  1127. index: int8;
  1128. z: bits32;
  1129. Begin
  1130. index := ( a shr 27 ) AND 15;
  1131. if ( aExp AND 1 ) <> 0 then
  1132. Begin
  1133. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1134. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1135. a := a shr 1;
  1136. End
  1137. else
  1138. Begin
  1139. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1140. z := a div z + z;
  1141. if ( $20000 <= z ) then
  1142. z := $FFFF8000
  1143. else
  1144. z := ( z shl 15 );
  1145. if ( z <= a ) then
  1146. Begin
  1147. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1148. exit;
  1149. End;
  1150. End;
  1151. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1152. End;
  1153. {*
  1154. -------------------------------------------------------------------------------
  1155. Returns the number of leading 0 bits before the most-significant 1 bit of
  1156. `a'. If `a' is zero, 32 is returned.
  1157. -------------------------------------------------------------------------------
  1158. *}
  1159. Function countLeadingZeros32( a:bits32 ): int8;
  1160. const countLeadingZerosHigh:array[0..255] of int8 = (
  1161. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1162. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1163. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1164. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1165. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1166. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1167. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1168. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1169. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1170. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1171. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1172. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1173. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1174. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1175. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1176. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1177. );
  1178. Var
  1179. shiftCount: int8;
  1180. Begin
  1181. shiftCount := 0;
  1182. if ( a < $10000 ) then
  1183. Begin
  1184. shiftCount := shiftcount + 16;
  1185. a := a shl 16;
  1186. End;
  1187. if ( a < $1000000 ) then
  1188. Begin
  1189. shiftCount := shiftcount + 8;
  1190. a := a shl 8;
  1191. end;
  1192. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1193. countLeadingZeros32:= shiftCount;
  1194. End;
  1195. {*----------------------------------------------------------------------------
  1196. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1197. | `a'. If `a' is zero, 64 is returned.
  1198. *----------------------------------------------------------------------------*}
  1199. function countLeadingZeros64( a : bits64): int8;
  1200. var
  1201. shiftcount : int8;
  1202. Begin
  1203. shiftCount := 0;
  1204. if ( a < (bits64(1) shl 32 )) then
  1205. shiftCount := shiftcount + 32
  1206. else
  1207. a := a shr 32;
  1208. shiftCount := shiftCount + countLeadingZeros32( a );
  1209. countLeadingZeros64:= shiftCount;
  1210. End;
  1211. {*
  1212. -------------------------------------------------------------------------------
  1213. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1214. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1215. returns 0.
  1216. -------------------------------------------------------------------------------
  1217. *}
  1218. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1219. Begin
  1220. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1221. End;
  1222. {*
  1223. -------------------------------------------------------------------------------
  1224. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1225. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1226. Otherwise, returns 0.
  1227. -------------------------------------------------------------------------------
  1228. *}
  1229. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1230. Begin
  1231. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1232. End;
  1233. {*
  1234. -------------------------------------------------------------------------------
  1235. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1236. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1237. returns 0.
  1238. -------------------------------------------------------------------------------
  1239. *}
  1240. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1241. Begin
  1242. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1243. End;
  1244. {*
  1245. -------------------------------------------------------------------------------
  1246. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1247. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1248. returns 0.
  1249. -------------------------------------------------------------------------------
  1250. *}
  1251. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1252. Begin
  1253. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1254. End;
  1255. (*****************************************************************************)
  1256. (* End Low-Level arithmetic *)
  1257. (*****************************************************************************)
  1258. {*
  1259. -------------------------------------------------------------------------------
  1260. Functions and definitions to determine: (1) whether tininess for underflow
  1261. is detected before or after rounding by default, (2) what (if anything)
  1262. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1263. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1264. are propagated from function inputs to output. These details are ENDIAN
  1265. specific
  1266. -------------------------------------------------------------------------------
  1267. *}
  1268. {$IFDEF ENDIAN_LITTLE}
  1269. {*
  1270. -------------------------------------------------------------------------------
  1271. Internal canonical NaN format.
  1272. -------------------------------------------------------------------------------
  1273. *}
  1274. TYPE
  1275. commonNaNT = packed record
  1276. sign: flag;
  1277. high, low : bits32;
  1278. end;
  1279. {*
  1280. -------------------------------------------------------------------------------
  1281. The pattern for a default generated single-precision NaN.
  1282. -------------------------------------------------------------------------------
  1283. *}
  1284. const float32_default_nan = $FFC00000;
  1285. {*
  1286. -------------------------------------------------------------------------------
  1287. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1288. otherwise returns 0.
  1289. -------------------------------------------------------------------------------
  1290. *}
  1291. Function float32_is_nan( a : float32 ): flag;
  1292. Begin
  1293. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1294. End;
  1295. {*
  1296. -------------------------------------------------------------------------------
  1297. Returns 1 if the single-precision floating-point value `a' is a signaling
  1298. NaN; otherwise returns 0.
  1299. -------------------------------------------------------------------------------
  1300. *}
  1301. Function float32_is_signaling_nan( a : float32 ): flag;
  1302. Begin
  1303. float32_is_signaling_nan := flag
  1304. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1305. End;
  1306. {*
  1307. -------------------------------------------------------------------------------
  1308. Returns the result of converting the single-precision floating-point NaN
  1309. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1310. exception is raised.
  1311. -------------------------------------------------------------------------------
  1312. *}
  1313. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1314. var
  1315. z : commonNaNT ;
  1316. Begin
  1317. if ( float32_is_signaling_nan( a ) <> 0) then
  1318. float_raise( float_flag_invalid );
  1319. z.sign := a shr 31;
  1320. z.low := 0;
  1321. z.high := a shl 9;
  1322. c := z;
  1323. End;
  1324. {*
  1325. -------------------------------------------------------------------------------
  1326. Returns the result of converting the canonical NaN `a' to the single-
  1327. precision floating-point format.
  1328. -------------------------------------------------------------------------------
  1329. *}
  1330. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1331. Begin
  1332. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1333. End;
  1334. {*
  1335. -------------------------------------------------------------------------------
  1336. Takes two single-precision floating-point values `a' and `b', one of which
  1337. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1338. signaling NaN, the invalid exception is raised.
  1339. -------------------------------------------------------------------------------
  1340. *}
  1341. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1342. Var
  1343. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1344. label returnLargerSignificand;
  1345. Begin
  1346. aIsNaN := float32_is_nan( a );
  1347. aIsSignalingNaN := float32_is_signaling_nan( a );
  1348. bIsNaN := float32_is_nan( b );
  1349. bIsSignalingNaN := float32_is_signaling_nan( b );
  1350. a := a or $00400000;
  1351. b := b or $00400000;
  1352. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1353. float_raise( float_flag_invalid );
  1354. if ( aIsSignalingNaN )<> 0 then
  1355. Begin
  1356. if ( bIsSignalingNaN ) <> 0 then
  1357. goto returnLargerSignificand;
  1358. if bIsNan <> 0 then
  1359. propagateFloat32NaN := b
  1360. else
  1361. propagateFloat32NaN := a;
  1362. exit;
  1363. End
  1364. else if ( aIsNaN <> 0) then
  1365. Begin
  1366. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1367. Begin
  1368. propagateFloat32NaN := a;
  1369. exit;
  1370. End;
  1371. returnLargerSignificand:
  1372. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1373. Begin
  1374. propagateFloat32NaN := b;
  1375. exit;
  1376. End;
  1377. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1378. Begin
  1379. propagateFloat32NaN := a;
  1380. End;
  1381. if a < b then
  1382. propagateFloat32NaN := a
  1383. else
  1384. propagateFloat32NaN := b;
  1385. exit;
  1386. End
  1387. else
  1388. Begin
  1389. propagateFloat32NaN := b;
  1390. exit;
  1391. End;
  1392. End;
  1393. {*
  1394. -------------------------------------------------------------------------------
  1395. The pattern for a default generated double-precision NaN. The `high' and
  1396. `low' values hold the most- and least-significant bits, respectively.
  1397. -------------------------------------------------------------------------------
  1398. *}
  1399. const
  1400. float64_default_nan_high = $FFF80000;
  1401. float64_default_nan_low = $00000000;
  1402. {*
  1403. -------------------------------------------------------------------------------
  1404. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1405. otherwise returns 0.
  1406. -------------------------------------------------------------------------------
  1407. *}
  1408. Function float64_is_nan( a : float64 ) : flag;
  1409. Begin
  1410. float64_is_nan :=
  1411. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1412. and ( a.low or ( a.high and $000FFFFF ) );
  1413. End;
  1414. {*
  1415. -------------------------------------------------------------------------------
  1416. Returns 1 if the double-precision floating-point value `a' is a signaling
  1417. NaN; otherwise returns 0.
  1418. -------------------------------------------------------------------------------
  1419. *}
  1420. Function float64_is_signaling_nan( a : float64 ): flag;
  1421. Begin
  1422. float64_is_signaling_nan :=
  1423. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1424. and ( a.low or ( a.high and $0007FFFF ) );
  1425. End;
  1426. {*
  1427. -------------------------------------------------------------------------------
  1428. Returns the result of converting the double-precision floating-point NaN
  1429. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1430. exception is raised.
  1431. -------------------------------------------------------------------------------
  1432. *}
  1433. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1434. Var
  1435. z : commonNaNT;
  1436. Begin
  1437. if ( float64_is_signaling_nan( a )<>0 ) then
  1438. float_raise( float_flag_invalid );
  1439. z.sign := a.high shr 31;
  1440. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1441. c := z;
  1442. End;
  1443. {*
  1444. -------------------------------------------------------------------------------
  1445. Returns the result of converting the canonical NaN `a' to the double-
  1446. precision floating-point format.
  1447. -------------------------------------------------------------------------------
  1448. *}
  1449. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1450. Var
  1451. z: float64;
  1452. Begin
  1453. shift64Right( a.high, a.low, 12, z.high, z.low );
  1454. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1455. c := z;
  1456. End;
  1457. {*
  1458. -------------------------------------------------------------------------------
  1459. Takes two double-precision floating-point values `a' and `b', one of which
  1460. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1461. signaling NaN, the invalid exception is raised.
  1462. -------------------------------------------------------------------------------
  1463. *}
  1464. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1465. Var
  1466. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1467. label returnLargerSignificand;
  1468. Begin
  1469. aIsNaN := float64_is_nan( a );
  1470. aIsSignalingNaN := float64_is_signaling_nan( a );
  1471. bIsNaN := float64_is_nan( b );
  1472. bIsSignalingNaN := float64_is_signaling_nan( b );
  1473. a.high := a.high or $00080000;
  1474. b.high := b.high or $00080000;
  1475. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1476. float_raise( float_flag_invalid );
  1477. if ( aIsSignalingNaN )<>0 then
  1478. Begin
  1479. if ( bIsSignalingNaN )<>0 then
  1480. goto returnLargerSignificand;
  1481. if bIsNan <> 0 then
  1482. c := b
  1483. else
  1484. c := a;
  1485. exit;
  1486. End
  1487. else if ( aIsNaN )<> 0 then
  1488. Begin
  1489. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1490. Begin
  1491. c := a;
  1492. exit;
  1493. End;
  1494. returnLargerSignificand:
  1495. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1496. Begin
  1497. c := b;
  1498. exit;
  1499. End;
  1500. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1501. Begin
  1502. c := a;
  1503. exit;
  1504. End;
  1505. if a.high < b.high then
  1506. c := a
  1507. else
  1508. c := b;
  1509. exit;
  1510. End
  1511. else
  1512. Begin
  1513. c := b;
  1514. exit;
  1515. End;
  1516. End;
  1517. {*----------------------------------------------------------------------------
  1518. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1519. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1520. | returns 0.
  1521. *----------------------------------------------------------------------------*}
  1522. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1523. begin
  1524. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1525. end;
  1526. {*----------------------------------------------------------------------------
  1527. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1528. | otherwise returns 0.
  1529. *----------------------------------------------------------------------------*}
  1530. function float128_is_nan( a : float128): flag;
  1531. begin
  1532. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1533. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1534. end;
  1535. {*----------------------------------------------------------------------------
  1536. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1537. | signaling NaN; otherwise returns 0.
  1538. *----------------------------------------------------------------------------*}
  1539. function float128_is_signaling_nan( a : float128): flag;
  1540. begin
  1541. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1542. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1543. end;
  1544. {*----------------------------------------------------------------------------
  1545. | Returns the result of converting the quadruple-precision floating-point NaN
  1546. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1547. | exception is raised.
  1548. *----------------------------------------------------------------------------*}
  1549. function float128ToCommonNaN( a : float128): commonNaNT;
  1550. var
  1551. z: commonNaNT;
  1552. qhigh,qlow : qword;
  1553. begin
  1554. if ( float128_is_signaling_nan( a )<>0) then
  1555. float_raise( float_flag_invalid );
  1556. z.sign := a.high shr 63;
  1557. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1558. z.high:=qhigh shr 32;
  1559. z.low:=qhigh and $ffffffff;
  1560. result:=z;
  1561. end;
  1562. {*----------------------------------------------------------------------------
  1563. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1564. | precision floating-point format.
  1565. *----------------------------------------------------------------------------*}
  1566. function commonNaNToFloat128( a : commonNaNT): float128;
  1567. var
  1568. z: float128;
  1569. begin
  1570. shift128Right( a.high, a.low, 16, z.high, z.low );
  1571. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1572. result:=z;
  1573. end;
  1574. {*----------------------------------------------------------------------------
  1575. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1576. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1577. | `b' is a signaling NaN, the invalid exception is raised.
  1578. *----------------------------------------------------------------------------*}
  1579. function propagateFloat128NaN( a: float128; b : float128): float128;
  1580. var
  1581. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1582. label
  1583. returnLargerSignificand;
  1584. begin
  1585. aIsNaN := float128_is_nan( a );
  1586. aIsSignalingNaN := float128_is_signaling_nan( a );
  1587. bIsNaN := float128_is_nan( b );
  1588. bIsSignalingNaN := float128_is_signaling_nan( b );
  1589. a.high := a.high or int64( $0000800000000000 );
  1590. b.high := b.high or int64( $0000800000000000 );
  1591. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1592. float_raise( float_flag_invalid );
  1593. if ( aIsSignalingNaN )<>0 then
  1594. begin
  1595. if ( bIsSignalingNaN )<>0 then
  1596. goto returnLargerSignificand;
  1597. if bIsNaN<>0 then
  1598. result := b
  1599. else
  1600. result := a;
  1601. exit;
  1602. end
  1603. else if ( aIsNaN )<>0 then
  1604. begin
  1605. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1606. begin
  1607. result := a;
  1608. exit;
  1609. end;
  1610. returnLargerSignificand:
  1611. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1612. begin
  1613. result := b;
  1614. exit;
  1615. end;
  1616. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1617. begin
  1618. result := a;
  1619. exit
  1620. end;
  1621. if ( a.high < b.high ) then
  1622. result := a
  1623. else
  1624. result := b;
  1625. exit;
  1626. end
  1627. else
  1628. result:=b;
  1629. end;
  1630. {$ELSE}
  1631. { Big endian code }
  1632. (*----------------------------------------------------------------------------
  1633. | Internal canonical NaN format.
  1634. *----------------------------------------------------------------------------*)
  1635. type
  1636. commonNANT = packed record
  1637. sign : flag;
  1638. high, low : bits32;
  1639. end;
  1640. (*----------------------------------------------------------------------------
  1641. | The pattern for a default generated single-precision NaN.
  1642. *----------------------------------------------------------------------------*)
  1643. const float32_default_nan = $7FFFFFFF;
  1644. (*----------------------------------------------------------------------------
  1645. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1646. | otherwise returns 0.
  1647. *----------------------------------------------------------------------------*)
  1648. function float32_is_nan(a: float32): flag;
  1649. begin
  1650. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1651. end;
  1652. (*----------------------------------------------------------------------------
  1653. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1654. | NaN; otherwise returns 0.
  1655. *----------------------------------------------------------------------------*)
  1656. function float32_is_signaling_nan(a: float32):flag;
  1657. begin
  1658. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1659. end;
  1660. (*----------------------------------------------------------------------------
  1661. | Returns the result of converting the single-precision floating-point NaN
  1662. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1663. | exception is raised.
  1664. *----------------------------------------------------------------------------*)
  1665. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1666. var
  1667. z: commonNANT;
  1668. begin
  1669. if float32_is_signaling_nan(a)<>0 then
  1670. float_raise(float_flag_invalid);
  1671. z.sign := a shr 31;
  1672. z.low := 0;
  1673. z.high := a shl 9;
  1674. c:=z;
  1675. end;
  1676. (*----------------------------------------------------------------------------
  1677. | Returns the result of converting the canonical NaN `a' to the single-
  1678. | precision floating-point format.
  1679. *----------------------------------------------------------------------------*)
  1680. function CommonNanToFloat32(a : CommonNaNT): float32;
  1681. begin
  1682. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  1683. end;
  1684. (*----------------------------------------------------------------------------
  1685. | Takes two single-precision floating-point values `a' and `b', one of which
  1686. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1687. | signaling NaN, the invalid exception is raised.
  1688. *----------------------------------------------------------------------------*)
  1689. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  1690. var
  1691. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1692. begin
  1693. aIsNaN := float32_is_nan( a );
  1694. aIsSignalingNaN := float32_is_signaling_nan( a );
  1695. bIsNaN := float32_is_nan( b );
  1696. bIsSignalingNaN := float32_is_signaling_nan( b );
  1697. a := a or $00400000;
  1698. b := b or $00400000;
  1699. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1700. float_raise( float_flag_invalid );
  1701. if bIsSignalingNaN<>0 then
  1702. propagateFloat32Nan := b
  1703. else if aIsSignalingNan<>0 then
  1704. propagateFloat32Nan := a
  1705. else if bIsNan<>0 then
  1706. propagateFloat32Nan := b
  1707. else
  1708. propagateFloat32Nan := a;
  1709. end;
  1710. (*----------------------------------------------------------------------------
  1711. | The pattern for a default generated double-precision NaN. The `high' and
  1712. | `low' values hold the most- and least-significant bits, respectively.
  1713. *----------------------------------------------------------------------------*)
  1714. const
  1715. float64_default_nan_high = $7FFFFFFF;
  1716. float64_default_nan_low = $FFFFFFFF;
  1717. (*----------------------------------------------------------------------------
  1718. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  1719. | otherwise returns 0.
  1720. *----------------------------------------------------------------------------*)
  1721. function float64_is_nan(a: float64): flag;
  1722. begin
  1723. float64_is_nan := flag (
  1724. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1725. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  1726. end;
  1727. (*----------------------------------------------------------------------------
  1728. | Returns 1 if the double-precision floating-point value `a' is a signaling
  1729. | NaN; otherwise returns 0.
  1730. *----------------------------------------------------------------------------*)
  1731. function float64_is_signaling_nan( a:float64): flag;
  1732. begin
  1733. float64_is_signaling_nan := flag(
  1734. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1735. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  1736. end;
  1737. (*----------------------------------------------------------------------------
  1738. | Returns the result of converting the double-precision floating-point NaN
  1739. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1740. | exception is raised.
  1741. *----------------------------------------------------------------------------*)
  1742. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1743. var
  1744. z : commonNaNT;
  1745. begin
  1746. if ( float64_is_signaling_nan( a )<>0 ) then
  1747. float_raise( float_flag_invalid );
  1748. z.sign := a.high shr 31;
  1749. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1750. c:=z;
  1751. end;
  1752. (*----------------------------------------------------------------------------
  1753. | Returns the result of converting the canonical NaN `a' to the double-
  1754. | precision floating-point format.
  1755. *----------------------------------------------------------------------------*)
  1756. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1757. var
  1758. z: float64;
  1759. begin
  1760. shift64Right( a.high, a.low, 12, z.high, z.low );
  1761. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1762. c:=z;
  1763. end;
  1764. (*----------------------------------------------------------------------------
  1765. | Takes two double-precision floating-point values `a' and `b', one of which
  1766. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1767. | signaling NaN, the invalid exception is raised.
  1768. *----------------------------------------------------------------------------*)
  1769. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1770. var
  1771. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  1772. begin
  1773. aIsNaN := float64_is_nan( a );
  1774. aIsSignalingNaN := float64_is_signaling_nan( a );
  1775. bIsNaN := float64_is_nan( b );
  1776. bIsSignalingNaN := float64_is_signaling_nan( b );
  1777. a.high := a.high or $00080000;
  1778. b.high := b.high or $00080000;
  1779. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  1780. float_raise( float_flag_invalid );
  1781. if bIsSignalingNaN<>0 then
  1782. c := b
  1783. else if aIsSignalingNan<>0 then
  1784. c := a
  1785. else if bIsNan<>0 then
  1786. c := b
  1787. else
  1788. c := a;
  1789. end;
  1790. {$ENDIF}
  1791. (****************************************************************************)
  1792. (* END ENDIAN SPECIFIC CODE *)
  1793. (****************************************************************************)
  1794. {*
  1795. -------------------------------------------------------------------------------
  1796. Returns the fraction bits of the single-precision floating-point value `a'.
  1797. -------------------------------------------------------------------------------
  1798. *}
  1799. Function ExtractFloat32Frac(a : Float32) : Bits32;
  1800. Begin
  1801. ExtractFloat32Frac := A AND $007FFFFF;
  1802. End;
  1803. {*
  1804. -------------------------------------------------------------------------------
  1805. Returns the exponent bits of the single-precision floating-point value `a'.
  1806. -------------------------------------------------------------------------------
  1807. *}
  1808. Function extractFloat32Exp( a: float32 ): Int16;
  1809. Begin
  1810. extractFloat32Exp := (a shr 23) AND $FF;
  1811. End;
  1812. {*
  1813. -------------------------------------------------------------------------------
  1814. Returns the sign bit of the single-precision floating-point value `a'.
  1815. -------------------------------------------------------------------------------
  1816. *}
  1817. Function extractFloat32Sign( a: float32 ): Flag;
  1818. Begin
  1819. extractFloat32Sign := a shr 31;
  1820. End;
  1821. {*
  1822. -------------------------------------------------------------------------------
  1823. Normalizes the subnormal single-precision floating-point value represented
  1824. by the denormalized significand `aSig'. The normalized exponent and
  1825. significand are stored at the locations pointed to by `zExpPtr' and
  1826. `zSigPtr', respectively.
  1827. -------------------------------------------------------------------------------
  1828. *}
  1829. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  1830. Var
  1831. ShiftCount : BYTE;
  1832. Begin
  1833. shiftCount := countLeadingZeros32( aSig ) - 8;
  1834. zSigPtr := aSig shl shiftCount;
  1835. zExpPtr := 1 - shiftCount;
  1836. End;
  1837. {*
  1838. -------------------------------------------------------------------------------
  1839. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  1840. single-precision floating-point value, returning the result. After being
  1841. shifted into the proper positions, the three fields are simply added
  1842. together to form the result. This means that any integer portion of `zSig'
  1843. will be added into the exponent. Since a properly normalized significand
  1844. will have an integer portion equal to 1, the `zExp' input should be 1 less
  1845. than the desired result exponent whenever `zSig' is a complete, normalized
  1846. significand.
  1847. -------------------------------------------------------------------------------
  1848. *}
  1849. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  1850. Begin
  1851. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  1852. + zSig;
  1853. End;
  1854. {*
  1855. -------------------------------------------------------------------------------
  1856. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1857. and significand `zSig', and returns the proper single-precision floating-
  1858. point value corresponding to the abstract input. Ordinarily, the abstract
  1859. value is simply rounded and packed into the single-precision format, with
  1860. the inexact exception raised if the abstract input cannot be represented
  1861. exactly. However, if the abstract value is too large, the overflow and
  1862. inexact exceptions are raised and an infinity or maximal finite value is
  1863. returned. If the abstract value is too small, the input value is rounded to
  1864. a subnormal number, and the underflow and inexact exceptions are raised if
  1865. the abstract input cannot be represented exactly as a subnormal single-
  1866. precision floating-point number.
  1867. The input significand `zSig' has its binary point between bits 30
  1868. and 29, which is 7 bits to the left of the usual location. This shifted
  1869. significand must be normalized or smaller. If `zSig' is not normalized,
  1870. `zExp' must be 0; in that case, the result returned is a subnormal number,
  1871. and it must not require rounding. In the usual case that `zSig' is
  1872. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  1873. The handling of underflow and overflow follows the IEC/IEEE Standard for
  1874. Binary Floating-Point Arithmetic.
  1875. -------------------------------------------------------------------------------
  1876. *}
  1877. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  1878. Var
  1879. roundingMode : BYTE;
  1880. roundNearestEven : Flag;
  1881. roundIncrement, roundBits : BYTE;
  1882. IsTiny : Flag;
  1883. Begin
  1884. roundingMode := softfloat_rounding_mode;
  1885. if (roundingMode = float_round_nearest_even) then
  1886. Begin
  1887. roundNearestEven := Flag(TRUE);
  1888. end
  1889. else
  1890. roundNearestEven := Flag(FALSE);
  1891. roundIncrement := $40;
  1892. if ( Boolean(roundNearestEven) = FALSE) then
  1893. Begin
  1894. if ( roundingMode = float_round_to_zero ) Then
  1895. Begin
  1896. roundIncrement := 0;
  1897. End
  1898. else
  1899. Begin
  1900. roundIncrement := $7F;
  1901. if ( zSign <> 0 ) then
  1902. Begin
  1903. if roundingMode = float_round_up then roundIncrement := 0;
  1904. End
  1905. else
  1906. Begin
  1907. if roundingMode = float_round_down then roundIncrement := 0;
  1908. End;
  1909. End
  1910. End;
  1911. roundBits := zSig AND $7F;
  1912. if ($FD <= bits16 (zExp) ) then
  1913. Begin
  1914. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  1915. Begin
  1916. float_raise( float_flag_overflow OR float_flag_inexact );
  1917. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  1918. exit;
  1919. End;
  1920. if ( zExp < 0 ) then
  1921. Begin
  1922. isTiny :=
  1923. flag(( float_detect_tininess = float_tininess_before_rounding )
  1924. OR ( zExp < -1 )
  1925. OR ( (zSig + roundIncrement) < $80000000 ));
  1926. shift32RightJamming( zSig, - zExp, zSig );
  1927. zExp := 0;
  1928. roundBits := zSig AND $7F;
  1929. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  1930. float_raise( float_flag_underflow );
  1931. End;
  1932. End;
  1933. if ( roundBits )<> 0 then
  1934. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  1935. zSig := ( zSig + roundIncrement ) shr 7;
  1936. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  1937. if ( zSig = 0 ) then zExp := 0;
  1938. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  1939. exit;
  1940. End;
  1941. {*
  1942. -------------------------------------------------------------------------------
  1943. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1944. and significand `zSig', and returns the proper single-precision floating-
  1945. point value corresponding to the abstract input. This routine is just like
  1946. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  1947. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  1948. floating-point exponent.
  1949. -------------------------------------------------------------------------------
  1950. *}
  1951. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  1952. Var
  1953. ShiftCount : int8;
  1954. Begin
  1955. shiftCount := countLeadingZeros32( zSig ) - 1;
  1956. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  1957. End;
  1958. {*
  1959. -------------------------------------------------------------------------------
  1960. Returns the most-significant 20 fraction bits of the double-precision
  1961. floating-point value `a'.
  1962. -------------------------------------------------------------------------------
  1963. *}
  1964. Function extractFloat64Frac0(a: float64): bits32;
  1965. Begin
  1966. extractFloat64Frac0 := a.high and $000FFFFF;
  1967. End;
  1968. {*
  1969. -------------------------------------------------------------------------------
  1970. Returns the least-significant 32 fraction bits of the double-precision
  1971. floating-point value `a'.
  1972. -------------------------------------------------------------------------------
  1973. *}
  1974. Function extractFloat64Frac1(a: float64): bits32;
  1975. Begin
  1976. extractFloat64Frac1 := a.low;
  1977. End;
  1978. {*
  1979. -------------------------------------------------------------------------------
  1980. Returns the exponent bits of the double-precision floating-point value `a'.
  1981. -------------------------------------------------------------------------------
  1982. *}
  1983. Function extractFloat64Exp(a: float64): int16;
  1984. Begin
  1985. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  1986. End;
  1987. {*
  1988. -------------------------------------------------------------------------------
  1989. Returns the sign bit of the double-precision floating-point value `a'.
  1990. -------------------------------------------------------------------------------
  1991. *}
  1992. Function extractFloat64Sign(a: float64) : flag;
  1993. Begin
  1994. extractFloat64Sign := a.high shr 31;
  1995. End;
  1996. {*
  1997. -------------------------------------------------------------------------------
  1998. Normalizes the subnormal double-precision floating-point value represented
  1999. by the denormalized significand formed by the concatenation of `aSig0' and
  2000. `aSig1'. The normalized exponent is stored at the location pointed to by
  2001. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2002. stored at the location pointed to by `zSig0Ptr', and the least significant
  2003. 32 bits of the normalized significand are stored at the location pointed to
  2004. by `zSig1Ptr'.
  2005. -------------------------------------------------------------------------------
  2006. *}
  2007. Procedure normalizeFloat64Subnormal(
  2008. aSig0: bits32;
  2009. aSig1: bits32;
  2010. VAR zExpPtr : Int16;
  2011. VAR zSig0Ptr : Bits32;
  2012. VAR zSig1Ptr : Bits32
  2013. );
  2014. Var
  2015. ShiftCount : Int8;
  2016. Begin
  2017. if ( aSig0 = 0 ) then
  2018. Begin
  2019. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2020. if ( shiftCount < 0 ) then
  2021. Begin
  2022. zSig0Ptr := aSig1 shr ( - shiftCount );
  2023. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2024. End
  2025. else
  2026. Begin
  2027. zSig0Ptr := aSig1 shl shiftCount;
  2028. zSig1Ptr := 0;
  2029. End;
  2030. zExpPtr := - shiftCount - 31;
  2031. End
  2032. else
  2033. Begin
  2034. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2035. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2036. zExpPtr := 1 - shiftCount;
  2037. End;
  2038. End;
  2039. {*
  2040. -------------------------------------------------------------------------------
  2041. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2042. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2043. point value, returning the result. After being shifted into the proper
  2044. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2045. together to form the most significant 32 bits of the result. This means
  2046. that any integer portion of `zSig0' will be added into the exponent. Since
  2047. a properly normalized significand will have an integer portion equal to 1,
  2048. the `zExp' input should be 1 less than the desired result exponent whenever
  2049. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2050. -------------------------------------------------------------------------------
  2051. *}
  2052. Procedure
  2053. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2054. var
  2055. z: Float64;
  2056. Begin
  2057. z.low := zSig1;
  2058. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2059. c := z;
  2060. End;
  2061. {*----------------------------------------------------------------------------
  2062. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2063. | double-precision floating-point value, returning the result. After being
  2064. | shifted into the proper positions, the three fields are simply added
  2065. | together to form the result. This means that any integer portion of `zSig'
  2066. | will be added into the exponent. Since a properly normalized significand
  2067. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2068. | than the desired result exponent whenever `zSig' is a complete, normalized
  2069. | significand.
  2070. *----------------------------------------------------------------------------*}
  2071. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2072. begin
  2073. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2074. end;
  2075. {*
  2076. -------------------------------------------------------------------------------
  2077. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2078. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2079. and `zSig2', and returns the proper double-precision floating-point value
  2080. corresponding to the abstract input. Ordinarily, the abstract value is
  2081. simply rounded and packed into the double-precision format, with the inexact
  2082. exception raised if the abstract input cannot be represented exactly.
  2083. However, if the abstract value is too large, the overflow and inexact
  2084. exceptions are raised and an infinity or maximal finite value is returned.
  2085. If the abstract value is too small, the input value is rounded to a
  2086. subnormal number, and the underflow and inexact exceptions are raised if the
  2087. abstract input cannot be represented exactly as a subnormal double-precision
  2088. floating-point number.
  2089. The input significand must be normalized or smaller. If the input
  2090. significand is not normalized, `zExp' must be 0; in that case, the result
  2091. returned is a subnormal number, and it must not require rounding. In the
  2092. usual case that the input significand is normalized, `zExp' must be 1 less
  2093. than the ``true'' floating-point exponent. The handling of underflow and
  2094. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2095. -------------------------------------------------------------------------------
  2096. *}
  2097. Procedure
  2098. roundAndPackFloat64(
  2099. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2100. Var
  2101. roundingMode : Int8;
  2102. roundNearestEven, increment, isTiny : Flag;
  2103. Begin
  2104. roundingMode := softfloat_rounding_mode;
  2105. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2106. increment := flag( sbits32 (zSig2) < 0 );
  2107. if ( roundNearestEven = flag(FALSE) ) then
  2108. Begin
  2109. if ( roundingMode = float_round_to_zero ) then
  2110. increment := 0
  2111. else
  2112. Begin
  2113. if ( zSign )<> 0 then
  2114. Begin
  2115. increment := flag( roundingMode = float_round_down ) and zSig2;
  2116. End
  2117. else
  2118. Begin
  2119. increment := flag( roundingMode = float_round_up ) and zSig2;
  2120. End
  2121. End
  2122. End;
  2123. if ( $7FD <= bits16 (zExp) ) then
  2124. Begin
  2125. if (( $7FD < zExp )
  2126. or (( zExp = $7FD )
  2127. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2128. and (increment<>0)
  2129. )
  2130. ) then
  2131. Begin
  2132. float_raise( float_flag_overflow OR float_flag_inexact );
  2133. if (( roundingMode = float_round_to_zero )
  2134. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2135. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2136. ) then
  2137. Begin
  2138. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2139. exit;
  2140. End;
  2141. packFloat64( zSign, $7FF, 0, 0, c );
  2142. exit;
  2143. End;
  2144. if ( zExp < 0 ) then
  2145. Begin
  2146. isTiny :=
  2147. flag( float_detect_tininess = float_tininess_before_rounding )
  2148. or flag( zExp < -1 )
  2149. or flag(increment = 0)
  2150. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2151. shift64ExtraRightJamming(
  2152. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2153. zExp := 0;
  2154. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2155. if ( roundNearestEven )<>0 then
  2156. Begin
  2157. increment := flag( sbits32 (zSig2) < 0 );
  2158. End
  2159. else
  2160. Begin
  2161. if ( zSign )<>0 then
  2162. Begin
  2163. increment := flag( roundingMode = float_round_down ) and zSig2;
  2164. End
  2165. else
  2166. Begin
  2167. increment := flag( roundingMode = float_round_up ) and zSig2;
  2168. End
  2169. End;
  2170. End;
  2171. End;
  2172. if ( zSig2 )<>0 then
  2173. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2174. if ( increment )<>0 then
  2175. Begin
  2176. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2177. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2178. End
  2179. else
  2180. Begin
  2181. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2182. End;
  2183. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2184. End;
  2185. {*----------------------------------------------------------------------------
  2186. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2187. | and significand `zSig', and returns the proper double-precision floating-
  2188. | point value corresponding to the abstract input. Ordinarily, the abstract
  2189. | value is simply rounded and packed into the double-precision format, with
  2190. | the inexact exception raised if the abstract input cannot be represented
  2191. | exactly. However, if the abstract value is too large, the overflow and
  2192. | inexact exceptions are raised and an infinity or maximal finite value is
  2193. | returned. If the abstract value is too small, the input value is rounded
  2194. | to a subnormal number, and the underflow and inexact exceptions are raised
  2195. | if the abstract input cannot be represented exactly as a subnormal double-
  2196. | precision floating-point number.
  2197. | The input significand `zSig' has its binary point between bits 62
  2198. | and 61, which is 10 bits to the left of the usual location. This shifted
  2199. | significand must be normalized or smaller. If `zSig' is not normalized,
  2200. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2201. | and it must not require rounding. In the usual case that `zSig' is
  2202. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2203. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2204. | Binary Floating-Point Arithmetic.
  2205. *----------------------------------------------------------------------------*}
  2206. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2207. var
  2208. roundingMode: int8;
  2209. roundNearestEven: flag;
  2210. roundIncrement, roundBits: int16;
  2211. isTiny: flag;
  2212. begin
  2213. roundingMode := softfloat_rounding_mode;
  2214. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2215. roundIncrement := $200;
  2216. if ( roundNearestEven=0 ) then
  2217. begin
  2218. if ( roundingMode = float_round_to_zero ) then
  2219. begin
  2220. roundIncrement := 0;
  2221. end
  2222. else begin
  2223. roundIncrement := $3FF;
  2224. if ( zSign<>0 ) then
  2225. begin
  2226. if ( roundingMode = float_round_up ) then
  2227. roundIncrement := 0;
  2228. end
  2229. else begin
  2230. if ( roundingMode = float_round_down ) then
  2231. roundIncrement := 0;
  2232. end
  2233. end
  2234. end;
  2235. roundBits := zSig and $3FF;
  2236. if ( $7FD <= bits16(zExp) ) then
  2237. begin
  2238. if ( ( $7FD < zExp )
  2239. or ( ( zExp = $7FD )
  2240. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2241. ) then
  2242. begin
  2243. float_raise( float_flag_overflow or float_flag_inexact );
  2244. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2245. exit;
  2246. end;
  2247. if ( zExp < 0 ) then
  2248. begin
  2249. isTiny := ord(
  2250. ( float_detect_tininess = float_tininess_before_rounding )
  2251. or ( zExp < -1 )
  2252. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2253. shift64RightJamming( zSig, - zExp, zSig );
  2254. zExp := 0;
  2255. roundBits := zSig and $3FF;
  2256. if ( isTiny and roundBits )<>0 then
  2257. float_raise( float_flag_underflow );
  2258. end
  2259. end;
  2260. if ( roundBits<>0 ) then
  2261. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2262. zSig := ( zSig + roundIncrement ) shr 10;
  2263. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2264. if ( zSig = 0 ) then
  2265. zExp := 0;
  2266. result:=packFloat64( zSign, zExp, zSig );
  2267. end;
  2268. {*
  2269. -------------------------------------------------------------------------------
  2270. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2271. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2272. returns the proper double-precision floating-point value corresponding
  2273. to the abstract input. This routine is just like `roundAndPackFloat64'
  2274. except that the input significand has fewer bits and does not have to be
  2275. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2276. point exponent.
  2277. -------------------------------------------------------------------------------
  2278. *}
  2279. Procedure
  2280. normalizeRoundAndPackFloat64(
  2281. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2282. Var
  2283. shiftCount : int8;
  2284. zSig2 : bits32;
  2285. Begin
  2286. if ( zSig0 = 0 ) then
  2287. Begin
  2288. zSig0 := zSig1;
  2289. zSig1 := 0;
  2290. zExp := zExp -32;
  2291. End;
  2292. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2293. if ( 0 <= shiftCount ) then
  2294. Begin
  2295. zSig2 := 0;
  2296. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2297. End
  2298. else
  2299. Begin
  2300. shift64ExtraRightJamming
  2301. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2302. End;
  2303. zExp := zExp - shiftCount;
  2304. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2305. End;
  2306. {*
  2307. -------------------------------------------------------------------------------
  2308. Returns the result of converting the 32-bit two's complement integer `a' to
  2309. the single-precision floating-point format. The conversion is performed
  2310. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2311. -------------------------------------------------------------------------------
  2312. *}
  2313. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2314. Var
  2315. zSign : Flag;
  2316. Begin
  2317. if ( a = 0 ) then
  2318. Begin
  2319. int32_to_float32.float32 := 0;
  2320. exit;
  2321. End;
  2322. if ( a = sbits32 ($80000000) ) then
  2323. Begin
  2324. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2325. exit;
  2326. end;
  2327. zSign := flag( a < 0 );
  2328. If zSign<>0 then
  2329. a := -a;
  2330. int32_to_float32.float32:=
  2331. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2332. End;
  2333. {*
  2334. -------------------------------------------------------------------------------
  2335. Returns the result of converting the 32-bit two's complement integer `a' to
  2336. the double-precision floating-point format. The conversion is performed
  2337. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2338. -------------------------------------------------------------------------------
  2339. *}
  2340. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2341. var
  2342. zSign : flag;
  2343. absA : bits32;
  2344. shiftCount : int8;
  2345. zSig0, zSig1 : bits32;
  2346. Begin
  2347. if ( a = 0 ) then
  2348. Begin
  2349. packFloat64( 0, 0, 0, 0, result );
  2350. exit;
  2351. end;
  2352. zSign := flag( a < 0 );
  2353. if ZSign<>0 then
  2354. AbsA := -a
  2355. else
  2356. AbsA := a;
  2357. shiftCount := countLeadingZeros32( absA ) - 11;
  2358. if ( 0 <= shiftCount ) then
  2359. Begin
  2360. zSig0 := absA shl shiftCount;
  2361. zSig1 := 0;
  2362. End
  2363. else
  2364. Begin
  2365. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2366. End;
  2367. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2368. End;
  2369. {*
  2370. -------------------------------------------------------------------------------
  2371. Returns the result of converting the single-precision floating-point value
  2372. `a' to the 32-bit two's complement integer format. The conversion is
  2373. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2374. Arithmetic---which means in particular that the conversion is rounded
  2375. according to the current rounding mode. If `a' is a NaN, the largest
  2376. positive integer is returned. Otherwise, if the conversion overflows, the
  2377. largest integer with the same sign as `a' is returned.
  2378. -------------------------------------------------------------------------------
  2379. *}
  2380. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2381. Var
  2382. aSign: flag;
  2383. aExp, shiftCount: int16;
  2384. aSig, aSigExtra: bits32;
  2385. z: int32;
  2386. roundingMode: int8;
  2387. Begin
  2388. aSig := extractFloat32Frac( a.float32 );
  2389. aExp := extractFloat32Exp( a.float32 );
  2390. aSign := extractFloat32Sign( a.float32 );
  2391. shiftCount := aExp - $96;
  2392. if ( 0 <= shiftCount ) then
  2393. Begin
  2394. if ( $9E <= aExp ) then
  2395. Begin
  2396. if ( a.float32 <> $CF000000 ) then
  2397. Begin
  2398. float_raise( float_flag_invalid );
  2399. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2400. Begin
  2401. float32_to_int32 := $7FFFFFFF;
  2402. exit;
  2403. End;
  2404. End;
  2405. float32_to_int32 := sbits32 ($80000000);
  2406. exit;
  2407. End;
  2408. z := ( aSig or $00800000 ) shl shiftCount;
  2409. if ( aSign<>0 ) then z := - z;
  2410. End
  2411. else
  2412. Begin
  2413. if ( aExp < $7E ) then
  2414. Begin
  2415. aSigExtra := aExp OR aSig;
  2416. z := 0;
  2417. End
  2418. else
  2419. Begin
  2420. aSig := aSig OR $00800000;
  2421. aSigExtra := aSig shl ( shiftCount and 31 );
  2422. z := aSig shr ( - shiftCount );
  2423. End;
  2424. if ( aSigExtra<>0 ) then
  2425. softfloat_exception_flags := softfloat_exception_flags
  2426. or float_flag_inexact;
  2427. roundingMode := softfloat_rounding_mode;
  2428. if ( roundingMode = float_round_nearest_even ) then
  2429. Begin
  2430. if ( sbits32 (aSigExtra) < 0 ) then
  2431. Begin
  2432. Inc(z);
  2433. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2434. z := z and not 1;
  2435. End;
  2436. if ( aSign<>0 ) then
  2437. z := - z;
  2438. End
  2439. else
  2440. Begin
  2441. aSigExtra := flag( aSigExtra <> 0 );
  2442. if ( aSign<>0 ) then
  2443. Begin
  2444. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2445. z := - z;
  2446. End
  2447. else
  2448. Begin
  2449. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2450. End
  2451. End;
  2452. End;
  2453. float32_to_int32 := z;
  2454. End;
  2455. {*
  2456. -------------------------------------------------------------------------------
  2457. Returns the result of converting the single-precision floating-point value
  2458. `a' to the 32-bit two's complement integer format. The conversion is
  2459. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2460. Arithmetic, except that the conversion is always rounded toward zero.
  2461. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2462. the conversion overflows, the largest integer with the same sign as `a' is
  2463. returned.
  2464. -------------------------------------------------------------------------------
  2465. *}
  2466. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2467. Var
  2468. aSign : flag;
  2469. aExp, shiftCount : int16;
  2470. aSig : bits32;
  2471. z : int32;
  2472. Begin
  2473. aSig := extractFloat32Frac( a.float32 );
  2474. aExp := extractFloat32Exp( a.float32 );
  2475. aSign := extractFloat32Sign( a.float32 );
  2476. shiftCount := aExp - $9E;
  2477. if ( 0 <= shiftCount ) then
  2478. Begin
  2479. if ( a.float32 <> $CF000000 ) then
  2480. Begin
  2481. float_raise( float_flag_invalid );
  2482. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2483. Begin
  2484. float32_to_int32_round_to_zero := $7FFFFFFF;
  2485. exit;
  2486. end;
  2487. End;
  2488. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2489. exit;
  2490. End
  2491. else
  2492. if ( aExp <= $7E ) then
  2493. Begin
  2494. if ( aExp or aSig )<>0 then
  2495. softfloat_exception_flags :=
  2496. softfloat_exception_flags or float_flag_inexact;
  2497. float32_to_int32_round_to_zero := 0;
  2498. exit;
  2499. End;
  2500. aSig := ( aSig or $00800000 ) shl 8;
  2501. z := aSig shr ( - shiftCount );
  2502. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2503. Begin
  2504. softfloat_exception_flags :=
  2505. softfloat_exception_flags or float_flag_inexact;
  2506. End;
  2507. if ( aSign<>0 ) then z := - z;
  2508. float32_to_int32_round_to_zero := z;
  2509. End;
  2510. {*
  2511. -------------------------------------------------------------------------------
  2512. Returns the result of converting the single-precision floating-point value
  2513. `a' to the double-precision floating-point format. The conversion is
  2514. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2515. Arithmetic.
  2516. -------------------------------------------------------------------------------
  2517. *}
  2518. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2519. Var
  2520. aSign : flag;
  2521. aExp : int16;
  2522. aSig, zSig0, zSig1: bits32;
  2523. tmp : CommonNanT;
  2524. Begin
  2525. aSig := extractFloat32Frac( a.float32 );
  2526. aExp := extractFloat32Exp( a.float32 );
  2527. aSign := extractFloat32Sign( a.float32 );
  2528. if ( aExp = $FF ) then
  2529. Begin
  2530. if ( aSig<>0 ) then
  2531. Begin
  2532. float32ToCommonNaN(a.float32, tmp);
  2533. commonNaNToFloat64(tmp , result);
  2534. exit;
  2535. End;
  2536. packFloat64( aSign, $7FF, 0, 0, result);
  2537. exit;
  2538. End;
  2539. if ( aExp = 0 ) then
  2540. Begin
  2541. if ( aSig = 0 ) then
  2542. Begin
  2543. packFloat64( aSign, 0, 0, 0, result );
  2544. exit;
  2545. end;
  2546. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2547. Dec(aExp);
  2548. End;
  2549. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2550. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2551. End;
  2552. {*
  2553. -------------------------------------------------------------------------------
  2554. Rounds the single-precision floating-point value `a' to an integer,
  2555. and returns the result as a single-precision floating-point value. The
  2556. operation is performed according to the IEC/IEEE Standard for Binary
  2557. Floating-Point Arithmetic.
  2558. -------------------------------------------------------------------------------
  2559. *}
  2560. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2561. Var
  2562. aSign: flag;
  2563. aExp: int16;
  2564. lastBitMask, roundBitsMask: bits32;
  2565. roundingMode: int8;
  2566. z: float32;
  2567. Begin
  2568. aExp := extractFloat32Exp( a.float32 );
  2569. if ( $96 <= aExp ) then
  2570. Begin
  2571. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2572. Begin
  2573. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2574. exit;
  2575. End;
  2576. float32_round_to_int:=a;
  2577. exit;
  2578. End;
  2579. if ( aExp <= $7E ) then
  2580. Begin
  2581. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2582. Begin
  2583. float32_round_to_int:=a;
  2584. exit;
  2585. end;
  2586. softfloat_exception_flags
  2587. := softfloat_exception_flags OR float_flag_inexact;
  2588. aSign := extractFloat32Sign( a.float32 );
  2589. case ( softfloat_rounding_mode ) of
  2590. float_round_nearest_even:
  2591. Begin
  2592. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2593. Begin
  2594. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2595. exit;
  2596. End;
  2597. End;
  2598. float_round_down:
  2599. Begin
  2600. if aSign <> 0 then
  2601. float32_round_to_int.float32 := $BF800000
  2602. else
  2603. float32_round_to_int.float32 := 0;
  2604. exit;
  2605. End;
  2606. float_round_up:
  2607. Begin
  2608. if aSign <> 0 then
  2609. float32_round_to_int.float32 := $80000000
  2610. else
  2611. float32_round_to_int.float32 := $3F800000;
  2612. exit;
  2613. End;
  2614. end;
  2615. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2616. End;
  2617. lastBitMask := 1;
  2618. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2619. lastBitMask := lastBitMask shl ($96 - aExp);
  2620. roundBitsMask := lastBitMask - 1;
  2621. z := a.float32;
  2622. roundingMode := softfloat_rounding_mode;
  2623. if ( roundingMode = float_round_nearest_even ) then
  2624. Begin
  2625. z := z + (lastBitMask shr 1);
  2626. if ( ( z and roundBitsMask ) = 0 ) then
  2627. z := z and not lastBitMask;
  2628. End
  2629. else if ( roundingMode <> float_round_to_zero ) then
  2630. Begin
  2631. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2632. Begin
  2633. z := z + roundBitsMask;
  2634. End;
  2635. End;
  2636. z := z and not roundBitsMask;
  2637. if ( z <> a.float32 ) then
  2638. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2639. float32_round_to_int.float32 := z;
  2640. End;
  2641. {*
  2642. -------------------------------------------------------------------------------
  2643. Returns the result of adding the absolute values of the single-precision
  2644. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2645. before being returned. `zSign' is ignored if the result is a NaN.
  2646. The addition is performed according to the IEC/IEEE Standard for Binary
  2647. Floating-Point Arithmetic.
  2648. -------------------------------------------------------------------------------
  2649. *}
  2650. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2651. Var
  2652. aExp, bExp, zExp: int16;
  2653. aSig, bSig, zSig: bits32;
  2654. expDiff: int16;
  2655. label roundAndPack;
  2656. Begin
  2657. aSig:=extractFloat32Frac( a );
  2658. aExp:=extractFloat32Exp( a );
  2659. bSig:=extractFloat32Frac( b );
  2660. bExp := extractFloat32Exp( b );
  2661. expDiff := aExp - bExp;
  2662. aSig := aSig shl 6;
  2663. bSig := bSig shl 6;
  2664. if ( 0 < expDiff ) then
  2665. Begin
  2666. if ( aExp = $FF ) then
  2667. Begin
  2668. if ( aSig <> 0) then
  2669. Begin
  2670. addFloat32Sigs := propagateFloat32NaN( a, b );
  2671. exit;
  2672. End;
  2673. addFloat32Sigs := a;
  2674. exit;
  2675. End;
  2676. if ( bExp = 0 ) then
  2677. Begin
  2678. Dec(expDiff);
  2679. End
  2680. else
  2681. Begin
  2682. bSig := bSig or $20000000;
  2683. End;
  2684. shift32RightJamming( bSig, expDiff, bSig );
  2685. zExp := aExp;
  2686. End
  2687. else
  2688. If ( expDiff < 0 ) then
  2689. Begin
  2690. if ( bExp = $FF ) then
  2691. Begin
  2692. if ( bSig<>0 ) then
  2693. Begin
  2694. addFloat32Sigs := propagateFloat32NaN( a, b );
  2695. exit;
  2696. end;
  2697. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  2698. exit;
  2699. End;
  2700. if ( aExp = 0 ) then
  2701. Begin
  2702. Inc(expDiff);
  2703. End
  2704. else
  2705. Begin
  2706. aSig := aSig OR $20000000;
  2707. End;
  2708. shift32RightJamming( aSig, - expDiff, aSig );
  2709. zExp := bExp;
  2710. End
  2711. else
  2712. Begin
  2713. if ( aExp = $FF ) then
  2714. Begin
  2715. if ( aSig OR bSig )<> 0 then
  2716. Begin
  2717. addFloat32Sigs := propagateFloat32NaN( a, b );
  2718. exit;
  2719. end;
  2720. addFloat32Sigs := a;
  2721. exit;
  2722. End;
  2723. if ( aExp = 0 ) then
  2724. Begin
  2725. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  2726. exit;
  2727. end;
  2728. zSig := $40000000 + aSig + bSig;
  2729. zExp := aExp;
  2730. goto roundAndPack;
  2731. End;
  2732. aSig := aSig OR $20000000;
  2733. zSig := ( aSig + bSig ) shl 1;
  2734. Dec(zExp);
  2735. if ( sbits32 (zSig) < 0 ) then
  2736. Begin
  2737. zSig := aSig + bSig;
  2738. Inc(zExp);
  2739. End;
  2740. roundAndPack:
  2741. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  2742. End;
  2743. {*
  2744. -------------------------------------------------------------------------------
  2745. Returns the result of subtracting the absolute values of the single-
  2746. precision floating-point values `a' and `b'. If `zSign' is 1, the
  2747. difference is negated before being returned. `zSign' is ignored if the
  2748. result is a NaN. The subtraction is performed according to the IEC/IEEE
  2749. Standard for Binary Floating-Point Arithmetic.
  2750. -------------------------------------------------------------------------------
  2751. *}
  2752. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  2753. Var
  2754. aExp, bExp, zExp: int16;
  2755. aSig, bSig, zSig: bits32;
  2756. expDiff : int16;
  2757. label aExpBigger;
  2758. label bExpBigger;
  2759. label aBigger;
  2760. label bBigger;
  2761. label normalizeRoundAndPack;
  2762. Begin
  2763. aSig := extractFloat32Frac( a );
  2764. aExp := extractFloat32Exp( a );
  2765. bSig := extractFloat32Frac( b );
  2766. bExp := extractFloat32Exp( b );
  2767. expDiff := aExp - bExp;
  2768. aSig := aSig shl 7;
  2769. bSig := bSig shl 7;
  2770. if ( 0 < expDiff ) then goto aExpBigger;
  2771. if ( expDiff < 0 ) then goto bExpBigger;
  2772. if ( aExp = $FF ) then
  2773. Begin
  2774. if ( aSig OR bSig )<> 0 then
  2775. Begin
  2776. subFloat32Sigs := propagateFloat32NaN( a, b );
  2777. exit;
  2778. End;
  2779. float_raise( float_flag_invalid );
  2780. subFloat32Sigs := float32_default_nan;
  2781. exit;
  2782. End;
  2783. if ( aExp = 0 ) then
  2784. Begin
  2785. aExp := 1;
  2786. bExp := 1;
  2787. End;
  2788. if ( bSig < aSig ) Then goto aBigger;
  2789. if ( aSig < bSig ) Then goto bBigger;
  2790. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  2791. exit;
  2792. bExpBigger:
  2793. if ( bExp = $FF ) then
  2794. Begin
  2795. if ( bSig<>0 ) then
  2796. Begin
  2797. subFloat32Sigs := propagateFloat32NaN( a, b );
  2798. exit;
  2799. End;
  2800. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  2801. exit;
  2802. End;
  2803. if ( aExp = 0 ) then
  2804. Begin
  2805. Inc(expDiff);
  2806. End
  2807. else
  2808. Begin
  2809. aSig := aSig OR $40000000;
  2810. End;
  2811. shift32RightJamming( aSig, - expDiff, aSig );
  2812. bSig := bSig OR $40000000;
  2813. bBigger:
  2814. zSig := bSig - aSig;
  2815. zExp := bExp;
  2816. zSign := zSign xor 1;
  2817. goto normalizeRoundAndPack;
  2818. aExpBigger:
  2819. if ( aExp = $FF ) then
  2820. Begin
  2821. if ( aSig <> 0) then
  2822. Begin
  2823. subFloat32Sigs := propagateFloat32NaN( a, b );
  2824. exit;
  2825. End;
  2826. subFloat32Sigs := a;
  2827. exit;
  2828. End;
  2829. if ( bExp = 0 ) then
  2830. Begin
  2831. Dec(expDiff);
  2832. End
  2833. else
  2834. Begin
  2835. bSig := bSig OR $40000000;
  2836. End;
  2837. shift32RightJamming( bSig, expDiff, bSig );
  2838. aSig := aSig OR $40000000;
  2839. aBigger:
  2840. zSig := aSig - bSig;
  2841. zExp := aExp;
  2842. normalizeRoundAndPack:
  2843. Dec(zExp);
  2844. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  2845. End;
  2846. {*
  2847. -------------------------------------------------------------------------------
  2848. Returns the result of adding the single-precision floating-point values `a'
  2849. and `b'. The operation is performed according to the IEC/IEEE Standard for
  2850. Binary Floating-Point Arithmetic.
  2851. -------------------------------------------------------------------------------
  2852. *}
  2853. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  2854. Var
  2855. aSign, bSign: Flag;
  2856. Begin
  2857. aSign := extractFloat32Sign( a.float32 );
  2858. bSign := extractFloat32Sign( b.float32 );
  2859. if ( aSign = bSign ) then
  2860. Begin
  2861. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  2862. End
  2863. else
  2864. Begin
  2865. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  2866. End;
  2867. End;
  2868. {*
  2869. -------------------------------------------------------------------------------
  2870. Returns the result of subtracting the single-precision floating-point values
  2871. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  2872. for Binary Floating-Point Arithmetic.
  2873. -------------------------------------------------------------------------------
  2874. *}
  2875. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  2876. Var
  2877. aSign, bSign: flag;
  2878. Begin
  2879. aSign := extractFloat32Sign( a.float32 );
  2880. bSign := extractFloat32Sign( b.float32 );
  2881. if ( aSign = bSign ) then
  2882. Begin
  2883. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  2884. End
  2885. else
  2886. Begin
  2887. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  2888. End;
  2889. End;
  2890. {*
  2891. -------------------------------------------------------------------------------
  2892. Returns the result of multiplying the single-precision floating-point values
  2893. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  2894. for Binary Floating-Point Arithmetic.
  2895. -------------------------------------------------------------------------------
  2896. *}
  2897. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  2898. Var
  2899. aSign, bSign, zSign: flag;
  2900. aExp, bExp, zExp : int16;
  2901. aSig, bSig, zSig0, zSig1: bits32;
  2902. Begin
  2903. aSig := extractFloat32Frac( a.float32 );
  2904. aExp := extractFloat32Exp( a.float32 );
  2905. aSign := extractFloat32Sign( a.float32 );
  2906. bSig := extractFloat32Frac( b.float32 );
  2907. bExp := extractFloat32Exp( b.float32 );
  2908. bSign := extractFloat32Sign( b.float32 );
  2909. zSign := aSign xor bSign;
  2910. if ( aExp = $FF ) then
  2911. Begin
  2912. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  2913. Begin
  2914. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  2915. End;
  2916. if ( ( bExp OR bSig ) = 0 ) then
  2917. Begin
  2918. float_raise( float_flag_invalid );
  2919. float32_mul.float32 := float32_default_nan;
  2920. exit;
  2921. End;
  2922. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  2923. exit;
  2924. End;
  2925. if ( bExp = $FF ) then
  2926. Begin
  2927. if ( bSig <> 0 ) then
  2928. Begin
  2929. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  2930. exit;
  2931. End;
  2932. if ( ( aExp OR aSig ) = 0 ) then
  2933. Begin
  2934. float_raise( float_flag_invalid );
  2935. float32_mul.float32 := float32_default_nan;
  2936. exit;
  2937. End;
  2938. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  2939. exit;
  2940. End;
  2941. if ( aExp = 0 ) then
  2942. Begin
  2943. if ( aSig = 0 ) then
  2944. Begin
  2945. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  2946. exit;
  2947. End;
  2948. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2949. End;
  2950. if ( bExp = 0 ) then
  2951. Begin
  2952. if ( bSig = 0 ) then
  2953. Begin
  2954. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  2955. exit;
  2956. End;
  2957. normalizeFloat32Subnormal( bSig, bExp, bSig );
  2958. End;
  2959. zExp := aExp + bExp - $7F;
  2960. aSig := ( aSig OR $00800000 ) shl 7;
  2961. bSig := ( bSig OR $00800000 ) shl 8;
  2962. mul32To64( aSig, bSig, zSig0, zSig1 );
  2963. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  2964. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  2965. Begin
  2966. zSig0 := zSig0 shl 1;
  2967. Dec(zExp);
  2968. End;
  2969. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  2970. End;
  2971. {*
  2972. -------------------------------------------------------------------------------
  2973. Returns the result of dividing the single-precision floating-point value `a'
  2974. by the corresponding value `b'. The operation is performed according to the
  2975. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2976. -------------------------------------------------------------------------------
  2977. *}
  2978. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  2979. Var
  2980. aSign, bSign, zSign: flag;
  2981. aExp, bExp, zExp: int16;
  2982. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  2983. Begin
  2984. aSig := extractFloat32Frac( a.float32 );
  2985. aExp := extractFloat32Exp( a.float32 );
  2986. aSign := extractFloat32Sign( a.float32 );
  2987. bSig := extractFloat32Frac( b.float32 );
  2988. bExp := extractFloat32Exp( b.float32 );
  2989. bSign := extractFloat32Sign( b.float32 );
  2990. zSign := aSign xor bSign;
  2991. if ( aExp = $FF ) then
  2992. Begin
  2993. if ( aSig <> 0 ) then
  2994. Begin
  2995. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  2996. exit;
  2997. End;
  2998. if ( bExp = $FF ) then
  2999. Begin
  3000. if ( bSig <> 0) then
  3001. Begin
  3002. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3003. End;
  3004. float_raise( float_flag_invalid );
  3005. float32_div.float32 := float32_default_nan;
  3006. exit;
  3007. End;
  3008. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3009. exit;
  3010. End;
  3011. if ( bExp = $FF ) then
  3012. Begin
  3013. if ( bSig <> 0) then
  3014. Begin
  3015. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3016. exit;
  3017. End;
  3018. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3019. exit;
  3020. End;
  3021. if ( bExp = 0 ) Then
  3022. Begin
  3023. if ( bSig = 0 ) Then
  3024. Begin
  3025. if ( ( aExp OR aSig ) = 0 ) then
  3026. Begin
  3027. float_raise( float_flag_invalid );
  3028. float32_div.float32 := float32_default_nan;
  3029. exit;
  3030. End;
  3031. float_raise( float_flag_divbyzero );
  3032. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3033. exit;
  3034. End;
  3035. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3036. End;
  3037. if ( aExp = 0 ) Then
  3038. Begin
  3039. if ( aSig = 0 ) Then
  3040. Begin
  3041. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3042. exit;
  3043. End;
  3044. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3045. End;
  3046. zExp := aExp - bExp + $7D;
  3047. aSig := ( aSig OR $00800000 ) shl 7;
  3048. bSig := ( bSig OR $00800000 ) shl 8;
  3049. if ( bSig <= ( aSig + aSig ) ) then
  3050. Begin
  3051. aSig := aSig shr 1;
  3052. Inc(zExp);
  3053. End;
  3054. zSig := estimateDiv64To32( aSig, 0, bSig );
  3055. if ( ( zSig and $3F ) <= 2 ) then
  3056. Begin
  3057. mul32To64( bSig, zSig, term0, term1 );
  3058. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3059. while ( sbits32 (rem0) < 0 ) do
  3060. Begin
  3061. Dec(zSig);
  3062. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3063. End;
  3064. zSig := zSig or bits32( rem1 <> 0 );
  3065. End;
  3066. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3067. End;
  3068. {*
  3069. -------------------------------------------------------------------------------
  3070. Returns the remainder of the single-precision floating-point value `a'
  3071. with respect to the corresponding value `b'. The operation is performed
  3072. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3073. -------------------------------------------------------------------------------
  3074. *}
  3075. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3076. Var
  3077. aSign, bSign, zSign: flag;
  3078. aExp, bExp, expDiff: int16;
  3079. aSig, bSig, q, allZero, alternateASig: bits32;
  3080. sigMean: sbits32;
  3081. Begin
  3082. aSig := extractFloat32Frac( a.float32 );
  3083. aExp := extractFloat32Exp( a.float32 );
  3084. aSign := extractFloat32Sign( a.float32 );
  3085. bSig := extractFloat32Frac( b.float32 );
  3086. bExp := extractFloat32Exp( b.float32 );
  3087. bSign := extractFloat32Sign( b.float32 );
  3088. if ( aExp = $FF ) then
  3089. Begin
  3090. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3091. Begin
  3092. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3093. exit;
  3094. End;
  3095. float_raise( float_flag_invalid );
  3096. float32_rem.float32 := float32_default_nan;
  3097. exit;
  3098. End;
  3099. if ( bExp = $FF ) then
  3100. Begin
  3101. if ( bSig <> 0 ) then
  3102. Begin
  3103. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3104. exit;
  3105. End;
  3106. float32_rem := a;
  3107. exit;
  3108. End;
  3109. if ( bExp = 0 ) then
  3110. Begin
  3111. if ( bSig = 0 ) then
  3112. Begin
  3113. float_raise( float_flag_invalid );
  3114. float32_rem.float32 := float32_default_nan;
  3115. exit;
  3116. End;
  3117. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3118. End;
  3119. if ( aExp = 0 ) then
  3120. Begin
  3121. if ( aSig = 0 ) then
  3122. Begin
  3123. float32_rem := a;
  3124. exit;
  3125. End;
  3126. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3127. End;
  3128. expDiff := aExp - bExp;
  3129. aSig := ( aSig OR $00800000 ) shl 8;
  3130. bSig := ( bSig OR $00800000 ) shl 8;
  3131. if ( expDiff < 0 ) then
  3132. Begin
  3133. if ( expDiff < -1 ) then
  3134. Begin
  3135. float32_rem := a;
  3136. exit;
  3137. End;
  3138. aSig := aSig shr 1;
  3139. End;
  3140. q := bits32( bSig <= aSig );
  3141. if ( q <> 0) then
  3142. aSig := aSig - bSig;
  3143. expDiff := expDiff - 32;
  3144. while ( 0 < expDiff ) do
  3145. Begin
  3146. q := estimateDiv64To32( aSig, 0, bSig );
  3147. if (2 < q) then
  3148. q := q - 2
  3149. else
  3150. q := 0;
  3151. aSig := - ( ( bSig shr 2 ) * q );
  3152. expDiff := expDiff - 30;
  3153. End;
  3154. expDiff := expDiff + 32;
  3155. if ( 0 < expDiff ) then
  3156. Begin
  3157. q := estimateDiv64To32( aSig, 0, bSig );
  3158. if (2 < q) then
  3159. q := q - 2
  3160. else
  3161. q := 0;
  3162. q := q shr (32 - expDiff);
  3163. bSig := bSig shr 2;
  3164. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3165. End
  3166. else
  3167. Begin
  3168. aSig := aSig shr 2;
  3169. bSig := bSig shr 2;
  3170. End;
  3171. Repeat
  3172. alternateASig := aSig;
  3173. Inc(q);
  3174. aSig := aSig - bSig;
  3175. Until not ( 0 <= sbits32 (aSig) );
  3176. sigMean := aSig + alternateASig;
  3177. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3178. Begin
  3179. aSig := alternateASig;
  3180. End;
  3181. zSign := flag( sbits32 (aSig) < 0 );
  3182. if ( zSign<>0 ) then
  3183. aSig := - aSig;
  3184. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3185. End;
  3186. {*
  3187. -------------------------------------------------------------------------------
  3188. Returns the square root of the single-precision floating-point value `a'.
  3189. The operation is performed according to the IEC/IEEE Standard for Binary
  3190. Floating-Point Arithmetic.
  3191. -------------------------------------------------------------------------------
  3192. *}
  3193. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3194. Var
  3195. aSign : flag;
  3196. aExp, zExp : int16;
  3197. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3198. label roundAndPack;
  3199. Begin
  3200. aSig := extractFloat32Frac( a.float32 );
  3201. aExp := extractFloat32Exp( a.float32 );
  3202. aSign := extractFloat32Sign( a.float32 );
  3203. if ( aExp = $FF ) then
  3204. Begin
  3205. if ( aSig <> 0) then
  3206. Begin
  3207. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3208. exit;
  3209. End;
  3210. if ( aSign = 0) then
  3211. Begin
  3212. float32_sqrt := a;
  3213. exit;
  3214. End;
  3215. float_raise( float_flag_invalid );
  3216. float32_sqrt.float32 := float32_default_nan;
  3217. exit;
  3218. End;
  3219. if ( aSign <> 0) then
  3220. Begin
  3221. if ( ( aExp OR aSig ) = 0 ) then
  3222. Begin
  3223. float32_sqrt := a;
  3224. exit;
  3225. End;
  3226. float_raise( float_flag_invalid );
  3227. float32_sqrt.float32 := float32_default_nan;
  3228. exit;
  3229. End;
  3230. if ( aExp = 0 ) then
  3231. Begin
  3232. if ( aSig = 0 ) then
  3233. Begin
  3234. float32_sqrt.float32 := 0;
  3235. exit;
  3236. End;
  3237. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3238. End;
  3239. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3240. aSig := ( aSig OR $00800000 ) shl 8;
  3241. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3242. if ( ( zSig and $7F ) <= 5 ) then
  3243. Begin
  3244. if ( zSig < 2 ) then
  3245. Begin
  3246. zSig := $7FFFFFFF;
  3247. goto roundAndPack;
  3248. End
  3249. else
  3250. Begin
  3251. aSig := aSig shr (aExp and 1);
  3252. mul32To64( zSig, zSig, term0, term1 );
  3253. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3254. while ( sbits32 (rem0) < 0 ) do
  3255. Begin
  3256. Dec(zSig);
  3257. shortShift64Left( 0, zSig, 1, term0, term1 );
  3258. term1 := term1 or 1;
  3259. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3260. End;
  3261. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3262. End;
  3263. End;
  3264. shift32RightJamming( zSig, 1, zSig );
  3265. roundAndPack:
  3266. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3267. End;
  3268. {*
  3269. -------------------------------------------------------------------------------
  3270. Returns 1 if the single-precision floating-point value `a' is equal to
  3271. the corresponding value `b', and 0 otherwise. The comparison is performed
  3272. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3273. -------------------------------------------------------------------------------
  3274. *}
  3275. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3276. Begin
  3277. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3278. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3279. ) then
  3280. Begin
  3281. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3282. Begin
  3283. float_raise( float_flag_invalid );
  3284. End;
  3285. float32_eq := 0;
  3286. exit;
  3287. End;
  3288. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3289. End;
  3290. {*
  3291. -------------------------------------------------------------------------------
  3292. Returns 1 if the single-precision floating-point value `a' is less than
  3293. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3294. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3295. Arithmetic.
  3296. -------------------------------------------------------------------------------
  3297. *}
  3298. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3299. var
  3300. aSign, bSign: flag;
  3301. Begin
  3302. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3303. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3304. ) then
  3305. Begin
  3306. float_raise( float_flag_invalid );
  3307. float32_le := 0;
  3308. exit;
  3309. End;
  3310. aSign := extractFloat32Sign( a.float32 );
  3311. bSign := extractFloat32Sign( b.float32 );
  3312. if ( aSign <> bSign ) then
  3313. Begin
  3314. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3315. exit;
  3316. End;
  3317. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3318. End;
  3319. {*
  3320. -------------------------------------------------------------------------------
  3321. Returns 1 if the single-precision floating-point value `a' is less than
  3322. the corresponding value `b', and 0 otherwise. The comparison is performed
  3323. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3324. -------------------------------------------------------------------------------
  3325. *}
  3326. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3327. var
  3328. aSign, bSign: flag;
  3329. Begin
  3330. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3331. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3332. ) then
  3333. Begin
  3334. float_raise( float_flag_invalid );
  3335. float32_lt :=0;
  3336. exit;
  3337. End;
  3338. aSign := extractFloat32Sign( a.float32 );
  3339. bSign := extractFloat32Sign( b.float32 );
  3340. if ( aSign <> bSign ) then
  3341. Begin
  3342. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3343. exit;
  3344. End;
  3345. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3346. End;
  3347. {*
  3348. -------------------------------------------------------------------------------
  3349. Returns 1 if the single-precision floating-point value `a' is equal to
  3350. the corresponding value `b', and 0 otherwise. The invalid exception is
  3351. raised if either operand is a NaN. Otherwise, the comparison is performed
  3352. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3353. -------------------------------------------------------------------------------
  3354. *}
  3355. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3356. Begin
  3357. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3358. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3359. ) then
  3360. Begin
  3361. float_raise( float_flag_invalid );
  3362. float32_eq_signaling := 0;
  3363. exit;
  3364. End;
  3365. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3366. End;
  3367. {*
  3368. -------------------------------------------------------------------------------
  3369. Returns 1 if the single-precision floating-point value `a' is less than or
  3370. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3371. cause an exception. Otherwise, the comparison is performed according to the
  3372. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3373. -------------------------------------------------------------------------------
  3374. *}
  3375. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3376. Var
  3377. aSign, bSign: flag;
  3378. aExp, bExp: int16;
  3379. Begin
  3380. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3381. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3382. ) then
  3383. Begin
  3384. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3385. Begin
  3386. float_raise( float_flag_invalid );
  3387. End;
  3388. float32_le_quiet := 0;
  3389. exit;
  3390. End;
  3391. aSign := extractFloat32Sign( a );
  3392. bSign := extractFloat32Sign( b );
  3393. if ( aSign <> bSign ) then
  3394. Begin
  3395. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3396. exit;
  3397. End;
  3398. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3399. End;
  3400. {*
  3401. -------------------------------------------------------------------------------
  3402. Returns 1 if the single-precision floating-point value `a' is less than
  3403. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3404. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3405. Standard for Binary Floating-Point Arithmetic.
  3406. -------------------------------------------------------------------------------
  3407. *}
  3408. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3409. Var
  3410. aSign, bSign: flag;
  3411. Begin
  3412. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3413. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3414. ) then
  3415. Begin
  3416. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3417. Begin
  3418. float_raise( float_flag_invalid );
  3419. End;
  3420. float32_lt_quiet := 0;
  3421. exit;
  3422. End;
  3423. aSign := extractFloat32Sign( a );
  3424. bSign := extractFloat32Sign( b );
  3425. if ( aSign <> bSign ) then
  3426. Begin
  3427. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3428. exit;
  3429. End;
  3430. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3431. End;
  3432. {*
  3433. -------------------------------------------------------------------------------
  3434. Returns the result of converting the double-precision floating-point value
  3435. `a' to the 32-bit two's complement integer format. The conversion is
  3436. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3437. Arithmetic---which means in particular that the conversion is rounded
  3438. according to the current rounding mode. If `a' is a NaN, the largest
  3439. positive integer is returned. Otherwise, if the conversion overflows, the
  3440. largest integer with the same sign as `a' is returned.
  3441. -------------------------------------------------------------------------------
  3442. *}
  3443. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3444. var
  3445. aSign: flag;
  3446. aExp, shiftCount: int16;
  3447. aSig0, aSig1, absZ, aSigExtra: bits32;
  3448. z: int32;
  3449. roundingMode: int8;
  3450. label invalid;
  3451. Begin
  3452. aSig1 := extractFloat64Frac1( a );
  3453. aSig0 := extractFloat64Frac0( a );
  3454. aExp := extractFloat64Exp( a );
  3455. aSign := extractFloat64Sign( a );
  3456. shiftCount := aExp - $413;
  3457. if ( 0 <= shiftCount ) then
  3458. Begin
  3459. if ( $41E < aExp ) then
  3460. Begin
  3461. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3462. aSign := 0;
  3463. goto invalid;
  3464. End;
  3465. shortShift64Left(
  3466. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3467. if ( $80000000 < absZ ) then
  3468. goto invalid;
  3469. End
  3470. else
  3471. Begin
  3472. aSig1 := flag( aSig1 <> 0 );
  3473. if ( aExp < $3FE ) then
  3474. Begin
  3475. aSigExtra := aExp OR aSig0 OR aSig1;
  3476. absZ := 0;
  3477. End
  3478. else
  3479. Begin
  3480. aSig0 := aSig0 OR $00100000;
  3481. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3482. absZ := aSig0 shr ( - shiftCount );
  3483. End;
  3484. End;
  3485. roundingMode := softfloat_rounding_mode;
  3486. if ( roundingMode = float_round_nearest_even ) then
  3487. Begin
  3488. if ( sbits32(aSigExtra) < 0 ) then
  3489. Begin
  3490. Inc(absZ);
  3491. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3492. absZ := absZ and not 1;
  3493. End;
  3494. if aSign <> 0 then
  3495. z := - absZ
  3496. else
  3497. z := absZ;
  3498. End
  3499. else
  3500. Begin
  3501. aSigExtra := bits32( aSigExtra <> 0 );
  3502. if ( aSign <> 0) then
  3503. Begin
  3504. z := - ( absZ
  3505. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3506. End
  3507. else
  3508. Begin
  3509. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3510. End
  3511. End;
  3512. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3513. Begin
  3514. invalid:
  3515. float_raise( float_flag_invalid );
  3516. if (aSign <> 0 ) then
  3517. float64_to_int32 := sbits32 ($80000000)
  3518. else
  3519. float64_to_int32 := $7FFFFFFF;
  3520. exit;
  3521. End;
  3522. if ( aSigExtra <> 0) then
  3523. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3524. float64_to_int32 := z;
  3525. End;
  3526. {*
  3527. -------------------------------------------------------------------------------
  3528. Returns the result of converting the double-precision floating-point value
  3529. `a' to the 32-bit two's complement integer format. The conversion is
  3530. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3531. Arithmetic, except that the conversion is always rounded toward zero.
  3532. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3533. the conversion overflows, the largest integer with the same sign as `a' is
  3534. returned.
  3535. -------------------------------------------------------------------------------
  3536. *}
  3537. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3538. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3539. Var
  3540. aSign: flag;
  3541. aExp, shiftCount: int16;
  3542. aSig0, aSig1, absZ, aSigExtra: bits32;
  3543. z: int32;
  3544. label invalid;
  3545. Begin
  3546. aSig1 := extractFloat64Frac1( a );
  3547. aSig0 := extractFloat64Frac0( a );
  3548. aExp := extractFloat64Exp( a );
  3549. aSign := extractFloat64Sign( a );
  3550. shiftCount := aExp - $413;
  3551. if ( 0 <= shiftCount ) then
  3552. Begin
  3553. if ( $41E < aExp ) then
  3554. Begin
  3555. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3556. aSign := 0;
  3557. goto invalid;
  3558. End;
  3559. shortShift64Left(
  3560. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3561. End
  3562. else
  3563. Begin
  3564. if ( aExp < $3FF ) then
  3565. Begin
  3566. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3567. Begin
  3568. softfloat_exception_flags :=
  3569. softfloat_exception_flags or float_flag_inexact;
  3570. End;
  3571. float64_to_int32_round_to_zero := 0;
  3572. exit;
  3573. End;
  3574. aSig0 := aSig0 or $00100000;
  3575. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3576. absZ := aSig0 shr ( - shiftCount );
  3577. End;
  3578. if aSign <> 0 then
  3579. z := - absZ
  3580. else
  3581. z := absZ;
  3582. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3583. Begin
  3584. invalid:
  3585. float_raise( float_flag_invalid );
  3586. if (aSign <> 0) then
  3587. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3588. else
  3589. float64_to_int32_round_to_zero := $7FFFFFFF;
  3590. exit;
  3591. End;
  3592. if ( aSigExtra <> 0) then
  3593. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3594. float64_to_int32_round_to_zero := z;
  3595. End;
  3596. {*
  3597. -------------------------------------------------------------------------------
  3598. Returns the result of converting the double-precision floating-point value
  3599. `a' to the single-precision floating-point format. The conversion is
  3600. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3601. Arithmetic.
  3602. -------------------------------------------------------------------------------
  3603. *}
  3604. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3605. Var
  3606. aSign: flag;
  3607. aExp: int16;
  3608. aSig0, aSig1, zSig: bits32;
  3609. allZero: bits32;
  3610. tmp : CommonNanT;
  3611. Begin
  3612. aSig1 := extractFloat64Frac1( a );
  3613. aSig0 := extractFloat64Frac0( a );
  3614. aExp := extractFloat64Exp( a );
  3615. aSign := extractFloat64Sign( a );
  3616. if ( aExp = $7FF ) then
  3617. Begin
  3618. if ( aSig0 OR aSig1 ) <> 0 then
  3619. Begin
  3620. float64ToCommonNaN( a, tmp );
  3621. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3622. exit;
  3623. End;
  3624. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3625. exit;
  3626. End;
  3627. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3628. if ( aExp <> 0) then
  3629. zSig := zSig OR $40000000;
  3630. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3631. End;
  3632. {*
  3633. -------------------------------------------------------------------------------
  3634. Rounds the double-precision floating-point value `a' to an integer,
  3635. and returns the result as a double-precision floating-point value. The
  3636. operation is performed according to the IEC/IEEE Standard for Binary
  3637. Floating-Point Arithmetic.
  3638. -------------------------------------------------------------------------------
  3639. *}
  3640. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3641. Var
  3642. aSign: flag;
  3643. aExp: int16;
  3644. lastBitMask, roundBitsMask: bits32;
  3645. roundingMode: int8;
  3646. z: float64;
  3647. Begin
  3648. aExp := extractFloat64Exp( a );
  3649. if ( $413 <= aExp ) then
  3650. Begin
  3651. if ( $433 <= aExp ) then
  3652. Begin
  3653. if ( ( aExp = $7FF )
  3654. AND
  3655. (
  3656. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3657. ) <>0)
  3658. ) then
  3659. Begin
  3660. propagateFloat64NaN( a, a, result );
  3661. exit;
  3662. End;
  3663. result := a;
  3664. exit;
  3665. End;
  3666. lastBitMask := 1;
  3667. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3668. roundBitsMask := lastBitMask - 1;
  3669. z := a;
  3670. roundingMode := softfloat_rounding_mode;
  3671. if ( roundingMode = float_round_nearest_even ) then
  3672. Begin
  3673. if ( lastBitMask <> 0) then
  3674. Begin
  3675. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  3676. if ( ( z.low and roundBitsMask ) = 0 ) then
  3677. z.low := z.low and not lastBitMask;
  3678. End
  3679. else
  3680. Begin
  3681. if ( sbits32 (z.low) < 0 ) then
  3682. Begin
  3683. Inc(z.high);
  3684. if ( bits32 ( z.low shl 1 ) = 0 ) then
  3685. z.high := z.high and not 1;
  3686. End;
  3687. End;
  3688. End
  3689. else if ( roundingMode <> float_round_to_zero ) then
  3690. Begin
  3691. if ( extractFloat64Sign( z )
  3692. xor flag( roundingMode = float_round_up ) )<> 0 then
  3693. Begin
  3694. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  3695. End;
  3696. End;
  3697. z.low := z.low and not roundBitsMask;
  3698. End
  3699. else
  3700. Begin
  3701. if ( aExp <= $3FE ) then
  3702. Begin
  3703. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  3704. Begin
  3705. result := a;
  3706. exit;
  3707. End;
  3708. softfloat_exception_flags := softfloat_exception_flags or
  3709. float_flag_inexact;
  3710. aSign := extractFloat64Sign( a );
  3711. case ( softfloat_rounding_mode ) of
  3712. float_round_nearest_even:
  3713. Begin
  3714. if ( ( aExp = $3FE )
  3715. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  3716. ) then
  3717. Begin
  3718. packFloat64( aSign, $3FF, 0, 0, result );
  3719. exit;
  3720. End;
  3721. End;
  3722. float_round_down:
  3723. Begin
  3724. if aSign<>0 then
  3725. packFloat64( 1, $3FF, 0, 0, result )
  3726. else
  3727. packFloat64( 0, 0, 0, 0, result );
  3728. exit;
  3729. End;
  3730. float_round_up:
  3731. Begin
  3732. if aSign <> 0 then
  3733. packFloat64( 1, 0, 0, 0, result )
  3734. else
  3735. packFloat64( 0, $3FF, 0, 0, result );
  3736. exit;
  3737. End;
  3738. end;
  3739. packFloat64( aSign, 0, 0, 0, result );
  3740. exit;
  3741. End;
  3742. lastBitMask := 1;
  3743. lastBitMask := lastBitMask shl ($413 - aExp);
  3744. roundBitsMask := lastBitMask - 1;
  3745. z.low := 0;
  3746. z.high := a.high;
  3747. roundingMode := softfloat_rounding_mode;
  3748. if ( roundingMode = float_round_nearest_even ) then
  3749. Begin
  3750. z.high := z.high + lastBitMask shr 1;
  3751. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  3752. Begin
  3753. z.high := z.high and not lastBitMask;
  3754. End;
  3755. End
  3756. else if ( roundingMode <> float_round_to_zero ) then
  3757. Begin
  3758. if ( extractFloat64Sign( z )
  3759. xor flag( roundingMode = float_round_up ) )<> 0 then
  3760. Begin
  3761. z.high := z.high or bits32( a.low <> 0 );
  3762. z.high := z.high + roundBitsMask;
  3763. End;
  3764. End;
  3765. z.high := z.high and not roundBitsMask;
  3766. End;
  3767. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  3768. Begin
  3769. softfloat_exception_flags :=
  3770. softfloat_exception_flags or float_flag_inexact;
  3771. End;
  3772. result := z;
  3773. End;
  3774. {*
  3775. -------------------------------------------------------------------------------
  3776. Returns the result of adding the absolute values of the double-precision
  3777. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3778. before being returned. `zSign' is ignored if the result is a NaN.
  3779. The addition is performed according to the IEC/IEEE Standard for Binary
  3780. Floating-Point Arithmetic.
  3781. -------------------------------------------------------------------------------
  3782. *}
  3783. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  3784. Var
  3785. aExp, bExp, zExp: int16;
  3786. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  3787. expDiff: int16;
  3788. label shiftRight1;
  3789. label roundAndPack;
  3790. Begin
  3791. aSig1 := extractFloat64Frac1( a );
  3792. aSig0 := extractFloat64Frac0( a );
  3793. aExp := extractFloat64Exp( a );
  3794. bSig1 := extractFloat64Frac1( b );
  3795. bSig0 := extractFloat64Frac0( b );
  3796. bExp := extractFloat64Exp( b );
  3797. expDiff := aExp - bExp;
  3798. if ( 0 < expDiff ) then
  3799. Begin
  3800. if ( aExp = $7FF ) then
  3801. Begin
  3802. if ( aSig0 OR aSig1 ) <> 0 then
  3803. Begin
  3804. propagateFloat64NaN( a, b, out );
  3805. exit;
  3806. end;
  3807. out := a;
  3808. exit;
  3809. End;
  3810. if ( bExp = 0 ) then
  3811. Begin
  3812. Dec(expDiff);
  3813. End
  3814. else
  3815. Begin
  3816. bSig0 := bSig0 or $00100000;
  3817. End;
  3818. shift64ExtraRightJamming(
  3819. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  3820. zExp := aExp;
  3821. End
  3822. else if ( expDiff < 0 ) then
  3823. Begin
  3824. if ( bExp = $7FF ) then
  3825. Begin
  3826. if ( bSig0 OR bSig1 ) <> 0 then
  3827. Begin
  3828. propagateFloat64NaN( a, b, out );
  3829. exit;
  3830. End;
  3831. packFloat64( zSign, $7FF, 0, 0, out );
  3832. End;
  3833. if ( aExp = 0 ) then
  3834. Begin
  3835. Inc(expDiff);
  3836. End
  3837. else
  3838. Begin
  3839. aSig0 := aSig0 or $00100000;
  3840. End;
  3841. shift64ExtraRightJamming(
  3842. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  3843. zExp := bExp;
  3844. End
  3845. else
  3846. Begin
  3847. if ( aExp = $7FF ) then
  3848. Begin
  3849. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  3850. Begin
  3851. propagateFloat64NaN( a, b, out );
  3852. exit;
  3853. End;
  3854. out := a;
  3855. exit;
  3856. End;
  3857. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  3858. if ( aExp = 0 ) then
  3859. Begin
  3860. packFloat64( zSign, 0, zSig0, zSig1, out );
  3861. exit;
  3862. End;
  3863. zSig2 := 0;
  3864. zSig0 := zSig0 or $00200000;
  3865. zExp := aExp;
  3866. goto shiftRight1;
  3867. End;
  3868. aSig0 := aSig0 or $00100000;
  3869. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  3870. Dec(zExp);
  3871. if ( zSig0 < $00200000 ) then
  3872. goto roundAndPack;
  3873. Inc(zExp);
  3874. shiftRight1:
  3875. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  3876. roundAndPack:
  3877. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  3878. End;
  3879. {*
  3880. -------------------------------------------------------------------------------
  3881. Returns the result of subtracting the absolute values of the double-
  3882. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3883. difference is negated before being returned. `zSign' is ignored if the
  3884. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3885. Standard for Binary Floating-Point Arithmetic.
  3886. -------------------------------------------------------------------------------
  3887. *}
  3888. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  3889. Var
  3890. aExp, bExp, zExp: int16;
  3891. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  3892. expDiff: int16;
  3893. z: float64;
  3894. label aExpBigger;
  3895. label bExpBigger;
  3896. label aBigger;
  3897. label bBigger;
  3898. label normalizeRoundAndPack;
  3899. Begin
  3900. aSig1 := extractFloat64Frac1( a );
  3901. aSig0 := extractFloat64Frac0( a );
  3902. aExp := extractFloat64Exp( a );
  3903. bSig1 := extractFloat64Frac1( b );
  3904. bSig0 := extractFloat64Frac0( b );
  3905. bExp := extractFloat64Exp( b );
  3906. expDiff := aExp - bExp;
  3907. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  3908. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  3909. if ( 0 < expDiff ) then goto aExpBigger;
  3910. if ( expDiff < 0 ) then goto bExpBigger;
  3911. if ( aExp = $7FF ) then
  3912. Begin
  3913. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  3914. Begin
  3915. propagateFloat64NaN( a, b, out );
  3916. exit;
  3917. End;
  3918. float_raise( float_flag_invalid );
  3919. z.low := float64_default_nan_low;
  3920. z.high := float64_default_nan_high;
  3921. out := z;
  3922. exit;
  3923. End;
  3924. if ( aExp = 0 ) then
  3925. Begin
  3926. aExp := 1;
  3927. bExp := 1;
  3928. End;
  3929. if ( bSig0 < aSig0 ) then goto aBigger;
  3930. if ( aSig0 < bSig0 ) then goto bBigger;
  3931. if ( bSig1 < aSig1 ) then goto aBigger;
  3932. if ( aSig1 < bSig1 ) then goto bBigger;
  3933. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  3934. exit;
  3935. bExpBigger:
  3936. if ( bExp = $7FF ) then
  3937. Begin
  3938. if ( bSig0 OR bSig1 ) <> 0 then
  3939. Begin
  3940. propagateFloat64NaN( a, b, out );
  3941. exit;
  3942. End;
  3943. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  3944. exit;
  3945. End;
  3946. if ( aExp = 0 ) then
  3947. Begin
  3948. Inc(expDiff);
  3949. End
  3950. else
  3951. Begin
  3952. aSig0 := aSig0 or $40000000;
  3953. End;
  3954. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  3955. bSig0 := bSig0 or $40000000;
  3956. bBigger:
  3957. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  3958. zExp := bExp;
  3959. zSign := zSign xor 1;
  3960. goto normalizeRoundAndPack;
  3961. aExpBigger:
  3962. if ( aExp = $7FF ) then
  3963. Begin
  3964. if ( aSig0 OR aSig1 ) <> 0 then
  3965. Begin
  3966. propagateFloat64NaN( a, b, out );
  3967. exit;
  3968. End;
  3969. out := a;
  3970. exit;
  3971. End;
  3972. if ( bExp = 0 ) then
  3973. Begin
  3974. Dec(expDiff);
  3975. End
  3976. else
  3977. Begin
  3978. bSig0 := bSig0 or $40000000;
  3979. End;
  3980. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  3981. aSig0 := aSig0 or $40000000;
  3982. aBigger:
  3983. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  3984. zExp := aExp;
  3985. normalizeRoundAndPack:
  3986. Dec(zExp);
  3987. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  3988. End;
  3989. {*
  3990. -------------------------------------------------------------------------------
  3991. Returns the result of adding the double-precision floating-point values `a'
  3992. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3993. Binary Floating-Point Arithmetic.
  3994. -------------------------------------------------------------------------------
  3995. *}
  3996. Function float64_add( a: float64; b : float64) : Float64;
  3997. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  3998. Var
  3999. aSign, bSign: flag;
  4000. Begin
  4001. aSign := extractFloat64Sign( a );
  4002. bSign := extractFloat64Sign( b );
  4003. if ( aSign = bSign ) then
  4004. Begin
  4005. addFloat64Sigs( a, b, aSign, result );
  4006. End
  4007. else
  4008. Begin
  4009. subFloat64Sigs( a, b, aSign, result );
  4010. End;
  4011. End;
  4012. {*
  4013. -------------------------------------------------------------------------------
  4014. Returns the result of subtracting the double-precision floating-point values
  4015. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4016. for Binary Floating-Point Arithmetic.
  4017. -------------------------------------------------------------------------------
  4018. *}
  4019. Function float64_sub(a: float64; b : float64) : Float64;
  4020. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4021. Var
  4022. aSign, bSign: flag;
  4023. Begin
  4024. aSign := extractFloat64Sign( a );
  4025. bSign := extractFloat64Sign( b );
  4026. if ( aSign = bSign ) then
  4027. Begin
  4028. subFloat64Sigs( a, b, aSign, result );
  4029. End
  4030. else
  4031. Begin
  4032. addFloat64Sigs( a, b, aSign, result );
  4033. End;
  4034. End;
  4035. {*
  4036. -------------------------------------------------------------------------------
  4037. Returns the result of multiplying the double-precision floating-point values
  4038. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4039. for Binary Floating-Point Arithmetic.
  4040. -------------------------------------------------------------------------------
  4041. *}
  4042. Function float64_mul( a: float64; b:float64) : Float64;
  4043. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4044. Var
  4045. aSign, bSign, zSign: flag;
  4046. aExp, bExp, zExp: int16;
  4047. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4048. z: float64;
  4049. label invalid;
  4050. Begin
  4051. aSig1 := extractFloat64Frac1( a );
  4052. aSig0 := extractFloat64Frac0( a );
  4053. aExp := extractFloat64Exp( a );
  4054. aSign := extractFloat64Sign( a );
  4055. bSig1 := extractFloat64Frac1( b );
  4056. bSig0 := extractFloat64Frac0( b );
  4057. bExp := extractFloat64Exp( b );
  4058. bSign := extractFloat64Sign( b );
  4059. zSign := aSign xor bSign;
  4060. if ( aExp = $7FF ) then
  4061. Begin
  4062. if ( (( aSig0 OR aSig1 ) <>0)
  4063. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4064. Begin
  4065. propagateFloat64NaN( a, b, result );
  4066. exit;
  4067. End;
  4068. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4069. packFloat64( zSign, $7FF, 0, 0, result );
  4070. exit;
  4071. End;
  4072. if ( bExp = $7FF ) then
  4073. Begin
  4074. if ( bSig0 OR bSig1 )<> 0 then
  4075. Begin
  4076. propagateFloat64NaN( a, b, result );
  4077. exit;
  4078. End;
  4079. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4080. Begin
  4081. invalid:
  4082. float_raise( float_flag_invalid );
  4083. z.low := float64_default_nan_low;
  4084. z.high := float64_default_nan_high;
  4085. result := z;
  4086. exit;
  4087. End;
  4088. packFloat64( zSign, $7FF, 0, 0, result );
  4089. exit;
  4090. End;
  4091. if ( aExp = 0 ) then
  4092. Begin
  4093. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4094. Begin
  4095. packFloat64( zSign, 0, 0, 0, result );
  4096. exit;
  4097. End;
  4098. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4099. End;
  4100. if ( bExp = 0 ) then
  4101. Begin
  4102. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4103. Begin
  4104. packFloat64( zSign, 0, 0, 0, result );
  4105. exit;
  4106. End;
  4107. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4108. End;
  4109. zExp := aExp + bExp - $400;
  4110. aSig0 := aSig0 or $00100000;
  4111. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4112. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4113. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4114. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4115. if ( $00200000 <= zSig0 ) then
  4116. Begin
  4117. shift64ExtraRightJamming(
  4118. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4119. Inc(zExp);
  4120. End;
  4121. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4122. End;
  4123. {*
  4124. -------------------------------------------------------------------------------
  4125. Returns the result of dividing the double-precision floating-point value `a'
  4126. by the corresponding value `b'. The operation is performed according to the
  4127. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4128. -------------------------------------------------------------------------------
  4129. *}
  4130. Function float64_div(a: float64; b : float64) : Float64;
  4131. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4132. Var
  4133. aSign, bSign, zSign: flag;
  4134. aExp, bExp, zExp: int16;
  4135. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4136. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4137. z: float64;
  4138. label invalid;
  4139. Begin
  4140. aSig1 := extractFloat64Frac1( a );
  4141. aSig0 := extractFloat64Frac0( a );
  4142. aExp := extractFloat64Exp( a );
  4143. aSign := extractFloat64Sign( a );
  4144. bSig1 := extractFloat64Frac1( b );
  4145. bSig0 := extractFloat64Frac0( b );
  4146. bExp := extractFloat64Exp( b );
  4147. bSign := extractFloat64Sign( b );
  4148. zSign := aSign xor bSign;
  4149. if ( aExp = $7FF ) then
  4150. Begin
  4151. if ( aSig0 OR aSig1 )<> 0 then
  4152. Begin
  4153. propagateFloat64NaN( a, b, result );
  4154. exit;
  4155. end;
  4156. if ( bExp = $7FF ) then
  4157. Begin
  4158. if ( bSig0 OR bSig1 )<>0 then
  4159. Begin
  4160. propagateFloat64NaN( a, b, result );
  4161. exit;
  4162. End;
  4163. goto invalid;
  4164. End;
  4165. packFloat64( zSign, $7FF, 0, 0, result );
  4166. exit;
  4167. End;
  4168. if ( bExp = $7FF ) then
  4169. Begin
  4170. if ( bSig0 OR bSig1 )<> 0 then
  4171. Begin
  4172. propagateFloat64NaN( a, b, result );
  4173. exit;
  4174. End;
  4175. packFloat64( zSign, 0, 0, 0, result );
  4176. exit;
  4177. End;
  4178. if ( bExp = 0 ) then
  4179. Begin
  4180. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4181. Begin
  4182. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4183. Begin
  4184. invalid:
  4185. float_raise( float_flag_invalid );
  4186. z.low := float64_default_nan_low;
  4187. z.high := float64_default_nan_high;
  4188. result := z;
  4189. exit;
  4190. End;
  4191. float_raise( float_flag_divbyzero );
  4192. packFloat64( zSign, $7FF, 0, 0, result );
  4193. exit;
  4194. End;
  4195. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4196. End;
  4197. if ( aExp = 0 ) then
  4198. Begin
  4199. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4200. Begin
  4201. packFloat64( zSign, 0, 0, 0, result );
  4202. exit;
  4203. End;
  4204. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4205. End;
  4206. zExp := aExp - bExp + $3FD;
  4207. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4208. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4209. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4210. Begin
  4211. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4212. Inc(zExp);
  4213. End;
  4214. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4215. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4216. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4217. while ( sbits32 (rem0) < 0 ) do
  4218. Begin
  4219. Dec(zSig0);
  4220. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4221. End;
  4222. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4223. if ( ( zSig1 and $3FF ) <= 4 ) then
  4224. Begin
  4225. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4226. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4227. while ( sbits32 (rem1) < 0 ) do
  4228. Begin
  4229. Dec(zSig1);
  4230. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4231. End;
  4232. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4233. End;
  4234. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4235. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4236. End;
  4237. {*
  4238. -------------------------------------------------------------------------------
  4239. Returns the remainder of the double-precision floating-point value `a'
  4240. with respect to the corresponding value `b'. The operation is performed
  4241. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4242. -------------------------------------------------------------------------------
  4243. *}
  4244. Function float64_rem(a: float64; b : float64) : float64;
  4245. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4246. Var
  4247. aSign, bSign, zSign: flag;
  4248. aExp, bExp, expDiff: int16;
  4249. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4250. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4251. sigMean0: sbits32;
  4252. z: float64;
  4253. label invalid;
  4254. Begin
  4255. aSig1 := extractFloat64Frac1( a );
  4256. aSig0 := extractFloat64Frac0( a );
  4257. aExp := extractFloat64Exp( a );
  4258. aSign := extractFloat64Sign( a );
  4259. bSig1 := extractFloat64Frac1( b );
  4260. bSig0 := extractFloat64Frac0( b );
  4261. bExp := extractFloat64Exp( b );
  4262. bSign := extractFloat64Sign( b );
  4263. if ( aExp = $7FF ) then
  4264. Begin
  4265. if ((( aSig0 OR aSig1 )<>0)
  4266. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4267. Begin
  4268. propagateFloat64NaN( a, b, result );
  4269. exit;
  4270. End;
  4271. goto invalid;
  4272. End;
  4273. if ( bExp = $7FF ) then
  4274. Begin
  4275. if ( bSig0 OR bSig1 ) <> 0 then
  4276. Begin
  4277. propagateFloat64NaN( a, b, result );
  4278. exit;
  4279. End;
  4280. result := a;
  4281. exit;
  4282. End;
  4283. if ( bExp = 0 ) then
  4284. Begin
  4285. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4286. Begin
  4287. invalid:
  4288. float_raise( float_flag_invalid );
  4289. z.low := float64_default_nan_low;
  4290. z.high := float64_default_nan_high;
  4291. result := z;
  4292. exit;
  4293. End;
  4294. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4295. End;
  4296. if ( aExp = 0 ) then
  4297. Begin
  4298. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4299. Begin
  4300. result := a;
  4301. exit;
  4302. End;
  4303. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4304. End;
  4305. expDiff := aExp - bExp;
  4306. if ( expDiff < -1 ) then
  4307. Begin
  4308. result := a;
  4309. exit;
  4310. End;
  4311. shortShift64Left(
  4312. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4313. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4314. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4315. if ( q )<>0 then
  4316. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4317. expDiff := expDiff - 32;
  4318. while ( 0 < expDiff ) do
  4319. Begin
  4320. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4321. if 4 < q then
  4322. q:= q - 4
  4323. else
  4324. q := 0;
  4325. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4326. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4327. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4328. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4329. expDiff := expDiff - 29;
  4330. End;
  4331. if ( -32 < expDiff ) then
  4332. Begin
  4333. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4334. if 4 < q then
  4335. q := q - 4
  4336. else
  4337. q := 0;
  4338. q := q shr (- expDiff);
  4339. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4340. expDiff := expDiff + 24;
  4341. if ( expDiff < 0 ) then
  4342. Begin
  4343. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4344. End
  4345. else
  4346. Begin
  4347. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4348. End;
  4349. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4350. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4351. End
  4352. else
  4353. Begin
  4354. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4355. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4356. End;
  4357. Repeat
  4358. alternateASig0 := aSig0;
  4359. alternateASig1 := aSig1;
  4360. Inc(q);
  4361. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4362. Until not ( 0 <= sbits32 (aSig0) );
  4363. add64(
  4364. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4365. if ( ( sigMean0 < 0 )
  4366. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4367. Begin
  4368. aSig0 := alternateASig0;
  4369. aSig1 := alternateASig1;
  4370. End;
  4371. zSign := flag( sbits32 (aSig0) < 0 );
  4372. if ( zSign <> 0 ) then
  4373. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4374. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4375. End;
  4376. {*
  4377. -------------------------------------------------------------------------------
  4378. Returns the square root of the double-precision floating-point value `a'.
  4379. The operation is performed according to the IEC/IEEE Standard for Binary
  4380. Floating-Point Arithmetic.
  4381. -------------------------------------------------------------------------------
  4382. *}
  4383. Procedure float64_sqrt( a: float64; var out: float64 );
  4384. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4385. Var
  4386. aSign: flag;
  4387. aExp, zExp: int16;
  4388. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4389. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4390. z: float64;
  4391. label invalid;
  4392. Begin
  4393. aSig1 := extractFloat64Frac1( a );
  4394. aSig0 := extractFloat64Frac0( a );
  4395. aExp := extractFloat64Exp( a );
  4396. aSign := extractFloat64Sign( a );
  4397. if ( aExp = $7FF ) then
  4398. Begin
  4399. if ( aSig0 OR aSig1 ) <> 0 then
  4400. Begin
  4401. propagateFloat64NaN( a, a, out );
  4402. exit;
  4403. End;
  4404. if ( aSign = 0) then
  4405. Begin
  4406. out := a;
  4407. exit;
  4408. End;
  4409. goto invalid;
  4410. End;
  4411. if ( aSign <> 0 ) then
  4412. Begin
  4413. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4414. Begin
  4415. out := a;
  4416. exit;
  4417. End;
  4418. invalid:
  4419. float_raise( float_flag_invalid );
  4420. z.low := float64_default_nan_low;
  4421. z.high := float64_default_nan_high;
  4422. out := z;
  4423. exit;
  4424. End;
  4425. if ( aExp = 0 ) then
  4426. Begin
  4427. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4428. Begin
  4429. packFloat64( 0, 0, 0, 0, out );
  4430. exit;
  4431. End;
  4432. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4433. End;
  4434. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4435. aSig0 := aSig0 or $00100000;
  4436. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4437. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4438. if ( zSig0 = 0 ) then
  4439. zSig0 := $7FFFFFFF;
  4440. doubleZSig0 := zSig0 + zSig0;
  4441. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4442. mul32To64( zSig0, zSig0, term0, term1 );
  4443. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4444. while ( sbits32 (rem0) < 0 ) do
  4445. Begin
  4446. Dec(zSig0);
  4447. doubleZSig0 := doubleZSig0 - 2;
  4448. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4449. End;
  4450. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4451. if ( ( zSig1 and $1FF ) <= 5 ) then
  4452. Begin
  4453. if ( zSig1 = 0 ) then
  4454. zSig1 := 1;
  4455. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4456. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4457. mul32To64( zSig1, zSig1, term2, term3 );
  4458. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4459. while ( sbits32 (rem1) < 0 ) do
  4460. Begin
  4461. Dec(zSig1);
  4462. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4463. term3 := term3 or 1;
  4464. term2 := term2 or doubleZSig0;
  4465. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4466. End;
  4467. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4468. End;
  4469. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4470. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4471. End;
  4472. {*
  4473. -------------------------------------------------------------------------------
  4474. Returns 1 if the double-precision floating-point value `a' is equal to
  4475. the corresponding value `b', and 0 otherwise. The comparison is performed
  4476. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4477. -------------------------------------------------------------------------------
  4478. *}
  4479. Function float64_eq(a: float64; b: float64): flag;
  4480. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4481. Begin
  4482. if
  4483. (
  4484. ( extractFloat64Exp( a ) = $7FF )
  4485. AND
  4486. (
  4487. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4488. )
  4489. )
  4490. OR (
  4491. ( extractFloat64Exp( b ) = $7FF )
  4492. AND (
  4493. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4494. )
  4495. )
  4496. ) then
  4497. Begin
  4498. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4499. float_raise( float_flag_invalid );
  4500. float64_eq := 0;
  4501. exit;
  4502. End;
  4503. float64_eq := flag(
  4504. ( a.low = b.low )
  4505. AND ( ( a.high = b.high )
  4506. OR ( ( a.low = 0 )
  4507. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4508. ));
  4509. End;
  4510. {*
  4511. -------------------------------------------------------------------------------
  4512. Returns 1 if the double-precision floating-point value `a' is less than
  4513. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4514. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4515. Arithmetic.
  4516. -------------------------------------------------------------------------------
  4517. *}
  4518. Function float64_le(a: float64;b: float64): flag;
  4519. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4520. Var
  4521. aSign, bSign: flag;
  4522. Begin
  4523. if
  4524. (
  4525. ( extractFloat64Exp( a ) = $7FF )
  4526. AND
  4527. (
  4528. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4529. )
  4530. )
  4531. OR (
  4532. ( extractFloat64Exp( b ) = $7FF )
  4533. AND (
  4534. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4535. )
  4536. )
  4537. ) then
  4538. Begin
  4539. float_raise( float_flag_invalid );
  4540. float64_le := 0;
  4541. exit;
  4542. End;
  4543. aSign := extractFloat64Sign( a );
  4544. bSign := extractFloat64Sign( b );
  4545. if ( aSign <> bSign ) then
  4546. Begin
  4547. float64_le := flag(
  4548. (aSign <> 0)
  4549. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4550. = 0 ));
  4551. exit;
  4552. End;
  4553. if aSign <> 0 then
  4554. float64_le := le64( b.high, b.low, a.high, a.low )
  4555. else
  4556. float64_le := le64( a.high, a.low, b.high, b.low );
  4557. End;
  4558. {*
  4559. -------------------------------------------------------------------------------
  4560. Returns 1 if the double-precision floating-point value `a' is less than
  4561. the corresponding value `b', and 0 otherwise. The comparison is performed
  4562. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4563. -------------------------------------------------------------------------------
  4564. *}
  4565. Function float64_lt(a: float64;b: float64): flag;
  4566. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4567. Var
  4568. aSign, bSign: flag;
  4569. Begin
  4570. if
  4571. (
  4572. ( extractFloat64Exp( a ) = $7FF )
  4573. AND
  4574. (
  4575. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4576. )
  4577. )
  4578. OR (
  4579. ( extractFloat64Exp( b ) = $7FF )
  4580. AND (
  4581. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4582. )
  4583. )
  4584. ) then
  4585. Begin
  4586. float_raise( float_flag_invalid );
  4587. float64_lt := 0;
  4588. exit;
  4589. End;
  4590. aSign := extractFloat64Sign( a );
  4591. bSign := extractFloat64Sign( b );
  4592. if ( aSign <> bSign ) then
  4593. Begin
  4594. float64_lt := flag(
  4595. (aSign <> 0)
  4596. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4597. <> 0 ));
  4598. exit;
  4599. End;
  4600. if aSign <> 0 then
  4601. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4602. else
  4603. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4604. End;
  4605. {*
  4606. -------------------------------------------------------------------------------
  4607. Returns 1 if the double-precision floating-point value `a' is equal to
  4608. the corresponding value `b', and 0 otherwise. The invalid exception is
  4609. raised if either operand is a NaN. Otherwise, the comparison is performed
  4610. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4611. -------------------------------------------------------------------------------
  4612. *}
  4613. Function float64_eq_signaling( a: float64; b: float64): flag;
  4614. Begin
  4615. if
  4616. (
  4617. ( extractFloat64Exp( a ) = $7FF )
  4618. AND
  4619. (
  4620. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4621. )
  4622. )
  4623. OR (
  4624. ( extractFloat64Exp( b ) = $7FF )
  4625. AND (
  4626. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4627. )
  4628. )
  4629. ) then
  4630. Begin
  4631. float_raise( float_flag_invalid );
  4632. float64_eq_signaling := 0;
  4633. exit;
  4634. End;
  4635. float64_eq_signaling := flag(
  4636. ( a.low = b.low )
  4637. AND ( ( a.high = b.high )
  4638. OR ( ( a.low = 0 )
  4639. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4640. ));
  4641. End;
  4642. {*
  4643. -------------------------------------------------------------------------------
  4644. Returns 1 if the double-precision floating-point value `a' is less than or
  4645. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4646. cause an exception. Otherwise, the comparison is performed according to the
  4647. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4648. -------------------------------------------------------------------------------
  4649. *}
  4650. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4651. Var
  4652. aSign, bSign : flag;
  4653. Begin
  4654. if
  4655. (
  4656. ( extractFloat64Exp( a ) = $7FF )
  4657. AND
  4658. (
  4659. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4660. )
  4661. )
  4662. OR (
  4663. ( extractFloat64Exp( b ) = $7FF )
  4664. AND (
  4665. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4666. )
  4667. )
  4668. ) then
  4669. Begin
  4670. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4671. float_raise( float_flag_invalid );
  4672. float64_le_quiet := 0;
  4673. exit;
  4674. End;
  4675. aSign := extractFloat64Sign( a );
  4676. bSign := extractFloat64Sign( b );
  4677. if ( aSign <> bSign ) then
  4678. Begin
  4679. float64_le_quiet := flag
  4680. ((aSign <> 0)
  4681. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4682. = 0 ));
  4683. exit;
  4684. End;
  4685. if aSign <> 0 then
  4686. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  4687. else
  4688. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  4689. End;
  4690. {*
  4691. -------------------------------------------------------------------------------
  4692. Returns 1 if the double-precision floating-point value `a' is less than
  4693. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4694. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4695. Standard for Binary Floating-Point Arithmetic.
  4696. -------------------------------------------------------------------------------
  4697. *}
  4698. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  4699. Var
  4700. aSign, bSign: flag;
  4701. Begin
  4702. if
  4703. (
  4704. ( extractFloat64Exp( a ) = $7FF )
  4705. AND
  4706. (
  4707. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4708. )
  4709. )
  4710. OR (
  4711. ( extractFloat64Exp( b ) = $7FF )
  4712. AND (
  4713. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4714. )
  4715. )
  4716. ) then
  4717. Begin
  4718. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4719. float_raise( float_flag_invalid );
  4720. float64_lt_quiet := 0;
  4721. exit;
  4722. End;
  4723. aSign := extractFloat64Sign( a );
  4724. bSign := extractFloat64Sign( b );
  4725. if ( aSign <> bSign ) then
  4726. Begin
  4727. float64_lt_quiet := flag(
  4728. (aSign<>0)
  4729. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4730. <> 0 ));
  4731. exit;
  4732. End;
  4733. If aSign <> 0 then
  4734. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  4735. else
  4736. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  4737. End;
  4738. {*----------------------------------------------------------------------------
  4739. | Returns the result of converting the 64-bit two's complement integer `a'
  4740. | to the single-precision floating-point format. The conversion is performed
  4741. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4742. *----------------------------------------------------------------------------*}
  4743. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  4744. var
  4745. zSign : flag;
  4746. absA : uint64;
  4747. shiftCount: int8;
  4748. zSig : bits32;
  4749. intval : int64rec;
  4750. Begin
  4751. if ( a = 0 ) then
  4752. begin
  4753. int64_to_float32.float32 := 0;
  4754. exit;
  4755. end;
  4756. if a < 0 then
  4757. zSign := flag(TRUE)
  4758. else
  4759. zSign := flag(FALSE);
  4760. if zSign<>0 then
  4761. absA := -a
  4762. else
  4763. absA := a;
  4764. shiftCount := countLeadingZeros64( absA ) - 40;
  4765. if ( 0 <= shiftCount ) then
  4766. begin
  4767. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  4768. end
  4769. else
  4770. begin
  4771. shiftCount := shiftCount + 7;
  4772. if ( shiftCount < 0 ) then
  4773. begin
  4774. intval.low := int64rec(AbsA).low;
  4775. intval.high := int64rec(AbsA).high;
  4776. shift64RightJamming( intval.low, intval.high, - shiftCount,
  4777. intval.low, intval.high);
  4778. int64rec(absA).low := intval.low;
  4779. int64rec(absA).high := intval.high;
  4780. end
  4781. else
  4782. absA := absA shl shiftCount;
  4783. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  4784. end;
  4785. End;
  4786. {*----------------------------------------------------------------------------
  4787. | Returns the result of converting the 64-bit two's complement integer `a'
  4788. | to the double-precision floating-point format. The conversion is performed
  4789. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4790. *----------------------------------------------------------------------------*}
  4791. function int64_to_float64( a: int64 ): float64;
  4792. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  4793. var
  4794. zSign : flag;
  4795. float_result : float64;
  4796. intval : int64rec;
  4797. AbsA : bits64;
  4798. shiftcount : int8;
  4799. zSig0, zSig1 : bits32;
  4800. Begin
  4801. if ( a = 0 ) then
  4802. Begin
  4803. packFloat64( 0, 0, 0, 0, result );
  4804. exit;
  4805. end;
  4806. zSign := flag( a < 0 );
  4807. if ZSign<>0 then
  4808. AbsA := -a
  4809. else
  4810. AbsA := a;
  4811. shiftCount := countLeadingZeros64( absA ) - 11;
  4812. if ( 0 <= shiftCount ) then
  4813. Begin
  4814. absA := absA shl shiftcount;
  4815. zSig0:=int64rec(absA).high;
  4816. zSig1:=int64rec(absA).low;
  4817. End
  4818. else
  4819. Begin
  4820. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  4821. End;
  4822. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  4823. int64_to_float64:= float_result;
  4824. End;
  4825. {*----------------------------------------------------------------------------
  4826. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  4827. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  4828. | Otherwise, returns 0.
  4829. *----------------------------------------------------------------------------*}
  4830. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  4831. begin
  4832. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  4833. end;
  4834. {*----------------------------------------------------------------------------
  4835. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  4836. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  4837. | any carry out is lost. The result is broken into two 64-bit pieces which
  4838. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  4839. *----------------------------------------------------------------------------*}
  4840. procedure add128(a0: bits64; a1: bits64; b0: bits64; b1: bits64; var z0Ptr: bits64; var z1Ptr : bits64);inline;
  4841. var
  4842. z1: bits64;
  4843. begin
  4844. z1 := a1 + b1;
  4845. z1Ptr := z1;
  4846. z0Ptr := a0 + b0 + ord( z1 < a1 );
  4847. end;
  4848. {*----------------------------------------------------------------------------
  4849. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  4850. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  4851. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  4852. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  4853. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  4854. | the most-significant bit of the extra result, and the other 63 bits of the
  4855. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  4856. | were all zero. This extra result is stored in the location pointed to by
  4857. | `z2Ptr'. The value of `count' can be arbitrarily large.
  4858. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  4859. | to form a fixed-point value with binary point between `a1' and `a2'. This
  4860. | fixed-point value is shifted right by the number of bits given in `count',
  4861. | and the integer part of the result is returned at the locations pointed to
  4862. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  4863. | corrupted as described above, and is returned at the location pointed to by
  4864. | `z2Ptr'.)
  4865. *----------------------------------------------------------------------------*}
  4866. procedure shift128ExtraRightJamming(
  4867. a0: bits64;
  4868. a1: bits64;
  4869. a2: bits64;
  4870. count: int16;
  4871. var z0Ptr: bits64;
  4872. var z1Ptr: bits64;
  4873. var z2Ptr: bits64);
  4874. var
  4875. z0, z1, z2: bits64;
  4876. negCount: int8;
  4877. begin
  4878. negCount := ( - count ) and 63;
  4879. if ( count = 0 ) then
  4880. begin
  4881. z2 := a2;
  4882. z1 := a1;
  4883. z0 := a0;
  4884. end
  4885. else begin
  4886. if ( count < 64 ) then
  4887. begin
  4888. z2 := a1 shr negCount;
  4889. z1 := ( a0 shl negCount ) or ( a1 shr count );
  4890. z0 := a0 shr count;
  4891. end
  4892. else begin
  4893. if ( count = 64 ) then
  4894. begin
  4895. z2 := a1;
  4896. z1 := a0;
  4897. end
  4898. else begin
  4899. a2 := a2 or a1;
  4900. if ( count < 128 ) then
  4901. begin
  4902. z2 := a0 shl negCount;
  4903. z1 := a0 shr ( count and 63 );
  4904. end
  4905. else begin
  4906. if ( count = 128 ) then
  4907. z2 := a0
  4908. else
  4909. z2 := ord( a0 <> 0 );
  4910. z1 := 0;
  4911. end;
  4912. end;
  4913. z0 := 0;
  4914. end;
  4915. z2 := z2 or ord( a2 <> 0 );
  4916. end;
  4917. z2Ptr := z2;
  4918. z1Ptr := z1;
  4919. z0Ptr := z0;
  4920. end;
  4921. {*----------------------------------------------------------------------------
  4922. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  4923. | _plus_ the number of bits given in `count'. The shifted result is at most
  4924. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  4925. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  4926. | shifted off is the most-significant bit of the extra result, and the other
  4927. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  4928. | bits shifted off were all zero. This extra result is stored in the location
  4929. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  4930. | (This routine makes more sense if `a0' and `a1' are considered to form
  4931. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  4932. | point value is shifted right by the number of bits given in `count', and
  4933. | the integer part of the result is returned at the location pointed to by
  4934. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  4935. | described above, and is returned at the location pointed to by `z1Ptr'.)
  4936. *----------------------------------------------------------------------------*}
  4937. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  4938. var
  4939. z0, z1: bits64;
  4940. negCount: int8;
  4941. begin
  4942. negCount := ( - count ) and 63;
  4943. if ( count = 0 ) then
  4944. begin
  4945. z1 := a1;
  4946. z0 := a0;
  4947. end
  4948. else if ( count < 64 ) then
  4949. begin
  4950. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  4951. z0 := a0 shr count;
  4952. end
  4953. else begin
  4954. if ( count = 64 ) then
  4955. begin
  4956. z1 := a0 or ord( a1 <> 0 );
  4957. end
  4958. else begin
  4959. z1 := ord( ( a0 or a1 ) <> 0 );
  4960. end;
  4961. z0 := 0;
  4962. end;
  4963. z1Ptr := z1;
  4964. z0Ptr := z0;
  4965. end;
  4966. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4967. {*----------------------------------------------------------------------------
  4968. | Returns the fraction bits of the extended double-precision floating-point
  4969. | value `a'.
  4970. *----------------------------------------------------------------------------*}
  4971. function extractFloatx80Frac(a : floatx80): bits64;inline;
  4972. begin
  4973. result:=a.low;
  4974. end;
  4975. {*----------------------------------------------------------------------------
  4976. | Returns the exponent bits of the extended double-precision floating-point
  4977. | value `a'.
  4978. *----------------------------------------------------------------------------*}
  4979. function extractFloatx80Exp(a : floatx80): int32;inline;
  4980. begin
  4981. result:=a.high and $7FFF;
  4982. end;
  4983. {*----------------------------------------------------------------------------
  4984. | Returns the sign bit of the extended double-precision floating-point value
  4985. | `a'.
  4986. *----------------------------------------------------------------------------*}
  4987. function extractFloatx80Sign(a : floatx80): flag;inline;
  4988. begin
  4989. result:=a.high shr 15;
  4990. end;
  4991. {*----------------------------------------------------------------------------
  4992. | Normalizes the subnormal extended double-precision floating-point value
  4993. | represented by the denormalized significand `aSig'. The normalized exponent
  4994. | and significand are stored at the locations pointed to by `zExpPtr' and
  4995. | `zSigPtr', respectively.
  4996. *----------------------------------------------------------------------------*}
  4997. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  4998. var
  4999. shiftCount: int8;
  5000. begin
  5001. shiftCount := countLeadingZeros64( aSig );
  5002. zSigPtr := aSig shl shiftCount;
  5003. zExpPtr := 1 - shiftCount;
  5004. end;
  5005. {*----------------------------------------------------------------------------
  5006. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5007. | extended double-precision floating-point value, returning the result.
  5008. *----------------------------------------------------------------------------*}
  5009. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5010. var
  5011. z: floatx80;
  5012. begin
  5013. z.low := zSig;
  5014. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5015. result:=z;
  5016. end;
  5017. {*----------------------------------------------------------------------------
  5018. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5019. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5020. | and returns the proper extended double-precision floating-point value
  5021. | corresponding to the abstract input. Ordinarily, the abstract value is
  5022. | rounded and packed into the extended double-precision format, with the
  5023. | inexact exception raised if the abstract input cannot be represented
  5024. | exactly. However, if the abstract value is too large, the overflow and
  5025. | inexact exceptions are raised and an infinity or maximal finite value is
  5026. | returned. If the abstract value is too small, the input value is rounded to
  5027. | a subnormal number, and the underflow and inexact exceptions are raised if
  5028. | the abstract input cannot be represented exactly as a subnormal extended
  5029. | double-precision floating-point number.
  5030. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5031. | number of bits as single or double precision, respectively. Otherwise, the
  5032. | result is rounded to the full precision of the extended double-precision
  5033. | format.
  5034. | The input significand must be normalized or smaller. If the input
  5035. | significand is not normalized, `zExp' must be 0; in that case, the result
  5036. | returned is a subnormal number, and it must not require rounding. The
  5037. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5038. | Floating-Point Arithmetic.
  5039. *----------------------------------------------------------------------------*}
  5040. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5041. var
  5042. roundingMode: int8;
  5043. roundNearestEven, increment, isTiny: flag;
  5044. roundIncrement, roundMask, roundBits: int64;
  5045. label
  5046. precision80;
  5047. begin
  5048. roundingMode := softfloat_rounding_mode;
  5049. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5050. if ( roundingPrecision = 80 ) then
  5051. goto precision80;
  5052. if ( roundingPrecision = 64 ) then
  5053. begin
  5054. roundIncrement := int64( $0000000000000400 );
  5055. roundMask := int64( $00000000000007FF );
  5056. end
  5057. else if ( roundingPrecision = 32 ) then
  5058. begin
  5059. roundIncrement := int64( $0000008000000000 );
  5060. roundMask := int64( $000000FFFFFFFFFF );
  5061. end
  5062. else begin
  5063. goto precision80;
  5064. end;
  5065. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5066. if ( not (roundNearestEven<>0) ) then
  5067. begin
  5068. if ( roundingMode = float_round_to_zero ) then
  5069. begin
  5070. roundIncrement := 0;
  5071. end
  5072. else begin
  5073. roundIncrement := roundMask;
  5074. if ( zSign<>0 ) then
  5075. begin
  5076. if ( roundingMode = float_round_up ) then
  5077. roundIncrement := 0;
  5078. end
  5079. else begin
  5080. if ( roundingMode = float_round_down ) then
  5081. roundIncrement := 0;
  5082. end;
  5083. end;
  5084. end;
  5085. roundBits := zSig0 and roundMask;
  5086. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5087. if ( ( $7FFE < zExp )
  5088. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5089. ) begin
  5090. goto overflow;
  5091. end;
  5092. if ( zExp <= 0 ) begin
  5093. isTiny =
  5094. ( float_detect_tininess = float_tininess_before_rounding )
  5095. or ( zExp < 0 )
  5096. or ( zSig0 <= zSig0 + roundIncrement );
  5097. shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
  5098. zExp := 0;
  5099. roundBits := zSig0 and roundMask;
  5100. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5101. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5102. zSig0 += roundIncrement;
  5103. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5104. roundIncrement := roundMask + 1;
  5105. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5106. roundMask |= roundIncrement;
  5107. end;
  5108. zSig0 &= ~ roundMask;
  5109. result:=packFloatx80( zSign, zExp, zSig0 );
  5110. end;
  5111. end;
  5112. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5113. zSig0 += roundIncrement;
  5114. if ( zSig0 < roundIncrement ) begin
  5115. ++zExp;
  5116. zSig0 := LIT64( $8000000000000000 );
  5117. end;
  5118. roundIncrement := roundMask + 1;
  5119. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5120. roundMask |= roundIncrement;
  5121. end;
  5122. zSig0 &= ~ roundMask;
  5123. if ( zSig0 = 0 ) zExp := 0;
  5124. result:=packFloatx80( zSign, zExp, zSig0 );
  5125. precision80:
  5126. increment := ( (sbits64) zSig1 < 0 );
  5127. if ( ! roundNearestEven ) begin
  5128. if ( roundingMode = float_round_to_zero ) begin
  5129. increment := 0;
  5130. end;
  5131. else begin
  5132. if ( zSign ) begin
  5133. increment := ( roundingMode = float_round_down ) and zSig1;
  5134. end;
  5135. else begin
  5136. increment := ( roundingMode = float_round_up ) and zSig1;
  5137. end;
  5138. end;
  5139. end;
  5140. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5141. if ( ( $7FFE < zExp )
  5142. or ( ( zExp = $7FFE )
  5143. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5144. and increment
  5145. )
  5146. ) begin
  5147. roundMask := 0;
  5148. overflow:
  5149. float_raise( float_flag_overflow or float_flag_inexact );
  5150. if ( ( roundingMode = float_round_to_zero )
  5151. or ( zSign and ( roundingMode = float_round_up ) )
  5152. or ( ! zSign and ( roundingMode = float_round_down ) )
  5153. ) begin
  5154. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5155. end;
  5156. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5157. end;
  5158. if ( zExp <= 0 ) begin
  5159. isTiny =
  5160. ( float_detect_tininess = float_tininess_before_rounding )
  5161. or ( zExp < 0 )
  5162. or ! increment
  5163. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5164. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
  5165. zExp := 0;
  5166. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5167. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5168. if ( roundNearestEven ) begin
  5169. increment := ( (sbits64) zSig1 < 0 );
  5170. end;
  5171. else begin
  5172. if ( zSign ) begin
  5173. increment := ( roundingMode = float_round_down ) and zSig1;
  5174. end;
  5175. else begin
  5176. increment := ( roundingMode = float_round_up ) and zSig1;
  5177. end;
  5178. end;
  5179. if ( increment ) begin
  5180. ++zSig0;
  5181. zSig0 &=
  5182. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5183. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5184. end;
  5185. result:=packFloatx80( zSign, zExp, zSig0 );
  5186. end;
  5187. end;
  5188. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5189. if ( increment ) begin
  5190. ++zSig0;
  5191. if ( zSig0 = 0 ) begin
  5192. ++zExp;
  5193. zSig0 := LIT64( $8000000000000000 );
  5194. end;
  5195. else begin
  5196. zSig0 &= ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5197. end;
  5198. end;
  5199. else begin
  5200. if ( zSig0 = 0 ) zExp := 0;
  5201. end;
  5202. result:=packFloatx80( zSign, zExp, zSig0 );
  5203. end;
  5204. {*----------------------------------------------------------------------------
  5205. | Takes an abstract floating-point value having sign `zSign', exponent
  5206. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5207. | and returns the proper extended double-precision floating-point value
  5208. | corresponding to the abstract input. This routine is just like
  5209. | `roundAndPackFloatx80' except that the input significand does not have to be
  5210. | normalized.
  5211. *----------------------------------------------------------------------------*}
  5212. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5213. var
  5214. shiftCount: int8;
  5215. begin
  5216. if ( zSig0 = 0 ) begin
  5217. zSig0 := zSig1;
  5218. zSig1 := 0;
  5219. zExp -= 64;
  5220. end;
  5221. shiftCount := countLeadingZeros64( zSig0 );
  5222. shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  5223. zExp := eExp - shiftCount;
  5224. return
  5225. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5226. end;
  5227. {*----------------------------------------------------------------------------
  5228. | Returns the result of converting the extended double-precision floating-
  5229. | point value `a' to the 32-bit two's complement integer format. The
  5230. | conversion is performed according to the IEC/IEEE Standard for Binary
  5231. | Floating-Point Arithmetic---which means in particular that the conversion
  5232. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5233. | largest positive integer is returned. Otherwise, if the conversion
  5234. | overflows, the largest integer with the same sign as `a' is returned.
  5235. *----------------------------------------------------------------------------*}
  5236. function floatx80_to_int32(a: floatx80): int32;
  5237. var
  5238. aSign: flag;
  5239. aExp, shiftCount: int32;
  5240. aSig: bits64;
  5241. begin
  5242. aSig := extractFloatx80Frac( a );
  5243. aExp := extractFloatx80Exp( a );
  5244. aSign := extractFloatx80Sign( a );
  5245. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5246. shiftCount := $4037 - aExp;
  5247. if ( shiftCount <= 0 ) shiftCount := 1;
  5248. shift64RightJamming( aSig, shiftCount, &aSig );
  5249. result := roundAndPackInt32( aSign, aSig );
  5250. end;
  5251. {*----------------------------------------------------------------------------
  5252. | Returns the result of converting the extended double-precision floating-
  5253. | point value `a' to the 32-bit two's complement integer format. The
  5254. | conversion is performed according to the IEC/IEEE Standard for Binary
  5255. | Floating-Point Arithmetic, except that the conversion is always rounded
  5256. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5257. | Otherwise, if the conversion overflows, the largest integer with the same
  5258. | sign as `a' is returned.
  5259. *----------------------------------------------------------------------------*}
  5260. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5261. var
  5262. aSign: flag;
  5263. aExp, shiftCount: int32;
  5264. aSig, savedASig: bits64;
  5265. z: int32;
  5266. begin
  5267. aSig := extractFloatx80Frac( a );
  5268. aExp := extractFloatx80Exp( a );
  5269. aSign := extractFloatx80Sign( a );
  5270. if ( $401E < aExp ) begin
  5271. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5272. goto invalid;
  5273. end;
  5274. else if ( aExp < $3FFF ) begin
  5275. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5276. result := 0;
  5277. end;
  5278. shiftCount := $403E - aExp;
  5279. savedASig := aSig;
  5280. aSig >>= shiftCount;
  5281. z := aSig;
  5282. if ( aSign ) z := - z;
  5283. if ( ( z < 0 ) xor aSign ) begin
  5284. invalid:
  5285. float_raise( float_flag_invalid );
  5286. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5287. end;
  5288. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5289. softfloat_exception_flags or= float_flag_inexact;
  5290. end;
  5291. result := z;
  5292. end;
  5293. {*----------------------------------------------------------------------------
  5294. | Returns the result of converting the extended double-precision floating-
  5295. | point value `a' to the 64-bit two's complement integer format. The
  5296. | conversion is performed according to the IEC/IEEE Standard for Binary
  5297. | Floating-Point Arithmetic---which means in particular that the conversion
  5298. | is rounded according to the current rounding mode. If `a' is a NaN,
  5299. | the largest positive integer is returned. Otherwise, if the conversion
  5300. | overflows, the largest integer with the same sign as `a' is returned.
  5301. *----------------------------------------------------------------------------*}
  5302. function floatx80_to_int64(a: floatx80): int64;
  5303. var
  5304. aSign: flag;
  5305. aExp, shiftCount: int32;
  5306. aSig, aSigExtra: bits64;
  5307. begin
  5308. aSig := extractFloatx80Frac( a );
  5309. aExp := extractFloatx80Exp( a );
  5310. aSign := extractFloatx80Sign( a );
  5311. shiftCount := $403E - aExp;
  5312. if ( shiftCount <= 0 ) begin
  5313. if ( shiftCount ) begin
  5314. float_raise( float_flag_invalid );
  5315. if ( ! aSign
  5316. or ( ( aExp = $7FFF )
  5317. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5318. ) begin
  5319. result := LIT64( $7FFFFFFFFFFFFFFF );
  5320. end;
  5321. result := (sbits64) LIT64( $8000000000000000 );
  5322. end;
  5323. aSigExtra := 0;
  5324. end;
  5325. else begin
  5326. shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
  5327. end;
  5328. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5329. end;
  5330. {*----------------------------------------------------------------------------
  5331. | Returns the result of converting the extended double-precision floating-
  5332. | point value `a' to the 64-bit two's complement integer format. The
  5333. | conversion is performed according to the IEC/IEEE Standard for Binary
  5334. | Floating-Point Arithmetic, except that the conversion is always rounded
  5335. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5336. | Otherwise, if the conversion overflows, the largest integer with the same
  5337. | sign as `a' is returned.
  5338. *----------------------------------------------------------------------------*}
  5339. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5340. var
  5341. aSign: flag;
  5342. aExp, shiftCount: int32;
  5343. aSig: bits64;
  5344. z: int64;
  5345. begin
  5346. aSig := extractFloatx80Frac( a );
  5347. aExp := extractFloatx80Exp( a );
  5348. aSign := extractFloatx80Sign( a );
  5349. shiftCount := aExp - $403E;
  5350. if ( 0 <= shiftCount ) begin
  5351. aSig &= LIT64( $7FFFFFFFFFFFFFFF );
  5352. if ( ( a.high <> $C03E ) or aSig ) begin
  5353. float_raise( float_flag_invalid );
  5354. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5355. result := LIT64( $7FFFFFFFFFFFFFFF );
  5356. end;
  5357. end;
  5358. result := (sbits64) LIT64( $8000000000000000 );
  5359. end;
  5360. else if ( aExp < $3FFF ) begin
  5361. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5362. result := 0;
  5363. end;
  5364. z := aSig>>( - shiftCount );
  5365. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5366. softfloat_exception_flags or= float_flag_inexact;
  5367. end;
  5368. if ( aSign ) z := - z;
  5369. result := z;
  5370. end;
  5371. {*----------------------------------------------------------------------------
  5372. | Returns the result of converting the extended double-precision floating-
  5373. | point value `a' to the single-precision floating-point format. The
  5374. | conversion is performed according to the IEC/IEEE Standard for Binary
  5375. | Floating-Point Arithmetic.
  5376. *----------------------------------------------------------------------------*}
  5377. function floatx80_to_float32(a: floatx80): float32;
  5378. var
  5379. aSign: flag;
  5380. aExp: int32;
  5381. aSig: bits64;
  5382. begin
  5383. aSig := extractFloatx80Frac( a );
  5384. aExp := extractFloatx80Exp( a );
  5385. aSign := extractFloatx80Sign( a );
  5386. if ( aExp = $7FFF ) begin
  5387. if ( (bits64) ( aSig shl 1 ) ) begin
  5388. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5389. end;
  5390. result := packFloat32( aSign, $FF, 0 );
  5391. end;
  5392. shift64RightJamming( aSig, 33, &aSig );
  5393. if ( aExp or aSig ) aExp -= $3F81;
  5394. result := roundAndPackFloat32( aSign, aExp, aSig );
  5395. end;
  5396. {*----------------------------------------------------------------------------
  5397. | Returns the result of converting the extended double-precision floating-
  5398. | point value `a' to the double-precision floating-point format. The
  5399. | conversion is performed according to the IEC/IEEE Standard for Binary
  5400. | Floating-Point Arithmetic.
  5401. *----------------------------------------------------------------------------*}
  5402. function floatx80_to_float64(a: floatx80): float64;
  5403. var
  5404. aSign: flag;
  5405. aExp: int32;
  5406. aSig, zSig: bits64;
  5407. begin
  5408. aSig := extractFloatx80Frac( a );
  5409. aExp := extractFloatx80Exp( a );
  5410. aSign := extractFloatx80Sign( a );
  5411. if ( aExp = $7FFF ) begin
  5412. if ( (bits64) ( aSig shl 1 ) ) begin
  5413. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5414. end;
  5415. result := packFloat64( aSign, $7FF, 0 );
  5416. end;
  5417. shift64RightJamming( aSig, 1, &zSig );
  5418. if ( aExp or aSig ) aExp -= $3C01;
  5419. result := roundAndPackFloat64( aSign, aExp, zSig );
  5420. end;
  5421. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5422. {*----------------------------------------------------------------------------
  5423. | Returns the result of converting the extended double-precision floating-
  5424. | point value `a' to the quadruple-precision floating-point format. The
  5425. | conversion is performed according to the IEC/IEEE Standard for Binary
  5426. | Floating-Point Arithmetic.
  5427. *----------------------------------------------------------------------------*}
  5428. function floatx80_to_float128(a: floatx80): float128;
  5429. var
  5430. aSign: flag;
  5431. aExp: int16;
  5432. aSig, zSig0, zSig1: bits64;
  5433. begin
  5434. aSig := extractFloatx80Frac( a );
  5435. aExp := extractFloatx80Exp( a );
  5436. aSign := extractFloatx80Sign( a );
  5437. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5438. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5439. end;
  5440. shift128Right( aSig shl 1, 0, 16, &zSig0, &zSig1 );
  5441. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5442. end;
  5443. {$endif FPC_SOFTFLOAT_FLOAT128}
  5444. {*----------------------------------------------------------------------------
  5445. | Rounds the extended double-precision floating-point value `a' to an integer,
  5446. | and Returns the result as an extended quadruple-precision floating-point
  5447. | value. The operation is performed according to the IEC/IEEE Standard for
  5448. | Binary Floating-Point Arithmetic.
  5449. *----------------------------------------------------------------------------*}
  5450. function floatx80_round_to_int(a: floatx80): floatx80;
  5451. var
  5452. aSign: flag;
  5453. aExp: int32;
  5454. lastBitMask, roundBitsMask: bits64;
  5455. roundingMode: int8;
  5456. z: floatx80;
  5457. begin
  5458. aExp := extractFloatx80Exp( a );
  5459. if ( $403E <= aExp ) begin
  5460. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5461. result := propagateFloatx80NaN( a, a );
  5462. end;
  5463. result := a;
  5464. end;
  5465. if ( aExp < $3FFF ) begin
  5466. if ( ( aExp = 0 )
  5467. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5468. result := a;
  5469. end;
  5470. softfloat_exception_flags or= float_flag_inexact;
  5471. aSign := extractFloatx80Sign( a );
  5472. switch ( softfloat_rounding_mode ) begin
  5473. case float_round_nearest_even:
  5474. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5475. ) begin
  5476. result :=
  5477. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5478. end;
  5479. break;
  5480. case float_round_down:
  5481. result :=
  5482. aSign ?
  5483. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5484. : packFloatx80( 0, 0, 0 );
  5485. case float_round_up:
  5486. result :=
  5487. aSign ? packFloatx80( 1, 0, 0 )
  5488. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5489. end;
  5490. result := packFloatx80( aSign, 0, 0 );
  5491. end;
  5492. lastBitMask := 1;
  5493. lastBitMask shl = $403E - aExp;
  5494. roundBitsMask := lastBitMask - 1;
  5495. z := a;
  5496. roundingMode := softfloat_rounding_mode;
  5497. if ( roundingMode = float_round_nearest_even ) begin
  5498. z.low += lastBitMask>>1;
  5499. if ( ( z.low and roundBitsMask ) = 0 ) z.low &= ~ lastBitMask;
  5500. end;
  5501. else if ( roundingMode <> float_round_to_zero ) begin
  5502. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5503. z.low += roundBitsMask;
  5504. end;
  5505. end;
  5506. z.low &= ~ roundBitsMask;
  5507. if ( z.low = 0 ) begin
  5508. ++z.high;
  5509. z.low := LIT64( $8000000000000000 );
  5510. end;
  5511. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5512. result := z;
  5513. end;
  5514. {*----------------------------------------------------------------------------
  5515. | Returns the result of adding the absolute values of the extended double-
  5516. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5517. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5518. | The addition is performed according to the IEC/IEEE Standard for Binary
  5519. | Floating-Point Arithmetic.
  5520. *----------------------------------------------------------------------------*}
  5521. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5522. var
  5523. aExp, bExp, zExp: int32;
  5524. aSig, bSig, zSig0, zSig1: bits64;
  5525. expDiff: int32;
  5526. begin
  5527. aSig := extractFloatx80Frac( a );
  5528. aExp := extractFloatx80Exp( a );
  5529. bSig := extractFloatx80Frac( b );
  5530. bExp := extractFloatx80Exp( b );
  5531. expDiff := aExp - bExp;
  5532. if ( 0 < expDiff ) begin
  5533. if ( aExp = $7FFF ) begin
  5534. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5535. result := a;
  5536. end;
  5537. if ( bExp = 0 ) --expDiff;
  5538. shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
  5539. zExp := aExp;
  5540. end;
  5541. else if ( expDiff < 0 ) begin
  5542. if ( bExp = $7FFF ) begin
  5543. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5544. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5545. end;
  5546. if ( aExp = 0 ) ++expDiff;
  5547. shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
  5548. zExp := bExp;
  5549. end;
  5550. else begin
  5551. if ( aExp = $7FFF ) begin
  5552. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5553. result := propagateFloatx80NaN( a, b );
  5554. end;
  5555. result := a;
  5556. end;
  5557. zSig1 := 0;
  5558. zSig0 := aSig + bSig;
  5559. if ( aExp = 0 ) begin
  5560. normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
  5561. goto roundAndPack;
  5562. end;
  5563. zExp := aExp;
  5564. goto shiftRight1;
  5565. end;
  5566. zSig0 := aSig + bSig;
  5567. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5568. shiftRight1:
  5569. shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
  5570. zSig0 or= LIT64( $8000000000000000 );
  5571. ++zExp;
  5572. roundAndPack:
  5573. result :=
  5574. roundAndPackFloatx80(
  5575. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5576. end;
  5577. {*----------------------------------------------------------------------------
  5578. | Returns the result of subtracting the absolute values of the extended
  5579. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5580. | difference is negated before being returned. `zSign' is ignored if the
  5581. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5582. | Standard for Binary Floating-Point Arithmetic.
  5583. *----------------------------------------------------------------------------*}
  5584. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5585. var
  5586. aExp, bExp, zExp: int32;
  5587. aSig, bSig, zSig0, zSig1: bits64;
  5588. expDiff: int32;
  5589. z: floatx80;
  5590. begin
  5591. aSig := extractFloatx80Frac( a );
  5592. aExp := extractFloatx80Exp( a );
  5593. bSig := extractFloatx80Frac( b );
  5594. bExp := extractFloatx80Exp( b );
  5595. expDiff := aExp - bExp;
  5596. if ( 0 < expDiff ) goto aExpBigger;
  5597. if ( expDiff < 0 ) goto bExpBigger;
  5598. if ( aExp = $7FFF ) begin
  5599. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5600. result := propagateFloatx80NaN( a, b );
  5601. end;
  5602. float_raise( float_flag_invalid );
  5603. z.low := floatx80_default_nan_low;
  5604. z.high := floatx80_default_nan_high;
  5605. result := z;
  5606. end;
  5607. if ( aExp = 0 ) begin
  5608. aExp := 1;
  5609. bExp := 1;
  5610. end;
  5611. zSig1 := 0;
  5612. if ( bSig < aSig ) goto aBigger;
  5613. if ( aSig < bSig ) goto bBigger;
  5614. result := packFloatx80( softfloat_rounding_mode = float_round_down, 0, 0 );
  5615. bExpBigger:
  5616. if ( bExp = $7FFF ) begin
  5617. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5618. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  5619. end;
  5620. if ( aExp = 0 ) ++expDiff;
  5621. shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
  5622. bBigger:
  5623. sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
  5624. zExp := bExp;
  5625. zSign xor = 1;
  5626. goto normalizeRoundAndPack;
  5627. aExpBigger:
  5628. if ( aExp = $7FFF ) begin
  5629. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5630. result := a;
  5631. end;
  5632. if ( bExp = 0 ) --expDiff;
  5633. shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
  5634. aBigger:
  5635. sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
  5636. zExp := aExp;
  5637. normalizeRoundAndPack:
  5638. result :=
  5639. normalizeRoundAndPackFloatx80(
  5640. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5641. end;
  5642. {*----------------------------------------------------------------------------
  5643. | Returns the result of adding the extended double-precision floating-point
  5644. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  5645. | Standard for Binary Floating-Point Arithmetic.
  5646. *----------------------------------------------------------------------------*}
  5647. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  5648. var
  5649. aSign, bSign: flag;
  5650. begin
  5651. aSign := extractFloatx80Sign( a );
  5652. bSign := extractFloatx80Sign( b );
  5653. if ( aSign = bSign ) begin
  5654. result := addFloatx80Sigs( a, b, aSign );
  5655. end;
  5656. else begin
  5657. result := subFloatx80Sigs( a, b, aSign );
  5658. end;
  5659. end;
  5660. {*----------------------------------------------------------------------------
  5661. | Returns the result of subtracting the extended double-precision floating-
  5662. | point values `a' and `b'. The operation is performed according to the
  5663. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5664. *----------------------------------------------------------------------------*}
  5665. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  5666. var
  5667. aSign, bSign: flag;
  5668. begin
  5669. aSign := extractFloatx80Sign( a );
  5670. bSign := extractFloatx80Sign( b );
  5671. if ( aSign = bSign ) begin
  5672. result := subFloatx80Sigs( a, b, aSign );
  5673. end;
  5674. else begin
  5675. result := addFloatx80Sigs( a, b, aSign );
  5676. end;
  5677. end;
  5678. {*----------------------------------------------------------------------------
  5679. | Returns the result of multiplying the extended double-precision floating-
  5680. | point values `a' and `b'. The operation is performed according to the
  5681. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5682. *----------------------------------------------------------------------------*}
  5683. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  5684. var
  5685. aSign, bSign, zSign: flag;
  5686. aExp, bExp, zExp: int32;
  5687. aSig, bSig, zSig0, zSig1: bits64;
  5688. z: floatx80;
  5689. begin
  5690. aSig := extractFloatx80Frac( a );
  5691. aExp := extractFloatx80Exp( a );
  5692. aSign := extractFloatx80Sign( a );
  5693. bSig := extractFloatx80Frac( b );
  5694. bExp := extractFloatx80Exp( b );
  5695. bSign := extractFloatx80Sign( b );
  5696. zSign := aSign xor bSign;
  5697. if ( aExp = $7FFF ) begin
  5698. if ( (bits64) ( aSig shl 1 )
  5699. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  5700. result := propagateFloatx80NaN( a, b );
  5701. end;
  5702. if ( ( bExp or bSig ) = 0 ) goto invalid;
  5703. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5704. end;
  5705. if ( bExp = $7FFF ) begin
  5706. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5707. if ( ( aExp or aSig ) = 0 ) begin
  5708. invalid:
  5709. float_raise( float_flag_invalid );
  5710. z.low := floatx80_default_nan_low;
  5711. z.high := floatx80_default_nan_high;
  5712. result := z;
  5713. end;
  5714. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5715. end;
  5716. if ( aExp = 0 ) begin
  5717. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5718. normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
  5719. end;
  5720. if ( bExp = 0 ) begin
  5721. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5722. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  5723. end;
  5724. zExp := aExp + bExp - $3FFE;
  5725. mul64To128( aSig, bSig, &zSig0, &zSig1 );
  5726. if ( 0 < (sbits64) zSig0 ) begin
  5727. shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
  5728. --zExp;
  5729. end;
  5730. result :=
  5731. roundAndPackFloatx80(
  5732. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5733. end;
  5734. {*----------------------------------------------------------------------------
  5735. | Returns the result of dividing the extended double-precision floating-point
  5736. | value `a' by the corresponding value `b'. The operation is performed
  5737. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5738. *----------------------------------------------------------------------------*}
  5739. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  5740. var
  5741. aSign, bSign, zSign: flag;
  5742. aExp, bExp, zExp: int32;
  5743. aSig, bSig, zSig0, zSig1: bits64;
  5744. rem0, rem1, rem2, term0, term1, term2: bits64;
  5745. z: floatx80;
  5746. begin
  5747. aSig := extractFloatx80Frac( a );
  5748. aExp := extractFloatx80Exp( a );
  5749. aSign := extractFloatx80Sign( a );
  5750. bSig := extractFloatx80Frac( b );
  5751. bExp := extractFloatx80Exp( b );
  5752. bSign := extractFloatx80Sign( b );
  5753. zSign := aSign xor bSign;
  5754. if ( aExp = $7FFF ) begin
  5755. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5756. if ( bExp = $7FFF ) begin
  5757. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5758. goto invalid;
  5759. end;
  5760. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5761. end;
  5762. if ( bExp = $7FFF ) begin
  5763. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5764. result := packFloatx80( zSign, 0, 0 );
  5765. end;
  5766. if ( bExp = 0 ) begin
  5767. if ( bSig = 0 ) begin
  5768. if ( ( aExp or aSig ) = 0 ) begin
  5769. invalid:
  5770. float_raise( float_flag_invalid );
  5771. z.low := floatx80_default_nan_low;
  5772. z.high := floatx80_default_nan_high;
  5773. result := z;
  5774. end;
  5775. float_raise( float_flag_divbyzero );
  5776. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5777. end;
  5778. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  5779. end;
  5780. if ( aExp = 0 ) begin
  5781. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5782. normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
  5783. end;
  5784. zExp := aExp - bExp + $3FFE;
  5785. rem1 := 0;
  5786. if ( bSig <= aSig ) begin
  5787. shift128Right( aSig, 0, 1, &aSig, &rem1 );
  5788. ++zExp;
  5789. end;
  5790. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  5791. mul64To128( bSig, zSig0, &term0, &term1 );
  5792. sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
  5793. while ( (sbits64) rem0 < 0 ) begin
  5794. --zSig0;
  5795. add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
  5796. end;
  5797. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  5798. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  5799. mul64To128( bSig, zSig1, &term1, &term2 );
  5800. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  5801. while ( (sbits64) rem1 < 0 ) begin
  5802. --zSig1;
  5803. add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
  5804. end;
  5805. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  5806. end;
  5807. result :=
  5808. roundAndPackFloatx80(
  5809. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5810. end;
  5811. {*----------------------------------------------------------------------------
  5812. | Returns the remainder of the extended double-precision floating-point value
  5813. | `a' with respect to the corresponding value `b'. The operation is performed
  5814. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5815. *----------------------------------------------------------------------------*}
  5816. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  5817. var
  5818. aSign, bSign, zSign: flag;
  5819. aExp, bExp, expDiff: int32;
  5820. aSig0, aSig1, bSig: bits64;
  5821. q, term0, term1, alternateASig0, alternateASig1: bits64;
  5822. z: floatx80;
  5823. begin
  5824. aSig0 := extractFloatx80Frac( a );
  5825. aExp := extractFloatx80Exp( a );
  5826. aSign := extractFloatx80Sign( a );
  5827. bSig := extractFloatx80Frac( b );
  5828. bExp := extractFloatx80Exp( b );
  5829. bSign := extractFloatx80Sign( b );
  5830. if ( aExp = $7FFF ) begin
  5831. if ( (bits64) ( aSig0 shl 1 )
  5832. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  5833. result := propagateFloatx80NaN( a, b );
  5834. end;
  5835. goto invalid;
  5836. end;
  5837. if ( bExp = $7FFF ) begin
  5838. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5839. result := a;
  5840. end;
  5841. if ( bExp = 0 ) begin
  5842. if ( bSig = 0 ) begin
  5843. invalid:
  5844. float_raise( float_flag_invalid );
  5845. z.low := floatx80_default_nan_low;
  5846. z.high := floatx80_default_nan_high;
  5847. result := z;
  5848. end;
  5849. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  5850. end;
  5851. if ( aExp = 0 ) begin
  5852. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  5853. normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
  5854. end;
  5855. bSig or= LIT64( $8000000000000000 );
  5856. zSign := aSign;
  5857. expDiff := aExp - bExp;
  5858. aSig1 := 0;
  5859. if ( expDiff < 0 ) begin
  5860. if ( expDiff < -1 ) result := a;
  5861. shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
  5862. expDiff := 0;
  5863. end;
  5864. q := ( bSig <= aSig0 );
  5865. if ( q ) aSig0 -= bSig;
  5866. expDiff -= 64;
  5867. while ( 0 < expDiff ) begin
  5868. q := estimateDiv128To64( aSig0, aSig1, bSig );
  5869. q := ( 2 < q ) ? q - 2 : 0;
  5870. mul64To128( bSig, q, &term0, &term1 );
  5871. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5872. shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
  5873. expDiff -= 62;
  5874. end;
  5875. expDiff += 64;
  5876. if ( 0 < expDiff ) begin
  5877. q := estimateDiv128To64( aSig0, aSig1, bSig );
  5878. q := ( 2 < q ) ? q - 2 : 0;
  5879. q >>= 64 - expDiff;
  5880. mul64To128( bSig, q shl ( 64 - expDiff ), &term0, &term1 );
  5881. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5882. shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
  5883. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  5884. ++q;
  5885. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5886. end;
  5887. end;
  5888. else begin
  5889. term1 := 0;
  5890. term0 := bSig;
  5891. end;
  5892. sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
  5893. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  5894. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  5895. and ( q and 1 ) )
  5896. ) begin
  5897. aSig0 := alternateASig0;
  5898. aSig1 := alternateASig1;
  5899. zSign := ! zSign;
  5900. end;
  5901. result :=
  5902. normalizeRoundAndPackFloatx80(
  5903. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  5904. end;
  5905. {*----------------------------------------------------------------------------
  5906. | Returns the square root of the extended double-precision floating-point
  5907. | value `a'. The operation is performed according to the IEC/IEEE Standard
  5908. | for Binary Floating-Point Arithmetic.
  5909. *----------------------------------------------------------------------------*}
  5910. function floatx80_sqrt(a: floatx80): floatx80;
  5911. var
  5912. aSign: flag;
  5913. aExp, zExp: int32;
  5914. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  5915. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  5916. z: floatx80;
  5917. label
  5918. invalid;
  5919. begin
  5920. aSig0 := extractFloatx80Frac( a );
  5921. aExp := extractFloatx80Exp( a );
  5922. aSign := extractFloatx80Sign( a );
  5923. if ( aExp = $7FFF ) begin
  5924. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  5925. if ( ! aSign ) result := a;
  5926. goto invalid;
  5927. end;
  5928. if ( aSign ) begin
  5929. if ( ( aExp or aSig0 ) = 0 ) result := a;
  5930. invalid:
  5931. float_raise( float_flag_invalid );
  5932. z.low := floatx80_default_nan_low;
  5933. z.high := floatx80_default_nan_high;
  5934. result := z;
  5935. end;
  5936. if ( aExp = 0 ) begin
  5937. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  5938. normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
  5939. end;
  5940. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  5941. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  5942. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), &aSig0, &aSig1 );
  5943. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  5944. doubleZSig0 := zSig0 shl 1;
  5945. mul64To128( zSig0, zSig0, &term0, &term1 );
  5946. sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
  5947. while ( (sbits64) rem0 < 0 ) begin
  5948. --zSig0;
  5949. doubleZSig0 -= 2;
  5950. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, &rem0, &rem1 );
  5951. end;
  5952. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  5953. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  5954. if ( zSig1 = 0 ) zSig1 := 1;
  5955. mul64To128( doubleZSig0, zSig1, &term1, &term2 );
  5956. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  5957. mul64To128( zSig1, zSig1, &term2, &term3 );
  5958. sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
  5959. while ( (sbits64) rem1 < 0 ) begin
  5960. --zSig1;
  5961. shortShift128Left( 0, zSig1, 1, &term2, &term3 );
  5962. term3 or= 1;
  5963. term2 or= doubleZSig0;
  5964. add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
  5965. end;
  5966. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  5967. end;
  5968. shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
  5969. zSig0 or= doubleZSig0;
  5970. result :=
  5971. roundAndPackFloatx80(
  5972. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  5973. end;
  5974. {*----------------------------------------------------------------------------
  5975. | Returns 1 if the extended double-precision floating-point value `a' is
  5976. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  5977. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  5978. | Arithmetic.
  5979. *----------------------------------------------------------------------------*}
  5980. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  5981. begin
  5982. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  5983. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  5984. or ( ( extractFloatx80Exp( b ) = $7FFF )
  5985. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  5986. ) begin
  5987. if ( floatx80_is_signaling_nan( a )
  5988. or floatx80_is_signaling_nan( b ) ) begin
  5989. float_raise( float_flag_invalid );
  5990. end;
  5991. result := 0;
  5992. end;
  5993. result :=
  5994. ( a.low = b.low )
  5995. and ( ( a.high = b.high )
  5996. or ( ( a.low = 0 )
  5997. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  5998. );
  5999. end;
  6000. {*----------------------------------------------------------------------------
  6001. | Returns 1 if the extended double-precision floating-point value `a' is
  6002. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6003. | comparison is performed according to the IEC/IEEE Standard for Binary
  6004. | Floating-Point Arithmetic.
  6005. *----------------------------------------------------------------------------*}
  6006. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6007. var
  6008. aSign, bSign: flag;
  6009. begin
  6010. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6011. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6012. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6013. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6014. ) begin
  6015. float_raise( float_flag_invalid );
  6016. result := 0;
  6017. end;
  6018. aSign := extractFloatx80Sign( a );
  6019. bSign := extractFloatx80Sign( b );
  6020. if ( aSign <> bSign ) begin
  6021. result :=
  6022. aSign
  6023. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6024. = 0 );
  6025. end;
  6026. result :=
  6027. aSign ? le128( b.high, b.low, a.high, a.low )
  6028. : le128( a.high, a.low, b.high, b.low );
  6029. end;
  6030. {*----------------------------------------------------------------------------
  6031. | Returns 1 if the extended double-precision floating-point value `a' is
  6032. | less than the corresponding value `b', and 0 otherwise. The comparison
  6033. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6034. | Arithmetic.
  6035. *----------------------------------------------------------------------------*}
  6036. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6037. var
  6038. aSign, bSign: flag;
  6039. begin
  6040. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6041. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6042. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6043. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6044. ) begin
  6045. float_raise( float_flag_invalid );
  6046. result := 0;
  6047. end;
  6048. aSign := extractFloatx80Sign( a );
  6049. bSign := extractFloatx80Sign( b );
  6050. if ( aSign <> bSign ) begin
  6051. result :=
  6052. aSign
  6053. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6054. <> 0 );
  6055. end;
  6056. result :=
  6057. aSign ? lt128( b.high, b.low, a.high, a.low )
  6058. : lt128( a.high, a.low, b.high, b.low );
  6059. end;
  6060. {*----------------------------------------------------------------------------
  6061. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6062. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6063. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6064. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6065. *----------------------------------------------------------------------------*}
  6066. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6067. begin
  6068. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6069. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6070. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6071. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6072. ) begin
  6073. float_raise( float_flag_invalid );
  6074. result := 0;
  6075. end;
  6076. result :=
  6077. ( a.low = b.low )
  6078. and ( ( a.high = b.high )
  6079. or ( ( a.low = 0 )
  6080. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6081. );
  6082. end;
  6083. {*----------------------------------------------------------------------------
  6084. | Returns 1 if the extended double-precision floating-point value `a' is less
  6085. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6086. | do not cause an exception. Otherwise, the comparison is performed according
  6087. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6088. *----------------------------------------------------------------------------*}
  6089. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6090. var
  6091. aSign, bSign: flag;
  6092. begin
  6093. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6094. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6095. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6096. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6097. ) begin
  6098. if ( floatx80_is_signaling_nan( a )
  6099. or floatx80_is_signaling_nan( b ) ) begin
  6100. float_raise( float_flag_invalid );
  6101. end;
  6102. result := 0;
  6103. end;
  6104. aSign := extractFloatx80Sign( a );
  6105. bSign := extractFloatx80Sign( b );
  6106. if ( aSign <> bSign ) begin
  6107. result :=
  6108. aSign
  6109. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6110. = 0 );
  6111. end;
  6112. result :=
  6113. aSign ? le128( b.high, b.low, a.high, a.low )
  6114. : le128( a.high, a.low, b.high, b.low );
  6115. end;
  6116. {*----------------------------------------------------------------------------
  6117. | Returns 1 if the extended double-precision floating-point value `a' is less
  6118. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6119. | an exception. Otherwise, the comparison is performed according to the
  6120. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6121. *----------------------------------------------------------------------------*}
  6122. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6123. var
  6124. aSign, bSign: flag;
  6125. begin
  6126. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6127. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6128. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6129. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6130. ) begin
  6131. if ( floatx80_is_signaling_nan( a )
  6132. or floatx80_is_signaling_nan( b ) ) begin
  6133. float_raise( float_flag_invalid );
  6134. end;
  6135. result := 0;
  6136. end;
  6137. aSign := extractFloatx80Sign( a );
  6138. bSign := extractFloatx80Sign( b );
  6139. if ( aSign <> bSign ) begin
  6140. result :=
  6141. aSign
  6142. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6143. <> 0 );
  6144. end;
  6145. result :=
  6146. aSign ? lt128( b.high, b.low, a.high, a.low )
  6147. : lt128( a.high, a.low, b.high, b.low );
  6148. end;
  6149. {$endif FPC_SOFTFLOAT_FLOATX80}
  6150. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6151. {*----------------------------------------------------------------------------
  6152. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6153. | floating-point value `a'.
  6154. *----------------------------------------------------------------------------*}
  6155. function extractFloat128Frac1(a : float128): bits64;
  6156. begin
  6157. result:=a.low;
  6158. end;
  6159. {*----------------------------------------------------------------------------
  6160. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6161. | floating-point value `a'.
  6162. *----------------------------------------------------------------------------*}
  6163. function extractFloat128Frac0(a : float128): bits64;
  6164. begin
  6165. result:=a.high and int64($0000FFFFFFFFFFFF);
  6166. end;
  6167. {*----------------------------------------------------------------------------
  6168. | Returns the exponent bits of the quadruple-precision floating-point value
  6169. | `a'.
  6170. *----------------------------------------------------------------------------*}
  6171. function extractFloat128Exp(a : float128): int32;
  6172. begin
  6173. result:=( a.high shr 48 ) and $7FFF;
  6174. end;
  6175. {*----------------------------------------------------------------------------
  6176. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6177. *----------------------------------------------------------------------------*}
  6178. function extractFloat128Sign(a : float128): flag;
  6179. begin
  6180. result:=a.high shr 63;
  6181. end;
  6182. {*----------------------------------------------------------------------------
  6183. | Normalizes the subnormal quadruple-precision floating-point value
  6184. | represented by the denormalized significand formed by the concatenation of
  6185. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6186. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6187. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6188. | least significant 64 bits of the normalized significand are stored at the
  6189. | location pointed to by `zSig1Ptr'.
  6190. *----------------------------------------------------------------------------*}
  6191. procedure normalizeFloat128Subnormal(
  6192. aSig0: bits64;
  6193. aSig1: bits64;
  6194. var zExpPtr: int32;
  6195. var zSig0Ptr: bits64;
  6196. var zSig1Ptr: bits64);
  6197. var
  6198. shiftCount: int8;
  6199. begin
  6200. if ( aSig0 = 0 ) then
  6201. begin
  6202. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6203. if ( shiftCount < 0 ) then
  6204. begin
  6205. zSig0Ptr := aSig1 shr ( - shiftCount );
  6206. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6207. end
  6208. else begin
  6209. zSig0Ptr := aSig1 shl shiftCount;
  6210. zSig1Ptr := 0;
  6211. end;
  6212. zExpPtr := - shiftCount - 63;
  6213. end
  6214. else begin
  6215. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6216. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6217. zExpPtr := 1 - shiftCount;
  6218. end;
  6219. end;
  6220. {*----------------------------------------------------------------------------
  6221. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6222. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6223. | floating-point value, returning the result. After being shifted into the
  6224. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6225. | added together to form the most significant 32 bits of the result. This
  6226. | means that any integer portion of `zSig0' will be added into the exponent.
  6227. | Since a properly normalized significand will have an integer portion equal
  6228. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6229. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6230. | significand.
  6231. *----------------------------------------------------------------------------*}
  6232. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6233. var
  6234. z: float128;
  6235. begin
  6236. z.low := zSig1;
  6237. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6238. result:=z;
  6239. end;
  6240. {*----------------------------------------------------------------------------
  6241. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6242. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6243. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6244. | corresponding to the abstract input. Ordinarily, the abstract value is
  6245. | simply rounded and packed into the quadruple-precision format, with the
  6246. | inexact exception raised if the abstract input cannot be represented
  6247. | exactly. However, if the abstract value is too large, the overflow and
  6248. | inexact exceptions are raised and an infinity or maximal finite value is
  6249. | returned. If the abstract value is too small, the input value is rounded to
  6250. | a subnormal number, and the underflow and inexact exceptions are raised if
  6251. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6252. | precision floating-point number.
  6253. | The input significand must be normalized or smaller. If the input
  6254. | significand is not normalized, `zExp' must be 0; in that case, the result
  6255. | returned is a subnormal number, and it must not require rounding. In the
  6256. | usual case that the input significand is normalized, `zExp' must be 1 less
  6257. | than the ``true'' floating-point exponent. The handling of underflow and
  6258. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6259. *----------------------------------------------------------------------------*}
  6260. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6261. var
  6262. roundingMode: int8;
  6263. roundNearestEven, increment, isTiny: flag;
  6264. begin
  6265. roundingMode := softfloat_rounding_mode;
  6266. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6267. increment := ord( sbits64(zSig2) < 0 );
  6268. if ( roundNearestEven=0 ) then
  6269. begin
  6270. if ( roundingMode = float_round_to_zero ) then
  6271. begin
  6272. increment := 0;
  6273. end
  6274. else begin
  6275. if ( zSign<>0 ) then
  6276. begin
  6277. increment := ord( roundingMode = float_round_down ) and zSig2;
  6278. end
  6279. else begin
  6280. increment := ord( roundingMode = float_round_up ) and zSig2;
  6281. end;
  6282. end;
  6283. end;
  6284. if ( $7FFD <= bits32(zExp) ) then
  6285. begin
  6286. if ( ord( $7FFD < zExp )
  6287. or ( ord( zExp = $7FFD )
  6288. and eq128(
  6289. int64( $0001FFFFFFFFFFFF ),
  6290. int64( $FFFFFFFFFFFFFFFF ),
  6291. zSig0,
  6292. zSig1
  6293. )
  6294. and increment
  6295. )
  6296. )<>0 then
  6297. begin
  6298. float_raise( float_flag_overflow or float_flag_inexact );
  6299. if ( ord( roundingMode = float_round_to_zero )
  6300. or ( zSign and ord( roundingMode = float_round_up ) )
  6301. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6302. )<>0 then
  6303. begin
  6304. result :=
  6305. packFloat128(
  6306. zSign,
  6307. $7FFE,
  6308. int64( $0000FFFFFFFFFFFF ),
  6309. int64( $FFFFFFFFFFFFFFFF )
  6310. );
  6311. end;
  6312. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6313. end;
  6314. if ( zExp < 0 ) then
  6315. begin
  6316. isTiny :=
  6317. ord(( float_detect_tininess = float_tininess_before_rounding )
  6318. or ( zExp < -1 )
  6319. or not( increment<>0 )
  6320. or boolean(lt128(
  6321. zSig0,
  6322. zSig1,
  6323. int64( $0001FFFFFFFFFFFF ),
  6324. int64( $FFFFFFFFFFFFFFFF )
  6325. )));
  6326. shift128ExtraRightJamming(
  6327. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6328. zExp := 0;
  6329. if ( isTiny and zSig2 )<>0 then
  6330. float_raise( float_flag_underflow );
  6331. if ( roundNearestEven<>0 ) then
  6332. begin
  6333. increment := ord( sbits64(zSig2) < 0 );
  6334. end
  6335. else begin
  6336. if ( zSign<>0 ) then
  6337. begin
  6338. increment := ord( roundingMode = float_round_down ) and zSig2;
  6339. end
  6340. else begin
  6341. increment := ord( roundingMode = float_round_up ) and zSig2;
  6342. end;
  6343. end;
  6344. end;
  6345. end;
  6346. if ( zSig2<>0 ) then
  6347. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6348. if ( increment<>0 ) then
  6349. begin
  6350. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6351. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6352. end
  6353. else begin
  6354. if ( ( zSig0 or zSig1 ) = 0 ) then
  6355. zExp := 0;
  6356. end;
  6357. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6358. end;
  6359. {*----------------------------------------------------------------------------
  6360. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6361. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6362. | returns the proper quadruple-precision floating-point value corresponding
  6363. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6364. | except that the input significand has fewer bits and does not have to be
  6365. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6366. | point exponent.
  6367. *----------------------------------------------------------------------------*}
  6368. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6369. var
  6370. shiftCount: int8;
  6371. zSig2: bits64;
  6372. begin
  6373. if ( zSig0 = 0 ) then
  6374. begin
  6375. zSig0 := zSig1;
  6376. zSig1 := 0;
  6377. dec(zExp, 64);
  6378. end;
  6379. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6380. if ( 0 <= shiftCount ) then
  6381. begin
  6382. zSig2 := 0;
  6383. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6384. end
  6385. else begin
  6386. shift128ExtraRightJamming(
  6387. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6388. end;
  6389. dec(zExp, shiftCount);
  6390. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6391. end;
  6392. {*----------------------------------------------------------------------------
  6393. | Returns the result of converting the quadruple-precision floating-point
  6394. | value `a' to the 32-bit two's complement integer format. The conversion
  6395. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6396. | Arithmetic---which means in particular that the conversion is rounded
  6397. | according to the current rounding mode. If `a' is a NaN, the largest
  6398. | positive integer is returned. Otherwise, if the conversion overflows, the
  6399. | largest integer with the same sign as `a' is returned.
  6400. *----------------------------------------------------------------------------*}
  6401. function float128_to_int32(a: float128): int32;
  6402. var
  6403. aSign: flag;
  6404. aExp, shiftCount: int32;
  6405. aSig0, aSig1: bits64;
  6406. begin
  6407. aSig1 := extractFloat128Frac1( a );
  6408. aSig0 := extractFloat128Frac0( a );
  6409. aExp := extractFloat128Exp( a );
  6410. aSign := extractFloat128Sign( a );
  6411. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6412. aSign := 0;
  6413. if ( aExp<>0 ) then
  6414. aSig0 := aSig0 or int64( $0001000000000000 );
  6415. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6416. shiftCount := $4028 - aExp;
  6417. if ( 0 < shiftCount ) then
  6418. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6419. result := roundAndPackInt32( aSign, aSig0 );
  6420. end;
  6421. {*----------------------------------------------------------------------------
  6422. | Returns the result of converting the quadruple-precision floating-point
  6423. | value `a' to the 32-bit two's complement integer format. The conversion
  6424. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6425. | Arithmetic, except that the conversion is always rounded toward zero. If
  6426. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6427. | conversion overflows, the largest integer with the same sign as `a' is
  6428. | returned.
  6429. *----------------------------------------------------------------------------*}
  6430. function float128_to_int32_round_to_zero(a: float128): int32;
  6431. var
  6432. aSign: flag;
  6433. aExp, shiftCount: int32;
  6434. aSig0, aSig1, savedASig: bits64;
  6435. z: int32;
  6436. label
  6437. invalid;
  6438. begin
  6439. aSig1 := extractFloat128Frac1( a );
  6440. aSig0 := extractFloat128Frac0( a );
  6441. aExp := extractFloat128Exp( a );
  6442. aSign := extractFloat128Sign( a );
  6443. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6444. if ( $401E < aExp ) then
  6445. begin
  6446. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6447. aSign := 0;
  6448. goto invalid;
  6449. end
  6450. else if ( aExp < $3FFF ) then
  6451. begin
  6452. if ( aExp or aSig0 )<>0 then
  6453. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6454. result := 0;
  6455. exit;
  6456. end;
  6457. aSig0 := aSig0 or int64( $0001000000000000 );
  6458. shiftCount := $402F - aExp;
  6459. savedASig := aSig0;
  6460. aSig0 := aSig0 shr shiftCount;
  6461. z := aSig0;
  6462. if ( aSign )<>0 then
  6463. z := - z;
  6464. if ( ord( z < 0 ) xor aSign )<>0 then
  6465. begin
  6466. invalid:
  6467. float_raise( float_flag_invalid );
  6468. if aSign<>0 then
  6469. result:=$80000000
  6470. else
  6471. result:=$7FFFFFFF;
  6472. exit;
  6473. end;
  6474. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6475. begin
  6476. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6477. end;
  6478. result := z;
  6479. end;
  6480. {*----------------------------------------------------------------------------
  6481. | Returns the result of converting the quadruple-precision floating-point
  6482. | value `a' to the 64-bit two's complement integer format. The conversion
  6483. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6484. | Arithmetic---which means in particular that the conversion is rounded
  6485. | according to the current rounding mode. If `a' is a NaN, the largest
  6486. | positive integer is returned. Otherwise, if the conversion overflows, the
  6487. | largest integer with the same sign as `a' is returned.
  6488. *----------------------------------------------------------------------------*}
  6489. function float128_to_int64(a: float128): int64;
  6490. var
  6491. aSign: flag;
  6492. aExp, shiftCount: int32;
  6493. aSig0, aSig1: bits64;
  6494. begin
  6495. aSig1 := extractFloat128Frac1( a );
  6496. aSig0 := extractFloat128Frac0( a );
  6497. aExp := extractFloat128Exp( a );
  6498. aSign := extractFloat128Sign( a );
  6499. if ( aExp<>0 ) then
  6500. aSig0 := aSig0 or int64( $0001000000000000 );
  6501. shiftCount := $402F - aExp;
  6502. if ( shiftCount <= 0 ) then
  6503. begin
  6504. if ( $403E < aExp ) then
  6505. begin
  6506. float_raise( float_flag_invalid );
  6507. if ( (aSign=0)
  6508. or ( ( aExp = $7FFF )
  6509. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6510. )
  6511. ) then
  6512. begin
  6513. result := int64( $7FFFFFFFFFFFFFFF );
  6514. end;
  6515. result := int64( $8000000000000000 );
  6516. end;
  6517. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6518. end
  6519. else begin
  6520. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6521. end;
  6522. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6523. end;
  6524. {*----------------------------------------------------------------------------
  6525. | Returns the result of converting the quadruple-precision floating-point
  6526. | value `a' to the 64-bit two's complement integer format. The conversion
  6527. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6528. | Arithmetic, except that the conversion is always rounded toward zero.
  6529. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6530. | the conversion overflows, the largest integer with the same sign as `a' is
  6531. | returned.
  6532. *----------------------------------------------------------------------------*}
  6533. function float128_to_int64_round_to_zero(a: float128): int64;
  6534. var
  6535. aSign: flag;
  6536. aExp, shiftCount: int32;
  6537. aSig0, aSig1: bits64;
  6538. z: int64;
  6539. begin
  6540. aSig1 := extractFloat128Frac1( a );
  6541. aSig0 := extractFloat128Frac0( a );
  6542. aExp := extractFloat128Exp( a );
  6543. aSign := extractFloat128Sign( a );
  6544. if ( aExp<>0 ) then
  6545. aSig0 := aSig0 or int64( $0001000000000000 );
  6546. shiftCount := aExp - $402F;
  6547. if ( 0 < shiftCount ) then
  6548. begin
  6549. if ( $403E <= aExp ) then
  6550. begin
  6551. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6552. if ( ( a.high = int64( $C03E000000000000 ) )
  6553. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6554. begin
  6555. if ( aSig1<>0 ) then
  6556. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6557. end
  6558. else begin
  6559. float_raise( float_flag_invalid );
  6560. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6561. begin
  6562. result := int64( $7FFFFFFFFFFFFFFF );
  6563. exit;
  6564. end;
  6565. end;
  6566. result := int64( $8000000000000000 );
  6567. exit;
  6568. end;
  6569. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6570. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6571. begin
  6572. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6573. end;
  6574. end
  6575. else begin
  6576. if ( aExp < $3FFF ) then
  6577. begin
  6578. if ( aExp or aSig0 or aSig1 )<>0 then
  6579. begin
  6580. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6581. end;
  6582. result := 0;
  6583. exit;
  6584. end;
  6585. z := aSig0 shr ( - shiftCount );
  6586. if ( (aSig1<>0)
  6587. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6588. begin
  6589. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6590. end;
  6591. end;
  6592. if ( aSign<>0 ) then
  6593. z := - z;
  6594. result := z;
  6595. end;
  6596. {*----------------------------------------------------------------------------
  6597. | Returns the result of converting the quadruple-precision floating-point
  6598. | value `a' to the single-precision floating-point format. The conversion
  6599. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6600. | Arithmetic.
  6601. *----------------------------------------------------------------------------*}
  6602. function float128_to_float32(a: float128): float32;
  6603. var
  6604. aSign: flag;
  6605. aExp: int32;
  6606. aSig0, aSig1: bits64;
  6607. zSig: bits32;
  6608. begin
  6609. aSig1 := extractFloat128Frac1( a );
  6610. aSig0 := extractFloat128Frac0( a );
  6611. aExp := extractFloat128Exp( a );
  6612. aSign := extractFloat128Sign( a );
  6613. if ( aExp = $7FFF ) then
  6614. begin
  6615. if ( aSig0 or aSig1 )<>0 then
  6616. begin
  6617. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  6618. exit;
  6619. end;
  6620. result := packFloat32( aSign, $FF, 0 );
  6621. exit;
  6622. end;
  6623. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6624. shift64RightJamming( aSig0, 18, aSig0 );
  6625. zSig := aSig0;
  6626. if ( aExp or zSig )<>0 then
  6627. begin
  6628. zSig := zSig or $40000000;
  6629. dec(aExp,$3F81);
  6630. end;
  6631. result := roundAndPackFloat32( aSign, aExp, zSig );
  6632. end;
  6633. {*----------------------------------------------------------------------------
  6634. | Returns the result of converting the quadruple-precision floating-point
  6635. | value `a' to the double-precision floating-point format. The conversion
  6636. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6637. | Arithmetic.
  6638. *----------------------------------------------------------------------------*}
  6639. function float128_to_float64(a: float128): float64;
  6640. var
  6641. aSign: flag;
  6642. aExp: int32;
  6643. aSig0, aSig1: bits64;
  6644. begin
  6645. aSig1 := extractFloat128Frac1( a );
  6646. aSig0 := extractFloat128Frac0( a );
  6647. aExp := extractFloat128Exp( a );
  6648. aSign := extractFloat128Sign( a );
  6649. if ( aExp = $7FFF ) then
  6650. begin
  6651. if ( aSig0 or aSig1 )<>0 then
  6652. begin
  6653. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  6654. exit;
  6655. end;
  6656. result:=packFloat64( aSign, $7FF, 0);
  6657. exit;
  6658. end;
  6659. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  6660. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6661. if ( aExp or aSig0 )<>0 then
  6662. begin
  6663. aSig0 := aSig0 or int64( $4000000000000000 );
  6664. dec(aExp,$3C01);
  6665. end;
  6666. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  6667. end;
  6668. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  6669. {*----------------------------------------------------------------------------
  6670. | Returns the result of converting the quadruple-precision floating-point
  6671. | value `a' to the extended double-precision floating-point format. The
  6672. | conversion is performed according to the IEC/IEEE Standard for Binary
  6673. | Floating-Point Arithmetic.
  6674. *----------------------------------------------------------------------------*}
  6675. function float128_to_floatx80(a: float128): floatx80;
  6676. var
  6677. aSign: flag;
  6678. aExp: int32;
  6679. aSig0, aSig1: bits64;
  6680. begin
  6681. aSig1 := extractFloat128Frac1( a );
  6682. aSig0 := extractFloat128Frac0( a );
  6683. aExp := extractFloat128Exp( a );
  6684. aSign := extractFloat128Sign( a );
  6685. if ( aExp = $7FFF ) begin
  6686. if ( aSig0 or aSig1 ) begin
  6687. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  6688. end;
  6689. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  6690. end;
  6691. if ( aExp = 0 ) begin
  6692. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloatx80( aSign, 0, 0 );
  6693. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  6694. end;
  6695. else begin
  6696. aSig0 or= int64( $0001000000000000 );
  6697. end;
  6698. shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
  6699. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  6700. end;
  6701. {$endif FPC_SOFTFLOAT_FLOATX80}
  6702. {*----------------------------------------------------------------------------
  6703. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  6704. | Returns the result as a quadruple-precision floating-point value. The
  6705. | operation is performed according to the IEC/IEEE Standard for Binary
  6706. | Floating-Point Arithmetic.
  6707. *----------------------------------------------------------------------------*}
  6708. function float128_round_to_int(a: float128): float128;
  6709. var
  6710. aSign: flag;
  6711. aExp: int32;
  6712. lastBitMask, roundBitsMask: bits64;
  6713. roundingMode: int8;
  6714. z: float128;
  6715. begin
  6716. aExp := extractFloat128Exp( a );
  6717. if ( $402F <= aExp ) then
  6718. begin
  6719. if ( $406F <= aExp ) then
  6720. begin
  6721. if ( ( aExp = $7FFF )
  6722. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  6723. ) then
  6724. begin
  6725. result := propagateFloat128NaN( a, a );
  6726. exit;
  6727. end;
  6728. result := a;
  6729. exit;
  6730. end;
  6731. lastBitMask := 1;
  6732. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  6733. roundBitsMask := lastBitMask - 1;
  6734. z := a;
  6735. roundingMode := softfloat_rounding_mode;
  6736. if ( roundingMode = float_round_nearest_even ) then
  6737. begin
  6738. if ( lastBitMask )<>0 then
  6739. begin
  6740. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  6741. if ( ( z.low and roundBitsMask ) = 0 ) then
  6742. z.low := z.low and not(lastBitMask);
  6743. end
  6744. else begin
  6745. if ( sbits64(z.low) < 0 ) then
  6746. begin
  6747. inc(z.high);
  6748. if ( bits64( z.low shl 1 ) = 0 ) then
  6749. z.high := z.high and not(1);
  6750. end;
  6751. end;
  6752. end
  6753. else if ( roundingMode <> float_round_to_zero ) then
  6754. begin
  6755. if ( extractFloat128Sign( z )
  6756. xor ord( roundingMode = float_round_up ) )<>0 then
  6757. begin
  6758. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  6759. end;
  6760. end;
  6761. z.low := z.low and not(roundBitsMask);
  6762. end
  6763. else begin
  6764. if ( aExp < $3FFF ) then
  6765. begin
  6766. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  6767. begin
  6768. result := a;
  6769. exit;
  6770. end;
  6771. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6772. aSign := extractFloat128Sign( a );
  6773. case softfloat_rounding_mode of
  6774. float_round_nearest_even:
  6775. if ( ( aExp = $3FFE )
  6776. and ( extractFloat128Frac0( a )
  6777. or extractFloat128Frac1( a ) )
  6778. ) begin
  6779. begin
  6780. result := packFloat128( aSign, $3FFF, 0, 0 );
  6781. exit;
  6782. end;
  6783. end;
  6784. float_round_down:
  6785. begin
  6786. result :=
  6787. aSign ? packFloat128( 1, $3FFF, 0, 0 )
  6788. : packFloat128( 0, 0, 0, 0 );
  6789. end;
  6790. float_round_up:
  6791. begin
  6792. result :=
  6793. aSign ? packFloat128( 1, 0, 0, 0 )
  6794. : packFloat128( 0, $3FFF, 0, 0 );
  6795. exit;
  6796. end;
  6797. end;
  6798. result := packFloat128( aSign, 0, 0, 0 );
  6799. exit;
  6800. end;
  6801. lastBitMask := 1;
  6802. lastBitMask shl = $402F - aExp;
  6803. roundBitsMask := lastBitMask - 1;
  6804. z.low := 0;
  6805. z.high := a.high;
  6806. roundingMode := softfloat_rounding_mode;
  6807. if ( roundingMode = float_round_nearest_even ) begin
  6808. z.high += lastBitMask>>1;
  6809. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) begin
  6810. z.high &= ~ lastBitMask;
  6811. end;
  6812. end;
  6813. else if ( roundingMode <> float_round_to_zero ) begin
  6814. if ( extractFloat128Sign( z )
  6815. xor ( roundingMode = float_round_up ) ) begin
  6816. z.high or= ( a.low <> 0 );
  6817. z.high += roundBitsMask;
  6818. end;
  6819. end;
  6820. z.high &= ~ roundBitsMask;
  6821. end;
  6822. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) begin
  6823. softfloat_exception_flags or= float_flag_inexact;
  6824. end;
  6825. result := z;
  6826. end;
  6827. {*----------------------------------------------------------------------------
  6828. | Returns the result of adding the absolute values of the quadruple-precision
  6829. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  6830. | before being returned. `zSign' is ignored if the result is a NaN.
  6831. | The addition is performed according to the IEC/IEEE Standard for Binary
  6832. | Floating-Point Arithmetic.
  6833. *----------------------------------------------------------------------------*}
  6834. function addFloat128Sigs( float128 a, float128 b, flag zSign ): float128;
  6835. var
  6836. aExp, bExp, zExp: int32;
  6837. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  6838. expDiff: int32;
  6839. begin
  6840. aSig1 := extractFloat128Frac1( a );
  6841. aSig0 := extractFloat128Frac0( a );
  6842. aExp := extractFloat128Exp( a );
  6843. bSig1 := extractFloat128Frac1( b );
  6844. bSig0 := extractFloat128Frac0( b );
  6845. bExp := extractFloat128Exp( b );
  6846. expDiff := aExp - bExp;
  6847. if ( 0 < expDiff ) begin
  6848. if ( aExp = $7FFF ) begin
  6849. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
  6850. result := a;
  6851. end;
  6852. if ( bExp = 0 ) begin
  6853. --expDiff;
  6854. end;
  6855. else begin
  6856. bSig0 or= int64( $0001000000000000 );
  6857. end;
  6858. shift128ExtraRightJamming(
  6859. bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
  6860. zExp := aExp;
  6861. end;
  6862. else if ( expDiff < 0 ) begin
  6863. if ( bExp = $7FFF ) begin
  6864. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  6865. result := packFloat128( zSign, $7FFF, 0, 0 );
  6866. end;
  6867. if ( aExp = 0 ) begin
  6868. ++expDiff;
  6869. end;
  6870. else begin
  6871. aSig0 or= int64( $0001000000000000 );
  6872. end;
  6873. shift128ExtraRightJamming(
  6874. aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
  6875. zExp := bExp;
  6876. end;
  6877. else begin
  6878. if ( aExp = $7FFF ) begin
  6879. if ( aSig0 or aSig1 or bSig0 or bSig1 ) begin
  6880. result := propagateFloat128NaN( a, b );
  6881. end;
  6882. result := a;
  6883. end;
  6884. add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  6885. if ( aExp = 0 ) result := packFloat128( zSign, 0, zSig0, zSig1 );
  6886. zSig2 := 0;
  6887. zSig0 or= int64( $0002000000000000 );
  6888. zExp := aExp;
  6889. goto shiftRight1;
  6890. end;
  6891. aSig0 or= int64( $0001000000000000 );
  6892. add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  6893. --zExp;
  6894. if ( zSig0 < int64( $0002000000000000 ) ) goto roundAndPack;
  6895. ++zExp;
  6896. shiftRight1:
  6897. shift128ExtraRightJamming(
  6898. zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
  6899. roundAndPack:
  6900. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6901. end;
  6902. {*----------------------------------------------------------------------------
  6903. | Returns the result of subtracting the absolute values of the quadruple-
  6904. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  6905. | difference is negated before being returned. `zSign' is ignored if the
  6906. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6907. | Standard for Binary Floating-Point Arithmetic.
  6908. *----------------------------------------------------------------------------*}
  6909. function subFloat128Sigs( float128 a, float128 b, flag zSign ): float128;
  6910. var
  6911. aExp, bExp, zExp: int32;
  6912. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  6913. expDiff: int32;
  6914. z: float128;
  6915. begin
  6916. aSig1 := extractFloat128Frac1( a );
  6917. aSig0 := extractFloat128Frac0( a );
  6918. aExp := extractFloat128Exp( a );
  6919. bSig1 := extractFloat128Frac1( b );
  6920. bSig0 := extractFloat128Frac0( b );
  6921. bExp := extractFloat128Exp( b );
  6922. expDiff := aExp - bExp;
  6923. shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
  6924. shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
  6925. if ( 0 < expDiff ) goto aExpBigger;
  6926. if ( expDiff < 0 ) goto bExpBigger;
  6927. if ( aExp = $7FFF ) begin
  6928. if ( aSig0 or aSig1 or bSig0 or bSig1 ) begin
  6929. result := propagateFloat128NaN( a, b );
  6930. end;
  6931. float_raise( float_flag_invalid );
  6932. z.low := float128_default_nan_low;
  6933. z.high := float128_default_nan_high;
  6934. result := z;
  6935. end;
  6936. if ( aExp = 0 ) begin
  6937. aExp := 1;
  6938. bExp := 1;
  6939. end;
  6940. if ( bSig0 < aSig0 ) then goto aBigger;
  6941. if ( aSig0 < bSig0 ) then goto bBigger;
  6942. if ( bSig1 < aSig1 ) then goto aBigger;
  6943. if ( aSig1 < bSig1 ) then goto bBigger;
  6944. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  6945. exit;
  6946. bExpBigger:
  6947. if ( bExp = $7FFF ) begin
  6948. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  6949. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  6950. end;
  6951. if ( aExp = 0 ) begin
  6952. ++expDiff;
  6953. end;
  6954. else begin
  6955. aSig0 or= int64( $4000000000000000 );
  6956. end;
  6957. shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
  6958. bSig0 or= int64( $4000000000000000 );
  6959. bBigger:
  6960. sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
  6961. zExp := bExp;
  6962. zSign xor = 1;
  6963. goto normalizeRoundAndPack;
  6964. aExpBigger:
  6965. if ( aExp = $7FFF ) begin
  6966. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
  6967. result := a;
  6968. end;
  6969. if ( bExp = 0 ) begin
  6970. --expDiff;
  6971. end;
  6972. else begin
  6973. bSig0 or= int64( $4000000000000000 );
  6974. end;
  6975. shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
  6976. aSig0 or= int64( $4000000000000000 );
  6977. aBigger:
  6978. sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  6979. zExp := aExp;
  6980. normalizeRoundAndPack:
  6981. --zExp;
  6982. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  6983. end;
  6984. {*----------------------------------------------------------------------------
  6985. | Returns the result of adding the quadruple-precision floating-point values
  6986. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  6987. | for Binary Floating-Point Arithmetic.
  6988. *----------------------------------------------------------------------------*}
  6989. function float128_add(a: float128; b: float128): float128;
  6990. var
  6991. aSign, bSign: flag;
  6992. begin
  6993. aSign := extractFloat128Sign( a );
  6994. bSign := extractFloat128Sign( b );
  6995. if ( aSign = bSign ) begin
  6996. result := addFloat128Sigs( a, b, aSign );
  6997. end;
  6998. else begin
  6999. result := subFloat128Sigs( a, b, aSign );
  7000. end;
  7001. end;
  7002. {*----------------------------------------------------------------------------
  7003. | Returns the result of subtracting the quadruple-precision floating-point
  7004. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7005. | Standard for Binary Floating-Point Arithmetic.
  7006. *----------------------------------------------------------------------------*}
  7007. function float128_sub(a: float128; b: float128): float128;
  7008. var
  7009. aSign, bSign: flag;
  7010. begin
  7011. aSign := extractFloat128Sign( a );
  7012. bSign := extractFloat128Sign( b );
  7013. if ( aSign = bSign ) begin
  7014. result := subFloat128Sigs( a, b, aSign );
  7015. end;
  7016. else begin
  7017. result := addFloat128Sigs( a, b, aSign );
  7018. end;
  7019. end;
  7020. {*----------------------------------------------------------------------------
  7021. | Returns the result of multiplying the quadruple-precision floating-point
  7022. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7023. | Standard for Binary Floating-Point Arithmetic.
  7024. *----------------------------------------------------------------------------*}
  7025. function float128_mul(a: float128; b: float128): float128;
  7026. var
  7027. aSign, bSign, zSign: flag;
  7028. aExp, bExp, zExp: int32;
  7029. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7030. z: float128;
  7031. begin
  7032. aSig1 := extractFloat128Frac1( a );
  7033. aSig0 := extractFloat128Frac0( a );
  7034. aExp := extractFloat128Exp( a );
  7035. aSign := extractFloat128Sign( a );
  7036. bSig1 := extractFloat128Frac1( b );
  7037. bSig0 := extractFloat128Frac0( b );
  7038. bExp := extractFloat128Exp( b );
  7039. bSign := extractFloat128Sign( b );
  7040. zSign := aSign xor bSign;
  7041. if ( aExp = $7FFF ) begin
  7042. if ( ( aSig0 or aSig1 )
  7043. or ( ( bExp = $7FFF ) and ( bSig0 or bSig1 ) ) ) begin
  7044. result := propagateFloat128NaN( a, b );
  7045. end;
  7046. if ( ( bExp or bSig0 or bSig1 ) = 0 ) goto invalid;
  7047. result := packFloat128( zSign, $7FFF, 0, 0 );
  7048. end;
  7049. if ( bExp = $7FFF ) begin
  7050. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7051. if ( ( aExp or aSig0 or aSig1 ) = 0 ) begin
  7052. invalid:
  7053. float_raise( float_flag_invalid );
  7054. z.low := float128_default_nan_low;
  7055. z.high := float128_default_nan_high;
  7056. result := z;
  7057. end;
  7058. result := packFloat128( zSign, $7FFF, 0, 0 );
  7059. end;
  7060. if ( aExp = 0 ) begin
  7061. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
  7062. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7063. end;
  7064. if ( bExp = 0 ) begin
  7065. if ( ( bSig0 or bSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
  7066. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  7067. end;
  7068. zExp := aExp + bExp - $4000;
  7069. aSig0 or= int64( $0001000000000000 );
  7070. shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
  7071. mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
  7072. add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
  7073. zSig2 or= ( zSig3 <> 0 );
  7074. if ( int64( $0002000000000000 ) <= zSig0 ) begin
  7075. shift128ExtraRightJamming(
  7076. zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
  7077. ++zExp;
  7078. end;
  7079. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7080. end;
  7081. {*----------------------------------------------------------------------------
  7082. | Returns the result of dividing the quadruple-precision floating-point value
  7083. | `a' by the corresponding value `b'. The operation is performed according to
  7084. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7085. *----------------------------------------------------------------------------*}
  7086. function float128_div(a: float128; b: float128): float128;
  7087. var
  7088. aSign, bSign, zSign: flag;
  7089. aExp, bExp, zExp: int32;
  7090. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7091. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7092. z: float128;
  7093. begin
  7094. aSig1 := extractFloat128Frac1( a );
  7095. aSig0 := extractFloat128Frac0( a );
  7096. aExp := extractFloat128Exp( a );
  7097. aSign := extractFloat128Sign( a );
  7098. bSig1 := extractFloat128Frac1( b );
  7099. bSig0 := extractFloat128Frac0( b );
  7100. bExp := extractFloat128Exp( b );
  7101. bSign := extractFloat128Sign( b );
  7102. zSign := aSign xor bSign;
  7103. if ( aExp = $7FFF ) begin
  7104. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
  7105. if ( bExp = $7FFF ) begin
  7106. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7107. goto invalid;
  7108. end;
  7109. result := packFloat128( zSign, $7FFF, 0, 0 );
  7110. end;
  7111. if ( bExp = $7FFF ) begin
  7112. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7113. result := packFloat128( zSign, 0, 0, 0 );
  7114. end;
  7115. if ( bExp = 0 ) begin
  7116. if ( ( bSig0 or bSig1 ) = 0 ) begin
  7117. if ( ( aExp or aSig0 or aSig1 ) = 0 ) begin
  7118. invalid:
  7119. float_raise( float_flag_invalid );
  7120. z.low := float128_default_nan_low;
  7121. z.high := float128_default_nan_high;
  7122. result := z;
  7123. end;
  7124. float_raise( float_flag_divbyzero );
  7125. result := packFloat128( zSign, $7FFF, 0, 0 );
  7126. end;
  7127. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  7128. end;
  7129. if ( aExp = 0 ) begin
  7130. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
  7131. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7132. end;
  7133. zExp := aExp - bExp + $3FFD;
  7134. shortShift128Left(
  7135. aSig0 or int64( $0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
  7136. shortShift128Left(
  7137. bSig0 or int64( $0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
  7138. if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) begin
  7139. shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
  7140. ++zExp;
  7141. end;
  7142. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7143. mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
  7144. sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
  7145. while ( (sbits64) rem0 < 0 ) begin
  7146. --zSig0;
  7147. add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
  7148. end;
  7149. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7150. if ( ( zSig1 and $3FFF ) <= 4 ) begin
  7151. mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
  7152. sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
  7153. while ( (sbits64) rem1 < 0 ) begin
  7154. --zSig1;
  7155. add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
  7156. end;
  7157. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  7158. end;
  7159. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
  7160. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7161. end;
  7162. {*----------------------------------------------------------------------------
  7163. | Returns the remainder of the quadruple-precision floating-point value `a'
  7164. | with respect to the corresponding value `b'. The operation is performed
  7165. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7166. *----------------------------------------------------------------------------*}
  7167. function float128_rem(a: float128; b: float128): float128;
  7168. var
  7169. aSign, bSign, zSign: flag;
  7170. aExp, bExp, expDiff: int32;
  7171. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7172. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7173. sigMean0: sbits64;
  7174. z: float128;
  7175. begin
  7176. aSig1 := extractFloat128Frac1( a );
  7177. aSig0 := extractFloat128Frac0( a );
  7178. aExp := extractFloat128Exp( a );
  7179. aSign := extractFloat128Sign( a );
  7180. bSig1 := extractFloat128Frac1( b );
  7181. bSig0 := extractFloat128Frac0( b );
  7182. bExp := extractFloat128Exp( b );
  7183. bSign := extractFloat128Sign( b );
  7184. if ( aExp = $7FFF ) begin
  7185. if ( ( aSig0 or aSig1 )
  7186. or ( ( bExp = $7FFF ) and ( bSig0 or bSig1 ) ) ) begin
  7187. result := propagateFloat128NaN( a, b );
  7188. end;
  7189. goto invalid;
  7190. end;
  7191. if ( bExp = $7FFF ) begin
  7192. if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
  7193. result := a;
  7194. end;
  7195. if ( bExp = 0 ) begin
  7196. if ( ( bSig0 or bSig1 ) = 0 ) begin
  7197. invalid:
  7198. float_raise( float_flag_invalid );
  7199. z.low := float128_default_nan_low;
  7200. z.high := float128_default_nan_high;
  7201. result := z;
  7202. end;
  7203. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  7204. end;
  7205. if ( aExp = 0 ) begin
  7206. if ( ( aSig0 or aSig1 ) = 0 ) result := a;
  7207. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7208. end;
  7209. expDiff := aExp - bExp;
  7210. if ( expDiff < -1 ) result := a;
  7211. shortShift128Left(
  7212. aSig0 or int64( $0001000000000000 ),
  7213. aSig1,
  7214. 15 - ( expDiff < 0 ),
  7215. &aSig0,
  7216. &aSig1
  7217. );
  7218. shortShift128Left(
  7219. bSig0 or int64( $0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
  7220. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7221. if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
  7222. expDiff -= 64;
  7223. while ( 0 < expDiff ) begin
  7224. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7225. q := ( 4 < q ) ? q - 4 : 0;
  7226. mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
  7227. shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
  7228. shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
  7229. sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
  7230. expDiff -= 61;
  7231. end;
  7232. if ( -64 < expDiff ) begin
  7233. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7234. q := ( 4 < q ) ? q - 4 : 0;
  7235. q >>= - expDiff;
  7236. shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
  7237. expDiff += 52;
  7238. if ( expDiff < 0 ) begin
  7239. shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
  7240. end;
  7241. else begin
  7242. shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
  7243. end;
  7244. mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
  7245. sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
  7246. end;
  7247. else begin
  7248. shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
  7249. shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
  7250. end;
  7251. do begin
  7252. alternateASig0 := aSig0;
  7253. alternateASig1 := aSig1;
  7254. ++q;
  7255. sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
  7256. end; while ( 0 <= (sbits64) aSig0 );
  7257. add128(
  7258. aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
  7259. if ( ( sigMean0 < 0 )
  7260. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and ( q and 1 ) ) ) begin
  7261. aSig0 := alternateASig0;
  7262. aSig1 := alternateASig1;
  7263. end;
  7264. zSign := ( (sbits64) aSig0 < 0 );
  7265. if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
  7266. result :=
  7267. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7268. end;
  7269. {*----------------------------------------------------------------------------
  7270. | Returns the square root of the quadruple-precision floating-point value `a'.
  7271. | The operation is performed according to the IEC/IEEE Standard for Binary
  7272. | Floating-Point Arithmetic.
  7273. *----------------------------------------------------------------------------*}
  7274. function float128_sqrt(a: float128): float128;
  7275. var
  7276. aSign: flag;
  7277. aExp, zExp: int32;
  7278. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7279. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7280. z: float128;
  7281. label
  7282. invalid;
  7283. begin
  7284. aSig1 := extractFloat128Frac1( a );
  7285. aSig0 := extractFloat128Frac0( a );
  7286. aExp := extractFloat128Exp( a );
  7287. aSign := extractFloat128Sign( a );
  7288. if ( aExp = $7FFF ) begin
  7289. if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, a );
  7290. if ( ! aSign ) result := a;
  7291. goto invalid;
  7292. end;
  7293. if ( aSign ) begin
  7294. if ( ( aExp or aSig0 or aSig1 ) = 0 ) result := a;
  7295. invalid:
  7296. float_raise( float_flag_invalid );
  7297. z.low := float128_default_nan_low;
  7298. z.high := float128_default_nan_high;
  7299. result := z;
  7300. end;
  7301. if ( aExp = 0 ) begin
  7302. if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( 0, 0, 0, 0 );
  7303. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  7304. end;
  7305. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7306. aSig0 := aSig0 or int64( $0001000000000000 );
  7307. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7308. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), &aSig0, &aSig1 );
  7309. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7310. doubleZSig0 := zSig0 shl 1;
  7311. mul64To128( zSig0, zSig0, &term0, &term1 );
  7312. sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
  7313. while ( (sbits64) rem0 < 0 ) begin
  7314. --zSig0;
  7315. doubleZSig0 -= 2;
  7316. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, &rem0, &rem1 );
  7317. end;
  7318. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7319. if ( ( zSig1 and $1FFF ) <= 5 ) begin
  7320. if ( zSig1 = 0 ) zSig1 := 1;
  7321. mul64To128( doubleZSig0, zSig1, &term1, &term2 );
  7322. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  7323. mul64To128( zSig1, zSig1, &term2, &term3 );
  7324. sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
  7325. while ( (sbits64) rem1 < 0 ) begin
  7326. --zSig1;
  7327. shortShift128Left( 0, zSig1, 1, &term2, &term3 );
  7328. term3 or= 1;
  7329. term2 or= doubleZSig0;
  7330. add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
  7331. end;
  7332. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  7333. end;
  7334. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
  7335. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7336. end;
  7337. {*----------------------------------------------------------------------------
  7338. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7339. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7340. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7341. *----------------------------------------------------------------------------*}
  7342. function float128_eq(a: float128; b: float128): flag;
  7343. begin
  7344. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7345. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7346. or ( ( extractFloat128Exp( b ) = $7FFF )
  7347. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7348. ) begin
  7349. if ( float128_is_signaling_nan( a )
  7350. or float128_is_signaling_nan( b ) ) begin
  7351. float_raise( float_flag_invalid );
  7352. end;
  7353. result := 0;
  7354. end;
  7355. result :=
  7356. ( a.low = b.low )
  7357. and ( ( a.high = b.high )
  7358. or ( ( a.low = 0 )
  7359. and ( (bits64) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7360. );
  7361. end;
  7362. {*----------------------------------------------------------------------------
  7363. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7364. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7365. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7366. | Arithmetic.
  7367. *----------------------------------------------------------------------------*}
  7368. function float128_le(a: float128; b: float128): flag;
  7369. var
  7370. aSign, bSign: flag;
  7371. begin
  7372. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7373. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7374. or ( ( extractFloat128Exp( b ) = $7FFF )
  7375. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7376. ) begin
  7377. float_raise( float_flag_invalid );
  7378. result := 0;
  7379. end;
  7380. aSign := extractFloat128Sign( a );
  7381. bSign := extractFloat128Sign( b );
  7382. if ( aSign <> bSign ) begin
  7383. result :=
  7384. aSign
  7385. or ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7386. = 0 );
  7387. end;
  7388. result :=
  7389. aSign ? le128( b.high, b.low, a.high, a.low )
  7390. : le128( a.high, a.low, b.high, b.low );
  7391. end;
  7392. {*----------------------------------------------------------------------------
  7393. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7394. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7395. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7396. *----------------------------------------------------------------------------*}
  7397. function float128_lt(a: float128; b: float128): flag;
  7398. var
  7399. aSign, bSign: flag;
  7400. begin
  7401. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7402. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7403. or ( ( extractFloat128Exp( b ) = $7FFF )
  7404. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7405. ) begin
  7406. float_raise( float_flag_invalid );
  7407. result := 0;
  7408. end;
  7409. aSign := extractFloat128Sign( a );
  7410. bSign := extractFloat128Sign( b );
  7411. if ( aSign <> bSign ) begin
  7412. result :=
  7413. aSign
  7414. and ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7415. <> 0 );
  7416. end;
  7417. result :=
  7418. aSign ? lt128( b.high, b.low, a.high, a.low )
  7419. : lt128( a.high, a.low, b.high, b.low );
  7420. end;
  7421. {*----------------------------------------------------------------------------
  7422. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7423. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7424. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7425. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7426. *----------------------------------------------------------------------------*}
  7427. function float128_eq_signaling(a: float128; b: float128): flag;
  7428. begin
  7429. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7430. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7431. or ( ( extractFloat128Exp( b ) = $7FFF )
  7432. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7433. ) begin
  7434. float_raise( float_flag_invalid );
  7435. result := 0;
  7436. end;
  7437. result :=
  7438. ( a.low = b.low )
  7439. and ( ( a.high = b.high )
  7440. or ( ( a.low = 0 )
  7441. and ( (bits64) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7442. );
  7443. end;
  7444. {*----------------------------------------------------------------------------
  7445. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7446. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7447. | cause an exception. Otherwise, the comparison is performed according to the
  7448. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7449. *----------------------------------------------------------------------------*}
  7450. function float128_le_quiet(a: float128; b: float128): flag;
  7451. var
  7452. aSign, bSign: flag;
  7453. begin
  7454. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7455. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7456. or ( ( extractFloat128Exp( b ) = $7FFF )
  7457. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7458. ) begin
  7459. if ( float128_is_signaling_nan( a )
  7460. or float128_is_signaling_nan( b ) ) begin
  7461. float_raise( float_flag_invalid );
  7462. end;
  7463. result := 0;
  7464. end;
  7465. aSign := extractFloat128Sign( a );
  7466. bSign := extractFloat128Sign( b );
  7467. if ( aSign <> bSign ) begin
  7468. result :=
  7469. aSign
  7470. or ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7471. = 0 );
  7472. end;
  7473. result :=
  7474. aSign ? le128( b.high, b.low, a.high, a.low )
  7475. : le128( a.high, a.low, b.high, b.low );
  7476. end;
  7477. {*----------------------------------------------------------------------------
  7478. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7479. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  7480. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  7481. | Standard for Binary Floating-Point Arithmetic.
  7482. *----------------------------------------------------------------------------*}
  7483. function float128_lt_quiet(a: float128; b: float128): flag;
  7484. var
  7485. aSign, bSign: flag;
  7486. begin
  7487. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7488. and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
  7489. or ( ( extractFloat128Exp( b ) = $7FFF )
  7490. and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
  7491. ) begin
  7492. if ( float128_is_signaling_nan( a )
  7493. or float128_is_signaling_nan( b ) ) begin
  7494. float_raise( float_flag_invalid );
  7495. end;
  7496. result := 0;
  7497. end;
  7498. aSign := extractFloat128Sign( a );
  7499. bSign := extractFloat128Sign( b );
  7500. if ( aSign <> bSign ) begin
  7501. result :=
  7502. aSign
  7503. and ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7504. <> 0 );
  7505. end;
  7506. result :=
  7507. aSign ? lt128( b.high, b.low, a.high, a.low )
  7508. : lt128( a.high, a.low, b.high, b.low );
  7509. end;
  7510. {$endif FPC_SOFTFLOAT_FLOAT128}
  7511. {$endif not(defined(fpc_softfpu_interface))}
  7512. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  7513. end.
  7514. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}