softfpu.pp 295 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. {$macro on}
  69. {$define compilerproc:=stdcall }
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. float32 = longword;
  80. {$define FPC_SYSTEM_HAS_float32}
  81. { we use here a record in the function header because
  82. the record allows bitwise conversion to single }
  83. float32rec = record
  84. float32 : float32;
  85. end;
  86. flag = byte;
  87. uint8 = byte;
  88. int8 = shortint;
  89. uint16 = word;
  90. int16 = smallint;
  91. uint32 = longword;
  92. int32 = longint;
  93. bits8 = byte;
  94. sbits8 = shortint;
  95. bits16 = word;
  96. sbits16 = smallint;
  97. sbits32 = longint;
  98. bits32 = longword;
  99. {$ifndef fpc}
  100. qword = int64;
  101. {$endif}
  102. { now part of the system unit
  103. uint64 = qword;
  104. }
  105. bits64 = qword;
  106. sbits64 = int64;
  107. {$ifdef ENDIAN_LITTLE}
  108. float64 = packed record
  109. low: bits32;
  110. high: bits32;
  111. end;
  112. int64rec = packed record
  113. low: bits32;
  114. high: bits32;
  115. end;
  116. floatx80 = packed record
  117. low : qword;
  118. high : word;
  119. end;
  120. float128 = packed record
  121. low : qword;
  122. high : qword;
  123. end;
  124. {$else}
  125. float64 = record
  126. case byte of
  127. 1: (high,low : bits32);
  128. // force the record to be aligned like a double
  129. // else *_to_double will fail for cpus like sparc
  130. 2: (dummy : double);
  131. end;
  132. int64rec = packed record
  133. high,low : bits32;
  134. end;
  135. floatx80 = packed record
  136. high : word;
  137. low : qword;
  138. end;
  139. float128 = packed record
  140. high : qword;
  141. low : qword;
  142. end;
  143. {$endif}
  144. {$define FPC_SYSTEM_HAS_float64}
  145. {*
  146. -------------------------------------------------------------------------------
  147. Returns 1 if the double-precision floating-point value `a' is less than
  148. the corresponding value `b', and 0 otherwise. The comparison is performed
  149. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  150. -------------------------------------------------------------------------------
  151. *}
  152. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  153. {*
  154. -------------------------------------------------------------------------------
  155. Returns 1 if the double-precision floating-point value `a' is less than
  156. or equal to the corresponding value `b', and 0 otherwise. The comparison
  157. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  158. Arithmetic.
  159. -------------------------------------------------------------------------------
  160. *}
  161. Function float64_le(a: float64;b: float64): flag; compilerproc;
  162. {*
  163. -------------------------------------------------------------------------------
  164. Returns 1 if the double-precision floating-point value `a' is equal to
  165. the corresponding value `b', and 0 otherwise. The comparison is performed
  166. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  167. -------------------------------------------------------------------------------
  168. *}
  169. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  170. {*
  171. -------------------------------------------------------------------------------
  172. Returns the square root of the double-precision floating-point value `a'.
  173. The operation is performed according to the IEC/IEEE Standard for Binary
  174. Floating-Point Arithmetic.
  175. -------------------------------------------------------------------------------
  176. *}
  177. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  178. {*
  179. -------------------------------------------------------------------------------
  180. Returns the remainder of the double-precision floating-point value `a'
  181. with respect to the corresponding value `b'. The operation is performed
  182. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  183. -------------------------------------------------------------------------------
  184. *}
  185. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  186. {*
  187. -------------------------------------------------------------------------------
  188. Returns the result of dividing the double-precision floating-point value `a'
  189. by the corresponding value `b'. The operation is performed according to the
  190. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  191. -------------------------------------------------------------------------------
  192. *}
  193. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  194. {*
  195. -------------------------------------------------------------------------------
  196. Returns the result of multiplying the double-precision floating-point values
  197. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  198. for Binary Floating-Point Arithmetic.
  199. -------------------------------------------------------------------------------
  200. *}
  201. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  202. {*
  203. -------------------------------------------------------------------------------
  204. Returns the result of subtracting the double-precision floating-point values
  205. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  206. for Binary Floating-Point Arithmetic.
  207. -------------------------------------------------------------------------------
  208. *}
  209. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  210. {*
  211. -------------------------------------------------------------------------------
  212. Returns the result of adding the double-precision floating-point values `a'
  213. and `b'. The operation is performed according to the IEC/IEEE Standard for
  214. Binary Floating-Point Arithmetic.
  215. -------------------------------------------------------------------------------
  216. *}
  217. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  218. {*
  219. -------------------------------------------------------------------------------
  220. Rounds the double-precision floating-point value `a' to an integer,
  221. and returns the result as a double-precision floating-point value. The
  222. operation is performed according to the IEC/IEEE Standard for Binary
  223. Floating-Point Arithmetic.
  224. -------------------------------------------------------------------------------
  225. *}
  226. Function float64_round_to_int(a: float64) : float64; compilerproc;
  227. {*
  228. -------------------------------------------------------------------------------
  229. Returns the result of converting the double-precision floating-point value
  230. `a' to the single-precision floating-point format. The conversion is
  231. performed according to the IEC/IEEE Standard for Binary Floating-Point
  232. Arithmetic.
  233. -------------------------------------------------------------------------------
  234. *}
  235. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  236. {*
  237. -------------------------------------------------------------------------------
  238. Returns the result of converting the double-precision floating-point value
  239. `a' to the 32-bit two's complement integer format. The conversion is
  240. performed according to the IEC/IEEE Standard for Binary Floating-Point
  241. Arithmetic, except that the conversion is always rounded toward zero.
  242. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  243. the conversion overflows, the largest integer with the same sign as `a' is
  244. returned.
  245. -------------------------------------------------------------------------------
  246. *}
  247. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  248. {*
  249. -------------------------------------------------------------------------------
  250. Returns the result of converting the double-precision floating-point value
  251. `a' to the 32-bit two's complement integer format. The conversion is
  252. performed according to the IEC/IEEE Standard for Binary Floating-Point
  253. Arithmetic---which means in particular that the conversion is rounded
  254. according to the current rounding mode. If `a' is a NaN, the largest
  255. positive integer is returned. Otherwise, if the conversion overflows, the
  256. largest integer with the same sign as `a' is returned.
  257. -------------------------------------------------------------------------------
  258. *}
  259. Function float64_to_int32(a: float64): int32; compilerproc;
  260. {*
  261. -------------------------------------------------------------------------------
  262. Returns 1 if the single-precision floating-point value `a' is less than
  263. the corresponding value `b', and 0 otherwise. The comparison is performed
  264. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  265. -------------------------------------------------------------------------------
  266. *}
  267. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  268. {*
  269. -------------------------------------------------------------------------------
  270. Returns 1 if the single-precision floating-point value `a' is less than
  271. or equal to the corresponding value `b', and 0 otherwise. The comparison
  272. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  273. Arithmetic.
  274. -------------------------------------------------------------------------------
  275. *}
  276. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  277. {*
  278. -------------------------------------------------------------------------------
  279. Returns 1 if the single-precision floating-point value `a' is equal to
  280. the corresponding value `b', and 0 otherwise. The comparison is performed
  281. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  282. -------------------------------------------------------------------------------
  283. *}
  284. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  285. {*
  286. -------------------------------------------------------------------------------
  287. Returns the square root of the single-precision floating-point value `a'.
  288. The operation is performed according to the IEC/IEEE Standard for Binary
  289. Floating-Point Arithmetic.
  290. -------------------------------------------------------------------------------
  291. *}
  292. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  293. {*
  294. -------------------------------------------------------------------------------
  295. Returns the remainder of the single-precision floating-point value `a'
  296. with respect to the corresponding value `b'. The operation is performed
  297. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  298. -------------------------------------------------------------------------------
  299. *}
  300. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  301. {*
  302. -------------------------------------------------------------------------------
  303. Returns the result of dividing the single-precision floating-point value `a'
  304. by the corresponding value `b'. The operation is performed according to the
  305. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  306. -------------------------------------------------------------------------------
  307. *}
  308. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  309. {*
  310. -------------------------------------------------------------------------------
  311. Returns the result of multiplying the single-precision floating-point values
  312. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  313. for Binary Floating-Point Arithmetic.
  314. -------------------------------------------------------------------------------
  315. *}
  316. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  317. {*
  318. -------------------------------------------------------------------------------
  319. Returns the result of subtracting the single-precision floating-point values
  320. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  321. for Binary Floating-Point Arithmetic.
  322. -------------------------------------------------------------------------------
  323. *}
  324. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  325. {*
  326. -------------------------------------------------------------------------------
  327. Returns the result of adding the single-precision floating-point values `a'
  328. and `b'. The operation is performed according to the IEC/IEEE Standard for
  329. Binary Floating-Point Arithmetic.
  330. -------------------------------------------------------------------------------
  331. *}
  332. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  333. {*
  334. -------------------------------------------------------------------------------
  335. Rounds the single-precision floating-point value `a' to an integer,
  336. and returns the result as a single-precision floating-point value. The
  337. operation is performed according to the IEC/IEEE Standard for Binary
  338. Floating-Point Arithmetic.
  339. -------------------------------------------------------------------------------
  340. *}
  341. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  342. {*
  343. -------------------------------------------------------------------------------
  344. Returns the result of converting the single-precision floating-point value
  345. `a' to the double-precision floating-point format. The conversion is
  346. performed according to the IEC/IEEE Standard for Binary Floating-Point
  347. Arithmetic.
  348. -------------------------------------------------------------------------------
  349. *}
  350. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  351. {*
  352. -------------------------------------------------------------------------------
  353. Returns the result of converting the single-precision floating-point value
  354. `a' to the 32-bit two's complement integer format. The conversion is
  355. performed according to the IEC/IEEE Standard for Binary Floating-Point
  356. Arithmetic, except that the conversion is always rounded toward zero.
  357. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  358. the conversion overflows, the largest integer with the same sign as `a' is
  359. returned.
  360. -------------------------------------------------------------------------------
  361. *}
  362. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  363. {*
  364. -------------------------------------------------------------------------------
  365. Returns the result of converting the single-precision floating-point value
  366. `a' to the 32-bit two's complement integer format. The conversion is
  367. performed according to the IEC/IEEE Standard for Binary Floating-Point
  368. Arithmetic---which means in particular that the conversion is rounded
  369. according to the current rounding mode. If `a' is a NaN, the largest
  370. positive integer is returned. Otherwise, if the conversion overflows, the
  371. largest integer with the same sign as `a' is returned.
  372. -------------------------------------------------------------------------------
  373. *}
  374. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  375. {*
  376. -------------------------------------------------------------------------------
  377. Returns the result of converting the 32-bit two's complement integer `a' to
  378. the double-precision floating-point format. The conversion is performed
  379. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  380. -------------------------------------------------------------------------------
  381. *}
  382. Function int32_to_float64( a: int32) : float64; compilerproc;
  383. {*
  384. -------------------------------------------------------------------------------
  385. Returns the result of converting the 32-bit two's complement integer `a' to
  386. the single-precision floating-point format. The conversion is performed
  387. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  388. -------------------------------------------------------------------------------
  389. *}
  390. Function int32_to_float32( a: int32): float32rec; compilerproc;
  391. {*----------------------------------------------------------------------------
  392. | Returns the result of converting the 64-bit two's complement integer `a'
  393. | to the double-precision floating-point format. The conversion is performed
  394. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  395. *----------------------------------------------------------------------------*}
  396. Function int64_to_float64( a: int64 ): float64; compilerproc;
  397. Function qword_to_float64( a: qword ): float64; compilerproc;
  398. {*----------------------------------------------------------------------------
  399. | Returns the result of converting the 64-bit two's complement integer `a'
  400. | to the single-precision floating-point format. The conversion is performed
  401. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  402. *----------------------------------------------------------------------------*}
  403. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  404. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  405. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  406. function float128_is_nan( a : float128): flag;
  407. function float128_is_signaling_nan( a : float128): flag;
  408. function float128_to_int32(a: float128): int32;
  409. function float128_to_int32_round_to_zero(a: float128): int32;
  410. function float128_to_int64(a: float128): int64;
  411. function float128_to_int64_round_to_zero(a: float128): int64;
  412. function float128_to_float32(a: float128): float32;
  413. function float128_to_float64(a: float128): float64;
  414. function float64_to_float128( a : float64) : float128;
  415. {$ifdef FPC_SOFTFLOAT_FLOAT80}
  416. function float128_to_floatx80(a: float128): floatx80;
  417. {$endif FPC_SOFTFLOAT_FLOAT80}
  418. function float128_round_to_int(a: float128): float128;
  419. function float128_add(a: float128; b: float128): float128;
  420. function float128_sub(a: float128; b: float128): float128;
  421. function float128_mul(a: float128; b: float128): float128;
  422. function float128_div(a: float128; b: float128): float128;
  423. function float128_rem(a: float128; b: float128): float128;
  424. function float128_sqrt(a: float128): float128;
  425. function float128_eq(a: float128; b: float128): flag;
  426. function float128_le(a: float128; b: float128): flag;
  427. function float128_lt(a: float128; b: float128): flag;
  428. function float128_eq_signaling(a: float128; b: float128): flag;
  429. function float128_le_quiet(a: float128; b: float128): flag;
  430. function float128_lt_quiet(a: float128; b: float128): flag;
  431. {$endif FPC_SOFTFLOAT_FLOAT128}
  432. CONST
  433. {-------------------------------------------------------------------------------
  434. Software IEC/IEEE floating-point underflow tininess-detection mode.
  435. -------------------------------------------------------------------------------
  436. *}
  437. float_tininess_after_rounding = 0;
  438. float_tininess_before_rounding = 1;
  439. {*
  440. -------------------------------------------------------------------------------
  441. Underflow tininess-detection mode, statically initialized to default value.
  442. (The declaration in `softfloat.h' must match the `int8' type here.)
  443. -------------------------------------------------------------------------------
  444. *}
  445. const float_detect_tininess: int8 = float_tininess_after_rounding;
  446. {$endif not(defined(fpc_softfpu_implementation))}
  447. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  448. implementation
  449. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  450. {$if not(defined(fpc_softfpu_interface))}
  451. (*****************************************************************************)
  452. (*----------------------------------------------------------------------------*)
  453. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  454. (* division and square root approximations. (Can be specialized to target if *)
  455. (* desired.) *)
  456. (* ---------------------------------------------------------------------------*)
  457. (*****************************************************************************)
  458. {*----------------------------------------------------------------------------
  459. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  460. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  461. | input. If `zSign' is 1, the input is negated before being converted to an
  462. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  463. | is simply rounded to an integer, with the inexact exception raised if the
  464. | input cannot be represented exactly as an integer. However, if the fixed-
  465. | point input is too large, the invalid exception is raised and the largest
  466. | positive or negative integer is returned.
  467. *----------------------------------------------------------------------------*}
  468. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  469. var
  470. roundingMode: int8;
  471. roundNearestEven: flag;
  472. roundIncrement, roundBits: int8;
  473. z: int32;
  474. begin
  475. roundingMode := softfloat_rounding_mode;
  476. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  477. roundIncrement := $40;
  478. if ( roundNearestEven=0 ) then
  479. begin
  480. if ( roundingMode = float_round_to_zero ) then
  481. begin
  482. roundIncrement := 0;
  483. end
  484. else begin
  485. roundIncrement := $7F;
  486. if ( zSign<>0 ) then
  487. begin
  488. if ( roundingMode = float_round_up ) then
  489. roundIncrement := 0;
  490. end
  491. else begin
  492. if ( roundingMode = float_round_down ) then
  493. roundIncrement := 0;
  494. end;
  495. end;
  496. end;
  497. roundBits := absZ and $7F;
  498. absZ := ( absZ + roundIncrement ) shr 7;
  499. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  500. z := absZ;
  501. if ( zSign<>0 ) then
  502. z := - z;
  503. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  504. begin
  505. float_raise( float_flag_invalid );
  506. if zSign<>0 then
  507. result:=sbits32($80000000)
  508. else
  509. result:=$7FFFFFFF;
  510. exit;
  511. end;
  512. if ( roundBits<>0 ) then
  513. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  514. result:=z;
  515. end;
  516. {*----------------------------------------------------------------------------
  517. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  518. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  519. | and returns the properly rounded 64-bit integer corresponding to the input.
  520. | If `zSign' is 1, the input is negated before being converted to an integer.
  521. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  522. | the inexact exception raised if the input cannot be represented exactly as
  523. | an integer. However, if the fixed-point input is too large, the invalid
  524. | exception is raised and the largest positive or negative integer is
  525. | returned.
  526. *----------------------------------------------------------------------------*}
  527. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  528. var
  529. roundingMode: int8;
  530. roundNearestEven, increment: flag;
  531. z: int64;
  532. label
  533. overflow;
  534. begin
  535. roundingMode := softfloat_rounding_mode;
  536. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  537. increment := ord( sbits64(absZ1) < 0 );
  538. if ( roundNearestEven=0 ) then
  539. begin
  540. if ( roundingMode = float_round_to_zero ) then
  541. begin
  542. increment := 0;
  543. end
  544. else begin
  545. if ( zSign<>0 ) then
  546. begin
  547. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  548. end
  549. else begin
  550. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  551. end;
  552. end;
  553. end;
  554. if ( increment<>0 ) then
  555. begin
  556. inc(absZ0);
  557. if ( absZ0 = 0 ) then
  558. goto overflow;
  559. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  560. end;
  561. z := absZ0;
  562. if ( zSign<>0 ) then
  563. z := - z;
  564. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  565. begin
  566. overflow:
  567. float_raise( float_flag_invalid );
  568. if zSign<>0 then
  569. result:=int64($8000000000000000)
  570. else
  571. result:=int64($7FFFFFFFFFFFFFFF);
  572. end;
  573. if ( absZ1<>0 ) then
  574. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  575. result:=z;
  576. end;
  577. {*
  578. -------------------------------------------------------------------------------
  579. Shifts `a' right by the number of bits given in `count'. If any nonzero
  580. bits are shifted off, they are ``jammed'' into the least significant bit of
  581. the result by setting the least significant bit to 1. The value of `count'
  582. can be arbitrarily large; in particular, if `count' is greater than 32, the
  583. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  584. The result is stored in the location pointed to by `zPtr'.
  585. -------------------------------------------------------------------------------
  586. *}
  587. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  588. var
  589. z: Bits32;
  590. Begin
  591. if ( count = 0 ) then
  592. z := a
  593. else
  594. if ( count < 32 ) then
  595. Begin
  596. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  597. End
  598. else
  599. Begin
  600. z := bits32( a <> 0 );
  601. End;
  602. zPtr := z;
  603. End;
  604. {*----------------------------------------------------------------------------
  605. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  606. | number of bits given in `count'. Any bits shifted off are lost. The value
  607. | of `count' can be arbitrarily large; in particular, if `count' is greater
  608. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  609. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  610. *----------------------------------------------------------------------------*}
  611. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  612. var
  613. z0, z1: bits64;
  614. negCount: int8;
  615. begin
  616. negCount := ( - count ) and 63;
  617. if ( count = 0 ) then
  618. begin
  619. z1 := a1;
  620. z0 := a0;
  621. end
  622. else if ( count < 64 ) then
  623. begin
  624. z1 := ( a0 shl negCount ) or ( a1 shr count );
  625. z0 := a0 shr count;
  626. end
  627. else
  628. begin
  629. if ( count shl 64 )<>0 then
  630. z1 := a0 shr ( count and 63 )
  631. else
  632. z1 := 0;
  633. z0 := 0;
  634. end;
  635. z1Ptr := z1;
  636. z0Ptr := z0;
  637. end;
  638. {*----------------------------------------------------------------------------
  639. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  640. | number of bits given in `count'. If any nonzero bits are shifted off, they
  641. | are ``jammed'' into the least significant bit of the result by setting the
  642. | least significant bit to 1. The value of `count' can be arbitrarily large;
  643. | in particular, if `count' is greater than 128, the result will be either
  644. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  645. | nonzero. The result is broken into two 64-bit pieces which are stored at
  646. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  647. *----------------------------------------------------------------------------*}
  648. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  649. var
  650. z0,z1 : bits64;
  651. negCount : int8;
  652. begin
  653. negCount := ( - count ) and 63;
  654. if ( count = 0 ) then begin
  655. z1 := a1;
  656. z0 := a0;
  657. end
  658. else if ( count < 64 ) then begin
  659. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  660. z0 := a0>>count;
  661. end
  662. else begin
  663. if ( count = 64 ) then begin
  664. z1 := a0 or ord( a1 <> 0 );
  665. end
  666. else if ( count < 128 ) then begin
  667. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  668. end
  669. else begin
  670. z1 := ord( ( a0 or a1 ) <> 0 );
  671. end;
  672. z0 := 0;
  673. end;
  674. z1Ptr := z1;
  675. z0Ptr := z0;
  676. end;
  677. {*
  678. -------------------------------------------------------------------------------
  679. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  680. number of bits given in `count'. Any bits shifted off are lost. The value
  681. of `count' can be arbitrarily large; in particular, if `count' is greater
  682. than 64, the result will be 0. The result is broken into two 32-bit pieces
  683. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  684. -------------------------------------------------------------------------------
  685. *}
  686. Procedure
  687. shift64Right(
  688. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  689. Var
  690. z0, z1: bits32;
  691. negCount : int8;
  692. Begin
  693. negCount := ( - count ) AND 31;
  694. if ( count = 0 ) then
  695. Begin
  696. z1 := a1;
  697. z0 := a0;
  698. End
  699. else if ( count < 32 ) then
  700. Begin
  701. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  702. z0 := a0 shr count;
  703. End
  704. else
  705. Begin
  706. if (count < 64) then
  707. z1 := ( a0 shr ( count AND 31 ) )
  708. else
  709. z1 := 0;
  710. z0 := 0;
  711. End;
  712. z1Ptr := z1;
  713. z0Ptr := z0;
  714. End;
  715. {*
  716. -------------------------------------------------------------------------------
  717. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  718. number of bits given in `count'. If any nonzero bits are shifted off, they
  719. are ``jammed'' into the least significant bit of the result by setting the
  720. least significant bit to 1. The value of `count' can be arbitrarily large;
  721. in particular, if `count' is greater than 64, the result will be either 0
  722. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  723. nonzero. The result is broken into two 32-bit pieces which are stored at
  724. the locations pointed to by `z0Ptr' and `z1Ptr'.
  725. -------------------------------------------------------------------------------
  726. *}
  727. Procedure
  728. shift64RightJamming(
  729. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  730. VAR
  731. z0, z1 : bits32;
  732. negCount : int8;
  733. Begin
  734. negCount := ( - count ) AND 31;
  735. if ( count = 0 ) then
  736. Begin
  737. z1 := a1;
  738. z0 := a0;
  739. End
  740. else
  741. if ( count < 32 ) then
  742. Begin
  743. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  744. z0 := a0 shr count;
  745. End
  746. else
  747. Begin
  748. if ( count = 32 ) then
  749. Begin
  750. z1 := a0 OR bits32( a1 <> 0 );
  751. End
  752. else
  753. if ( count < 64 ) Then
  754. Begin
  755. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  756. End
  757. else
  758. Begin
  759. z1 := bits32( ( a0 OR a1 ) <> 0 );
  760. End;
  761. z0 := 0;
  762. End;
  763. z1Ptr := z1;
  764. z0Ptr := z0;
  765. End;
  766. {*----------------------------------------------------------------------------
  767. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  768. | bits are shifted off, they are ``jammed'' into the least significant bit of
  769. | the result by setting the least significant bit to 1. The value of `count'
  770. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  771. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  772. | The result is stored in the location pointed to by `zPtr'.
  773. *----------------------------------------------------------------------------*}
  774. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  775. var
  776. z: bits64;
  777. begin
  778. if ( count = 0 ) then
  779. begin
  780. z := a;
  781. end
  782. else if ( count < 64 ) then
  783. begin
  784. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  785. end
  786. else
  787. begin
  788. z := ord( a <> 0 );
  789. end;
  790. zPtr := z;
  791. end;
  792. {*
  793. -------------------------------------------------------------------------------
  794. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  795. by 32 _plus_ the number of bits given in `count'. The shifted result is
  796. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  797. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  798. off form a third 32-bit result as follows: The _last_ bit shifted off is
  799. the most-significant bit of the extra result, and the other 31 bits of the
  800. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  801. were all zero. This extra result is stored in the location pointed to by
  802. `z2Ptr'. The value of `count' can be arbitrarily large.
  803. (This routine makes more sense if `a0', `a1', and `a2' are considered
  804. to form a fixed-point value with binary point between `a1' and `a2'. This
  805. fixed-point value is shifted right by the number of bits given in `count',
  806. and the integer part of the result is returned at the locations pointed to
  807. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  808. corrupted as described above, and is returned at the location pointed to by
  809. `z2Ptr'.)
  810. -------------------------------------------------------------------------------
  811. }
  812. Procedure
  813. shift64ExtraRightJamming(
  814. a0: bits32;
  815. a1: bits32;
  816. a2: bits32;
  817. count: int16;
  818. VAR z0Ptr: bits32;
  819. VAR z1Ptr: bits32;
  820. VAR z2Ptr: bits32
  821. );
  822. Var
  823. z0, z1, z2: bits32;
  824. negCount : int8;
  825. Begin
  826. negCount := ( - count ) AND 31;
  827. if ( count = 0 ) then
  828. Begin
  829. z2 := a2;
  830. z1 := a1;
  831. z0 := a0;
  832. End
  833. else
  834. Begin
  835. if ( count < 32 ) Then
  836. Begin
  837. z2 := a1 shl negCount;
  838. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  839. z0 := a0 shr count;
  840. End
  841. else
  842. Begin
  843. if ( count = 32 ) then
  844. Begin
  845. z2 := a1;
  846. z1 := a0;
  847. End
  848. else
  849. Begin
  850. a2 := a2 or a1;
  851. if ( count < 64 ) then
  852. Begin
  853. z2 := a0 shl negCount;
  854. z1 := a0 shr ( count AND 31 );
  855. End
  856. else
  857. Begin
  858. if count = 64 then
  859. z2 := a0
  860. else
  861. z2 := bits32(a0 <> 0);
  862. z1 := 0;
  863. End;
  864. End;
  865. z0 := 0;
  866. End;
  867. z2 := z2 or bits32( a2 <> 0 );
  868. End;
  869. z2Ptr := z2;
  870. z1Ptr := z1;
  871. z0Ptr := z0;
  872. End;
  873. {*
  874. -------------------------------------------------------------------------------
  875. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  876. number of bits given in `count'. Any bits shifted off are lost. The value
  877. of `count' must be less than 32. The result is broken into two 32-bit
  878. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  879. -------------------------------------------------------------------------------
  880. *}
  881. Procedure
  882. shortShift64Left(
  883. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  884. Begin
  885. z1Ptr := a1 shl count;
  886. if count = 0 then
  887. z0Ptr := a0
  888. else
  889. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  890. End;
  891. {*
  892. -------------------------------------------------------------------------------
  893. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  894. by the number of bits given in `count'. Any bits shifted off are lost.
  895. The value of `count' must be less than 32. The result is broken into three
  896. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  897. `z1Ptr', and `z2Ptr'.
  898. -------------------------------------------------------------------------------
  899. *}
  900. Procedure
  901. shortShift96Left(
  902. a0: bits32;
  903. a1: bits32;
  904. a2: bits32;
  905. count: int16;
  906. VAR z0Ptr: bits32;
  907. VAR z1Ptr: bits32;
  908. VAR z2Ptr: bits32
  909. );
  910. Var
  911. z0, z1, z2: bits32;
  912. negCount: int8;
  913. Begin
  914. z2 := a2 shl count;
  915. z1 := a1 shl count;
  916. z0 := a0 shl count;
  917. if ( 0 < count ) then
  918. Begin
  919. negCount := ( ( - count ) AND 31 );
  920. z1 := z1 or (a2 shr negCount);
  921. z0 := z0 or (a1 shr negCount);
  922. End;
  923. z2Ptr := z2;
  924. z1Ptr := z1;
  925. z0Ptr := z0;
  926. End;
  927. {*----------------------------------------------------------------------------
  928. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  929. | number of bits given in `count'. Any bits shifted off are lost. The value
  930. | of `count' must be less than 64. The result is broken into two 64-bit
  931. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  932. *----------------------------------------------------------------------------*}
  933. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  934. begin
  935. z1Ptr := a1 shl count;
  936. if count=0 then
  937. z0Ptr:=a0
  938. else
  939. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  940. end;
  941. {*
  942. -------------------------------------------------------------------------------
  943. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  944. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  945. any carry out is lost. The result is broken into two 32-bit pieces which
  946. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  947. -------------------------------------------------------------------------------
  948. *}
  949. Procedure
  950. add64(
  951. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  952. Var
  953. z1: bits32;
  954. Begin
  955. z1 := a1 + b1;
  956. z1Ptr := z1;
  957. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  958. End;
  959. {*
  960. -------------------------------------------------------------------------------
  961. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  962. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  963. modulo 2^96, so any carry out is lost. The result is broken into three
  964. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  965. `z1Ptr', and `z2Ptr'.
  966. -------------------------------------------------------------------------------
  967. *}
  968. Procedure
  969. add96(
  970. a0: bits32;
  971. a1: bits32;
  972. a2: bits32;
  973. b0: bits32;
  974. b1: bits32;
  975. b2: bits32;
  976. VAR z0Ptr: bits32;
  977. VAR z1Ptr: bits32;
  978. VAR z2Ptr: bits32
  979. );
  980. var
  981. z0, z1, z2: bits32;
  982. carry0, carry1: int8;
  983. Begin
  984. z2 := a2 + b2;
  985. carry1 := int8( z2 < a2 );
  986. z1 := a1 + b1;
  987. carry0 := int8( z1 < a1 );
  988. z0 := a0 + b0;
  989. z1 := z1 + carry1;
  990. z0 := z0 + bits32( z1 < carry1 );
  991. z0 := z0 + carry0;
  992. z2Ptr := z2;
  993. z1Ptr := z1;
  994. z0Ptr := z0;
  995. End;
  996. {*----------------------------------------------------------------------------
  997. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  998. | by the number of bits given in `count'. Any bits shifted off are lost.
  999. | The value of `count' must be less than 64. The result is broken into three
  1000. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1001. | `z1Ptr', and `z2Ptr'.
  1002. *----------------------------------------------------------------------------*}
  1003. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1004. var
  1005. z0, z1, z2 : bits64;
  1006. negCount : int8;
  1007. begin
  1008. z2 := a2 shl count;
  1009. z1 := a1 shl count;
  1010. z0 := a0 shl count;
  1011. if ( 0 < count ) then
  1012. begin
  1013. negCount := ( ( - count ) and 63 );
  1014. z1 := z1 or (a2 shr negCount);
  1015. z0 := z0 or (a1 shr negCount);
  1016. end;
  1017. z2Ptr := z2;
  1018. z1Ptr := z1;
  1019. z0Ptr := z0;
  1020. end;
  1021. {*----------------------------------------------------------------------------
  1022. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1023. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1024. | any carry out is lost. The result is broken into two 64-bit pieces which
  1025. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1026. *----------------------------------------------------------------------------*}
  1027. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1028. var
  1029. z1 : bits64;
  1030. begin
  1031. z1 := a1 + b1;
  1032. z1Ptr := z1;
  1033. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1034. end;
  1035. {*----------------------------------------------------------------------------
  1036. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1037. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1038. | modulo 2^192, so any carry out is lost. The result is broken into three
  1039. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1040. | `z1Ptr', and `z2Ptr'.
  1041. *----------------------------------------------------------------------------*}
  1042. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1043. var
  1044. z0, z1, z2 : bits64;
  1045. carry0, carry1 : int8;
  1046. begin
  1047. z2 := a2 + b2;
  1048. carry1 := ord( z2 < a2 );
  1049. z1 := a1 + b1;
  1050. carry0 := ord( z1 < a1 );
  1051. z0 := a0 + b0;
  1052. inc(z1, carry1);
  1053. inc(z0, ord( z1 < carry1 ));
  1054. inc(z0, carry0);
  1055. z2Ptr := z2;
  1056. z1Ptr := z1;
  1057. z0Ptr := z0;
  1058. end;
  1059. {*
  1060. -------------------------------------------------------------------------------
  1061. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1062. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1063. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1064. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1065. `z1Ptr'.
  1066. -------------------------------------------------------------------------------
  1067. *}
  1068. Procedure
  1069. sub64(
  1070. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1071. Begin
  1072. z1Ptr := a1 - b1;
  1073. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1074. End;
  1075. {*
  1076. -------------------------------------------------------------------------------
  1077. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1078. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1079. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1080. into three 32-bit pieces which are stored at the locations pointed to by
  1081. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1082. -------------------------------------------------------------------------------
  1083. *}
  1084. Procedure
  1085. sub96(
  1086. a0:bits32;
  1087. a1:bits32;
  1088. a2:bits32;
  1089. b0:bits32;
  1090. b1:bits32;
  1091. b2:bits32;
  1092. VAR z0Ptr:bits32;
  1093. VAR z1Ptr:bits32;
  1094. VAR z2Ptr:bits32
  1095. );
  1096. Var
  1097. z0, z1, z2: bits32;
  1098. borrow0, borrow1: int8;
  1099. Begin
  1100. z2 := a2 - b2;
  1101. borrow1 := int8( a2 < b2 );
  1102. z1 := a1 - b1;
  1103. borrow0 := int8( a1 < b1 );
  1104. z0 := a0 - b0;
  1105. z0 := z0 - bits32( z1 < borrow1 );
  1106. z1 := z1 - borrow1;
  1107. z0 := z0 -borrow0;
  1108. z2Ptr := z2;
  1109. z1Ptr := z1;
  1110. z0Ptr := z0;
  1111. End;
  1112. {*----------------------------------------------------------------------------
  1113. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1114. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1115. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1116. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1117. | `z1Ptr'.
  1118. *----------------------------------------------------------------------------*}
  1119. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1120. begin
  1121. z1Ptr := a1 - b1;
  1122. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1123. end;
  1124. {*----------------------------------------------------------------------------
  1125. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1126. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1127. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1128. | result is broken into three 64-bit pieces which are stored at the locations
  1129. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1130. *----------------------------------------------------------------------------*}
  1131. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1132. var
  1133. z0, z1, z2 : bits64;
  1134. borrow0, borrow1 : int8;
  1135. begin
  1136. z2 := a2 - b2;
  1137. borrow1 := ord( a2 < b2 );
  1138. z1 := a1 - b1;
  1139. borrow0 := ord( a1 < b1 );
  1140. z0 := a0 - b0;
  1141. dec(z0, ord( z1 < borrow1 ));
  1142. dec(z1, borrow1);
  1143. dec(z0, borrow0);
  1144. z2Ptr := z2;
  1145. z1Ptr := z1;
  1146. z0Ptr := z0;
  1147. end;
  1148. {*
  1149. -------------------------------------------------------------------------------
  1150. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1151. into two 32-bit pieces which are stored at the locations pointed to by
  1152. `z0Ptr' and `z1Ptr'.
  1153. -------------------------------------------------------------------------------
  1154. *}
  1155. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1156. :bits32 );
  1157. Var
  1158. aHigh, aLow, bHigh, bLow: bits16;
  1159. z0, zMiddleA, zMiddleB, z1: bits32;
  1160. Begin
  1161. aLow := a and $ffff;
  1162. aHigh := a shr 16;
  1163. bLow := b and $ffff;
  1164. bHigh := b shr 16;
  1165. z1 := ( bits32( aLow) ) * bLow;
  1166. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1167. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1168. z0 := ( bits32 (aHigh) ) * bHigh;
  1169. zMiddleA := zMiddleA + zMiddleB;
  1170. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1171. zMiddleA := zmiddleA shl 16;
  1172. z1 := z1 + zMiddleA;
  1173. z0 := z0 + bits32( z1 < zMiddleA );
  1174. z1Ptr := z1;
  1175. z0Ptr := z0;
  1176. End;
  1177. {*
  1178. -------------------------------------------------------------------------------
  1179. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1180. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1181. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1182. `z2Ptr'.
  1183. -------------------------------------------------------------------------------
  1184. *}
  1185. Procedure
  1186. mul64By32To96(
  1187. a0:bits32;
  1188. a1:bits32;
  1189. b:bits32;
  1190. VAR z0Ptr:bits32;
  1191. VAR z1Ptr:bits32;
  1192. VAR z2Ptr:bits32
  1193. );
  1194. Var
  1195. z0, z1, z2, more1: bits32;
  1196. Begin
  1197. mul32To64( a1, b, z1, z2 );
  1198. mul32To64( a0, b, z0, more1 );
  1199. add64( z0, more1, 0, z1, z0, z1 );
  1200. z2Ptr := z2;
  1201. z1Ptr := z1;
  1202. z0Ptr := z0;
  1203. End;
  1204. {*
  1205. -------------------------------------------------------------------------------
  1206. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1207. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1208. product. The product is broken into four 32-bit pieces which are stored at
  1209. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1210. -------------------------------------------------------------------------------
  1211. *}
  1212. Procedure
  1213. mul64To128(
  1214. a0:bits32;
  1215. a1:bits32;
  1216. b0:bits32;
  1217. b1:bits32;
  1218. VAR z0Ptr:bits32;
  1219. VAR z1Ptr:bits32;
  1220. VAR z2Ptr:bits32;
  1221. VAR z3Ptr:bits32
  1222. );
  1223. Var
  1224. z0, z1, z2, z3: bits32;
  1225. more1, more2: bits32;
  1226. Begin
  1227. mul32To64( a1, b1, z2, z3 );
  1228. mul32To64( a1, b0, z1, more2 );
  1229. add64( z1, more2, 0, z2, z1, z2 );
  1230. mul32To64( a0, b0, z0, more1 );
  1231. add64( z0, more1, 0, z1, z0, z1 );
  1232. mul32To64( a0, b1, more1, more2 );
  1233. add64( more1, more2, 0, z2, more1, z2 );
  1234. add64( z0, z1, 0, more1, z0, z1 );
  1235. z3Ptr := z3;
  1236. z2Ptr := z2;
  1237. z1Ptr := z1;
  1238. z0Ptr := z0;
  1239. End;
  1240. {*----------------------------------------------------------------------------
  1241. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1242. | into two 64-bit pieces which are stored at the locations pointed to by
  1243. | `z0Ptr' and `z1Ptr'.
  1244. *----------------------------------------------------------------------------*}
  1245. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1246. var
  1247. aHigh, aLow, bHigh, bLow : bits32;
  1248. z0, zMiddleA, zMiddleB, z1 : bits64;
  1249. begin
  1250. aLow := a;
  1251. aHigh := a shr 32;
  1252. bLow := b;
  1253. bHigh := b shr 32;
  1254. z1 := ( bits64(aLow) ) * bLow;
  1255. zMiddleA := ( bits64( aLow )) * bHigh;
  1256. zMiddleB := ( bits64( aHigh )) * bLow;
  1257. z0 := ( bits64(aHigh) ) * bHigh;
  1258. inc(zMiddleA, zMiddleB);
  1259. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1260. zMiddleA := zMiddleA shl 32;
  1261. inc(z1, zMiddleA);
  1262. inc(z0, ord( z1 < zMiddleA ));
  1263. z1Ptr := z1;
  1264. z0Ptr := z0;
  1265. end;
  1266. {*----------------------------------------------------------------------------
  1267. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1268. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1269. | product. The product is broken into four 64-bit pieces which are stored at
  1270. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1271. *----------------------------------------------------------------------------*}
  1272. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1273. var
  1274. z0,z1,z2,z3,more1,more2 : bits64;
  1275. begin
  1276. mul64To128( a1, b1, z2, z3 );
  1277. mul64To128( a1, b0, z1, more2 );
  1278. add128( z1, more2, 0, z2, z1, z2 );
  1279. mul64To128( a0, b0, z0, more1 );
  1280. add128( z0, more1, 0, z1, z0, z1 );
  1281. mul64To128( a0, b1, more1, more2 );
  1282. add128( more1, more2, 0, z2, more1, z2 );
  1283. add128( z0, z1, 0, more1, z0, z1 );
  1284. z3Ptr := z3;
  1285. z2Ptr := z2;
  1286. z1Ptr := z1;
  1287. z0Ptr := z0;
  1288. end;
  1289. {*----------------------------------------------------------------------------
  1290. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1291. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1292. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1293. | `z2Ptr'.
  1294. *----------------------------------------------------------------------------*}
  1295. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1296. var
  1297. z0, z1, z2, more1 : bits64;
  1298. begin
  1299. mul64To128( a1, b, z1, z2 );
  1300. mul64To128( a0, b, z0, more1 );
  1301. add128( z0, more1, 0, z1, z0, z1 );
  1302. z2Ptr := z2;
  1303. z1Ptr := z1;
  1304. z0Ptr := z0;
  1305. end;
  1306. {*----------------------------------------------------------------------------
  1307. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1308. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1309. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1310. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1311. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1312. | unsigned integer is returned.
  1313. *----------------------------------------------------------------------------*}
  1314. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1315. var
  1316. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1317. begin
  1318. if ( b <= a0 ) then
  1319. begin
  1320. result:=qword( $FFFFFFFFFFFFFFFF );
  1321. exit;
  1322. end;
  1323. b0 := b shr 32;
  1324. if ( b0 shl 32 <= a0 ) then
  1325. z:=qword( $FFFFFFFF00000000 )
  1326. else
  1327. z:=( a0 div b0 ) shl 32;
  1328. mul64To128( b, z, term0, term1 );
  1329. sub128( a0, a1, term0, term1, rem0, rem1 );
  1330. while ( ( sbits64(rem0) ) < 0 ) do begin
  1331. dec(z,qword( $100000000 ));
  1332. b1 := b shl 32;
  1333. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1334. end;
  1335. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1336. if ( b0 shl 32 <= rem0 ) then
  1337. z:=z or $FFFFFFFF
  1338. else
  1339. z:=z or rem0 div b0;
  1340. result:=z;
  1341. end;
  1342. {*
  1343. -------------------------------------------------------------------------------
  1344. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1345. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1346. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1347. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1348. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1349. unsigned integer is returned.
  1350. -------------------------------------------------------------------------------
  1351. *}
  1352. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1353. Var
  1354. b0, b1: bits32;
  1355. rem0, rem1, term0, term1: bits32;
  1356. z: bits32;
  1357. Begin
  1358. if ( b <= a0 ) then
  1359. Begin
  1360. estimateDiv64To32 := $FFFFFFFF;
  1361. exit;
  1362. End;
  1363. b0 := b shr 16;
  1364. if ( b0 shl 16 <= a0 ) then
  1365. z:= $FFFF0000
  1366. else
  1367. z:= ( a0 div b0 ) shl 16;
  1368. mul32To64( b, z, term0, term1 );
  1369. sub64( a0, a1, term0, term1, rem0, rem1 );
  1370. while ( ( sbits32 (rem0) ) < 0 ) do
  1371. Begin
  1372. z := z - $10000;
  1373. b1 := b shl 16;
  1374. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1375. End;
  1376. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1377. if ( b0 shl 16 <= rem0 ) then
  1378. z := z or $FFFF
  1379. else
  1380. z := z or (rem0 div b0);
  1381. estimateDiv64To32 := z;
  1382. End;
  1383. {*
  1384. -------------------------------------------------------------------------------
  1385. Returns an approximation to the square root of the 32-bit significand given
  1386. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1387. `aExp' (the least significant bit) is 1, the integer returned approximates
  1388. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1389. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1390. case, the approximation returned lies strictly within +/-2 of the exact
  1391. value.
  1392. -------------------------------------------------------------------------------
  1393. *}
  1394. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1395. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1396. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1397. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1398. );
  1399. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1400. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1401. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1402. );
  1403. Var
  1404. index: int8;
  1405. z: bits32;
  1406. Begin
  1407. index := ( a shr 27 ) AND 15;
  1408. if ( aExp AND 1 ) <> 0 then
  1409. Begin
  1410. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1411. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1412. a := a shr 1;
  1413. End
  1414. else
  1415. Begin
  1416. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1417. z := a div z + z;
  1418. if ( $20000 <= z ) then
  1419. z := $FFFF8000
  1420. else
  1421. z := ( z shl 15 );
  1422. if ( z <= a ) then
  1423. Begin
  1424. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1425. exit;
  1426. End;
  1427. End;
  1428. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1429. End;
  1430. {*
  1431. -------------------------------------------------------------------------------
  1432. Returns the number of leading 0 bits before the most-significant 1 bit of
  1433. `a'. If `a' is zero, 32 is returned.
  1434. -------------------------------------------------------------------------------
  1435. *}
  1436. Function countLeadingZeros32( a:bits32 ): int8;
  1437. const countLeadingZerosHigh:array[0..255] of int8 = (
  1438. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1439. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1440. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1441. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1442. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1443. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1444. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1445. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1446. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1447. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1448. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1449. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1450. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1451. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1452. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1453. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1454. );
  1455. Var
  1456. shiftCount: int8;
  1457. Begin
  1458. shiftCount := 0;
  1459. if ( a < $10000 ) then
  1460. Begin
  1461. shiftCount := shiftcount + 16;
  1462. a := a shl 16;
  1463. End;
  1464. if ( a < $1000000 ) then
  1465. Begin
  1466. shiftCount := shiftcount + 8;
  1467. a := a shl 8;
  1468. end;
  1469. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1470. countLeadingZeros32:= shiftCount;
  1471. End;
  1472. {*----------------------------------------------------------------------------
  1473. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1474. | `a'. If `a' is zero, 64 is returned.
  1475. *----------------------------------------------------------------------------*}
  1476. function countLeadingZeros64( a : bits64): int8;
  1477. var
  1478. shiftcount : int8;
  1479. Begin
  1480. shiftCount := 0;
  1481. if ( a < bits64(bits64(1) shl 32 )) then
  1482. shiftCount := shiftcount + 32
  1483. else
  1484. a := a shr 32;
  1485. shiftCount := shiftCount + countLeadingZeros32( a );
  1486. countLeadingZeros64:= shiftCount;
  1487. End;
  1488. {*
  1489. -------------------------------------------------------------------------------
  1490. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1491. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1492. returns 0.
  1493. -------------------------------------------------------------------------------
  1494. *}
  1495. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1496. Begin
  1497. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1498. End;
  1499. {*
  1500. -------------------------------------------------------------------------------
  1501. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1502. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1503. Otherwise, returns 0.
  1504. -------------------------------------------------------------------------------
  1505. *}
  1506. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1507. Begin
  1508. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1509. End;
  1510. {*
  1511. -------------------------------------------------------------------------------
  1512. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1513. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1514. returns 0.
  1515. -------------------------------------------------------------------------------
  1516. *}
  1517. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1518. Begin
  1519. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1520. End;
  1521. {*
  1522. -------------------------------------------------------------------------------
  1523. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1524. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1525. returns 0.
  1526. -------------------------------------------------------------------------------
  1527. *}
  1528. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1529. Begin
  1530. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1531. End;
  1532. const
  1533. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1534. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1535. (*****************************************************************************)
  1536. (* End Low-Level arithmetic *)
  1537. (*****************************************************************************)
  1538. {*
  1539. -------------------------------------------------------------------------------
  1540. Functions and definitions to determine: (1) whether tininess for underflow
  1541. is detected before or after rounding by default, (2) what (if anything)
  1542. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1543. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1544. are propagated from function inputs to output. These details are ENDIAN
  1545. specific
  1546. -------------------------------------------------------------------------------
  1547. *}
  1548. {$IFDEF ENDIAN_LITTLE}
  1549. {*
  1550. -------------------------------------------------------------------------------
  1551. Internal canonical NaN format.
  1552. -------------------------------------------------------------------------------
  1553. *}
  1554. TYPE
  1555. commonNaNT = packed record
  1556. sign: flag;
  1557. high, low : bits32;
  1558. end;
  1559. {*
  1560. -------------------------------------------------------------------------------
  1561. The pattern for a default generated single-precision NaN.
  1562. -------------------------------------------------------------------------------
  1563. *}
  1564. const float32_default_nan = $FFC00000;
  1565. {*
  1566. -------------------------------------------------------------------------------
  1567. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1568. otherwise returns 0.
  1569. -------------------------------------------------------------------------------
  1570. *}
  1571. Function float32_is_nan( a : float32 ): flag;
  1572. Begin
  1573. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1574. End;
  1575. {*
  1576. -------------------------------------------------------------------------------
  1577. Returns 1 if the single-precision floating-point value `a' is a signaling
  1578. NaN; otherwise returns 0.
  1579. -------------------------------------------------------------------------------
  1580. *}
  1581. Function float32_is_signaling_nan( a : float32 ): flag;
  1582. Begin
  1583. float32_is_signaling_nan := flag
  1584. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1585. End;
  1586. {*
  1587. -------------------------------------------------------------------------------
  1588. Returns the result of converting the single-precision floating-point NaN
  1589. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1590. exception is raised.
  1591. -------------------------------------------------------------------------------
  1592. *}
  1593. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1594. var
  1595. z : commonNaNT ;
  1596. Begin
  1597. if ( float32_is_signaling_nan( a ) <> 0) then
  1598. float_raise( float_flag_invalid );
  1599. z.sign := a shr 31;
  1600. z.low := 0;
  1601. z.high := a shl 9;
  1602. c := z;
  1603. End;
  1604. {*
  1605. -------------------------------------------------------------------------------
  1606. Returns the result of converting the canonical NaN `a' to the single-
  1607. precision floating-point format.
  1608. -------------------------------------------------------------------------------
  1609. *}
  1610. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1611. Begin
  1612. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1613. End;
  1614. {*
  1615. -------------------------------------------------------------------------------
  1616. Takes two single-precision floating-point values `a' and `b', one of which
  1617. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1618. signaling NaN, the invalid exception is raised.
  1619. -------------------------------------------------------------------------------
  1620. *}
  1621. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1622. Var
  1623. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1624. label returnLargerSignificand;
  1625. Begin
  1626. aIsNaN := float32_is_nan( a );
  1627. aIsSignalingNaN := float32_is_signaling_nan( a );
  1628. bIsNaN := float32_is_nan( b );
  1629. bIsSignalingNaN := float32_is_signaling_nan( b );
  1630. a := a or $00400000;
  1631. b := b or $00400000;
  1632. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1633. float_raise( float_flag_invalid );
  1634. if ( aIsSignalingNaN )<> 0 then
  1635. Begin
  1636. if ( bIsSignalingNaN ) <> 0 then
  1637. goto returnLargerSignificand;
  1638. if bIsNan <> 0 then
  1639. propagateFloat32NaN := b
  1640. else
  1641. propagateFloat32NaN := a;
  1642. exit;
  1643. End
  1644. else if ( aIsNaN <> 0) then
  1645. Begin
  1646. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1647. Begin
  1648. propagateFloat32NaN := a;
  1649. exit;
  1650. End;
  1651. returnLargerSignificand:
  1652. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1653. Begin
  1654. propagateFloat32NaN := b;
  1655. exit;
  1656. End;
  1657. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1658. Begin
  1659. propagateFloat32NaN := a;
  1660. End;
  1661. if a < b then
  1662. propagateFloat32NaN := a
  1663. else
  1664. propagateFloat32NaN := b;
  1665. exit;
  1666. End
  1667. else
  1668. Begin
  1669. propagateFloat32NaN := b;
  1670. exit;
  1671. End;
  1672. End;
  1673. {*
  1674. -------------------------------------------------------------------------------
  1675. The pattern for a default generated double-precision NaN. The `high' and
  1676. `low' values hold the most- and least-significant bits, respectively.
  1677. -------------------------------------------------------------------------------
  1678. *}
  1679. const
  1680. float64_default_nan_high = $FFF80000;
  1681. float64_default_nan_low = $00000000;
  1682. {*
  1683. -------------------------------------------------------------------------------
  1684. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1685. otherwise returns 0.
  1686. -------------------------------------------------------------------------------
  1687. *}
  1688. Function float64_is_nan( a : float64 ) : flag;
  1689. Begin
  1690. float64_is_nan :=
  1691. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1692. and ( a.low or ( a.high and $000FFFFF ) );
  1693. End;
  1694. {*
  1695. -------------------------------------------------------------------------------
  1696. Returns 1 if the double-precision floating-point value `a' is a signaling
  1697. NaN; otherwise returns 0.
  1698. -------------------------------------------------------------------------------
  1699. *}
  1700. Function float64_is_signaling_nan( a : float64 ): flag;
  1701. Begin
  1702. float64_is_signaling_nan :=
  1703. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1704. and ( a.low or ( a.high and $0007FFFF ) );
  1705. End;
  1706. {*
  1707. -------------------------------------------------------------------------------
  1708. Returns the result of converting the double-precision floating-point NaN
  1709. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1710. exception is raised.
  1711. -------------------------------------------------------------------------------
  1712. *}
  1713. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1714. Var
  1715. z : commonNaNT;
  1716. Begin
  1717. if ( float64_is_signaling_nan( a )<>0 ) then
  1718. float_raise( float_flag_invalid );
  1719. z.sign := a.high shr 31;
  1720. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1721. c := z;
  1722. End;
  1723. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1724. Var
  1725. z : commonNaNT;
  1726. Begin
  1727. if ( float64_is_signaling_nan( a )<>0 ) then
  1728. float_raise( float_flag_invalid );
  1729. z.sign := a.high shr 31;
  1730. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1731. result := z;
  1732. End;
  1733. {*
  1734. -------------------------------------------------------------------------------
  1735. Returns the result of converting the canonical NaN `a' to the double-
  1736. precision floating-point format.
  1737. -------------------------------------------------------------------------------
  1738. *}
  1739. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1740. Var
  1741. z: float64;
  1742. Begin
  1743. shift64Right( a.high, a.low, 12, z.high, z.low );
  1744. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1745. c := z;
  1746. End;
  1747. {*
  1748. -------------------------------------------------------------------------------
  1749. Takes two double-precision floating-point values `a' and `b', one of which
  1750. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1751. signaling NaN, the invalid exception is raised.
  1752. -------------------------------------------------------------------------------
  1753. *}
  1754. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1755. Var
  1756. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1757. label returnLargerSignificand;
  1758. Begin
  1759. aIsNaN := float64_is_nan( a );
  1760. aIsSignalingNaN := float64_is_signaling_nan( a );
  1761. bIsNaN := float64_is_nan( b );
  1762. bIsSignalingNaN := float64_is_signaling_nan( b );
  1763. a.high := a.high or $00080000;
  1764. b.high := b.high or $00080000;
  1765. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1766. float_raise( float_flag_invalid );
  1767. if ( aIsSignalingNaN )<>0 then
  1768. Begin
  1769. if ( bIsSignalingNaN )<>0 then
  1770. goto returnLargerSignificand;
  1771. if bIsNan <> 0 then
  1772. c := b
  1773. else
  1774. c := a;
  1775. exit;
  1776. End
  1777. else if ( aIsNaN )<> 0 then
  1778. Begin
  1779. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1780. Begin
  1781. c := a;
  1782. exit;
  1783. End;
  1784. returnLargerSignificand:
  1785. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1786. Begin
  1787. c := b;
  1788. exit;
  1789. End;
  1790. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1791. Begin
  1792. c := a;
  1793. exit;
  1794. End;
  1795. if a.high < b.high then
  1796. c := a
  1797. else
  1798. c := b;
  1799. exit;
  1800. End
  1801. else
  1802. Begin
  1803. c := b;
  1804. exit;
  1805. End;
  1806. End;
  1807. {*----------------------------------------------------------------------------
  1808. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1809. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1810. | returns 0.
  1811. *----------------------------------------------------------------------------*}
  1812. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1813. begin
  1814. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1815. end;
  1816. {*----------------------------------------------------------------------------
  1817. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1818. | otherwise returns 0.
  1819. *----------------------------------------------------------------------------*}
  1820. function float128_is_nan( a : float128): flag;
  1821. begin
  1822. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1823. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1824. end;
  1825. {*----------------------------------------------------------------------------
  1826. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1827. | signaling NaN; otherwise returns 0.
  1828. *----------------------------------------------------------------------------*}
  1829. function float128_is_signaling_nan( a : float128): flag;
  1830. begin
  1831. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1832. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1833. end;
  1834. {*----------------------------------------------------------------------------
  1835. | Returns the result of converting the quadruple-precision floating-point NaN
  1836. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1837. | exception is raised.
  1838. *----------------------------------------------------------------------------*}
  1839. function float128ToCommonNaN( a : float128): commonNaNT;
  1840. var
  1841. z: commonNaNT;
  1842. qhigh,qlow : qword;
  1843. begin
  1844. if ( float128_is_signaling_nan( a )<>0) then
  1845. float_raise( float_flag_invalid );
  1846. z.sign := a.high shr 63;
  1847. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1848. z.high:=qhigh shr 32;
  1849. z.low:=qhigh and $ffffffff;
  1850. result:=z;
  1851. end;
  1852. {*----------------------------------------------------------------------------
  1853. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1854. | precision floating-point format.
  1855. *----------------------------------------------------------------------------*}
  1856. function commonNaNToFloat128( a : commonNaNT): float128;
  1857. var
  1858. z: float128;
  1859. begin
  1860. shift128Right( a.high, a.low, 16, z.high, z.low );
  1861. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1862. result:=z;
  1863. end;
  1864. {*----------------------------------------------------------------------------
  1865. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1866. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1867. | `b' is a signaling NaN, the invalid exception is raised.
  1868. *----------------------------------------------------------------------------*}
  1869. function propagateFloat128NaN( a: float128; b : float128): float128;
  1870. var
  1871. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1872. label
  1873. returnLargerSignificand;
  1874. begin
  1875. aIsNaN := float128_is_nan( a );
  1876. aIsSignalingNaN := float128_is_signaling_nan( a );
  1877. bIsNaN := float128_is_nan( b );
  1878. bIsSignalingNaN := float128_is_signaling_nan( b );
  1879. a.high := a.high or int64( $0000800000000000 );
  1880. b.high := b.high or int64( $0000800000000000 );
  1881. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1882. float_raise( float_flag_invalid );
  1883. if ( aIsSignalingNaN )<>0 then
  1884. begin
  1885. if ( bIsSignalingNaN )<>0 then
  1886. goto returnLargerSignificand;
  1887. if bIsNaN<>0 then
  1888. result := b
  1889. else
  1890. result := a;
  1891. exit;
  1892. end
  1893. else if ( aIsNaN )<>0 then
  1894. begin
  1895. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1896. begin
  1897. result := a;
  1898. exit;
  1899. end;
  1900. returnLargerSignificand:
  1901. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1902. begin
  1903. result := b;
  1904. exit;
  1905. end;
  1906. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1907. begin
  1908. result := a;
  1909. exit
  1910. end;
  1911. if ( a.high < b.high ) then
  1912. result := a
  1913. else
  1914. result := b;
  1915. exit;
  1916. end
  1917. else
  1918. result:=b;
  1919. end;
  1920. {$ELSE}
  1921. { Big endian code }
  1922. (*----------------------------------------------------------------------------
  1923. | Internal canonical NaN format.
  1924. *----------------------------------------------------------------------------*)
  1925. type
  1926. commonNANT = packed record
  1927. sign : flag;
  1928. high, low : bits32;
  1929. end;
  1930. (*----------------------------------------------------------------------------
  1931. | The pattern for a default generated single-precision NaN.
  1932. *----------------------------------------------------------------------------*)
  1933. const float32_default_nan = $7FFFFFFF;
  1934. (*----------------------------------------------------------------------------
  1935. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1936. | otherwise returns 0.
  1937. *----------------------------------------------------------------------------*)
  1938. function float32_is_nan(a: float32): flag;
  1939. begin
  1940. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1941. end;
  1942. (*----------------------------------------------------------------------------
  1943. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1944. | NaN; otherwise returns 0.
  1945. *----------------------------------------------------------------------------*)
  1946. function float32_is_signaling_nan(a: float32):flag;
  1947. begin
  1948. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1949. end;
  1950. (*----------------------------------------------------------------------------
  1951. | Returns the result of converting the single-precision floating-point NaN
  1952. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1953. | exception is raised.
  1954. *----------------------------------------------------------------------------*)
  1955. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1956. var
  1957. z: commonNANT;
  1958. begin
  1959. if float32_is_signaling_nan(a)<>0 then
  1960. float_raise(float_flag_invalid);
  1961. z.sign := a shr 31;
  1962. z.low := 0;
  1963. z.high := a shl 9;
  1964. c:=z;
  1965. end;
  1966. (*----------------------------------------------------------------------------
  1967. | Returns the result of converting the canonical NaN `a' to the single-
  1968. | precision floating-point format.
  1969. *----------------------------------------------------------------------------*)
  1970. function CommonNanToFloat32(a : CommonNaNT): float32;
  1971. begin
  1972. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  1973. end;
  1974. (*----------------------------------------------------------------------------
  1975. | Takes two single-precision floating-point values `a' and `b', one of which
  1976. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1977. | signaling NaN, the invalid exception is raised.
  1978. *----------------------------------------------------------------------------*)
  1979. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  1980. var
  1981. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1982. begin
  1983. aIsNaN := float32_is_nan( a );
  1984. aIsSignalingNaN := float32_is_signaling_nan( a );
  1985. bIsNaN := float32_is_nan( b );
  1986. bIsSignalingNaN := float32_is_signaling_nan( b );
  1987. a := a or $00400000;
  1988. b := b or $00400000;
  1989. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1990. float_raise( float_flag_invalid );
  1991. if bIsSignalingNaN<>0 then
  1992. propagateFloat32Nan := b
  1993. else if aIsSignalingNan<>0 then
  1994. propagateFloat32Nan := a
  1995. else if bIsNan<>0 then
  1996. propagateFloat32Nan := b
  1997. else
  1998. propagateFloat32Nan := a;
  1999. end;
  2000. (*----------------------------------------------------------------------------
  2001. | The pattern for a default generated double-precision NaN. The `high' and
  2002. | `low' values hold the most- and least-significant bits, respectively.
  2003. *----------------------------------------------------------------------------*)
  2004. const
  2005. float64_default_nan_high = $7FFFFFFF;
  2006. float64_default_nan_low = $FFFFFFFF;
  2007. (*----------------------------------------------------------------------------
  2008. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2009. | otherwise returns 0.
  2010. *----------------------------------------------------------------------------*)
  2011. function float64_is_nan(a: float64): flag;
  2012. begin
  2013. float64_is_nan := flag (
  2014. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2015. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2016. end;
  2017. (*----------------------------------------------------------------------------
  2018. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2019. | NaN; otherwise returns 0.
  2020. *----------------------------------------------------------------------------*)
  2021. function float64_is_signaling_nan( a:float64): flag;
  2022. begin
  2023. float64_is_signaling_nan := flag(
  2024. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2025. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2026. end;
  2027. (*----------------------------------------------------------------------------
  2028. | Returns the result of converting the double-precision floating-point NaN
  2029. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2030. | exception is raised.
  2031. *----------------------------------------------------------------------------*)
  2032. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2033. var
  2034. z : commonNaNT;
  2035. begin
  2036. if ( float64_is_signaling_nan( a )<>0 ) then
  2037. float_raise( float_flag_invalid );
  2038. z.sign := a.high shr 31;
  2039. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2040. c:=z;
  2041. end;
  2042. (*----------------------------------------------------------------------------
  2043. | Returns the result of converting the canonical NaN `a' to the double-
  2044. | precision floating-point format.
  2045. *----------------------------------------------------------------------------*)
  2046. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2047. var
  2048. z: float64;
  2049. begin
  2050. shift64Right( a.high, a.low, 12, z.high, z.low );
  2051. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2052. c:=z;
  2053. end;
  2054. (*----------------------------------------------------------------------------
  2055. | Takes two double-precision floating-point values `a' and `b', one of which
  2056. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2057. | signaling NaN, the invalid exception is raised.
  2058. *----------------------------------------------------------------------------*)
  2059. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2060. var
  2061. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2062. begin
  2063. aIsNaN := float64_is_nan( a );
  2064. aIsSignalingNaN := float64_is_signaling_nan( a );
  2065. bIsNaN := float64_is_nan( b );
  2066. bIsSignalingNaN := float64_is_signaling_nan( b );
  2067. a.high := a.high or $00080000;
  2068. b.high := b.high or $00080000;
  2069. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2070. float_raise( float_flag_invalid );
  2071. if bIsSignalingNaN<>0 then
  2072. c := b
  2073. else if aIsSignalingNan<>0 then
  2074. c := a
  2075. else if bIsNan<>0 then
  2076. c := b
  2077. else
  2078. c := a;
  2079. end;
  2080. {$ENDIF}
  2081. (****************************************************************************)
  2082. (* END ENDIAN SPECIFIC CODE *)
  2083. (****************************************************************************)
  2084. {*
  2085. -------------------------------------------------------------------------------
  2086. Returns the fraction bits of the single-precision floating-point value `a'.
  2087. -------------------------------------------------------------------------------
  2088. *}
  2089. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2090. Begin
  2091. ExtractFloat32Frac := A AND $007FFFFF;
  2092. End;
  2093. {*
  2094. -------------------------------------------------------------------------------
  2095. Returns the exponent bits of the single-precision floating-point value `a'.
  2096. -------------------------------------------------------------------------------
  2097. *}
  2098. Function extractFloat32Exp( a: float32 ): Int16;
  2099. Begin
  2100. extractFloat32Exp := (a shr 23) AND $FF;
  2101. End;
  2102. {*
  2103. -------------------------------------------------------------------------------
  2104. Returns the sign bit of the single-precision floating-point value `a'.
  2105. -------------------------------------------------------------------------------
  2106. *}
  2107. Function extractFloat32Sign( a: float32 ): Flag;
  2108. Begin
  2109. extractFloat32Sign := a shr 31;
  2110. End;
  2111. {*
  2112. -------------------------------------------------------------------------------
  2113. Normalizes the subnormal single-precision floating-point value represented
  2114. by the denormalized significand `aSig'. The normalized exponent and
  2115. significand are stored at the locations pointed to by `zExpPtr' and
  2116. `zSigPtr', respectively.
  2117. -------------------------------------------------------------------------------
  2118. *}
  2119. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2120. Var
  2121. ShiftCount : BYTE;
  2122. Begin
  2123. shiftCount := countLeadingZeros32( aSig ) - 8;
  2124. zSigPtr := aSig shl shiftCount;
  2125. zExpPtr := 1 - shiftCount;
  2126. End;
  2127. {*
  2128. -------------------------------------------------------------------------------
  2129. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2130. single-precision floating-point value, returning the result. After being
  2131. shifted into the proper positions, the three fields are simply added
  2132. together to form the result. This means that any integer portion of `zSig'
  2133. will be added into the exponent. Since a properly normalized significand
  2134. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2135. than the desired result exponent whenever `zSig' is a complete, normalized
  2136. significand.
  2137. -------------------------------------------------------------------------------
  2138. *}
  2139. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2140. Begin
  2141. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2142. + zSig;
  2143. End;
  2144. {*
  2145. -------------------------------------------------------------------------------
  2146. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2147. and significand `zSig', and returns the proper single-precision floating-
  2148. point value corresponding to the abstract input. Ordinarily, the abstract
  2149. value is simply rounded and packed into the single-precision format, with
  2150. the inexact exception raised if the abstract input cannot be represented
  2151. exactly. However, if the abstract value is too large, the overflow and
  2152. inexact exceptions are raised and an infinity or maximal finite value is
  2153. returned. If the abstract value is too small, the input value is rounded to
  2154. a subnormal number, and the underflow and inexact exceptions are raised if
  2155. the abstract input cannot be represented exactly as a subnormal single-
  2156. precision floating-point number.
  2157. The input significand `zSig' has its binary point between bits 30
  2158. and 29, which is 7 bits to the left of the usual location. This shifted
  2159. significand must be normalized or smaller. If `zSig' is not normalized,
  2160. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2161. and it must not require rounding. In the usual case that `zSig' is
  2162. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2163. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2164. Binary Floating-Point Arithmetic.
  2165. -------------------------------------------------------------------------------
  2166. *}
  2167. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2168. Var
  2169. roundingMode : BYTE;
  2170. roundNearestEven : Flag;
  2171. roundIncrement, roundBits : BYTE;
  2172. IsTiny : Flag;
  2173. Begin
  2174. roundingMode := softfloat_rounding_mode;
  2175. if (roundingMode = float_round_nearest_even) then
  2176. Begin
  2177. roundNearestEven := Flag(TRUE);
  2178. end
  2179. else
  2180. roundNearestEven := Flag(FALSE);
  2181. roundIncrement := $40;
  2182. if ( Boolean(roundNearestEven) = FALSE) then
  2183. Begin
  2184. if ( roundingMode = float_round_to_zero ) Then
  2185. Begin
  2186. roundIncrement := 0;
  2187. End
  2188. else
  2189. Begin
  2190. roundIncrement := $7F;
  2191. if ( zSign <> 0 ) then
  2192. Begin
  2193. if roundingMode = float_round_up then roundIncrement := 0;
  2194. End
  2195. else
  2196. Begin
  2197. if roundingMode = float_round_down then roundIncrement := 0;
  2198. End;
  2199. End
  2200. End;
  2201. roundBits := zSig AND $7F;
  2202. if ($FD <= bits16 (zExp) ) then
  2203. Begin
  2204. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2205. Begin
  2206. float_raise( float_flag_overflow OR float_flag_inexact );
  2207. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2208. exit;
  2209. End;
  2210. if ( zExp < 0 ) then
  2211. Begin
  2212. isTiny :=
  2213. flag(( float_detect_tininess = float_tininess_before_rounding )
  2214. OR ( zExp < -1 )
  2215. OR ( (zSig + roundIncrement) < $80000000 ));
  2216. shift32RightJamming( zSig, - zExp, zSig );
  2217. zExp := 0;
  2218. roundBits := zSig AND $7F;
  2219. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2220. float_raise( float_flag_underflow );
  2221. End;
  2222. End;
  2223. if ( roundBits )<> 0 then
  2224. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  2225. zSig := ( zSig + roundIncrement ) shr 7;
  2226. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2227. if ( zSig = 0 ) then zExp := 0;
  2228. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2229. exit;
  2230. End;
  2231. {*
  2232. -------------------------------------------------------------------------------
  2233. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2234. and significand `zSig', and returns the proper single-precision floating-
  2235. point value corresponding to the abstract input. This routine is just like
  2236. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2237. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2238. floating-point exponent.
  2239. -------------------------------------------------------------------------------
  2240. *}
  2241. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2242. Var
  2243. ShiftCount : int8;
  2244. Begin
  2245. shiftCount := countLeadingZeros32( zSig ) - 1;
  2246. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2247. End;
  2248. {*
  2249. -------------------------------------------------------------------------------
  2250. Returns the most-significant 20 fraction bits of the double-precision
  2251. floating-point value `a'.
  2252. -------------------------------------------------------------------------------
  2253. *}
  2254. Function extractFloat64Frac0(a: float64): bits32;
  2255. Begin
  2256. extractFloat64Frac0 := a.high and $000FFFFF;
  2257. End;
  2258. {*
  2259. -------------------------------------------------------------------------------
  2260. Returns the least-significant 32 fraction bits of the double-precision
  2261. floating-point value `a'.
  2262. -------------------------------------------------------------------------------
  2263. *}
  2264. Function extractFloat64Frac1(a: float64): bits32;
  2265. Begin
  2266. extractFloat64Frac1 := a.low;
  2267. End;
  2268. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2269. Function extractFloat64Frac(a: float64): bits64;
  2270. Begin
  2271. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2272. End;
  2273. {*
  2274. -------------------------------------------------------------------------------
  2275. Returns the exponent bits of the double-precision floating-point value `a'.
  2276. -------------------------------------------------------------------------------
  2277. *}
  2278. Function extractFloat64Exp(a: float64): int16;
  2279. Begin
  2280. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2281. End;
  2282. {*
  2283. -------------------------------------------------------------------------------
  2284. Returns the sign bit of the double-precision floating-point value `a'.
  2285. -------------------------------------------------------------------------------
  2286. *}
  2287. Function extractFloat64Sign(a: float64) : flag;
  2288. Begin
  2289. extractFloat64Sign := a.high shr 31;
  2290. End;
  2291. {*
  2292. -------------------------------------------------------------------------------
  2293. Normalizes the subnormal double-precision floating-point value represented
  2294. by the denormalized significand formed by the concatenation of `aSig0' and
  2295. `aSig1'. The normalized exponent is stored at the location pointed to by
  2296. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2297. stored at the location pointed to by `zSig0Ptr', and the least significant
  2298. 32 bits of the normalized significand are stored at the location pointed to
  2299. by `zSig1Ptr'.
  2300. -------------------------------------------------------------------------------
  2301. *}
  2302. Procedure normalizeFloat64Subnormal(
  2303. aSig0: bits32;
  2304. aSig1: bits32;
  2305. VAR zExpPtr : Int16;
  2306. VAR zSig0Ptr : Bits32;
  2307. VAR zSig1Ptr : Bits32
  2308. );
  2309. Var
  2310. ShiftCount : Int8;
  2311. Begin
  2312. if ( aSig0 = 0 ) then
  2313. Begin
  2314. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2315. if ( shiftCount < 0 ) then
  2316. Begin
  2317. zSig0Ptr := aSig1 shr ( - shiftCount );
  2318. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2319. End
  2320. else
  2321. Begin
  2322. zSig0Ptr := aSig1 shl shiftCount;
  2323. zSig1Ptr := 0;
  2324. End;
  2325. zExpPtr := - shiftCount - 31;
  2326. End
  2327. else
  2328. Begin
  2329. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2330. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2331. zExpPtr := 1 - shiftCount;
  2332. End;
  2333. End;
  2334. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2335. var
  2336. shiftCount : int8;
  2337. begin
  2338. shiftCount := countLeadingZeros64( aSig ) - 11;
  2339. zSigPtr := aSig shl shiftCount;
  2340. zExpPtr := 1 - shiftCount;
  2341. end;
  2342. {*
  2343. -------------------------------------------------------------------------------
  2344. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2345. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2346. point value, returning the result. After being shifted into the proper
  2347. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2348. together to form the most significant 32 bits of the result. This means
  2349. that any integer portion of `zSig0' will be added into the exponent. Since
  2350. a properly normalized significand will have an integer portion equal to 1,
  2351. the `zExp' input should be 1 less than the desired result exponent whenever
  2352. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2353. -------------------------------------------------------------------------------
  2354. *}
  2355. Procedure
  2356. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2357. var
  2358. z: Float64;
  2359. Begin
  2360. z.low := zSig1;
  2361. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2362. c := z;
  2363. End;
  2364. {*----------------------------------------------------------------------------
  2365. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2366. | double-precision floating-point value, returning the result. After being
  2367. | shifted into the proper positions, the three fields are simply added
  2368. | together to form the result. This means that any integer portion of `zSig'
  2369. | will be added into the exponent. Since a properly normalized significand
  2370. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2371. | than the desired result exponent whenever `zSig' is a complete, normalized
  2372. | significand.
  2373. *----------------------------------------------------------------------------*}
  2374. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2375. begin
  2376. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2377. end;
  2378. {*
  2379. -------------------------------------------------------------------------------
  2380. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2381. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2382. and `zSig2', and returns the proper double-precision floating-point value
  2383. corresponding to the abstract input. Ordinarily, the abstract value is
  2384. simply rounded and packed into the double-precision format, with the inexact
  2385. exception raised if the abstract input cannot be represented exactly.
  2386. However, if the abstract value is too large, the overflow and inexact
  2387. exceptions are raised and an infinity or maximal finite value is returned.
  2388. If the abstract value is too small, the input value is rounded to a
  2389. subnormal number, and the underflow and inexact exceptions are raised if the
  2390. abstract input cannot be represented exactly as a subnormal double-precision
  2391. floating-point number.
  2392. The input significand must be normalized or smaller. If the input
  2393. significand is not normalized, `zExp' must be 0; in that case, the result
  2394. returned is a subnormal number, and it must not require rounding. In the
  2395. usual case that the input significand is normalized, `zExp' must be 1 less
  2396. than the ``true'' floating-point exponent. The handling of underflow and
  2397. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2398. -------------------------------------------------------------------------------
  2399. *}
  2400. Procedure
  2401. roundAndPackFloat64(
  2402. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2403. Var
  2404. roundingMode : Int8;
  2405. roundNearestEven, increment, isTiny : Flag;
  2406. Begin
  2407. roundingMode := softfloat_rounding_mode;
  2408. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2409. increment := flag( sbits32 (zSig2) < 0 );
  2410. if ( roundNearestEven = flag(FALSE) ) then
  2411. Begin
  2412. if ( roundingMode = float_round_to_zero ) then
  2413. increment := 0
  2414. else
  2415. Begin
  2416. if ( zSign )<> 0 then
  2417. Begin
  2418. increment := flag( roundingMode = float_round_down ) and zSig2;
  2419. End
  2420. else
  2421. Begin
  2422. increment := flag( roundingMode = float_round_up ) and zSig2;
  2423. End
  2424. End
  2425. End;
  2426. if ( $7FD <= bits16 (zExp) ) then
  2427. Begin
  2428. if (( $7FD < zExp )
  2429. or (( zExp = $7FD )
  2430. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2431. and (increment<>0)
  2432. )
  2433. ) then
  2434. Begin
  2435. float_raise( float_flag_overflow OR float_flag_inexact );
  2436. if (( roundingMode = float_round_to_zero )
  2437. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2438. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2439. ) then
  2440. Begin
  2441. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2442. exit;
  2443. End;
  2444. packFloat64( zSign, $7FF, 0, 0, c );
  2445. exit;
  2446. End;
  2447. if ( zExp < 0 ) then
  2448. Begin
  2449. isTiny :=
  2450. flag( float_detect_tininess = float_tininess_before_rounding )
  2451. or flag( zExp < -1 )
  2452. or flag(increment = 0)
  2453. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2454. shift64ExtraRightJamming(
  2455. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2456. zExp := 0;
  2457. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2458. if ( roundNearestEven )<>0 then
  2459. Begin
  2460. increment := flag( sbits32 (zSig2) < 0 );
  2461. End
  2462. else
  2463. Begin
  2464. if ( zSign )<>0 then
  2465. Begin
  2466. increment := flag( roundingMode = float_round_down ) and zSig2;
  2467. End
  2468. else
  2469. Begin
  2470. increment := flag( roundingMode = float_round_up ) and zSig2;
  2471. End
  2472. End;
  2473. End;
  2474. End;
  2475. if ( zSig2 )<>0 then
  2476. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2477. if ( increment )<>0 then
  2478. Begin
  2479. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2480. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2481. End
  2482. else
  2483. Begin
  2484. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2485. End;
  2486. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2487. End;
  2488. {*----------------------------------------------------------------------------
  2489. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2490. | and significand `zSig', and returns the proper double-precision floating-
  2491. | point value corresponding to the abstract input. Ordinarily, the abstract
  2492. | value is simply rounded and packed into the double-precision format, with
  2493. | the inexact exception raised if the abstract input cannot be represented
  2494. | exactly. However, if the abstract value is too large, the overflow and
  2495. | inexact exceptions are raised and an infinity or maximal finite value is
  2496. | returned. If the abstract value is too small, the input value is rounded
  2497. | to a subnormal number, and the underflow and inexact exceptions are raised
  2498. | if the abstract input cannot be represented exactly as a subnormal double-
  2499. | precision floating-point number.
  2500. | The input significand `zSig' has its binary point between bits 62
  2501. | and 61, which is 10 bits to the left of the usual location. This shifted
  2502. | significand must be normalized or smaller. If `zSig' is not normalized,
  2503. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2504. | and it must not require rounding. In the usual case that `zSig' is
  2505. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2506. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2507. | Binary Floating-Point Arithmetic.
  2508. *----------------------------------------------------------------------------*}
  2509. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2510. var
  2511. roundingMode: int8;
  2512. roundNearestEven: flag;
  2513. roundIncrement, roundBits: int16;
  2514. isTiny: flag;
  2515. begin
  2516. roundingMode := softfloat_rounding_mode;
  2517. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2518. roundIncrement := $200;
  2519. if ( roundNearestEven=0 ) then
  2520. begin
  2521. if ( roundingMode = float_round_to_zero ) then
  2522. begin
  2523. roundIncrement := 0;
  2524. end
  2525. else begin
  2526. roundIncrement := $3FF;
  2527. if ( zSign<>0 ) then
  2528. begin
  2529. if ( roundingMode = float_round_up ) then
  2530. roundIncrement := 0;
  2531. end
  2532. else begin
  2533. if ( roundingMode = float_round_down ) then
  2534. roundIncrement := 0;
  2535. end
  2536. end
  2537. end;
  2538. roundBits := zSig and $3FF;
  2539. if ( $7FD <= bits16(zExp) ) then
  2540. begin
  2541. if ( ( $7FD < zExp )
  2542. or ( ( zExp = $7FD )
  2543. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2544. ) then
  2545. begin
  2546. float_raise( float_flag_overflow or float_flag_inexact );
  2547. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2548. exit;
  2549. end;
  2550. if ( zExp < 0 ) then
  2551. begin
  2552. isTiny := ord(
  2553. ( float_detect_tininess = float_tininess_before_rounding )
  2554. or ( zExp < -1 )
  2555. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2556. shift64RightJamming( zSig, - zExp, zSig );
  2557. zExp := 0;
  2558. roundBits := zSig and $3FF;
  2559. if ( isTiny and roundBits )<>0 then
  2560. float_raise( float_flag_underflow );
  2561. end
  2562. end;
  2563. if ( roundBits<>0 ) then
  2564. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2565. zSig := ( zSig + roundIncrement ) shr 10;
  2566. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2567. if ( zSig = 0 ) then
  2568. zExp := 0;
  2569. result:=packFloat64( zSign, zExp, zSig );
  2570. end;
  2571. {*
  2572. -------------------------------------------------------------------------------
  2573. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2574. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2575. returns the proper double-precision floating-point value corresponding
  2576. to the abstract input. This routine is just like `roundAndPackFloat64'
  2577. except that the input significand has fewer bits and does not have to be
  2578. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2579. point exponent.
  2580. -------------------------------------------------------------------------------
  2581. *}
  2582. Procedure
  2583. normalizeRoundAndPackFloat64(
  2584. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2585. Var
  2586. shiftCount : int8;
  2587. zSig2 : bits32;
  2588. Begin
  2589. if ( zSig0 = 0 ) then
  2590. Begin
  2591. zSig0 := zSig1;
  2592. zSig1 := 0;
  2593. zExp := zExp -32;
  2594. End;
  2595. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2596. if ( 0 <= shiftCount ) then
  2597. Begin
  2598. zSig2 := 0;
  2599. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2600. End
  2601. else
  2602. Begin
  2603. shift64ExtraRightJamming
  2604. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2605. End;
  2606. zExp := zExp - shiftCount;
  2607. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2608. End;
  2609. {*
  2610. -------------------------------------------------------------------------------
  2611. Returns the result of converting the 32-bit two's complement integer `a' to
  2612. the single-precision floating-point format. The conversion is performed
  2613. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2614. -------------------------------------------------------------------------------
  2615. *}
  2616. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2617. Var
  2618. zSign : Flag;
  2619. Begin
  2620. if ( a = 0 ) then
  2621. Begin
  2622. int32_to_float32.float32 := 0;
  2623. exit;
  2624. End;
  2625. if ( a = sbits32 ($80000000) ) then
  2626. Begin
  2627. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2628. exit;
  2629. end;
  2630. zSign := flag( a < 0 );
  2631. If zSign<>0 then
  2632. a := -a;
  2633. int32_to_float32.float32:=
  2634. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2635. End;
  2636. {*
  2637. -------------------------------------------------------------------------------
  2638. Returns the result of converting the 32-bit two's complement integer `a' to
  2639. the double-precision floating-point format. The conversion is performed
  2640. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2641. -------------------------------------------------------------------------------
  2642. *}
  2643. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2644. var
  2645. zSign : flag;
  2646. absA : bits32;
  2647. shiftCount : int8;
  2648. zSig0, zSig1 : bits32;
  2649. Begin
  2650. if ( a = 0 ) then
  2651. Begin
  2652. packFloat64( 0, 0, 0, 0, result );
  2653. exit;
  2654. end;
  2655. zSign := flag( a < 0 );
  2656. if ZSign<>0 then
  2657. AbsA := -a
  2658. else
  2659. AbsA := a;
  2660. shiftCount := countLeadingZeros32( absA ) - 11;
  2661. if ( 0 <= shiftCount ) then
  2662. Begin
  2663. zSig0 := absA shl shiftCount;
  2664. zSig1 := 0;
  2665. End
  2666. else
  2667. Begin
  2668. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2669. End;
  2670. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2671. End;
  2672. {*
  2673. -------------------------------------------------------------------------------
  2674. Returns the result of converting the single-precision floating-point value
  2675. `a' to the 32-bit two's complement integer format. The conversion is
  2676. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2677. Arithmetic---which means in particular that the conversion is rounded
  2678. according to the current rounding mode. If `a' is a NaN, the largest
  2679. positive integer is returned. Otherwise, if the conversion overflows, the
  2680. largest integer with the same sign as `a' is returned.
  2681. -------------------------------------------------------------------------------
  2682. *}
  2683. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2684. Var
  2685. aSign: flag;
  2686. aExp, shiftCount: int16;
  2687. aSig, aSigExtra: bits32;
  2688. z: int32;
  2689. roundingMode: int8;
  2690. Begin
  2691. aSig := extractFloat32Frac( a.float32 );
  2692. aExp := extractFloat32Exp( a.float32 );
  2693. aSign := extractFloat32Sign( a.float32 );
  2694. shiftCount := aExp - $96;
  2695. if ( 0 <= shiftCount ) then
  2696. Begin
  2697. if ( $9E <= aExp ) then
  2698. Begin
  2699. if ( a.float32 <> $CF000000 ) then
  2700. Begin
  2701. float_raise( float_flag_invalid );
  2702. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2703. Begin
  2704. float32_to_int32 := $7FFFFFFF;
  2705. exit;
  2706. End;
  2707. End;
  2708. float32_to_int32 := sbits32 ($80000000);
  2709. exit;
  2710. End;
  2711. z := ( aSig or $00800000 ) shl shiftCount;
  2712. if ( aSign<>0 ) then z := - z;
  2713. End
  2714. else
  2715. Begin
  2716. if ( aExp < $7E ) then
  2717. Begin
  2718. aSigExtra := aExp OR aSig;
  2719. z := 0;
  2720. End
  2721. else
  2722. Begin
  2723. aSig := aSig OR $00800000;
  2724. aSigExtra := aSig shl ( shiftCount and 31 );
  2725. z := aSig shr ( - shiftCount );
  2726. End;
  2727. if ( aSigExtra<>0 ) then
  2728. softfloat_exception_flags := softfloat_exception_flags
  2729. or float_flag_inexact;
  2730. roundingMode := softfloat_rounding_mode;
  2731. if ( roundingMode = float_round_nearest_even ) then
  2732. Begin
  2733. if ( sbits32 (aSigExtra) < 0 ) then
  2734. Begin
  2735. Inc(z);
  2736. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2737. z := z and not 1;
  2738. End;
  2739. if ( aSign<>0 ) then
  2740. z := - z;
  2741. End
  2742. else
  2743. Begin
  2744. aSigExtra := flag( aSigExtra <> 0 );
  2745. if ( aSign<>0 ) then
  2746. Begin
  2747. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2748. z := - z;
  2749. End
  2750. else
  2751. Begin
  2752. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2753. End
  2754. End;
  2755. End;
  2756. float32_to_int32 := z;
  2757. End;
  2758. {*
  2759. -------------------------------------------------------------------------------
  2760. Returns the result of converting the single-precision floating-point value
  2761. `a' to the 32-bit two's complement integer format. The conversion is
  2762. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2763. Arithmetic, except that the conversion is always rounded toward zero.
  2764. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2765. the conversion overflows, the largest integer with the same sign as `a' is
  2766. returned.
  2767. -------------------------------------------------------------------------------
  2768. *}
  2769. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2770. Var
  2771. aSign : flag;
  2772. aExp, shiftCount : int16;
  2773. aSig : bits32;
  2774. z : int32;
  2775. Begin
  2776. aSig := extractFloat32Frac( a.float32 );
  2777. aExp := extractFloat32Exp( a.float32 );
  2778. aSign := extractFloat32Sign( a.float32 );
  2779. shiftCount := aExp - $9E;
  2780. if ( 0 <= shiftCount ) then
  2781. Begin
  2782. if ( a.float32 <> $CF000000 ) then
  2783. Begin
  2784. float_raise( float_flag_invalid );
  2785. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2786. Begin
  2787. float32_to_int32_round_to_zero := $7FFFFFFF;
  2788. exit;
  2789. end;
  2790. End;
  2791. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2792. exit;
  2793. End
  2794. else
  2795. if ( aExp <= $7E ) then
  2796. Begin
  2797. if ( aExp or aSig )<>0 then
  2798. softfloat_exception_flags :=
  2799. softfloat_exception_flags or float_flag_inexact;
  2800. float32_to_int32_round_to_zero := 0;
  2801. exit;
  2802. End;
  2803. aSig := ( aSig or $00800000 ) shl 8;
  2804. z := aSig shr ( - shiftCount );
  2805. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2806. Begin
  2807. softfloat_exception_flags :=
  2808. softfloat_exception_flags or float_flag_inexact;
  2809. End;
  2810. if ( aSign<>0 ) then z := - z;
  2811. float32_to_int32_round_to_zero := z;
  2812. End;
  2813. {*
  2814. -------------------------------------------------------------------------------
  2815. Returns the result of converting the single-precision floating-point value
  2816. `a' to the double-precision floating-point format. The conversion is
  2817. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2818. Arithmetic.
  2819. -------------------------------------------------------------------------------
  2820. *}
  2821. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2822. Var
  2823. aSign : flag;
  2824. aExp : int16;
  2825. aSig, zSig0, zSig1: bits32;
  2826. tmp : CommonNanT;
  2827. Begin
  2828. aSig := extractFloat32Frac( a.float32 );
  2829. aExp := extractFloat32Exp( a.float32 );
  2830. aSign := extractFloat32Sign( a.float32 );
  2831. if ( aExp = $FF ) then
  2832. Begin
  2833. if ( aSig<>0 ) then
  2834. Begin
  2835. float32ToCommonNaN(a.float32, tmp);
  2836. commonNaNToFloat64(tmp , result);
  2837. exit;
  2838. End;
  2839. packFloat64( aSign, $7FF, 0, 0, result);
  2840. exit;
  2841. End;
  2842. if ( aExp = 0 ) then
  2843. Begin
  2844. if ( aSig = 0 ) then
  2845. Begin
  2846. packFloat64( aSign, 0, 0, 0, result );
  2847. exit;
  2848. end;
  2849. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2850. Dec(aExp);
  2851. End;
  2852. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2853. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2854. End;
  2855. {*
  2856. -------------------------------------------------------------------------------
  2857. Rounds the single-precision floating-point value `a' to an integer,
  2858. and returns the result as a single-precision floating-point value. The
  2859. operation is performed according to the IEC/IEEE Standard for Binary
  2860. Floating-Point Arithmetic.
  2861. -------------------------------------------------------------------------------
  2862. *}
  2863. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2864. Var
  2865. aSign: flag;
  2866. aExp: int16;
  2867. lastBitMask, roundBitsMask: bits32;
  2868. roundingMode: int8;
  2869. z: float32;
  2870. Begin
  2871. aExp := extractFloat32Exp( a.float32 );
  2872. if ( $96 <= aExp ) then
  2873. Begin
  2874. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2875. Begin
  2876. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2877. exit;
  2878. End;
  2879. float32_round_to_int:=a;
  2880. exit;
  2881. End;
  2882. if ( aExp <= $7E ) then
  2883. Begin
  2884. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2885. Begin
  2886. float32_round_to_int:=a;
  2887. exit;
  2888. end;
  2889. softfloat_exception_flags
  2890. := softfloat_exception_flags OR float_flag_inexact;
  2891. aSign := extractFloat32Sign( a.float32 );
  2892. case ( softfloat_rounding_mode ) of
  2893. float_round_nearest_even:
  2894. Begin
  2895. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2896. Begin
  2897. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2898. exit;
  2899. End;
  2900. End;
  2901. float_round_down:
  2902. Begin
  2903. if aSign <> 0 then
  2904. float32_round_to_int.float32 := $BF800000
  2905. else
  2906. float32_round_to_int.float32 := 0;
  2907. exit;
  2908. End;
  2909. float_round_up:
  2910. Begin
  2911. if aSign <> 0 then
  2912. float32_round_to_int.float32 := $80000000
  2913. else
  2914. float32_round_to_int.float32 := $3F800000;
  2915. exit;
  2916. End;
  2917. end;
  2918. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2919. End;
  2920. lastBitMask := 1;
  2921. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2922. lastBitMask := lastBitMask shl ($96 - aExp);
  2923. roundBitsMask := lastBitMask - 1;
  2924. z := a.float32;
  2925. roundingMode := softfloat_rounding_mode;
  2926. if ( roundingMode = float_round_nearest_even ) then
  2927. Begin
  2928. z := z + (lastBitMask shr 1);
  2929. if ( ( z and roundBitsMask ) = 0 ) then
  2930. z := z and not lastBitMask;
  2931. End
  2932. else if ( roundingMode <> float_round_to_zero ) then
  2933. Begin
  2934. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2935. Begin
  2936. z := z + roundBitsMask;
  2937. End;
  2938. End;
  2939. z := z and not roundBitsMask;
  2940. if ( z <> a.float32 ) then
  2941. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2942. float32_round_to_int.float32 := z;
  2943. End;
  2944. {*
  2945. -------------------------------------------------------------------------------
  2946. Returns the result of adding the absolute values of the single-precision
  2947. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2948. before being returned. `zSign' is ignored if the result is a NaN.
  2949. The addition is performed according to the IEC/IEEE Standard for Binary
  2950. Floating-Point Arithmetic.
  2951. -------------------------------------------------------------------------------
  2952. *}
  2953. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2954. Var
  2955. aExp, bExp, zExp: int16;
  2956. aSig, bSig, zSig: bits32;
  2957. expDiff: int16;
  2958. label roundAndPack;
  2959. Begin
  2960. aSig:=extractFloat32Frac( a );
  2961. aExp:=extractFloat32Exp( a );
  2962. bSig:=extractFloat32Frac( b );
  2963. bExp := extractFloat32Exp( b );
  2964. expDiff := aExp - bExp;
  2965. aSig := aSig shl 6;
  2966. bSig := bSig shl 6;
  2967. if ( 0 < expDiff ) then
  2968. Begin
  2969. if ( aExp = $FF ) then
  2970. Begin
  2971. if ( aSig <> 0) then
  2972. Begin
  2973. addFloat32Sigs := propagateFloat32NaN( a, b );
  2974. exit;
  2975. End;
  2976. addFloat32Sigs := a;
  2977. exit;
  2978. End;
  2979. if ( bExp = 0 ) then
  2980. Begin
  2981. Dec(expDiff);
  2982. End
  2983. else
  2984. Begin
  2985. bSig := bSig or $20000000;
  2986. End;
  2987. shift32RightJamming( bSig, expDiff, bSig );
  2988. zExp := aExp;
  2989. End
  2990. else
  2991. If ( expDiff < 0 ) then
  2992. Begin
  2993. if ( bExp = $FF ) then
  2994. Begin
  2995. if ( bSig<>0 ) then
  2996. Begin
  2997. addFloat32Sigs := propagateFloat32NaN( a, b );
  2998. exit;
  2999. end;
  3000. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3001. exit;
  3002. End;
  3003. if ( aExp = 0 ) then
  3004. Begin
  3005. Inc(expDiff);
  3006. End
  3007. else
  3008. Begin
  3009. aSig := aSig OR $20000000;
  3010. End;
  3011. shift32RightJamming( aSig, - expDiff, aSig );
  3012. zExp := bExp;
  3013. End
  3014. else
  3015. Begin
  3016. if ( aExp = $FF ) then
  3017. Begin
  3018. if ( aSig OR bSig )<> 0 then
  3019. Begin
  3020. addFloat32Sigs := propagateFloat32NaN( a, b );
  3021. exit;
  3022. end;
  3023. addFloat32Sigs := a;
  3024. exit;
  3025. End;
  3026. if ( aExp = 0 ) then
  3027. Begin
  3028. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3029. exit;
  3030. end;
  3031. zSig := $40000000 + aSig + bSig;
  3032. zExp := aExp;
  3033. goto roundAndPack;
  3034. End;
  3035. aSig := aSig OR $20000000;
  3036. zSig := ( aSig + bSig ) shl 1;
  3037. Dec(zExp);
  3038. if ( sbits32 (zSig) < 0 ) then
  3039. Begin
  3040. zSig := aSig + bSig;
  3041. Inc(zExp);
  3042. End;
  3043. roundAndPack:
  3044. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3045. End;
  3046. {*
  3047. -------------------------------------------------------------------------------
  3048. Returns the result of subtracting the absolute values of the single-
  3049. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3050. difference is negated before being returned. `zSign' is ignored if the
  3051. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3052. Standard for Binary Floating-Point Arithmetic.
  3053. -------------------------------------------------------------------------------
  3054. *}
  3055. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3056. Var
  3057. aExp, bExp, zExp: int16;
  3058. aSig, bSig, zSig: bits32;
  3059. expDiff : int16;
  3060. label aExpBigger;
  3061. label bExpBigger;
  3062. label aBigger;
  3063. label bBigger;
  3064. label normalizeRoundAndPack;
  3065. Begin
  3066. aSig := extractFloat32Frac( a );
  3067. aExp := extractFloat32Exp( a );
  3068. bSig := extractFloat32Frac( b );
  3069. bExp := extractFloat32Exp( b );
  3070. expDiff := aExp - bExp;
  3071. aSig := aSig shl 7;
  3072. bSig := bSig shl 7;
  3073. if ( 0 < expDiff ) then goto aExpBigger;
  3074. if ( expDiff < 0 ) then goto bExpBigger;
  3075. if ( aExp = $FF ) then
  3076. Begin
  3077. if ( aSig OR bSig )<> 0 then
  3078. Begin
  3079. subFloat32Sigs := propagateFloat32NaN( a, b );
  3080. exit;
  3081. End;
  3082. float_raise( float_flag_invalid );
  3083. subFloat32Sigs := float32_default_nan;
  3084. exit;
  3085. End;
  3086. if ( aExp = 0 ) then
  3087. Begin
  3088. aExp := 1;
  3089. bExp := 1;
  3090. End;
  3091. if ( bSig < aSig ) Then goto aBigger;
  3092. if ( aSig < bSig ) Then goto bBigger;
  3093. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3094. exit;
  3095. bExpBigger:
  3096. if ( bExp = $FF ) then
  3097. Begin
  3098. if ( bSig<>0 ) then
  3099. Begin
  3100. subFloat32Sigs := propagateFloat32NaN( a, b );
  3101. exit;
  3102. End;
  3103. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3104. exit;
  3105. End;
  3106. if ( aExp = 0 ) then
  3107. Begin
  3108. Inc(expDiff);
  3109. End
  3110. else
  3111. Begin
  3112. aSig := aSig OR $40000000;
  3113. End;
  3114. shift32RightJamming( aSig, - expDiff, aSig );
  3115. bSig := bSig OR $40000000;
  3116. bBigger:
  3117. zSig := bSig - aSig;
  3118. zExp := bExp;
  3119. zSign := zSign xor 1;
  3120. goto normalizeRoundAndPack;
  3121. aExpBigger:
  3122. if ( aExp = $FF ) then
  3123. Begin
  3124. if ( aSig <> 0) then
  3125. Begin
  3126. subFloat32Sigs := propagateFloat32NaN( a, b );
  3127. exit;
  3128. End;
  3129. subFloat32Sigs := a;
  3130. exit;
  3131. End;
  3132. if ( bExp = 0 ) then
  3133. Begin
  3134. Dec(expDiff);
  3135. End
  3136. else
  3137. Begin
  3138. bSig := bSig OR $40000000;
  3139. End;
  3140. shift32RightJamming( bSig, expDiff, bSig );
  3141. aSig := aSig OR $40000000;
  3142. aBigger:
  3143. zSig := aSig - bSig;
  3144. zExp := aExp;
  3145. normalizeRoundAndPack:
  3146. Dec(zExp);
  3147. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3148. End;
  3149. {*
  3150. -------------------------------------------------------------------------------
  3151. Returns the result of adding the single-precision floating-point values `a'
  3152. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3153. Binary Floating-Point Arithmetic.
  3154. -------------------------------------------------------------------------------
  3155. *}
  3156. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3157. Var
  3158. aSign, bSign: Flag;
  3159. Begin
  3160. aSign := extractFloat32Sign( a.float32 );
  3161. bSign := extractFloat32Sign( b.float32 );
  3162. if ( aSign = bSign ) then
  3163. Begin
  3164. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3165. End
  3166. else
  3167. Begin
  3168. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3169. End;
  3170. End;
  3171. {*
  3172. -------------------------------------------------------------------------------
  3173. Returns the result of subtracting the single-precision floating-point values
  3174. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3175. for Binary Floating-Point Arithmetic.
  3176. -------------------------------------------------------------------------------
  3177. *}
  3178. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3179. Var
  3180. aSign, bSign: flag;
  3181. Begin
  3182. aSign := extractFloat32Sign( a.float32 );
  3183. bSign := extractFloat32Sign( b.float32 );
  3184. if ( aSign = bSign ) then
  3185. Begin
  3186. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3187. End
  3188. else
  3189. Begin
  3190. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3191. End;
  3192. End;
  3193. {*
  3194. -------------------------------------------------------------------------------
  3195. Returns the result of multiplying the single-precision floating-point values
  3196. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3197. for Binary Floating-Point Arithmetic.
  3198. -------------------------------------------------------------------------------
  3199. *}
  3200. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3201. Var
  3202. aSign, bSign, zSign: flag;
  3203. aExp, bExp, zExp : int16;
  3204. aSig, bSig, zSig0, zSig1: bits32;
  3205. Begin
  3206. aSig := extractFloat32Frac( a.float32 );
  3207. aExp := extractFloat32Exp( a.float32 );
  3208. aSign := extractFloat32Sign( a.float32 );
  3209. bSig := extractFloat32Frac( b.float32 );
  3210. bExp := extractFloat32Exp( b.float32 );
  3211. bSign := extractFloat32Sign( b.float32 );
  3212. zSign := aSign xor bSign;
  3213. if ( aExp = $FF ) then
  3214. Begin
  3215. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3216. Begin
  3217. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3218. End;
  3219. if ( ( bExp OR bSig ) = 0 ) then
  3220. Begin
  3221. float_raise( float_flag_invalid );
  3222. float32_mul.float32 := float32_default_nan;
  3223. exit;
  3224. End;
  3225. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3226. exit;
  3227. End;
  3228. if ( bExp = $FF ) then
  3229. Begin
  3230. if ( bSig <> 0 ) then
  3231. Begin
  3232. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3233. exit;
  3234. End;
  3235. if ( ( aExp OR aSig ) = 0 ) then
  3236. Begin
  3237. float_raise( float_flag_invalid );
  3238. float32_mul.float32 := float32_default_nan;
  3239. exit;
  3240. End;
  3241. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3242. exit;
  3243. End;
  3244. if ( aExp = 0 ) then
  3245. Begin
  3246. if ( aSig = 0 ) then
  3247. Begin
  3248. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3249. exit;
  3250. End;
  3251. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3252. End;
  3253. if ( bExp = 0 ) then
  3254. Begin
  3255. if ( bSig = 0 ) then
  3256. Begin
  3257. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3258. exit;
  3259. End;
  3260. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3261. End;
  3262. zExp := aExp + bExp - $7F;
  3263. aSig := ( aSig OR $00800000 ) shl 7;
  3264. bSig := ( bSig OR $00800000 ) shl 8;
  3265. mul32To64( aSig, bSig, zSig0, zSig1 );
  3266. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3267. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3268. Begin
  3269. zSig0 := zSig0 shl 1;
  3270. Dec(zExp);
  3271. End;
  3272. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3273. End;
  3274. {*
  3275. -------------------------------------------------------------------------------
  3276. Returns the result of dividing the single-precision floating-point value `a'
  3277. by the corresponding value `b'. The operation is performed according to the
  3278. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3279. -------------------------------------------------------------------------------
  3280. *}
  3281. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3282. Var
  3283. aSign, bSign, zSign: flag;
  3284. aExp, bExp, zExp: int16;
  3285. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3286. Begin
  3287. aSig := extractFloat32Frac( a.float32 );
  3288. aExp := extractFloat32Exp( a.float32 );
  3289. aSign := extractFloat32Sign( a.float32 );
  3290. bSig := extractFloat32Frac( b.float32 );
  3291. bExp := extractFloat32Exp( b.float32 );
  3292. bSign := extractFloat32Sign( b.float32 );
  3293. zSign := aSign xor bSign;
  3294. if ( aExp = $FF ) then
  3295. Begin
  3296. if ( aSig <> 0 ) then
  3297. Begin
  3298. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3299. exit;
  3300. End;
  3301. if ( bExp = $FF ) then
  3302. Begin
  3303. if ( bSig <> 0) then
  3304. Begin
  3305. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3306. End;
  3307. float_raise( float_flag_invalid );
  3308. float32_div.float32 := float32_default_nan;
  3309. exit;
  3310. End;
  3311. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3312. exit;
  3313. End;
  3314. if ( bExp = $FF ) then
  3315. Begin
  3316. if ( bSig <> 0) then
  3317. Begin
  3318. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3319. exit;
  3320. End;
  3321. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3322. exit;
  3323. End;
  3324. if ( bExp = 0 ) Then
  3325. Begin
  3326. if ( bSig = 0 ) Then
  3327. Begin
  3328. if ( ( aExp OR aSig ) = 0 ) then
  3329. Begin
  3330. float_raise( float_flag_invalid );
  3331. float32_div.float32 := float32_default_nan;
  3332. exit;
  3333. End;
  3334. float_raise( float_flag_divbyzero );
  3335. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3336. exit;
  3337. End;
  3338. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3339. End;
  3340. if ( aExp = 0 ) Then
  3341. Begin
  3342. if ( aSig = 0 ) Then
  3343. Begin
  3344. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3345. exit;
  3346. End;
  3347. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3348. End;
  3349. zExp := aExp - bExp + $7D;
  3350. aSig := ( aSig OR $00800000 ) shl 7;
  3351. bSig := ( bSig OR $00800000 ) shl 8;
  3352. if ( bSig <= ( aSig + aSig ) ) then
  3353. Begin
  3354. aSig := aSig shr 1;
  3355. Inc(zExp);
  3356. End;
  3357. zSig := estimateDiv64To32( aSig, 0, bSig );
  3358. if ( ( zSig and $3F ) <= 2 ) then
  3359. Begin
  3360. mul32To64( bSig, zSig, term0, term1 );
  3361. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3362. while ( sbits32 (rem0) < 0 ) do
  3363. Begin
  3364. Dec(zSig);
  3365. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3366. End;
  3367. zSig := zSig or bits32( rem1 <> 0 );
  3368. End;
  3369. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3370. End;
  3371. {*
  3372. -------------------------------------------------------------------------------
  3373. Returns the remainder of the single-precision floating-point value `a'
  3374. with respect to the corresponding value `b'. The operation is performed
  3375. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3376. -------------------------------------------------------------------------------
  3377. *}
  3378. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3379. Var
  3380. aSign, bSign, zSign: flag;
  3381. aExp, bExp, expDiff: int16;
  3382. aSig, bSig, q, allZero, alternateASig: bits32;
  3383. sigMean: sbits32;
  3384. Begin
  3385. aSig := extractFloat32Frac( a.float32 );
  3386. aExp := extractFloat32Exp( a.float32 );
  3387. aSign := extractFloat32Sign( a.float32 );
  3388. bSig := extractFloat32Frac( b.float32 );
  3389. bExp := extractFloat32Exp( b.float32 );
  3390. bSign := extractFloat32Sign( b.float32 );
  3391. if ( aExp = $FF ) then
  3392. Begin
  3393. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3394. Begin
  3395. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3396. exit;
  3397. End;
  3398. float_raise( float_flag_invalid );
  3399. float32_rem.float32 := float32_default_nan;
  3400. exit;
  3401. End;
  3402. if ( bExp = $FF ) then
  3403. Begin
  3404. if ( bSig <> 0 ) then
  3405. Begin
  3406. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3407. exit;
  3408. End;
  3409. float32_rem := a;
  3410. exit;
  3411. End;
  3412. if ( bExp = 0 ) then
  3413. Begin
  3414. if ( bSig = 0 ) then
  3415. Begin
  3416. float_raise( float_flag_invalid );
  3417. float32_rem.float32 := float32_default_nan;
  3418. exit;
  3419. End;
  3420. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3421. End;
  3422. if ( aExp = 0 ) then
  3423. Begin
  3424. if ( aSig = 0 ) then
  3425. Begin
  3426. float32_rem := a;
  3427. exit;
  3428. End;
  3429. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3430. End;
  3431. expDiff := aExp - bExp;
  3432. aSig := ( aSig OR $00800000 ) shl 8;
  3433. bSig := ( bSig OR $00800000 ) shl 8;
  3434. if ( expDiff < 0 ) then
  3435. Begin
  3436. if ( expDiff < -1 ) then
  3437. Begin
  3438. float32_rem := a;
  3439. exit;
  3440. End;
  3441. aSig := aSig shr 1;
  3442. End;
  3443. q := bits32( bSig <= aSig );
  3444. if ( q <> 0) then
  3445. aSig := aSig - bSig;
  3446. expDiff := expDiff - 32;
  3447. while ( 0 < expDiff ) do
  3448. Begin
  3449. q := estimateDiv64To32( aSig, 0, bSig );
  3450. if (2 < q) then
  3451. q := q - 2
  3452. else
  3453. q := 0;
  3454. aSig := - ( ( bSig shr 2 ) * q );
  3455. expDiff := expDiff - 30;
  3456. End;
  3457. expDiff := expDiff + 32;
  3458. if ( 0 < expDiff ) then
  3459. Begin
  3460. q := estimateDiv64To32( aSig, 0, bSig );
  3461. if (2 < q) then
  3462. q := q - 2
  3463. else
  3464. q := 0;
  3465. q := q shr (32 - expDiff);
  3466. bSig := bSig shr 2;
  3467. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3468. End
  3469. else
  3470. Begin
  3471. aSig := aSig shr 2;
  3472. bSig := bSig shr 2;
  3473. End;
  3474. Repeat
  3475. alternateASig := aSig;
  3476. Inc(q);
  3477. aSig := aSig - bSig;
  3478. Until not ( 0 <= sbits32 (aSig) );
  3479. sigMean := aSig + alternateASig;
  3480. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3481. Begin
  3482. aSig := alternateASig;
  3483. End;
  3484. zSign := flag( sbits32 (aSig) < 0 );
  3485. if ( zSign<>0 ) then
  3486. aSig := - aSig;
  3487. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3488. End;
  3489. {*
  3490. -------------------------------------------------------------------------------
  3491. Returns the square root of the single-precision floating-point value `a'.
  3492. The operation is performed according to the IEC/IEEE Standard for Binary
  3493. Floating-Point Arithmetic.
  3494. -------------------------------------------------------------------------------
  3495. *}
  3496. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3497. Var
  3498. aSign : flag;
  3499. aExp, zExp : int16;
  3500. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3501. label roundAndPack;
  3502. Begin
  3503. aSig := extractFloat32Frac( a.float32 );
  3504. aExp := extractFloat32Exp( a.float32 );
  3505. aSign := extractFloat32Sign( a.float32 );
  3506. if ( aExp = $FF ) then
  3507. Begin
  3508. if ( aSig <> 0) then
  3509. Begin
  3510. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3511. exit;
  3512. End;
  3513. if ( aSign = 0) then
  3514. Begin
  3515. float32_sqrt := a;
  3516. exit;
  3517. End;
  3518. float_raise( float_flag_invalid );
  3519. float32_sqrt.float32 := float32_default_nan;
  3520. exit;
  3521. End;
  3522. if ( aSign <> 0) then
  3523. Begin
  3524. if ( ( aExp OR aSig ) = 0 ) then
  3525. Begin
  3526. float32_sqrt := a;
  3527. exit;
  3528. End;
  3529. float_raise( float_flag_invalid );
  3530. float32_sqrt.float32 := float32_default_nan;
  3531. exit;
  3532. End;
  3533. if ( aExp = 0 ) then
  3534. Begin
  3535. if ( aSig = 0 ) then
  3536. Begin
  3537. float32_sqrt.float32 := 0;
  3538. exit;
  3539. End;
  3540. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3541. End;
  3542. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3543. aSig := ( aSig OR $00800000 ) shl 8;
  3544. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3545. if ( ( zSig and $7F ) <= 5 ) then
  3546. Begin
  3547. if ( zSig < 2 ) then
  3548. Begin
  3549. zSig := $7FFFFFFF;
  3550. goto roundAndPack;
  3551. End
  3552. else
  3553. Begin
  3554. aSig := aSig shr (aExp and 1);
  3555. mul32To64( zSig, zSig, term0, term1 );
  3556. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3557. while ( sbits32 (rem0) < 0 ) do
  3558. Begin
  3559. Dec(zSig);
  3560. shortShift64Left( 0, zSig, 1, term0, term1 );
  3561. term1 := term1 or 1;
  3562. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3563. End;
  3564. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3565. End;
  3566. End;
  3567. shift32RightJamming( zSig, 1, zSig );
  3568. roundAndPack:
  3569. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3570. End;
  3571. {*
  3572. -------------------------------------------------------------------------------
  3573. Returns 1 if the single-precision floating-point value `a' is equal to
  3574. the corresponding value `b', and 0 otherwise. The comparison is performed
  3575. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3576. -------------------------------------------------------------------------------
  3577. *}
  3578. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3579. Begin
  3580. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3581. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3582. ) then
  3583. Begin
  3584. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3585. Begin
  3586. float_raise( float_flag_invalid );
  3587. End;
  3588. float32_eq := 0;
  3589. exit;
  3590. End;
  3591. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3592. End;
  3593. {*
  3594. -------------------------------------------------------------------------------
  3595. Returns 1 if the single-precision floating-point value `a' is less than
  3596. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3597. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3598. Arithmetic.
  3599. -------------------------------------------------------------------------------
  3600. *}
  3601. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3602. var
  3603. aSign, bSign: flag;
  3604. Begin
  3605. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3606. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3607. ) then
  3608. Begin
  3609. float_raise( float_flag_invalid );
  3610. float32_le := 0;
  3611. exit;
  3612. End;
  3613. aSign := extractFloat32Sign( a.float32 );
  3614. bSign := extractFloat32Sign( b.float32 );
  3615. if ( aSign <> bSign ) then
  3616. Begin
  3617. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3618. exit;
  3619. End;
  3620. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3621. End;
  3622. {*
  3623. -------------------------------------------------------------------------------
  3624. Returns 1 if the single-precision floating-point value `a' is less than
  3625. the corresponding value `b', and 0 otherwise. The comparison is performed
  3626. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3627. -------------------------------------------------------------------------------
  3628. *}
  3629. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3630. var
  3631. aSign, bSign: flag;
  3632. Begin
  3633. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3634. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3635. ) then
  3636. Begin
  3637. float_raise( float_flag_invalid );
  3638. float32_lt :=0;
  3639. exit;
  3640. End;
  3641. aSign := extractFloat32Sign( a.float32 );
  3642. bSign := extractFloat32Sign( b.float32 );
  3643. if ( aSign <> bSign ) then
  3644. Begin
  3645. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3646. exit;
  3647. End;
  3648. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3649. End;
  3650. {*
  3651. -------------------------------------------------------------------------------
  3652. Returns 1 if the single-precision floating-point value `a' is equal to
  3653. the corresponding value `b', and 0 otherwise. The invalid exception is
  3654. raised if either operand is a NaN. Otherwise, the comparison is performed
  3655. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3656. -------------------------------------------------------------------------------
  3657. *}
  3658. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3659. Begin
  3660. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3661. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3662. ) then
  3663. Begin
  3664. float_raise( float_flag_invalid );
  3665. float32_eq_signaling := 0;
  3666. exit;
  3667. End;
  3668. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3669. End;
  3670. {*
  3671. -------------------------------------------------------------------------------
  3672. Returns 1 if the single-precision floating-point value `a' is less than or
  3673. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3674. cause an exception. Otherwise, the comparison is performed according to the
  3675. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3676. -------------------------------------------------------------------------------
  3677. *}
  3678. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3679. Var
  3680. aSign, bSign: flag;
  3681. aExp, bExp: int16;
  3682. Begin
  3683. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3684. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3685. ) then
  3686. Begin
  3687. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3688. Begin
  3689. float_raise( float_flag_invalid );
  3690. End;
  3691. float32_le_quiet := 0;
  3692. exit;
  3693. End;
  3694. aSign := extractFloat32Sign( a );
  3695. bSign := extractFloat32Sign( b );
  3696. if ( aSign <> bSign ) then
  3697. Begin
  3698. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3699. exit;
  3700. End;
  3701. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3702. End;
  3703. {*
  3704. -------------------------------------------------------------------------------
  3705. Returns 1 if the single-precision floating-point value `a' is less than
  3706. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3707. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3708. Standard for Binary Floating-Point Arithmetic.
  3709. -------------------------------------------------------------------------------
  3710. *}
  3711. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3712. Var
  3713. aSign, bSign: flag;
  3714. Begin
  3715. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3716. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3717. ) then
  3718. Begin
  3719. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3720. Begin
  3721. float_raise( float_flag_invalid );
  3722. End;
  3723. float32_lt_quiet := 0;
  3724. exit;
  3725. End;
  3726. aSign := extractFloat32Sign( a );
  3727. bSign := extractFloat32Sign( b );
  3728. if ( aSign <> bSign ) then
  3729. Begin
  3730. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3731. exit;
  3732. End;
  3733. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3734. End;
  3735. {*
  3736. -------------------------------------------------------------------------------
  3737. Returns the result of converting the double-precision floating-point value
  3738. `a' to the 32-bit two's complement integer format. The conversion is
  3739. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3740. Arithmetic---which means in particular that the conversion is rounded
  3741. according to the current rounding mode. If `a' is a NaN, the largest
  3742. positive integer is returned. Otherwise, if the conversion overflows, the
  3743. largest integer with the same sign as `a' is returned.
  3744. -------------------------------------------------------------------------------
  3745. *}
  3746. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3747. var
  3748. aSign: flag;
  3749. aExp, shiftCount: int16;
  3750. aSig0, aSig1, absZ, aSigExtra: bits32;
  3751. z: int32;
  3752. roundingMode: int8;
  3753. label invalid;
  3754. Begin
  3755. aSig1 := extractFloat64Frac1( a );
  3756. aSig0 := extractFloat64Frac0( a );
  3757. aExp := extractFloat64Exp( a );
  3758. aSign := extractFloat64Sign( a );
  3759. shiftCount := aExp - $413;
  3760. if ( 0 <= shiftCount ) then
  3761. Begin
  3762. if ( $41E < aExp ) then
  3763. Begin
  3764. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3765. aSign := 0;
  3766. goto invalid;
  3767. End;
  3768. shortShift64Left(
  3769. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3770. if ( $80000000 < absZ ) then
  3771. goto invalid;
  3772. End
  3773. else
  3774. Begin
  3775. aSig1 := flag( aSig1 <> 0 );
  3776. if ( aExp < $3FE ) then
  3777. Begin
  3778. aSigExtra := aExp OR aSig0 OR aSig1;
  3779. absZ := 0;
  3780. End
  3781. else
  3782. Begin
  3783. aSig0 := aSig0 OR $00100000;
  3784. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3785. absZ := aSig0 shr ( - shiftCount );
  3786. End;
  3787. End;
  3788. roundingMode := softfloat_rounding_mode;
  3789. if ( roundingMode = float_round_nearest_even ) then
  3790. Begin
  3791. if ( sbits32(aSigExtra) < 0 ) then
  3792. Begin
  3793. Inc(absZ);
  3794. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3795. absZ := absZ and not 1;
  3796. End;
  3797. if aSign <> 0 then
  3798. z := - absZ
  3799. else
  3800. z := absZ;
  3801. End
  3802. else
  3803. Begin
  3804. aSigExtra := bits32( aSigExtra <> 0 );
  3805. if ( aSign <> 0) then
  3806. Begin
  3807. z := - ( absZ
  3808. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3809. End
  3810. else
  3811. Begin
  3812. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3813. End
  3814. End;
  3815. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3816. Begin
  3817. invalid:
  3818. float_raise( float_flag_invalid );
  3819. if (aSign <> 0 ) then
  3820. float64_to_int32 := sbits32 ($80000000)
  3821. else
  3822. float64_to_int32 := $7FFFFFFF;
  3823. exit;
  3824. End;
  3825. if ( aSigExtra <> 0) then
  3826. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3827. float64_to_int32 := z;
  3828. End;
  3829. {*
  3830. -------------------------------------------------------------------------------
  3831. Returns the result of converting the double-precision floating-point value
  3832. `a' to the 32-bit two's complement integer format. The conversion is
  3833. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3834. Arithmetic, except that the conversion is always rounded toward zero.
  3835. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3836. the conversion overflows, the largest integer with the same sign as `a' is
  3837. returned.
  3838. -------------------------------------------------------------------------------
  3839. *}
  3840. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3841. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3842. Var
  3843. aSign: flag;
  3844. aExp, shiftCount: int16;
  3845. aSig0, aSig1, absZ, aSigExtra: bits32;
  3846. z: int32;
  3847. label invalid;
  3848. Begin
  3849. aSig1 := extractFloat64Frac1( a );
  3850. aSig0 := extractFloat64Frac0( a );
  3851. aExp := extractFloat64Exp( a );
  3852. aSign := extractFloat64Sign( a );
  3853. shiftCount := aExp - $413;
  3854. if ( 0 <= shiftCount ) then
  3855. Begin
  3856. if ( $41E < aExp ) then
  3857. Begin
  3858. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3859. aSign := 0;
  3860. goto invalid;
  3861. End;
  3862. shortShift64Left(
  3863. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3864. End
  3865. else
  3866. Begin
  3867. if ( aExp < $3FF ) then
  3868. Begin
  3869. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3870. Begin
  3871. softfloat_exception_flags :=
  3872. softfloat_exception_flags or float_flag_inexact;
  3873. End;
  3874. float64_to_int32_round_to_zero := 0;
  3875. exit;
  3876. End;
  3877. aSig0 := aSig0 or $00100000;
  3878. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3879. absZ := aSig0 shr ( - shiftCount );
  3880. End;
  3881. if aSign <> 0 then
  3882. z := - absZ
  3883. else
  3884. z := absZ;
  3885. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3886. Begin
  3887. invalid:
  3888. float_raise( float_flag_invalid );
  3889. if (aSign <> 0) then
  3890. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3891. else
  3892. float64_to_int32_round_to_zero := $7FFFFFFF;
  3893. exit;
  3894. End;
  3895. if ( aSigExtra <> 0) then
  3896. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3897. float64_to_int32_round_to_zero := z;
  3898. End;
  3899. {*
  3900. -------------------------------------------------------------------------------
  3901. Returns the result of converting the double-precision floating-point value
  3902. `a' to the single-precision floating-point format. The conversion is
  3903. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3904. Arithmetic.
  3905. -------------------------------------------------------------------------------
  3906. *}
  3907. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3908. Var
  3909. aSign: flag;
  3910. aExp: int16;
  3911. aSig0, aSig1, zSig: bits32;
  3912. allZero: bits32;
  3913. tmp : CommonNanT;
  3914. Begin
  3915. aSig1 := extractFloat64Frac1( a );
  3916. aSig0 := extractFloat64Frac0( a );
  3917. aExp := extractFloat64Exp( a );
  3918. aSign := extractFloat64Sign( a );
  3919. if ( aExp = $7FF ) then
  3920. Begin
  3921. if ( aSig0 OR aSig1 ) <> 0 then
  3922. Begin
  3923. float64ToCommonNaN( a, tmp );
  3924. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3925. exit;
  3926. End;
  3927. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3928. exit;
  3929. End;
  3930. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3931. if ( aExp <> 0) then
  3932. zSig := zSig OR $40000000;
  3933. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3934. End;
  3935. {*
  3936. -------------------------------------------------------------------------------
  3937. Rounds the double-precision floating-point value `a' to an integer,
  3938. and returns the result as a double-precision floating-point value. The
  3939. operation is performed according to the IEC/IEEE Standard for Binary
  3940. Floating-Point Arithmetic.
  3941. -------------------------------------------------------------------------------
  3942. *}
  3943. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3944. Var
  3945. aSign: flag;
  3946. aExp: int16;
  3947. lastBitMask, roundBitsMask: bits32;
  3948. roundingMode: int8;
  3949. z: float64;
  3950. Begin
  3951. aExp := extractFloat64Exp( a );
  3952. if ( $413 <= aExp ) then
  3953. Begin
  3954. if ( $433 <= aExp ) then
  3955. Begin
  3956. if ( ( aExp = $7FF )
  3957. AND
  3958. (
  3959. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3960. ) <>0)
  3961. ) then
  3962. Begin
  3963. propagateFloat64NaN( a, a, result );
  3964. exit;
  3965. End;
  3966. result := a;
  3967. exit;
  3968. End;
  3969. lastBitMask := 1;
  3970. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3971. roundBitsMask := lastBitMask - 1;
  3972. z := a;
  3973. roundingMode := softfloat_rounding_mode;
  3974. if ( roundingMode = float_round_nearest_even ) then
  3975. Begin
  3976. if ( lastBitMask <> 0) then
  3977. Begin
  3978. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  3979. if ( ( z.low and roundBitsMask ) = 0 ) then
  3980. z.low := z.low and not lastBitMask;
  3981. End
  3982. else
  3983. Begin
  3984. if ( sbits32 (z.low) < 0 ) then
  3985. Begin
  3986. Inc(z.high);
  3987. if ( bits32 ( z.low shl 1 ) = 0 ) then
  3988. z.high := z.high and not 1;
  3989. End;
  3990. End;
  3991. End
  3992. else if ( roundingMode <> float_round_to_zero ) then
  3993. Begin
  3994. if ( extractFloat64Sign( z )
  3995. xor flag( roundingMode = float_round_up ) )<> 0 then
  3996. Begin
  3997. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  3998. End;
  3999. End;
  4000. z.low := z.low and not roundBitsMask;
  4001. End
  4002. else
  4003. Begin
  4004. if ( aExp <= $3FE ) then
  4005. Begin
  4006. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4007. Begin
  4008. result := a;
  4009. exit;
  4010. End;
  4011. softfloat_exception_flags := softfloat_exception_flags or
  4012. float_flag_inexact;
  4013. aSign := extractFloat64Sign( a );
  4014. case ( softfloat_rounding_mode ) of
  4015. float_round_nearest_even:
  4016. Begin
  4017. if ( ( aExp = $3FE )
  4018. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4019. ) then
  4020. Begin
  4021. packFloat64( aSign, $3FF, 0, 0, result );
  4022. exit;
  4023. End;
  4024. End;
  4025. float_round_down:
  4026. Begin
  4027. if aSign<>0 then
  4028. packFloat64( 1, $3FF, 0, 0, result )
  4029. else
  4030. packFloat64( 0, 0, 0, 0, result );
  4031. exit;
  4032. End;
  4033. float_round_up:
  4034. Begin
  4035. if aSign <> 0 then
  4036. packFloat64( 1, 0, 0, 0, result )
  4037. else
  4038. packFloat64( 0, $3FF, 0, 0, result );
  4039. exit;
  4040. End;
  4041. end;
  4042. packFloat64( aSign, 0, 0, 0, result );
  4043. exit;
  4044. End;
  4045. lastBitMask := 1;
  4046. lastBitMask := lastBitMask shl ($413 - aExp);
  4047. roundBitsMask := lastBitMask - 1;
  4048. z.low := 0;
  4049. z.high := a.high;
  4050. roundingMode := softfloat_rounding_mode;
  4051. if ( roundingMode = float_round_nearest_even ) then
  4052. Begin
  4053. z.high := z.high + lastBitMask shr 1;
  4054. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4055. Begin
  4056. z.high := z.high and not lastBitMask;
  4057. End;
  4058. End
  4059. else if ( roundingMode <> float_round_to_zero ) then
  4060. Begin
  4061. if ( extractFloat64Sign( z )
  4062. xor flag( roundingMode = float_round_up ) )<> 0 then
  4063. Begin
  4064. z.high := z.high or bits32( a.low <> 0 );
  4065. z.high := z.high + roundBitsMask;
  4066. End;
  4067. End;
  4068. z.high := z.high and not roundBitsMask;
  4069. End;
  4070. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4071. Begin
  4072. softfloat_exception_flags :=
  4073. softfloat_exception_flags or float_flag_inexact;
  4074. End;
  4075. result := z;
  4076. End;
  4077. {*
  4078. -------------------------------------------------------------------------------
  4079. Returns the result of adding the absolute values of the double-precision
  4080. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4081. before being returned. `zSign' is ignored if the result is a NaN.
  4082. The addition is performed according to the IEC/IEEE Standard for Binary
  4083. Floating-Point Arithmetic.
  4084. -------------------------------------------------------------------------------
  4085. *}
  4086. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4087. Var
  4088. aExp, bExp, zExp: int16;
  4089. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4090. expDiff: int16;
  4091. label shiftRight1;
  4092. label roundAndPack;
  4093. Begin
  4094. aSig1 := extractFloat64Frac1( a );
  4095. aSig0 := extractFloat64Frac0( a );
  4096. aExp := extractFloat64Exp( a );
  4097. bSig1 := extractFloat64Frac1( b );
  4098. bSig0 := extractFloat64Frac0( b );
  4099. bExp := extractFloat64Exp( b );
  4100. expDiff := aExp - bExp;
  4101. if ( 0 < expDiff ) then
  4102. Begin
  4103. if ( aExp = $7FF ) then
  4104. Begin
  4105. if ( aSig0 OR aSig1 ) <> 0 then
  4106. Begin
  4107. propagateFloat64NaN( a, b, out );
  4108. exit;
  4109. end;
  4110. out := a;
  4111. exit;
  4112. End;
  4113. if ( bExp = 0 ) then
  4114. Begin
  4115. Dec(expDiff);
  4116. End
  4117. else
  4118. Begin
  4119. bSig0 := bSig0 or $00100000;
  4120. End;
  4121. shift64ExtraRightJamming(
  4122. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4123. zExp := aExp;
  4124. End
  4125. else if ( expDiff < 0 ) then
  4126. Begin
  4127. if ( bExp = $7FF ) then
  4128. Begin
  4129. if ( bSig0 OR bSig1 ) <> 0 then
  4130. Begin
  4131. propagateFloat64NaN( a, b, out );
  4132. exit;
  4133. End;
  4134. packFloat64( zSign, $7FF, 0, 0, out );
  4135. End;
  4136. if ( aExp = 0 ) then
  4137. Begin
  4138. Inc(expDiff);
  4139. End
  4140. else
  4141. Begin
  4142. aSig0 := aSig0 or $00100000;
  4143. End;
  4144. shift64ExtraRightJamming(
  4145. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4146. zExp := bExp;
  4147. End
  4148. else
  4149. Begin
  4150. if ( aExp = $7FF ) then
  4151. Begin
  4152. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4153. Begin
  4154. propagateFloat64NaN( a, b, out );
  4155. exit;
  4156. End;
  4157. out := a;
  4158. exit;
  4159. End;
  4160. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4161. if ( aExp = 0 ) then
  4162. Begin
  4163. packFloat64( zSign, 0, zSig0, zSig1, out );
  4164. exit;
  4165. End;
  4166. zSig2 := 0;
  4167. zSig0 := zSig0 or $00200000;
  4168. zExp := aExp;
  4169. goto shiftRight1;
  4170. End;
  4171. aSig0 := aSig0 or $00100000;
  4172. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4173. Dec(zExp);
  4174. if ( zSig0 < $00200000 ) then
  4175. goto roundAndPack;
  4176. Inc(zExp);
  4177. shiftRight1:
  4178. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4179. roundAndPack:
  4180. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4181. End;
  4182. {*
  4183. -------------------------------------------------------------------------------
  4184. Returns the result of subtracting the absolute values of the double-
  4185. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4186. difference is negated before being returned. `zSign' is ignored if the
  4187. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4188. Standard for Binary Floating-Point Arithmetic.
  4189. -------------------------------------------------------------------------------
  4190. *}
  4191. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4192. Var
  4193. aExp, bExp, zExp: int16;
  4194. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4195. expDiff: int16;
  4196. z: float64;
  4197. label aExpBigger;
  4198. label bExpBigger;
  4199. label aBigger;
  4200. label bBigger;
  4201. label normalizeRoundAndPack;
  4202. Begin
  4203. aSig1 := extractFloat64Frac1( a );
  4204. aSig0 := extractFloat64Frac0( a );
  4205. aExp := extractFloat64Exp( a );
  4206. bSig1 := extractFloat64Frac1( b );
  4207. bSig0 := extractFloat64Frac0( b );
  4208. bExp := extractFloat64Exp( b );
  4209. expDiff := aExp - bExp;
  4210. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4211. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4212. if ( 0 < expDiff ) then goto aExpBigger;
  4213. if ( expDiff < 0 ) then goto bExpBigger;
  4214. if ( aExp = $7FF ) then
  4215. Begin
  4216. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4217. Begin
  4218. propagateFloat64NaN( a, b, out );
  4219. exit;
  4220. End;
  4221. float_raise( float_flag_invalid );
  4222. z.low := float64_default_nan_low;
  4223. z.high := float64_default_nan_high;
  4224. out := z;
  4225. exit;
  4226. End;
  4227. if ( aExp = 0 ) then
  4228. Begin
  4229. aExp := 1;
  4230. bExp := 1;
  4231. End;
  4232. if ( bSig0 < aSig0 ) then goto aBigger;
  4233. if ( aSig0 < bSig0 ) then goto bBigger;
  4234. if ( bSig1 < aSig1 ) then goto aBigger;
  4235. if ( aSig1 < bSig1 ) then goto bBigger;
  4236. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4237. exit;
  4238. bExpBigger:
  4239. if ( bExp = $7FF ) then
  4240. Begin
  4241. if ( bSig0 OR bSig1 ) <> 0 then
  4242. Begin
  4243. propagateFloat64NaN( a, b, out );
  4244. exit;
  4245. End;
  4246. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4247. exit;
  4248. End;
  4249. if ( aExp = 0 ) then
  4250. Begin
  4251. Inc(expDiff);
  4252. End
  4253. else
  4254. Begin
  4255. aSig0 := aSig0 or $40000000;
  4256. End;
  4257. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4258. bSig0 := bSig0 or $40000000;
  4259. bBigger:
  4260. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4261. zExp := bExp;
  4262. zSign := zSign xor 1;
  4263. goto normalizeRoundAndPack;
  4264. aExpBigger:
  4265. if ( aExp = $7FF ) then
  4266. Begin
  4267. if ( aSig0 OR aSig1 ) <> 0 then
  4268. Begin
  4269. propagateFloat64NaN( a, b, out );
  4270. exit;
  4271. End;
  4272. out := a;
  4273. exit;
  4274. End;
  4275. if ( bExp = 0 ) then
  4276. Begin
  4277. Dec(expDiff);
  4278. End
  4279. else
  4280. Begin
  4281. bSig0 := bSig0 or $40000000;
  4282. End;
  4283. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4284. aSig0 := aSig0 or $40000000;
  4285. aBigger:
  4286. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4287. zExp := aExp;
  4288. normalizeRoundAndPack:
  4289. Dec(zExp);
  4290. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4291. End;
  4292. {*
  4293. -------------------------------------------------------------------------------
  4294. Returns the result of adding the double-precision floating-point values `a'
  4295. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4296. Binary Floating-Point Arithmetic.
  4297. -------------------------------------------------------------------------------
  4298. *}
  4299. Function float64_add( a: float64; b : float64) : Float64;
  4300. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4301. Var
  4302. aSign, bSign: flag;
  4303. Begin
  4304. aSign := extractFloat64Sign( a );
  4305. bSign := extractFloat64Sign( b );
  4306. if ( aSign = bSign ) then
  4307. Begin
  4308. addFloat64Sigs( a, b, aSign, result );
  4309. End
  4310. else
  4311. Begin
  4312. subFloat64Sigs( a, b, aSign, result );
  4313. End;
  4314. End;
  4315. {*
  4316. -------------------------------------------------------------------------------
  4317. Returns the result of subtracting the double-precision floating-point values
  4318. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4319. for Binary Floating-Point Arithmetic.
  4320. -------------------------------------------------------------------------------
  4321. *}
  4322. Function float64_sub(a: float64; b : float64) : Float64;
  4323. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4324. Var
  4325. aSign, bSign: flag;
  4326. Begin
  4327. aSign := extractFloat64Sign( a );
  4328. bSign := extractFloat64Sign( b );
  4329. if ( aSign = bSign ) then
  4330. Begin
  4331. subFloat64Sigs( a, b, aSign, result );
  4332. End
  4333. else
  4334. Begin
  4335. addFloat64Sigs( a, b, aSign, result );
  4336. End;
  4337. End;
  4338. {*
  4339. -------------------------------------------------------------------------------
  4340. Returns the result of multiplying the double-precision floating-point values
  4341. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4342. for Binary Floating-Point Arithmetic.
  4343. -------------------------------------------------------------------------------
  4344. *}
  4345. Function float64_mul( a: float64; b:float64) : Float64;
  4346. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4347. Var
  4348. aSign, bSign, zSign: flag;
  4349. aExp, bExp, zExp: int16;
  4350. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4351. z: float64;
  4352. label invalid;
  4353. Begin
  4354. aSig1 := extractFloat64Frac1( a );
  4355. aSig0 := extractFloat64Frac0( a );
  4356. aExp := extractFloat64Exp( a );
  4357. aSign := extractFloat64Sign( a );
  4358. bSig1 := extractFloat64Frac1( b );
  4359. bSig0 := extractFloat64Frac0( b );
  4360. bExp := extractFloat64Exp( b );
  4361. bSign := extractFloat64Sign( b );
  4362. zSign := aSign xor bSign;
  4363. if ( aExp = $7FF ) then
  4364. Begin
  4365. if ( (( aSig0 OR aSig1 ) <>0)
  4366. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4367. Begin
  4368. propagateFloat64NaN( a, b, result );
  4369. exit;
  4370. End;
  4371. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4372. packFloat64( zSign, $7FF, 0, 0, result );
  4373. exit;
  4374. End;
  4375. if ( bExp = $7FF ) then
  4376. Begin
  4377. if ( bSig0 OR bSig1 )<> 0 then
  4378. Begin
  4379. propagateFloat64NaN( a, b, result );
  4380. exit;
  4381. End;
  4382. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4383. Begin
  4384. invalid:
  4385. float_raise( float_flag_invalid );
  4386. z.low := float64_default_nan_low;
  4387. z.high := float64_default_nan_high;
  4388. result := z;
  4389. exit;
  4390. End;
  4391. packFloat64( zSign, $7FF, 0, 0, result );
  4392. exit;
  4393. End;
  4394. if ( aExp = 0 ) then
  4395. Begin
  4396. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4397. Begin
  4398. packFloat64( zSign, 0, 0, 0, result );
  4399. exit;
  4400. End;
  4401. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4402. End;
  4403. if ( bExp = 0 ) then
  4404. Begin
  4405. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4406. Begin
  4407. packFloat64( zSign, 0, 0, 0, result );
  4408. exit;
  4409. End;
  4410. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4411. End;
  4412. zExp := aExp + bExp - $400;
  4413. aSig0 := aSig0 or $00100000;
  4414. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4415. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4416. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4417. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4418. if ( $00200000 <= zSig0 ) then
  4419. Begin
  4420. shift64ExtraRightJamming(
  4421. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4422. Inc(zExp);
  4423. End;
  4424. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4425. End;
  4426. {*
  4427. -------------------------------------------------------------------------------
  4428. Returns the result of dividing the double-precision floating-point value `a'
  4429. by the corresponding value `b'. The operation is performed according to the
  4430. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4431. -------------------------------------------------------------------------------
  4432. *}
  4433. Function float64_div(a: float64; b : float64) : Float64;
  4434. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4435. Var
  4436. aSign, bSign, zSign: flag;
  4437. aExp, bExp, zExp: int16;
  4438. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4439. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4440. z: float64;
  4441. label invalid;
  4442. Begin
  4443. aSig1 := extractFloat64Frac1( a );
  4444. aSig0 := extractFloat64Frac0( a );
  4445. aExp := extractFloat64Exp( a );
  4446. aSign := extractFloat64Sign( a );
  4447. bSig1 := extractFloat64Frac1( b );
  4448. bSig0 := extractFloat64Frac0( b );
  4449. bExp := extractFloat64Exp( b );
  4450. bSign := extractFloat64Sign( b );
  4451. zSign := aSign xor bSign;
  4452. if ( aExp = $7FF ) then
  4453. Begin
  4454. if ( aSig0 OR aSig1 )<> 0 then
  4455. Begin
  4456. propagateFloat64NaN( a, b, result );
  4457. exit;
  4458. end;
  4459. if ( bExp = $7FF ) then
  4460. Begin
  4461. if ( bSig0 OR bSig1 )<>0 then
  4462. Begin
  4463. propagateFloat64NaN( a, b, result );
  4464. exit;
  4465. End;
  4466. goto invalid;
  4467. End;
  4468. packFloat64( zSign, $7FF, 0, 0, result );
  4469. exit;
  4470. End;
  4471. if ( bExp = $7FF ) then
  4472. Begin
  4473. if ( bSig0 OR bSig1 )<> 0 then
  4474. Begin
  4475. propagateFloat64NaN( a, b, result );
  4476. exit;
  4477. End;
  4478. packFloat64( zSign, 0, 0, 0, result );
  4479. exit;
  4480. End;
  4481. if ( bExp = 0 ) then
  4482. Begin
  4483. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4484. Begin
  4485. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4486. Begin
  4487. invalid:
  4488. float_raise( float_flag_invalid );
  4489. z.low := float64_default_nan_low;
  4490. z.high := float64_default_nan_high;
  4491. result := z;
  4492. exit;
  4493. End;
  4494. float_raise( float_flag_divbyzero );
  4495. packFloat64( zSign, $7FF, 0, 0, result );
  4496. exit;
  4497. End;
  4498. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4499. End;
  4500. if ( aExp = 0 ) then
  4501. Begin
  4502. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4503. Begin
  4504. packFloat64( zSign, 0, 0, 0, result );
  4505. exit;
  4506. End;
  4507. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4508. End;
  4509. zExp := aExp - bExp + $3FD;
  4510. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4511. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4512. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4513. Begin
  4514. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4515. Inc(zExp);
  4516. End;
  4517. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4518. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4519. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4520. while ( sbits32 (rem0) < 0 ) do
  4521. Begin
  4522. Dec(zSig0);
  4523. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4524. End;
  4525. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4526. if ( ( zSig1 and $3FF ) <= 4 ) then
  4527. Begin
  4528. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4529. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4530. while ( sbits32 (rem1) < 0 ) do
  4531. Begin
  4532. Dec(zSig1);
  4533. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4534. End;
  4535. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4536. End;
  4537. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4538. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4539. End;
  4540. {*
  4541. -------------------------------------------------------------------------------
  4542. Returns the remainder of the double-precision floating-point value `a'
  4543. with respect to the corresponding value `b'. The operation is performed
  4544. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4545. -------------------------------------------------------------------------------
  4546. *}
  4547. Function float64_rem(a: float64; b : float64) : float64;
  4548. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4549. Var
  4550. aSign, bSign, zSign: flag;
  4551. aExp, bExp, expDiff: int16;
  4552. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4553. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4554. sigMean0: sbits32;
  4555. z: float64;
  4556. label invalid;
  4557. Begin
  4558. aSig1 := extractFloat64Frac1( a );
  4559. aSig0 := extractFloat64Frac0( a );
  4560. aExp := extractFloat64Exp( a );
  4561. aSign := extractFloat64Sign( a );
  4562. bSig1 := extractFloat64Frac1( b );
  4563. bSig0 := extractFloat64Frac0( b );
  4564. bExp := extractFloat64Exp( b );
  4565. bSign := extractFloat64Sign( b );
  4566. if ( aExp = $7FF ) then
  4567. Begin
  4568. if ((( aSig0 OR aSig1 )<>0)
  4569. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4570. Begin
  4571. propagateFloat64NaN( a, b, result );
  4572. exit;
  4573. End;
  4574. goto invalid;
  4575. End;
  4576. if ( bExp = $7FF ) then
  4577. Begin
  4578. if ( bSig0 OR bSig1 ) <> 0 then
  4579. Begin
  4580. propagateFloat64NaN( a, b, result );
  4581. exit;
  4582. End;
  4583. result := a;
  4584. exit;
  4585. End;
  4586. if ( bExp = 0 ) then
  4587. Begin
  4588. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4589. Begin
  4590. invalid:
  4591. float_raise( float_flag_invalid );
  4592. z.low := float64_default_nan_low;
  4593. z.high := float64_default_nan_high;
  4594. result := z;
  4595. exit;
  4596. End;
  4597. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4598. End;
  4599. if ( aExp = 0 ) then
  4600. Begin
  4601. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4602. Begin
  4603. result := a;
  4604. exit;
  4605. End;
  4606. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4607. End;
  4608. expDiff := aExp - bExp;
  4609. if ( expDiff < -1 ) then
  4610. Begin
  4611. result := a;
  4612. exit;
  4613. End;
  4614. shortShift64Left(
  4615. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4616. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4617. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4618. if ( q )<>0 then
  4619. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4620. expDiff := expDiff - 32;
  4621. while ( 0 < expDiff ) do
  4622. Begin
  4623. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4624. if 4 < q then
  4625. q:= q - 4
  4626. else
  4627. q := 0;
  4628. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4629. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4630. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4631. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4632. expDiff := expDiff - 29;
  4633. End;
  4634. if ( -32 < expDiff ) then
  4635. Begin
  4636. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4637. if 4 < q then
  4638. q := q - 4
  4639. else
  4640. q := 0;
  4641. q := q shr (- expDiff);
  4642. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4643. expDiff := expDiff + 24;
  4644. if ( expDiff < 0 ) then
  4645. Begin
  4646. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4647. End
  4648. else
  4649. Begin
  4650. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4651. End;
  4652. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4653. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4654. End
  4655. else
  4656. Begin
  4657. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4658. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4659. End;
  4660. Repeat
  4661. alternateASig0 := aSig0;
  4662. alternateASig1 := aSig1;
  4663. Inc(q);
  4664. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4665. Until not ( 0 <= sbits32 (aSig0) );
  4666. add64(
  4667. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4668. if ( ( sigMean0 < 0 )
  4669. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4670. Begin
  4671. aSig0 := alternateASig0;
  4672. aSig1 := alternateASig1;
  4673. End;
  4674. zSign := flag( sbits32 (aSig0) < 0 );
  4675. if ( zSign <> 0 ) then
  4676. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4677. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4678. End;
  4679. {*
  4680. -------------------------------------------------------------------------------
  4681. Returns the square root of the double-precision floating-point value `a'.
  4682. The operation is performed according to the IEC/IEEE Standard for Binary
  4683. Floating-Point Arithmetic.
  4684. -------------------------------------------------------------------------------
  4685. *}
  4686. Procedure float64_sqrt( a: float64; var out: float64 );
  4687. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4688. Var
  4689. aSign: flag;
  4690. aExp, zExp: int16;
  4691. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4692. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4693. z: float64;
  4694. label invalid;
  4695. Begin
  4696. aSig1 := extractFloat64Frac1( a );
  4697. aSig0 := extractFloat64Frac0( a );
  4698. aExp := extractFloat64Exp( a );
  4699. aSign := extractFloat64Sign( a );
  4700. if ( aExp = $7FF ) then
  4701. Begin
  4702. if ( aSig0 OR aSig1 ) <> 0 then
  4703. Begin
  4704. propagateFloat64NaN( a, a, out );
  4705. exit;
  4706. End;
  4707. if ( aSign = 0) then
  4708. Begin
  4709. out := a;
  4710. exit;
  4711. End;
  4712. goto invalid;
  4713. End;
  4714. if ( aSign <> 0 ) then
  4715. Begin
  4716. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4717. Begin
  4718. out := a;
  4719. exit;
  4720. End;
  4721. invalid:
  4722. float_raise( float_flag_invalid );
  4723. z.low := float64_default_nan_low;
  4724. z.high := float64_default_nan_high;
  4725. out := z;
  4726. exit;
  4727. End;
  4728. if ( aExp = 0 ) then
  4729. Begin
  4730. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4731. Begin
  4732. packFloat64( 0, 0, 0, 0, out );
  4733. exit;
  4734. End;
  4735. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4736. End;
  4737. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4738. aSig0 := aSig0 or $00100000;
  4739. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4740. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4741. if ( zSig0 = 0 ) then
  4742. zSig0 := $7FFFFFFF;
  4743. doubleZSig0 := zSig0 + zSig0;
  4744. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4745. mul32To64( zSig0, zSig0, term0, term1 );
  4746. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4747. while ( sbits32 (rem0) < 0 ) do
  4748. Begin
  4749. Dec(zSig0);
  4750. doubleZSig0 := doubleZSig0 - 2;
  4751. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4752. End;
  4753. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4754. if ( ( zSig1 and $1FF ) <= 5 ) then
  4755. Begin
  4756. if ( zSig1 = 0 ) then
  4757. zSig1 := 1;
  4758. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4759. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4760. mul32To64( zSig1, zSig1, term2, term3 );
  4761. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4762. while ( sbits32 (rem1) < 0 ) do
  4763. Begin
  4764. Dec(zSig1);
  4765. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4766. term3 := term3 or 1;
  4767. term2 := term2 or doubleZSig0;
  4768. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4769. End;
  4770. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4771. End;
  4772. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4773. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4774. End;
  4775. {*
  4776. -------------------------------------------------------------------------------
  4777. Returns 1 if the double-precision floating-point value `a' is equal to
  4778. the corresponding value `b', and 0 otherwise. The comparison is performed
  4779. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4780. -------------------------------------------------------------------------------
  4781. *}
  4782. Function float64_eq(a: float64; b: float64): flag;
  4783. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4784. Begin
  4785. if
  4786. (
  4787. ( extractFloat64Exp( a ) = $7FF )
  4788. AND
  4789. (
  4790. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4791. )
  4792. )
  4793. OR (
  4794. ( extractFloat64Exp( b ) = $7FF )
  4795. AND (
  4796. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4797. )
  4798. )
  4799. ) then
  4800. Begin
  4801. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4802. float_raise( float_flag_invalid );
  4803. float64_eq := 0;
  4804. exit;
  4805. End;
  4806. float64_eq := flag(
  4807. ( a.low = b.low )
  4808. AND ( ( a.high = b.high )
  4809. OR ( ( a.low = 0 )
  4810. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4811. ));
  4812. End;
  4813. {*
  4814. -------------------------------------------------------------------------------
  4815. Returns 1 if the double-precision floating-point value `a' is less than
  4816. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4817. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4818. Arithmetic.
  4819. -------------------------------------------------------------------------------
  4820. *}
  4821. Function float64_le(a: float64;b: float64): flag;
  4822. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4823. Var
  4824. aSign, bSign: flag;
  4825. Begin
  4826. if
  4827. (
  4828. ( extractFloat64Exp( a ) = $7FF )
  4829. AND
  4830. (
  4831. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4832. )
  4833. )
  4834. OR (
  4835. ( extractFloat64Exp( b ) = $7FF )
  4836. AND (
  4837. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4838. )
  4839. )
  4840. ) then
  4841. Begin
  4842. float_raise( float_flag_invalid );
  4843. float64_le := 0;
  4844. exit;
  4845. End;
  4846. aSign := extractFloat64Sign( a );
  4847. bSign := extractFloat64Sign( b );
  4848. if ( aSign <> bSign ) then
  4849. Begin
  4850. float64_le := flag(
  4851. (aSign <> 0)
  4852. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4853. = 0 ));
  4854. exit;
  4855. End;
  4856. if aSign <> 0 then
  4857. float64_le := le64( b.high, b.low, a.high, a.low )
  4858. else
  4859. float64_le := le64( a.high, a.low, b.high, b.low );
  4860. End;
  4861. {*
  4862. -------------------------------------------------------------------------------
  4863. Returns 1 if the double-precision floating-point value `a' is less than
  4864. the corresponding value `b', and 0 otherwise. The comparison is performed
  4865. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4866. -------------------------------------------------------------------------------
  4867. *}
  4868. Function float64_lt(a: float64;b: float64): flag;
  4869. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4870. Var
  4871. aSign, bSign: flag;
  4872. Begin
  4873. if
  4874. (
  4875. ( extractFloat64Exp( a ) = $7FF )
  4876. AND
  4877. (
  4878. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4879. )
  4880. )
  4881. OR (
  4882. ( extractFloat64Exp( b ) = $7FF )
  4883. AND (
  4884. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4885. )
  4886. )
  4887. ) then
  4888. Begin
  4889. float_raise( float_flag_invalid );
  4890. float64_lt := 0;
  4891. exit;
  4892. End;
  4893. aSign := extractFloat64Sign( a );
  4894. bSign := extractFloat64Sign( b );
  4895. if ( aSign <> bSign ) then
  4896. Begin
  4897. float64_lt := flag(
  4898. (aSign <> 0)
  4899. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4900. <> 0 ));
  4901. exit;
  4902. End;
  4903. if aSign <> 0 then
  4904. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4905. else
  4906. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4907. End;
  4908. {*
  4909. -------------------------------------------------------------------------------
  4910. Returns 1 if the double-precision floating-point value `a' is equal to
  4911. the corresponding value `b', and 0 otherwise. The invalid exception is
  4912. raised if either operand is a NaN. Otherwise, the comparison is performed
  4913. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4914. -------------------------------------------------------------------------------
  4915. *}
  4916. Function float64_eq_signaling( a: float64; b: float64): flag;
  4917. Begin
  4918. if
  4919. (
  4920. ( extractFloat64Exp( a ) = $7FF )
  4921. AND
  4922. (
  4923. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4924. )
  4925. )
  4926. OR (
  4927. ( extractFloat64Exp( b ) = $7FF )
  4928. AND (
  4929. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4930. )
  4931. )
  4932. ) then
  4933. Begin
  4934. float_raise( float_flag_invalid );
  4935. float64_eq_signaling := 0;
  4936. exit;
  4937. End;
  4938. float64_eq_signaling := flag(
  4939. ( a.low = b.low )
  4940. AND ( ( a.high = b.high )
  4941. OR ( ( a.low = 0 )
  4942. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4943. ));
  4944. End;
  4945. {*
  4946. -------------------------------------------------------------------------------
  4947. Returns 1 if the double-precision floating-point value `a' is less than or
  4948. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4949. cause an exception. Otherwise, the comparison is performed according to the
  4950. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4951. -------------------------------------------------------------------------------
  4952. *}
  4953. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4954. Var
  4955. aSign, bSign : flag;
  4956. Begin
  4957. if
  4958. (
  4959. ( extractFloat64Exp( a ) = $7FF )
  4960. AND
  4961. (
  4962. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4963. )
  4964. )
  4965. OR (
  4966. ( extractFloat64Exp( b ) = $7FF )
  4967. AND (
  4968. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4969. )
  4970. )
  4971. ) then
  4972. Begin
  4973. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4974. float_raise( float_flag_invalid );
  4975. float64_le_quiet := 0;
  4976. exit;
  4977. End;
  4978. aSign := extractFloat64Sign( a );
  4979. bSign := extractFloat64Sign( b );
  4980. if ( aSign <> bSign ) then
  4981. Begin
  4982. float64_le_quiet := flag
  4983. ((aSign <> 0)
  4984. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4985. = 0 ));
  4986. exit;
  4987. End;
  4988. if aSign <> 0 then
  4989. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  4990. else
  4991. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  4992. End;
  4993. {*
  4994. -------------------------------------------------------------------------------
  4995. Returns 1 if the double-precision floating-point value `a' is less than
  4996. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4997. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4998. Standard for Binary Floating-Point Arithmetic.
  4999. -------------------------------------------------------------------------------
  5000. *}
  5001. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5002. Var
  5003. aSign, bSign: flag;
  5004. Begin
  5005. if
  5006. (
  5007. ( extractFloat64Exp( a ) = $7FF )
  5008. AND
  5009. (
  5010. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5011. )
  5012. )
  5013. OR (
  5014. ( extractFloat64Exp( b ) = $7FF )
  5015. AND (
  5016. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5017. )
  5018. )
  5019. ) then
  5020. Begin
  5021. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5022. float_raise( float_flag_invalid );
  5023. float64_lt_quiet := 0;
  5024. exit;
  5025. End;
  5026. aSign := extractFloat64Sign( a );
  5027. bSign := extractFloat64Sign( b );
  5028. if ( aSign <> bSign ) then
  5029. Begin
  5030. float64_lt_quiet := flag(
  5031. (aSign<>0)
  5032. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5033. <> 0 ));
  5034. exit;
  5035. End;
  5036. If aSign <> 0 then
  5037. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5038. else
  5039. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5040. End;
  5041. {*----------------------------------------------------------------------------
  5042. | Returns the result of converting the 64-bit two's complement integer `a'
  5043. | to the single-precision floating-point format. The conversion is performed
  5044. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5045. *----------------------------------------------------------------------------*}
  5046. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5047. var
  5048. zSign : flag;
  5049. absA : uint64;
  5050. shiftCount: int8;
  5051. zSig : bits32;
  5052. intval : int64rec;
  5053. Begin
  5054. if ( a = 0 ) then
  5055. begin
  5056. int64_to_float32.float32 := 0;
  5057. exit;
  5058. end;
  5059. if a < 0 then
  5060. zSign := flag(TRUE)
  5061. else
  5062. zSign := flag(FALSE);
  5063. if zSign<>0 then
  5064. absA := -a
  5065. else
  5066. absA := a;
  5067. shiftCount := countLeadingZeros64( absA ) - 40;
  5068. if ( 0 <= shiftCount ) then
  5069. begin
  5070. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5071. end
  5072. else
  5073. begin
  5074. shiftCount := shiftCount + 7;
  5075. if ( shiftCount < 0 ) then
  5076. begin
  5077. intval.low := int64rec(AbsA).low;
  5078. intval.high := int64rec(AbsA).high;
  5079. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5080. intval.low, intval.high);
  5081. int64rec(absA).low := intval.low;
  5082. int64rec(absA).high := intval.high;
  5083. end
  5084. else
  5085. absA := absA shl shiftCount;
  5086. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5087. end;
  5088. End;
  5089. {*----------------------------------------------------------------------------
  5090. | Returns the result of converting the 64-bit two's complement integer `a'
  5091. | to the single-precision floating-point format. The conversion is performed
  5092. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5093. | Unisgned version.
  5094. *----------------------------------------------------------------------------*}
  5095. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5096. var
  5097. zSign : flag;
  5098. absA : uint64;
  5099. shiftCount: int8;
  5100. zSig : bits32;
  5101. intval : int64rec;
  5102. Begin
  5103. if ( a = 0 ) then
  5104. begin
  5105. qword_to_float32.float32 := 0;
  5106. exit;
  5107. end;
  5108. zSign := flag(FALSE);
  5109. absA := a;
  5110. shiftCount := countLeadingZeros64( absA ) - 40;
  5111. if ( 0 <= shiftCount ) then
  5112. begin
  5113. qword_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5114. end
  5115. else
  5116. begin
  5117. shiftCount := shiftCount + 7;
  5118. if ( shiftCount < 0 ) then
  5119. begin
  5120. intval.low := int64rec(AbsA).low;
  5121. intval.high := int64rec(AbsA).high;
  5122. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5123. intval.low, intval.high);
  5124. int64rec(absA).low := intval.low;
  5125. int64rec(absA).high := intval.high;
  5126. end
  5127. else
  5128. absA := absA shl shiftCount;
  5129. qword_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5130. end;
  5131. End;
  5132. {*----------------------------------------------------------------------------
  5133. | Returns the result of converting the 64-bit two's complement integer `a'
  5134. | to the double-precision floating-point format. The conversion is performed
  5135. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5136. *----------------------------------------------------------------------------*}
  5137. function qword_to_float64( a: qword ): float64;
  5138. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5139. var
  5140. zSign : flag;
  5141. float_result : float64;
  5142. intval : int64rec;
  5143. AbsA : bits64;
  5144. shiftcount : int8;
  5145. zSig0, zSig1 : bits32;
  5146. Begin
  5147. if ( a = 0 ) then
  5148. Begin
  5149. packFloat64( 0, 0, 0, 0, result );
  5150. exit;
  5151. end;
  5152. zSign := flag(FALSE);
  5153. AbsA := a;
  5154. shiftCount := countLeadingZeros64( absA ) - 11;
  5155. if ( 0 <= shiftCount ) then
  5156. Begin
  5157. absA := absA shl shiftcount;
  5158. zSig0:=int64rec(absA).high;
  5159. zSig1:=int64rec(absA).low;
  5160. End
  5161. else
  5162. Begin
  5163. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5164. - shiftCount, zSig0, zSig1 );
  5165. End;
  5166. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5167. qword_to_float64:= float_result;
  5168. End;
  5169. {*----------------------------------------------------------------------------
  5170. | Returns the result of converting the 64-bit two's complement integer `a'
  5171. | to the double-precision floating-point format. The conversion is performed
  5172. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5173. *----------------------------------------------------------------------------*}
  5174. function int64_to_float64( a: int64 ): float64;
  5175. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5176. var
  5177. zSign : flag;
  5178. float_result : float64;
  5179. intval : int64rec;
  5180. AbsA : bits64;
  5181. shiftcount : int8;
  5182. zSig0, zSig1 : bits32;
  5183. Begin
  5184. if ( a = 0 ) then
  5185. Begin
  5186. packFloat64( 0, 0, 0, 0, result );
  5187. exit;
  5188. end;
  5189. zSign := flag( a < 0 );
  5190. if ZSign<>0 then
  5191. AbsA := -a
  5192. else
  5193. AbsA := a;
  5194. shiftCount := countLeadingZeros64( absA ) - 11;
  5195. if ( 0 <= shiftCount ) then
  5196. Begin
  5197. absA := absA shl shiftcount;
  5198. zSig0:=int64rec(absA).high;
  5199. zSig1:=int64rec(absA).low;
  5200. End
  5201. else
  5202. Begin
  5203. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5204. - shiftCount, zSig0, zSig1 );
  5205. End;
  5206. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5207. int64_to_float64:= float_result;
  5208. End;
  5209. {*----------------------------------------------------------------------------
  5210. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5211. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5212. | Otherwise, returns 0.
  5213. *----------------------------------------------------------------------------*}
  5214. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5215. begin
  5216. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5217. end;
  5218. {*----------------------------------------------------------------------------
  5219. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5220. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5221. | Otherwise, returns 0.
  5222. *----------------------------------------------------------------------------*}
  5223. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5224. begin
  5225. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5226. end;
  5227. {*----------------------------------------------------------------------------
  5228. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5229. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5230. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5231. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5232. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5233. | the most-significant bit of the extra result, and the other 63 bits of the
  5234. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5235. | were all zero. This extra result is stored in the location pointed to by
  5236. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5237. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5238. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5239. | fixed-point value is shifted right by the number of bits given in `count',
  5240. | and the integer part of the result is returned at the locations pointed to
  5241. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5242. | corrupted as described above, and is returned at the location pointed to by
  5243. | `z2Ptr'.)
  5244. *----------------------------------------------------------------------------*}
  5245. procedure shift128ExtraRightJamming(
  5246. a0: bits64;
  5247. a1: bits64;
  5248. a2: bits64;
  5249. count: int16;
  5250. var z0Ptr: bits64;
  5251. var z1Ptr: bits64;
  5252. var z2Ptr: bits64);
  5253. var
  5254. z0, z1, z2: bits64;
  5255. negCount: int8;
  5256. begin
  5257. negCount := ( - count ) and 63;
  5258. if ( count = 0 ) then
  5259. begin
  5260. z2 := a2;
  5261. z1 := a1;
  5262. z0 := a0;
  5263. end
  5264. else begin
  5265. if ( count < 64 ) then
  5266. begin
  5267. z2 := a1 shr negCount;
  5268. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5269. z0 := a0 shr count;
  5270. end
  5271. else begin
  5272. if ( count = 64 ) then
  5273. begin
  5274. z2 := a1;
  5275. z1 := a0;
  5276. end
  5277. else begin
  5278. a2 := a2 or a1;
  5279. if ( count < 128 ) then
  5280. begin
  5281. z2 := a0 shl negCount;
  5282. z1 := a0 shr ( count and 63 );
  5283. end
  5284. else begin
  5285. if ( count = 128 ) then
  5286. z2 := a0
  5287. else
  5288. z2 := ord( a0 <> 0 );
  5289. z1 := 0;
  5290. end;
  5291. end;
  5292. z0 := 0;
  5293. end;
  5294. z2 := z2 or ord( a2 <> 0 );
  5295. end;
  5296. z2Ptr := z2;
  5297. z1Ptr := z1;
  5298. z0Ptr := z0;
  5299. end;
  5300. {*----------------------------------------------------------------------------
  5301. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5302. | _plus_ the number of bits given in `count'. The shifted result is at most
  5303. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5304. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5305. | shifted off is the most-significant bit of the extra result, and the other
  5306. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5307. | bits shifted off were all zero. This extra result is stored in the location
  5308. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5309. | (This routine makes more sense if `a0' and `a1' are considered to form
  5310. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5311. | point value is shifted right by the number of bits given in `count', and
  5312. | the integer part of the result is returned at the location pointed to by
  5313. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5314. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5315. *----------------------------------------------------------------------------*}
  5316. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5317. var
  5318. z0, z1: bits64;
  5319. negCount: int8;
  5320. begin
  5321. negCount := ( - count ) and 63;
  5322. if ( count = 0 ) then
  5323. begin
  5324. z1 := a1;
  5325. z0 := a0;
  5326. end
  5327. else if ( count < 64 ) then
  5328. begin
  5329. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5330. z0 := a0 shr count;
  5331. end
  5332. else begin
  5333. if ( count = 64 ) then
  5334. begin
  5335. z1 := a0 or ord( a1 <> 0 );
  5336. end
  5337. else begin
  5338. z1 := ord( ( a0 or a1 ) <> 0 );
  5339. end;
  5340. z0 := 0;
  5341. end;
  5342. z1Ptr := z1;
  5343. z0Ptr := z0;
  5344. end;
  5345. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5346. {*----------------------------------------------------------------------------
  5347. | Returns the fraction bits of the extended double-precision floating-point
  5348. | value `a'.
  5349. *----------------------------------------------------------------------------*}
  5350. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5351. begin
  5352. result:=a.low;
  5353. end;
  5354. {*----------------------------------------------------------------------------
  5355. | Returns the exponent bits of the extended double-precision floating-point
  5356. | value `a'.
  5357. *----------------------------------------------------------------------------*}
  5358. function extractFloatx80Exp(a : floatx80): int32;inline;
  5359. begin
  5360. result:=a.high and $7FFF;
  5361. end;
  5362. {*----------------------------------------------------------------------------
  5363. | Returns the sign bit of the extended double-precision floating-point value
  5364. | `a'.
  5365. *----------------------------------------------------------------------------*}
  5366. function extractFloatx80Sign(a : floatx80): flag;inline;
  5367. begin
  5368. result:=a.high shr 15;
  5369. end;
  5370. {*----------------------------------------------------------------------------
  5371. | Normalizes the subnormal extended double-precision floating-point value
  5372. | represented by the denormalized significand `aSig'. The normalized exponent
  5373. | and significand are stored at the locations pointed to by `zExpPtr' and
  5374. | `zSigPtr', respectively.
  5375. *----------------------------------------------------------------------------*}
  5376. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5377. var
  5378. shiftCount: int8;
  5379. begin
  5380. shiftCount := countLeadingZeros64( aSig );
  5381. zSigPtr := aSig shl shiftCount;
  5382. zExpPtr := 1 - shiftCount;
  5383. end;
  5384. {*----------------------------------------------------------------------------
  5385. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5386. | extended double-precision floating-point value, returning the result.
  5387. *----------------------------------------------------------------------------*}
  5388. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5389. var
  5390. z: floatx80;
  5391. begin
  5392. z.low := zSig;
  5393. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5394. result:=z;
  5395. end;
  5396. {*----------------------------------------------------------------------------
  5397. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5398. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5399. | and returns the proper extended double-precision floating-point value
  5400. | corresponding to the abstract input. Ordinarily, the abstract value is
  5401. | rounded and packed into the extended double-precision format, with the
  5402. | inexact exception raised if the abstract input cannot be represented
  5403. | exactly. However, if the abstract value is too large, the overflow and
  5404. | inexact exceptions are raised and an infinity or maximal finite value is
  5405. | returned. If the abstract value is too small, the input value is rounded to
  5406. | a subnormal number, and the underflow and inexact exceptions are raised if
  5407. | the abstract input cannot be represented exactly as a subnormal extended
  5408. | double-precision floating-point number.
  5409. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5410. | number of bits as single or double precision, respectively. Otherwise, the
  5411. | result is rounded to the full precision of the extended double-precision
  5412. | format.
  5413. | The input significand must be normalized or smaller. If the input
  5414. | significand is not normalized, `zExp' must be 0; in that case, the result
  5415. | returned is a subnormal number, and it must not require rounding. The
  5416. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5417. | Floating-Point Arithmetic.
  5418. *----------------------------------------------------------------------------*}
  5419. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5420. var
  5421. roundingMode: int8;
  5422. roundNearestEven, increment, isTiny: flag;
  5423. roundIncrement, roundMask, roundBits: int64;
  5424. label
  5425. precision80;
  5426. begin
  5427. roundingMode := softfloat_rounding_mode;
  5428. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5429. if ( roundingPrecision = 80 ) then
  5430. goto precision80;
  5431. if ( roundingPrecision = 64 ) then
  5432. begin
  5433. roundIncrement := int64( $0000000000000400 );
  5434. roundMask := int64( $00000000000007FF );
  5435. end
  5436. else if ( roundingPrecision = 32 ) then
  5437. begin
  5438. roundIncrement := int64( $0000008000000000 );
  5439. roundMask := int64( $000000FFFFFFFFFF );
  5440. end
  5441. else begin
  5442. goto precision80;
  5443. end;
  5444. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5445. if ( not (roundNearestEven<>0) ) then
  5446. begin
  5447. if ( roundingMode = float_round_to_zero ) then
  5448. begin
  5449. roundIncrement := 0;
  5450. end
  5451. else begin
  5452. roundIncrement := roundMask;
  5453. if ( zSign<>0 ) then
  5454. begin
  5455. if ( roundingMode = float_round_up ) then
  5456. roundIncrement := 0;
  5457. end
  5458. else begin
  5459. if ( roundingMode = float_round_down ) then
  5460. roundIncrement := 0;
  5461. end;
  5462. end;
  5463. end;
  5464. roundBits := zSig0 and roundMask;
  5465. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5466. if ( ( $7FFE < zExp )
  5467. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5468. ) begin
  5469. goto overflow;
  5470. end;
  5471. if ( zExp <= 0 ) begin
  5472. isTiny =
  5473. ( float_detect_tininess = float_tininess_before_rounding )
  5474. or ( zExp < 0 )
  5475. or ( zSig0 <= zSig0 + roundIncrement );
  5476. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5477. zExp := 0;
  5478. roundBits := zSig0 and roundMask;
  5479. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5480. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5481. zSig0 += roundIncrement;
  5482. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5483. roundIncrement := roundMask + 1;
  5484. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5485. roundMask |= roundIncrement;
  5486. end;
  5487. zSig0 = ~ roundMask;
  5488. result:=packFloatx80( zSign, zExp, zSig0 );
  5489. end;
  5490. end;
  5491. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5492. zSig0 += roundIncrement;
  5493. if ( zSig0 < roundIncrement ) begin
  5494. ++zExp;
  5495. zSig0 := LIT64( $8000000000000000 );
  5496. end;
  5497. roundIncrement := roundMask + 1;
  5498. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5499. roundMask |= roundIncrement;
  5500. end;
  5501. zSig0 = ~ roundMask;
  5502. if ( zSig0 = 0 ) zExp := 0;
  5503. result:=packFloatx80( zSign, zExp, zSig0 );
  5504. precision80:
  5505. increment := ( (sbits64) zSig1 < 0 );
  5506. if ( ! roundNearestEven ) begin
  5507. if ( roundingMode = float_round_to_zero ) begin
  5508. increment := 0;
  5509. end;
  5510. else begin
  5511. if ( zSign ) begin
  5512. increment := ( roundingMode = float_round_down ) and zSig1;
  5513. end;
  5514. else begin
  5515. increment := ( roundingMode = float_round_up ) and zSig1;
  5516. end;
  5517. end;
  5518. end;
  5519. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5520. if ( ( $7FFE < zExp )
  5521. or ( ( zExp = $7FFE )
  5522. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5523. and increment
  5524. )
  5525. ) begin
  5526. roundMask := 0;
  5527. overflow:
  5528. float_raise( float_flag_overflow or float_flag_inexact );
  5529. if ( ( roundingMode = float_round_to_zero )
  5530. or ( zSign and ( roundingMode = float_round_up ) )
  5531. or ( ! zSign and ( roundingMode = float_round_down ) )
  5532. ) begin
  5533. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5534. end;
  5535. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5536. end;
  5537. if ( zExp <= 0 ) begin
  5538. isTiny =
  5539. ( float_detect_tininess = float_tininess_before_rounding )
  5540. or ( zExp < 0 )
  5541. or ! increment
  5542. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5543. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  5544. zExp := 0;
  5545. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5546. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5547. if ( roundNearestEven ) begin
  5548. increment := ( (sbits64) zSig1 < 0 );
  5549. end;
  5550. else begin
  5551. if ( zSign ) begin
  5552. increment := ( roundingMode = float_round_down ) and zSig1;
  5553. end;
  5554. else begin
  5555. increment := ( roundingMode = float_round_up ) and zSig1;
  5556. end;
  5557. end;
  5558. if ( increment ) begin
  5559. ++zSig0;
  5560. zSig0 =
  5561. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5562. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5563. end;
  5564. result:=packFloatx80( zSign, zExp, zSig0 );
  5565. end;
  5566. end;
  5567. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5568. if ( increment ) begin
  5569. ++zSig0;
  5570. if ( zSig0 = 0 ) begin
  5571. ++zExp;
  5572. zSig0 := LIT64( $8000000000000000 );
  5573. end;
  5574. else begin
  5575. zSig0 = ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5576. end;
  5577. end;
  5578. else begin
  5579. if ( zSig0 = 0 ) zExp := 0;
  5580. end;
  5581. result:=packFloatx80( zSign, zExp, zSig0 );
  5582. end;
  5583. {*----------------------------------------------------------------------------
  5584. | Takes an abstract floating-point value having sign `zSign', exponent
  5585. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5586. | and returns the proper extended double-precision floating-point value
  5587. | corresponding to the abstract input. This routine is just like
  5588. | `roundAndPackFloatx80' except that the input significand does not have to be
  5589. | normalized.
  5590. *----------------------------------------------------------------------------*}
  5591. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5592. var
  5593. shiftCount: int8;
  5594. begin
  5595. if ( zSig0 = 0 ) begin
  5596. zSig0 := zSig1;
  5597. zSig1 := 0;
  5598. zExp -= 64;
  5599. end;
  5600. shiftCount := countLeadingZeros64( zSig0 );
  5601. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5602. zExp := eExp - shiftCount;
  5603. return
  5604. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5605. end;
  5606. {*----------------------------------------------------------------------------
  5607. | Returns the result of converting the extended double-precision floating-
  5608. | point value `a' to the 32-bit two's complement integer format. The
  5609. | conversion is performed according to the IEC/IEEE Standard for Binary
  5610. | Floating-Point Arithmetic---which means in particular that the conversion
  5611. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5612. | largest positive integer is returned. Otherwise, if the conversion
  5613. | overflows, the largest integer with the same sign as `a' is returned.
  5614. *----------------------------------------------------------------------------*}
  5615. function floatx80_to_int32(a: floatx80): int32;
  5616. var
  5617. aSign: flag;
  5618. aExp, shiftCount: int32;
  5619. aSig: bits64;
  5620. begin
  5621. aSig := extractFloatx80Frac( a );
  5622. aExp := extractFloatx80Exp( a );
  5623. aSign := extractFloatx80Sign( a );
  5624. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5625. shiftCount := $4037 - aExp;
  5626. if ( shiftCount <= 0 ) shiftCount := 1;
  5627. shift64RightJamming( aSig, shiftCount, aSig );
  5628. result := roundAndPackInt32( aSign, aSig );
  5629. end;
  5630. {*----------------------------------------------------------------------------
  5631. | Returns the result of converting the extended double-precision floating-
  5632. | point value `a' to the 32-bit two's complement integer format. The
  5633. | conversion is performed according to the IEC/IEEE Standard for Binary
  5634. | Floating-Point Arithmetic, except that the conversion is always rounded
  5635. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5636. | Otherwise, if the conversion overflows, the largest integer with the same
  5637. | sign as `a' is returned.
  5638. *----------------------------------------------------------------------------*}
  5639. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5640. var
  5641. aSign: flag;
  5642. aExp, shiftCount: int32;
  5643. aSig, savedASig: bits64;
  5644. z: int32;
  5645. begin
  5646. aSig := extractFloatx80Frac( a );
  5647. aExp := extractFloatx80Exp( a );
  5648. aSign := extractFloatx80Sign( a );
  5649. if ( $401E < aExp ) begin
  5650. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5651. goto invalid;
  5652. end;
  5653. else if ( aExp < $3FFF ) begin
  5654. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5655. result := 0;
  5656. end;
  5657. shiftCount := $403E - aExp;
  5658. savedASig := aSig;
  5659. aSig >>= shiftCount;
  5660. z := aSig;
  5661. if ( aSign ) z := - z;
  5662. if ( ( z < 0 ) xor aSign ) begin
  5663. invalid:
  5664. float_raise( float_flag_invalid );
  5665. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5666. end;
  5667. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5668. softfloat_exception_flags or= float_flag_inexact;
  5669. end;
  5670. result := z;
  5671. end;
  5672. {*----------------------------------------------------------------------------
  5673. | Returns the result of converting the extended double-precision floating-
  5674. | point value `a' to the 64-bit two's complement integer format. The
  5675. | conversion is performed according to the IEC/IEEE Standard for Binary
  5676. | Floating-Point Arithmetic---which means in particular that the conversion
  5677. | is rounded according to the current rounding mode. If `a' is a NaN,
  5678. | the largest positive integer is returned. Otherwise, if the conversion
  5679. | overflows, the largest integer with the same sign as `a' is returned.
  5680. *----------------------------------------------------------------------------*}
  5681. function floatx80_to_int64(a: floatx80): int64;
  5682. var
  5683. aSign: flag;
  5684. aExp, shiftCount: int32;
  5685. aSig, aSigExtra: bits64;
  5686. begin
  5687. aSig := extractFloatx80Frac( a );
  5688. aExp := extractFloatx80Exp( a );
  5689. aSign := extractFloatx80Sign( a );
  5690. shiftCount := $403E - aExp;
  5691. if ( shiftCount <= 0 ) begin
  5692. if ( shiftCount ) begin
  5693. float_raise( float_flag_invalid );
  5694. if ( ! aSign
  5695. or ( ( aExp = $7FFF )
  5696. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5697. ) begin
  5698. result := LIT64( $7FFFFFFFFFFFFFFF );
  5699. end;
  5700. result := (sbits64) LIT64( $8000000000000000 );
  5701. end;
  5702. aSigExtra := 0;
  5703. end;
  5704. else begin
  5705. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  5706. end;
  5707. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5708. end;
  5709. {*----------------------------------------------------------------------------
  5710. | Returns the result of converting the extended double-precision floating-
  5711. | point value `a' to the 64-bit two's complement integer format. The
  5712. | conversion is performed according to the IEC/IEEE Standard for Binary
  5713. | Floating-Point Arithmetic, except that the conversion is always rounded
  5714. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5715. | Otherwise, if the conversion overflows, the largest integer with the same
  5716. | sign as `a' is returned.
  5717. *----------------------------------------------------------------------------*}
  5718. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5719. var
  5720. aSign: flag;
  5721. aExp, shiftCount: int32;
  5722. aSig: bits64;
  5723. z: int64;
  5724. begin
  5725. aSig := extractFloatx80Frac( a );
  5726. aExp := extractFloatx80Exp( a );
  5727. aSign := extractFloatx80Sign( a );
  5728. shiftCount := aExp - $403E;
  5729. if ( 0 <= shiftCount ) begin
  5730. aSig = LIT64( $7FFFFFFFFFFFFFFF );
  5731. if ( ( a.high <> $C03E ) or aSig ) begin
  5732. float_raise( float_flag_invalid );
  5733. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5734. result := LIT64( $7FFFFFFFFFFFFFFF );
  5735. end;
  5736. end;
  5737. result := (sbits64) LIT64( $8000000000000000 );
  5738. end;
  5739. else if ( aExp < $3FFF ) begin
  5740. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5741. result := 0;
  5742. end;
  5743. z := aSig>>( - shiftCount );
  5744. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5745. softfloat_exception_flags or= float_flag_inexact;
  5746. end;
  5747. if ( aSign ) z := - z;
  5748. result := z;
  5749. end;
  5750. {*----------------------------------------------------------------------------
  5751. | Returns the result of converting the extended double-precision floating-
  5752. | point value `a' to the single-precision floating-point format. The
  5753. | conversion is performed according to the IEC/IEEE Standard for Binary
  5754. | Floating-Point Arithmetic.
  5755. *----------------------------------------------------------------------------*}
  5756. function floatx80_to_float32(a: floatx80): float32;
  5757. var
  5758. aSign: flag;
  5759. aExp: int32;
  5760. aSig: bits64;
  5761. begin
  5762. aSig := extractFloatx80Frac( a );
  5763. aExp := extractFloatx80Exp( a );
  5764. aSign := extractFloatx80Sign( a );
  5765. if ( aExp = $7FFF ) begin
  5766. if ( (bits64) ( aSig shl 1 ) ) begin
  5767. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5768. end;
  5769. result := packFloat32( aSign, $FF, 0 );
  5770. end;
  5771. shift64RightJamming( aSig, 33, aSig );
  5772. if ( aExp or aSig ) aExp -= $3F81;
  5773. result := roundAndPackFloat32( aSign, aExp, aSig );
  5774. end;
  5775. {*----------------------------------------------------------------------------
  5776. | Returns the result of converting the extended double-precision floating-
  5777. | point value `a' to the double-precision floating-point format. The
  5778. | conversion is performed according to the IEC/IEEE Standard for Binary
  5779. | Floating-Point Arithmetic.
  5780. *----------------------------------------------------------------------------*}
  5781. function floatx80_to_float64(a: floatx80): float64;
  5782. var
  5783. aSign: flag;
  5784. aExp: int32;
  5785. aSig, zSig: bits64;
  5786. begin
  5787. aSig := extractFloatx80Frac( a );
  5788. aExp := extractFloatx80Exp( a );
  5789. aSign := extractFloatx80Sign( a );
  5790. if ( aExp = $7FFF ) begin
  5791. if ( (bits64) ( aSig shl 1 ) ) begin
  5792. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5793. end;
  5794. result := packFloat64( aSign, $7FF, 0 );
  5795. end;
  5796. shift64RightJamming( aSig, 1, zSig );
  5797. if ( aExp or aSig ) aExp -= $3C01;
  5798. result := roundAndPackFloat64( aSign, aExp, zSig );
  5799. end;
  5800. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5801. {*----------------------------------------------------------------------------
  5802. | Returns the result of converting the extended double-precision floating-
  5803. | point value `a' to the quadruple-precision floating-point format. The
  5804. | conversion is performed according to the IEC/IEEE Standard for Binary
  5805. | Floating-Point Arithmetic.
  5806. *----------------------------------------------------------------------------*}
  5807. function floatx80_to_float128(a: floatx80): float128;
  5808. var
  5809. aSign: flag;
  5810. aExp: int16;
  5811. aSig, zSig0, zSig1: bits64;
  5812. begin
  5813. aSig := extractFloatx80Frac( a );
  5814. aExp := extractFloatx80Exp( a );
  5815. aSign := extractFloatx80Sign( a );
  5816. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5817. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5818. end;
  5819. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  5820. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5821. end;
  5822. {$endif FPC_SOFTFLOAT_FLOAT128}
  5823. {*----------------------------------------------------------------------------
  5824. | Rounds the extended double-precision floating-point value `a' to an integer,
  5825. | and Returns the result as an extended quadruple-precision floating-point
  5826. | value. The operation is performed according to the IEC/IEEE Standard for
  5827. | Binary Floating-Point Arithmetic.
  5828. *----------------------------------------------------------------------------*}
  5829. function floatx80_round_to_int(a: floatx80): floatx80;
  5830. var
  5831. aSign: flag;
  5832. aExp: int32;
  5833. lastBitMask, roundBitsMask: bits64;
  5834. roundingMode: int8;
  5835. z: floatx80;
  5836. begin
  5837. aExp := extractFloatx80Exp( a );
  5838. if ( $403E <= aExp ) begin
  5839. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5840. result := propagateFloatx80NaN( a, a );
  5841. end;
  5842. result := a;
  5843. end;
  5844. if ( aExp < $3FFF ) begin
  5845. if ( ( aExp = 0 )
  5846. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5847. result := a;
  5848. end;
  5849. softfloat_exception_flags or= float_flag_inexact;
  5850. aSign := extractFloatx80Sign( a );
  5851. switch ( softfloat_rounding_mode ) begin
  5852. case float_round_nearest_even:
  5853. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5854. ) begin
  5855. result :=
  5856. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5857. end;
  5858. break;
  5859. case float_round_down:
  5860. result :=
  5861. aSign ?
  5862. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5863. : packFloatx80( 0, 0, 0 );
  5864. case float_round_up:
  5865. result :=
  5866. aSign ? packFloatx80( 1, 0, 0 )
  5867. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5868. end;
  5869. result := packFloatx80( aSign, 0, 0 );
  5870. end;
  5871. lastBitMask := 1;
  5872. lastBitMask shl = $403E - aExp;
  5873. roundBitsMask := lastBitMask - 1;
  5874. z := a;
  5875. roundingMode := softfloat_rounding_mode;
  5876. if ( roundingMode = float_round_nearest_even ) begin
  5877. z.low += lastBitMask>>1;
  5878. if ( ( z.low and roundBitsMask ) = 0 ) z.low = ~ lastBitMask;
  5879. end;
  5880. else if ( roundingMode <> float_round_to_zero ) begin
  5881. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5882. z.low += roundBitsMask;
  5883. end;
  5884. end;
  5885. z.low = ~ roundBitsMask;
  5886. if ( z.low = 0 ) begin
  5887. ++z.high;
  5888. z.low := LIT64( $8000000000000000 );
  5889. end;
  5890. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5891. result := z;
  5892. end;
  5893. {*----------------------------------------------------------------------------
  5894. | Returns the result of adding the absolute values of the extended double-
  5895. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5896. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5897. | The addition is performed according to the IEC/IEEE Standard for Binary
  5898. | Floating-Point Arithmetic.
  5899. *----------------------------------------------------------------------------*}
  5900. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5901. var
  5902. aExp, bExp, zExp: int32;
  5903. aSig, bSig, zSig0, zSig1: bits64;
  5904. expDiff: int32;
  5905. begin
  5906. aSig := extractFloatx80Frac( a );
  5907. aExp := extractFloatx80Exp( a );
  5908. bSig := extractFloatx80Frac( b );
  5909. bExp := extractFloatx80Exp( b );
  5910. expDiff := aExp - bExp;
  5911. if ( 0 < expDiff ) begin
  5912. if ( aExp = $7FFF ) begin
  5913. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5914. result := a;
  5915. end;
  5916. if ( bExp = 0 ) --expDiff;
  5917. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5918. zExp := aExp;
  5919. end;
  5920. else if ( expDiff < 0 ) begin
  5921. if ( bExp = $7FFF ) begin
  5922. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5923. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5924. end;
  5925. if ( aExp = 0 ) ++expDiff;
  5926. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5927. zExp := bExp;
  5928. end;
  5929. else begin
  5930. if ( aExp = $7FFF ) begin
  5931. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5932. result := propagateFloatx80NaN( a, b );
  5933. end;
  5934. result := a;
  5935. end;
  5936. zSig1 := 0;
  5937. zSig0 := aSig + bSig;
  5938. if ( aExp = 0 ) begin
  5939. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  5940. goto roundAndPack;
  5941. end;
  5942. zExp := aExp;
  5943. goto shiftRight1;
  5944. end;
  5945. zSig0 := aSig + bSig;
  5946. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5947. shiftRight1:
  5948. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  5949. zSig0 or= LIT64( $8000000000000000 );
  5950. ++zExp;
  5951. roundAndPack:
  5952. result :=
  5953. roundAndPackFloatx80(
  5954. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5955. end;
  5956. {*----------------------------------------------------------------------------
  5957. | Returns the result of subtracting the absolute values of the extended
  5958. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5959. | difference is negated before being returned. `zSign' is ignored if the
  5960. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5961. | Standard for Binary Floating-Point Arithmetic.
  5962. *----------------------------------------------------------------------------*}
  5963. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5964. var
  5965. aExp, bExp, zExp: int32;
  5966. aSig, bSig, zSig0, zSig1: bits64;
  5967. expDiff: int32;
  5968. z: floatx80;
  5969. begin
  5970. aSig := extractFloatx80Frac( a );
  5971. aExp := extractFloatx80Exp( a );
  5972. bSig := extractFloatx80Frac( b );
  5973. bExp := extractFloatx80Exp( b );
  5974. expDiff := aExp - bExp;
  5975. if ( 0 < expDiff ) goto aExpBigger;
  5976. if ( expDiff < 0 ) goto bExpBigger;
  5977. if ( aExp = $7FFF ) begin
  5978. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5979. result := propagateFloatx80NaN( a, b );
  5980. end;
  5981. float_raise( float_flag_invalid );
  5982. z.low := floatx80_default_nan_low;
  5983. z.high := floatx80_default_nan_high;
  5984. result := z;
  5985. end;
  5986. if ( aExp = 0 ) begin
  5987. aExp := 1;
  5988. bExp := 1;
  5989. end;
  5990. zSig1 := 0;
  5991. if ( bSig < aSig ) goto aBigger;
  5992. if ( aSig < bSig ) goto bBigger;
  5993. result := packFloatx80( softfloat_rounding_mode = float_round_down, 0, 0 );
  5994. bExpBigger:
  5995. if ( bExp = $7FFF ) begin
  5996. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5997. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  5998. end;
  5999. if ( aExp = 0 ) ++expDiff;
  6000. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6001. bBigger:
  6002. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6003. zExp := bExp;
  6004. zSign xor = 1;
  6005. goto normalizeRoundAndPack;
  6006. aExpBigger:
  6007. if ( aExp = $7FFF ) begin
  6008. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6009. result := a;
  6010. end;
  6011. if ( bExp = 0 ) --expDiff;
  6012. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6013. aBigger:
  6014. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6015. zExp := aExp;
  6016. normalizeRoundAndPack:
  6017. result :=
  6018. normalizeRoundAndPackFloatx80(
  6019. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6020. end;
  6021. {*----------------------------------------------------------------------------
  6022. | Returns the result of adding the extended double-precision floating-point
  6023. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6024. | Standard for Binary Floating-Point Arithmetic.
  6025. *----------------------------------------------------------------------------*}
  6026. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6027. var
  6028. aSign, bSign: flag;
  6029. begin
  6030. aSign := extractFloatx80Sign( a );
  6031. bSign := extractFloatx80Sign( b );
  6032. if ( aSign = bSign ) begin
  6033. result := addFloatx80Sigs( a, b, aSign );
  6034. end;
  6035. else begin
  6036. result := subFloatx80Sigs( a, b, aSign );
  6037. end;
  6038. end;
  6039. {*----------------------------------------------------------------------------
  6040. | Returns the result of subtracting the extended double-precision floating-
  6041. | point values `a' and `b'. The operation is performed according to the
  6042. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6043. *----------------------------------------------------------------------------*}
  6044. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6045. var
  6046. aSign, bSign: flag;
  6047. begin
  6048. aSign := extractFloatx80Sign( a );
  6049. bSign := extractFloatx80Sign( b );
  6050. if ( aSign = bSign ) begin
  6051. result := subFloatx80Sigs( a, b, aSign );
  6052. end;
  6053. else begin
  6054. result := addFloatx80Sigs( a, b, aSign );
  6055. end;
  6056. end;
  6057. {*----------------------------------------------------------------------------
  6058. | Returns the result of multiplying the extended double-precision floating-
  6059. | point values `a' and `b'. The operation is performed according to the
  6060. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6061. *----------------------------------------------------------------------------*}
  6062. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6063. var
  6064. aSign, bSign, zSign: flag;
  6065. aExp, bExp, zExp: int32;
  6066. aSig, bSig, zSig0, zSig1: bits64;
  6067. z: floatx80;
  6068. begin
  6069. aSig := extractFloatx80Frac( a );
  6070. aExp := extractFloatx80Exp( a );
  6071. aSign := extractFloatx80Sign( a );
  6072. bSig := extractFloatx80Frac( b );
  6073. bExp := extractFloatx80Exp( b );
  6074. bSign := extractFloatx80Sign( b );
  6075. zSign := aSign xor bSign;
  6076. if ( aExp = $7FFF ) begin
  6077. if ( (bits64) ( aSig shl 1 )
  6078. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6079. result := propagateFloatx80NaN( a, b );
  6080. end;
  6081. if ( ( bExp or bSig ) = 0 ) goto invalid;
  6082. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6083. end;
  6084. if ( bExp = $7FFF ) begin
  6085. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6086. if ( ( aExp or aSig ) = 0 ) begin
  6087. invalid:
  6088. float_raise( float_flag_invalid );
  6089. z.low := floatx80_default_nan_low;
  6090. z.high := floatx80_default_nan_high;
  6091. result := z;
  6092. end;
  6093. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6094. end;
  6095. if ( aExp = 0 ) begin
  6096. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6097. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6098. end;
  6099. if ( bExp = 0 ) begin
  6100. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6101. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6102. end;
  6103. zExp := aExp + bExp - $3FFE;
  6104. mul64To128( aSig, bSig, zSig0, zSig1 );
  6105. if ( 0 < (sbits64) zSig0 ) begin
  6106. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6107. --zExp;
  6108. end;
  6109. result :=
  6110. roundAndPackFloatx80(
  6111. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6112. end;
  6113. {*----------------------------------------------------------------------------
  6114. | Returns the result of dividing the extended double-precision floating-point
  6115. | value `a' by the corresponding value `b'. The operation is performed
  6116. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6117. *----------------------------------------------------------------------------*}
  6118. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6119. var
  6120. aSign, bSign, zSign: flag;
  6121. aExp, bExp, zExp: int32;
  6122. aSig, bSig, zSig0, zSig1: bits64;
  6123. rem0, rem1, rem2, term0, term1, term2: bits64;
  6124. z: floatx80;
  6125. begin
  6126. aSig := extractFloatx80Frac( a );
  6127. aExp := extractFloatx80Exp( a );
  6128. aSign := extractFloatx80Sign( a );
  6129. bSig := extractFloatx80Frac( b );
  6130. bExp := extractFloatx80Exp( b );
  6131. bSign := extractFloatx80Sign( b );
  6132. zSign := aSign xor bSign;
  6133. if ( aExp = $7FFF ) begin
  6134. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6135. if ( bExp = $7FFF ) begin
  6136. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6137. goto invalid;
  6138. end;
  6139. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6140. end;
  6141. if ( bExp = $7FFF ) begin
  6142. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6143. result := packFloatx80( zSign, 0, 0 );
  6144. end;
  6145. if ( bExp = 0 ) begin
  6146. if ( bSig = 0 ) begin
  6147. if ( ( aExp or aSig ) = 0 ) begin
  6148. invalid:
  6149. float_raise( float_flag_invalid );
  6150. z.low := floatx80_default_nan_low;
  6151. z.high := floatx80_default_nan_high;
  6152. result := z;
  6153. end;
  6154. float_raise( float_flag_divbyzero );
  6155. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6156. end;
  6157. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6158. end;
  6159. if ( aExp = 0 ) begin
  6160. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6161. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6162. end;
  6163. zExp := aExp - bExp + $3FFE;
  6164. rem1 := 0;
  6165. if ( bSig <= aSig ) begin
  6166. shift128Right( aSig, 0, 1, aSig, rem1 );
  6167. ++zExp;
  6168. end;
  6169. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6170. mul64To128( bSig, zSig0, term0, term1 );
  6171. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6172. while ( (sbits64) rem0 < 0 ) begin
  6173. --zSig0;
  6174. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6175. end;
  6176. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6177. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  6178. mul64To128( bSig, zSig1, term1, term2 );
  6179. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6180. while ( (sbits64) rem1 < 0 ) begin
  6181. --zSig1;
  6182. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6183. end;
  6184. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  6185. end;
  6186. result :=
  6187. roundAndPackFloatx80(
  6188. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6189. end;
  6190. {*----------------------------------------------------------------------------
  6191. | Returns the remainder of the extended double-precision floating-point value
  6192. | `a' with respect to the corresponding value `b'. The operation is performed
  6193. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6194. *----------------------------------------------------------------------------*}
  6195. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6196. var
  6197. aSign, bSign, zSign: flag;
  6198. aExp, bExp, expDiff: int32;
  6199. aSig0, aSig1, bSig: bits64;
  6200. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6201. z: floatx80;
  6202. begin
  6203. aSig0 := extractFloatx80Frac( a );
  6204. aExp := extractFloatx80Exp( a );
  6205. aSign := extractFloatx80Sign( a );
  6206. bSig := extractFloatx80Frac( b );
  6207. bExp := extractFloatx80Exp( b );
  6208. bSign := extractFloatx80Sign( b );
  6209. if ( aExp = $7FFF ) begin
  6210. if ( (bits64) ( aSig0 shl 1 )
  6211. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6212. result := propagateFloatx80NaN( a, b );
  6213. end;
  6214. goto invalid;
  6215. end;
  6216. if ( bExp = $7FFF ) begin
  6217. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6218. result := a;
  6219. end;
  6220. if ( bExp = 0 ) begin
  6221. if ( bSig = 0 ) begin
  6222. invalid:
  6223. float_raise( float_flag_invalid );
  6224. z.low := floatx80_default_nan_low;
  6225. z.high := floatx80_default_nan_high;
  6226. result := z;
  6227. end;
  6228. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6229. end;
  6230. if ( aExp = 0 ) begin
  6231. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  6232. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6233. end;
  6234. bSig or= LIT64( $8000000000000000 );
  6235. zSign := aSign;
  6236. expDiff := aExp - bExp;
  6237. aSig1 := 0;
  6238. if ( expDiff < 0 ) begin
  6239. if ( expDiff < -1 ) result := a;
  6240. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6241. expDiff := 0;
  6242. end;
  6243. q := ( bSig <= aSig0 );
  6244. if ( q ) aSig0 -= bSig;
  6245. expDiff -= 64;
  6246. while ( 0 < expDiff ) begin
  6247. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6248. q := ( 2 < q ) ? q - 2 : 0;
  6249. mul64To128( bSig, q, term0, term1 );
  6250. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6251. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6252. expDiff -= 62;
  6253. end;
  6254. expDiff += 64;
  6255. if ( 0 < expDiff ) begin
  6256. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6257. q := ( 2 < q ) ? q - 2 : 0;
  6258. q >>= 64 - expDiff;
  6259. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6260. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6261. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6262. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  6263. ++q;
  6264. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6265. end;
  6266. end;
  6267. else begin
  6268. term1 := 0;
  6269. term0 := bSig;
  6270. end;
  6271. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6272. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6273. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6274. and ( q and 1 ) )
  6275. ) begin
  6276. aSig0 := alternateASig0;
  6277. aSig1 := alternateASig1;
  6278. zSign := ! zSign;
  6279. end;
  6280. result :=
  6281. normalizeRoundAndPackFloatx80(
  6282. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6283. end;
  6284. {*----------------------------------------------------------------------------
  6285. | Returns the square root of the extended double-precision floating-point
  6286. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6287. | for Binary Floating-Point Arithmetic.
  6288. *----------------------------------------------------------------------------*}
  6289. function floatx80_sqrt(a: floatx80): floatx80;
  6290. var
  6291. aSign: flag;
  6292. aExp, zExp: int32;
  6293. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6294. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6295. z: floatx80;
  6296. label
  6297. invalid;
  6298. begin
  6299. aSig0 := extractFloatx80Frac( a );
  6300. aExp := extractFloatx80Exp( a );
  6301. aSign := extractFloatx80Sign( a );
  6302. if ( aExp = $7FFF ) begin
  6303. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  6304. if ( ! aSign ) result := a;
  6305. goto invalid;
  6306. end;
  6307. if ( aSign ) begin
  6308. if ( ( aExp or aSig0 ) = 0 ) result := a;
  6309. invalid:
  6310. float_raise( float_flag_invalid );
  6311. z.low := floatx80_default_nan_low;
  6312. z.high := floatx80_default_nan_high;
  6313. result := z;
  6314. end;
  6315. if ( aExp = 0 ) begin
  6316. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  6317. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6318. end;
  6319. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  6320. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  6321. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  6322. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  6323. doubleZSig0 := zSig0 shl 1;
  6324. mul64To128( zSig0, zSig0, term0, term1 );
  6325. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  6326. while ( (sbits64) rem0 < 0 ) begin
  6327. --zSig0;
  6328. doubleZSig0 -= 2;
  6329. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  6330. end;
  6331. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  6332. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  6333. if ( zSig1 = 0 ) zSig1 := 1;
  6334. mul64To128( doubleZSig0, zSig1, term1, term2 );
  6335. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6336. mul64To128( zSig1, zSig1, term2, term3 );
  6337. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  6338. while ( (sbits64) rem1 < 0 ) begin
  6339. --zSig1;
  6340. shortShift128Left( 0, zSig1, 1, term2, term3 );
  6341. term3 or= 1;
  6342. term2 or= doubleZSig0;
  6343. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  6344. end;
  6345. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  6346. end;
  6347. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  6348. zSig0 or= doubleZSig0;
  6349. result :=
  6350. roundAndPackFloatx80(
  6351. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  6352. end;
  6353. {*----------------------------------------------------------------------------
  6354. | Returns 1 if the extended double-precision floating-point value `a' is
  6355. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  6356. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6357. | Arithmetic.
  6358. *----------------------------------------------------------------------------*}
  6359. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  6360. begin
  6361. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6362. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6363. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6364. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6365. ) begin
  6366. if ( floatx80_is_signaling_nan( a )
  6367. or floatx80_is_signaling_nan( b ) ) begin
  6368. float_raise( float_flag_invalid );
  6369. end;
  6370. result := 0;
  6371. end;
  6372. result :=
  6373. ( a.low = b.low )
  6374. and ( ( a.high = b.high )
  6375. or ( ( a.low = 0 )
  6376. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6377. );
  6378. end;
  6379. {*----------------------------------------------------------------------------
  6380. | Returns 1 if the extended double-precision floating-point value `a' is
  6381. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6382. | comparison is performed according to the IEC/IEEE Standard for Binary
  6383. | Floating-Point Arithmetic.
  6384. *----------------------------------------------------------------------------*}
  6385. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6386. var
  6387. aSign, bSign: flag;
  6388. begin
  6389. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6390. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6391. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6392. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6393. ) begin
  6394. float_raise( float_flag_invalid );
  6395. result := 0;
  6396. end;
  6397. aSign := extractFloatx80Sign( a );
  6398. bSign := extractFloatx80Sign( b );
  6399. if ( aSign <> bSign ) begin
  6400. result :=
  6401. aSign
  6402. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6403. = 0 );
  6404. end;
  6405. result :=
  6406. aSign ? le128( b.high, b.low, a.high, a.low )
  6407. : le128( a.high, a.low, b.high, b.low );
  6408. end;
  6409. {*----------------------------------------------------------------------------
  6410. | Returns 1 if the extended double-precision floating-point value `a' is
  6411. | less than the corresponding value `b', and 0 otherwise. The comparison
  6412. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6413. | Arithmetic.
  6414. *----------------------------------------------------------------------------*}
  6415. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6416. var
  6417. aSign, bSign: flag;
  6418. begin
  6419. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6420. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6421. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6422. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6423. ) begin
  6424. float_raise( float_flag_invalid );
  6425. result := 0;
  6426. end;
  6427. aSign := extractFloatx80Sign( a );
  6428. bSign := extractFloatx80Sign( b );
  6429. if ( aSign <> bSign ) begin
  6430. result :=
  6431. aSign
  6432. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6433. <> 0 );
  6434. end;
  6435. result :=
  6436. aSign ? lt128( b.high, b.low, a.high, a.low )
  6437. : lt128( a.high, a.low, b.high, b.low );
  6438. end;
  6439. {*----------------------------------------------------------------------------
  6440. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6441. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6442. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6443. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6444. *----------------------------------------------------------------------------*}
  6445. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6446. begin
  6447. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6448. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6449. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6450. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6451. ) begin
  6452. float_raise( float_flag_invalid );
  6453. result := 0;
  6454. end;
  6455. result :=
  6456. ( a.low = b.low )
  6457. and ( ( a.high = b.high )
  6458. or ( ( a.low = 0 )
  6459. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6460. );
  6461. end;
  6462. {*----------------------------------------------------------------------------
  6463. | Returns 1 if the extended double-precision floating-point value `a' is less
  6464. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6465. | do not cause an exception. Otherwise, the comparison is performed according
  6466. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6467. *----------------------------------------------------------------------------*}
  6468. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6469. var
  6470. aSign, bSign: flag;
  6471. begin
  6472. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6473. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6474. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6475. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6476. ) begin
  6477. if ( floatx80_is_signaling_nan( a )
  6478. or floatx80_is_signaling_nan( b ) ) begin
  6479. float_raise( float_flag_invalid );
  6480. end;
  6481. result := 0;
  6482. end;
  6483. aSign := extractFloatx80Sign( a );
  6484. bSign := extractFloatx80Sign( b );
  6485. if ( aSign <> bSign ) begin
  6486. result :=
  6487. aSign
  6488. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6489. = 0 );
  6490. end;
  6491. result :=
  6492. aSign ? le128( b.high, b.low, a.high, a.low )
  6493. : le128( a.high, a.low, b.high, b.low );
  6494. end;
  6495. {*----------------------------------------------------------------------------
  6496. | Returns 1 if the extended double-precision floating-point value `a' is less
  6497. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6498. | an exception. Otherwise, the comparison is performed according to the
  6499. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6500. *----------------------------------------------------------------------------*}
  6501. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6502. var
  6503. aSign, bSign: flag;
  6504. begin
  6505. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6506. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6507. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6508. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6509. ) begin
  6510. if ( floatx80_is_signaling_nan( a )
  6511. or floatx80_is_signaling_nan( b ) ) begin
  6512. float_raise( float_flag_invalid );
  6513. end;
  6514. result := 0;
  6515. end;
  6516. aSign := extractFloatx80Sign( a );
  6517. bSign := extractFloatx80Sign( b );
  6518. if ( aSign <> bSign ) begin
  6519. result :=
  6520. aSign
  6521. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6522. <> 0 );
  6523. end;
  6524. result :=
  6525. aSign ? lt128( b.high, b.low, a.high, a.low )
  6526. : lt128( a.high, a.low, b.high, b.low );
  6527. end;
  6528. {$endif FPC_SOFTFLOAT_FLOATX80}
  6529. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6530. {*----------------------------------------------------------------------------
  6531. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6532. | floating-point value `a'.
  6533. *----------------------------------------------------------------------------*}
  6534. function extractFloat128Frac1(a : float128): bits64;
  6535. begin
  6536. result:=a.low;
  6537. end;
  6538. {*----------------------------------------------------------------------------
  6539. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6540. | floating-point value `a'.
  6541. *----------------------------------------------------------------------------*}
  6542. function extractFloat128Frac0(a : float128): bits64;
  6543. begin
  6544. result:=a.high and int64($0000FFFFFFFFFFFF);
  6545. end;
  6546. {*----------------------------------------------------------------------------
  6547. | Returns the exponent bits of the quadruple-precision floating-point value
  6548. | `a'.
  6549. *----------------------------------------------------------------------------*}
  6550. function extractFloat128Exp(a : float128): int32;
  6551. begin
  6552. result:=( a.high shr 48 ) and $7FFF;
  6553. end;
  6554. {*----------------------------------------------------------------------------
  6555. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6556. *----------------------------------------------------------------------------*}
  6557. function extractFloat128Sign(a : float128): flag;
  6558. begin
  6559. result:=a.high shr 63;
  6560. end;
  6561. {*----------------------------------------------------------------------------
  6562. | Normalizes the subnormal quadruple-precision floating-point value
  6563. | represented by the denormalized significand formed by the concatenation of
  6564. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6565. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6566. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6567. | least significant 64 bits of the normalized significand are stored at the
  6568. | location pointed to by `zSig1Ptr'.
  6569. *----------------------------------------------------------------------------*}
  6570. procedure normalizeFloat128Subnormal(
  6571. aSig0: bits64;
  6572. aSig1: bits64;
  6573. var zExpPtr: int32;
  6574. var zSig0Ptr: bits64;
  6575. var zSig1Ptr: bits64);
  6576. var
  6577. shiftCount: int8;
  6578. begin
  6579. if ( aSig0 = 0 ) then
  6580. begin
  6581. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6582. if ( shiftCount < 0 ) then
  6583. begin
  6584. zSig0Ptr := aSig1 shr ( - shiftCount );
  6585. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6586. end
  6587. else begin
  6588. zSig0Ptr := aSig1 shl shiftCount;
  6589. zSig1Ptr := 0;
  6590. end;
  6591. zExpPtr := - shiftCount - 63;
  6592. end
  6593. else begin
  6594. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6595. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6596. zExpPtr := 1 - shiftCount;
  6597. end;
  6598. end;
  6599. {*----------------------------------------------------------------------------
  6600. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6601. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6602. | floating-point value, returning the result. After being shifted into the
  6603. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6604. | added together to form the most significant 32 bits of the result. This
  6605. | means that any integer portion of `zSig0' will be added into the exponent.
  6606. | Since a properly normalized significand will have an integer portion equal
  6607. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6608. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6609. | significand.
  6610. *----------------------------------------------------------------------------*}
  6611. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6612. var
  6613. z: float128;
  6614. begin
  6615. z.low := zSig1;
  6616. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6617. result:=z;
  6618. end;
  6619. {*----------------------------------------------------------------------------
  6620. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6621. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6622. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6623. | corresponding to the abstract input. Ordinarily, the abstract value is
  6624. | simply rounded and packed into the quadruple-precision format, with the
  6625. | inexact exception raised if the abstract input cannot be represented
  6626. | exactly. However, if the abstract value is too large, the overflow and
  6627. | inexact exceptions are raised and an infinity or maximal finite value is
  6628. | returned. If the abstract value is too small, the input value is rounded to
  6629. | a subnormal number, and the underflow and inexact exceptions are raised if
  6630. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6631. | precision floating-point number.
  6632. | The input significand must be normalized or smaller. If the input
  6633. | significand is not normalized, `zExp' must be 0; in that case, the result
  6634. | returned is a subnormal number, and it must not require rounding. In the
  6635. | usual case that the input significand is normalized, `zExp' must be 1 less
  6636. | than the ``true'' floating-point exponent. The handling of underflow and
  6637. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6638. *----------------------------------------------------------------------------*}
  6639. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6640. var
  6641. roundingMode: int8;
  6642. roundNearestEven, increment, isTiny: flag;
  6643. begin
  6644. roundingMode := softfloat_rounding_mode;
  6645. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6646. increment := ord( sbits64(zSig2) < 0 );
  6647. if ( roundNearestEven=0 ) then
  6648. begin
  6649. if ( roundingMode = float_round_to_zero ) then
  6650. begin
  6651. increment := 0;
  6652. end
  6653. else begin
  6654. if ( zSign<>0 ) then
  6655. begin
  6656. increment := ord( roundingMode = float_round_down ) and zSig2;
  6657. end
  6658. else begin
  6659. increment := ord( roundingMode = float_round_up ) and zSig2;
  6660. end;
  6661. end;
  6662. end;
  6663. if ( $7FFD <= bits32(zExp) ) then
  6664. begin
  6665. if ( ord( $7FFD < zExp )
  6666. or ( ord( zExp = $7FFD )
  6667. and eq128(
  6668. int64( $0001FFFFFFFFFFFF ),
  6669. int64( $FFFFFFFFFFFFFFFF ),
  6670. zSig0,
  6671. zSig1
  6672. )
  6673. and increment
  6674. )
  6675. )<>0 then
  6676. begin
  6677. float_raise( float_flag_overflow or float_flag_inexact );
  6678. if ( ord( roundingMode = float_round_to_zero )
  6679. or ( zSign and ord( roundingMode = float_round_up ) )
  6680. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6681. )<>0 then
  6682. begin
  6683. result :=
  6684. packFloat128(
  6685. zSign,
  6686. $7FFE,
  6687. int64( $0000FFFFFFFFFFFF ),
  6688. int64( $FFFFFFFFFFFFFFFF )
  6689. );
  6690. end;
  6691. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6692. end;
  6693. if ( zExp < 0 ) then
  6694. begin
  6695. isTiny :=
  6696. ord(( float_detect_tininess = float_tininess_before_rounding )
  6697. or ( zExp < -1 )
  6698. or not( increment<>0 )
  6699. or boolean(lt128(
  6700. zSig0,
  6701. zSig1,
  6702. int64( $0001FFFFFFFFFFFF ),
  6703. int64( $FFFFFFFFFFFFFFFF )
  6704. )));
  6705. shift128ExtraRightJamming(
  6706. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6707. zExp := 0;
  6708. if ( isTiny and zSig2 )<>0 then
  6709. float_raise( float_flag_underflow );
  6710. if ( roundNearestEven<>0 ) then
  6711. begin
  6712. increment := ord( sbits64(zSig2) < 0 );
  6713. end
  6714. else begin
  6715. if ( zSign<>0 ) then
  6716. begin
  6717. increment := ord( roundingMode = float_round_down ) and zSig2;
  6718. end
  6719. else begin
  6720. increment := ord( roundingMode = float_round_up ) and zSig2;
  6721. end;
  6722. end;
  6723. end;
  6724. end;
  6725. if ( zSig2<>0 ) then
  6726. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6727. if ( increment<>0 ) then
  6728. begin
  6729. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6730. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6731. end
  6732. else begin
  6733. if ( ( zSig0 or zSig1 ) = 0 ) then
  6734. zExp := 0;
  6735. end;
  6736. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6737. end;
  6738. {*----------------------------------------------------------------------------
  6739. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6740. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6741. | returns the proper quadruple-precision floating-point value corresponding
  6742. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6743. | except that the input significand has fewer bits and does not have to be
  6744. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6745. | point exponent.
  6746. *----------------------------------------------------------------------------*}
  6747. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6748. var
  6749. shiftCount: int8;
  6750. zSig2: bits64;
  6751. begin
  6752. if ( zSig0 = 0 ) then
  6753. begin
  6754. zSig0 := zSig1;
  6755. zSig1 := 0;
  6756. dec(zExp, 64);
  6757. end;
  6758. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6759. if ( 0 <= shiftCount ) then
  6760. begin
  6761. zSig2 := 0;
  6762. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6763. end
  6764. else begin
  6765. shift128ExtraRightJamming(
  6766. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6767. end;
  6768. dec(zExp, shiftCount);
  6769. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6770. end;
  6771. {*----------------------------------------------------------------------------
  6772. | Returns the result of converting the quadruple-precision floating-point
  6773. | value `a' to the 32-bit two's complement integer format. The conversion
  6774. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6775. | Arithmetic---which means in particular that the conversion is rounded
  6776. | according to the current rounding mode. If `a' is a NaN, the largest
  6777. | positive integer is returned. Otherwise, if the conversion overflows, the
  6778. | largest integer with the same sign as `a' is returned.
  6779. *----------------------------------------------------------------------------*}
  6780. function float128_to_int32(a: float128): int32;
  6781. var
  6782. aSign: flag;
  6783. aExp, shiftCount: int32;
  6784. aSig0, aSig1: bits64;
  6785. begin
  6786. aSig1 := extractFloat128Frac1( a );
  6787. aSig0 := extractFloat128Frac0( a );
  6788. aExp := extractFloat128Exp( a );
  6789. aSign := extractFloat128Sign( a );
  6790. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6791. aSign := 0;
  6792. if ( aExp<>0 ) then
  6793. aSig0 := aSig0 or int64( $0001000000000000 );
  6794. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6795. shiftCount := $4028 - aExp;
  6796. if ( 0 < shiftCount ) then
  6797. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6798. result := roundAndPackInt32( aSign, aSig0 );
  6799. end;
  6800. {*----------------------------------------------------------------------------
  6801. | Returns the result of converting the quadruple-precision floating-point
  6802. | value `a' to the 32-bit two's complement integer format. The conversion
  6803. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6804. | Arithmetic, except that the conversion is always rounded toward zero. If
  6805. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6806. | conversion overflows, the largest integer with the same sign as `a' is
  6807. | returned.
  6808. *----------------------------------------------------------------------------*}
  6809. function float128_to_int32_round_to_zero(a: float128): int32;
  6810. var
  6811. aSign: flag;
  6812. aExp, shiftCount: int32;
  6813. aSig0, aSig1, savedASig: bits64;
  6814. z: int32;
  6815. label
  6816. invalid;
  6817. begin
  6818. aSig1 := extractFloat128Frac1( a );
  6819. aSig0 := extractFloat128Frac0( a );
  6820. aExp := extractFloat128Exp( a );
  6821. aSign := extractFloat128Sign( a );
  6822. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6823. if ( $401E < aExp ) then
  6824. begin
  6825. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6826. aSign := 0;
  6827. goto invalid;
  6828. end
  6829. else if ( aExp < $3FFF ) then
  6830. begin
  6831. if ( aExp or aSig0 )<>0 then
  6832. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6833. result := 0;
  6834. exit;
  6835. end;
  6836. aSig0 := aSig0 or int64( $0001000000000000 );
  6837. shiftCount := $402F - aExp;
  6838. savedASig := aSig0;
  6839. aSig0 := aSig0 shr shiftCount;
  6840. z := aSig0;
  6841. if ( aSign )<>0 then
  6842. z := - z;
  6843. if ( ord( z < 0 ) xor aSign )<>0 then
  6844. begin
  6845. invalid:
  6846. float_raise( float_flag_invalid );
  6847. if aSign<>0 then
  6848. result:=$80000000
  6849. else
  6850. result:=$7FFFFFFF;
  6851. exit;
  6852. end;
  6853. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6854. begin
  6855. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6856. end;
  6857. result := z;
  6858. end;
  6859. {*----------------------------------------------------------------------------
  6860. | Returns the result of converting the quadruple-precision floating-point
  6861. | value `a' to the 64-bit two's complement integer format. The conversion
  6862. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6863. | Arithmetic---which means in particular that the conversion is rounded
  6864. | according to the current rounding mode. If `a' is a NaN, the largest
  6865. | positive integer is returned. Otherwise, if the conversion overflows, the
  6866. | largest integer with the same sign as `a' is returned.
  6867. *----------------------------------------------------------------------------*}
  6868. function float128_to_int64(a: float128): int64;
  6869. var
  6870. aSign: flag;
  6871. aExp, shiftCount: int32;
  6872. aSig0, aSig1: bits64;
  6873. begin
  6874. aSig1 := extractFloat128Frac1( a );
  6875. aSig0 := extractFloat128Frac0( a );
  6876. aExp := extractFloat128Exp( a );
  6877. aSign := extractFloat128Sign( a );
  6878. if ( aExp<>0 ) then
  6879. aSig0 := aSig0 or int64( $0001000000000000 );
  6880. shiftCount := $402F - aExp;
  6881. if ( shiftCount <= 0 ) then
  6882. begin
  6883. if ( $403E < aExp ) then
  6884. begin
  6885. float_raise( float_flag_invalid );
  6886. if ( (aSign=0)
  6887. or ( ( aExp = $7FFF )
  6888. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6889. )
  6890. ) then
  6891. begin
  6892. result := int64( $7FFFFFFFFFFFFFFF );
  6893. end;
  6894. result := int64( $8000000000000000 );
  6895. end;
  6896. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6897. end
  6898. else begin
  6899. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6900. end;
  6901. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6902. end;
  6903. {*----------------------------------------------------------------------------
  6904. | Returns the result of converting the quadruple-precision floating-point
  6905. | value `a' to the 64-bit two's complement integer format. The conversion
  6906. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6907. | Arithmetic, except that the conversion is always rounded toward zero.
  6908. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6909. | the conversion overflows, the largest integer with the same sign as `a' is
  6910. | returned.
  6911. *----------------------------------------------------------------------------*}
  6912. function float128_to_int64_round_to_zero(a: float128): int64;
  6913. var
  6914. aSign: flag;
  6915. aExp, shiftCount: int32;
  6916. aSig0, aSig1: bits64;
  6917. z: int64;
  6918. begin
  6919. aSig1 := extractFloat128Frac1( a );
  6920. aSig0 := extractFloat128Frac0( a );
  6921. aExp := extractFloat128Exp( a );
  6922. aSign := extractFloat128Sign( a );
  6923. if ( aExp<>0 ) then
  6924. aSig0 := aSig0 or int64( $0001000000000000 );
  6925. shiftCount := aExp - $402F;
  6926. if ( 0 < shiftCount ) then
  6927. begin
  6928. if ( $403E <= aExp ) then
  6929. begin
  6930. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6931. if ( ( a.high = int64( $C03E000000000000 ) )
  6932. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6933. begin
  6934. if ( aSig1<>0 ) then
  6935. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6936. end
  6937. else begin
  6938. float_raise( float_flag_invalid );
  6939. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6940. begin
  6941. result := int64( $7FFFFFFFFFFFFFFF );
  6942. exit;
  6943. end;
  6944. end;
  6945. result := int64( $8000000000000000 );
  6946. exit;
  6947. end;
  6948. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6949. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6950. begin
  6951. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6952. end;
  6953. end
  6954. else begin
  6955. if ( aExp < $3FFF ) then
  6956. begin
  6957. if ( aExp or aSig0 or aSig1 )<>0 then
  6958. begin
  6959. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6960. end;
  6961. result := 0;
  6962. exit;
  6963. end;
  6964. z := aSig0 shr ( - shiftCount );
  6965. if ( (aSig1<>0)
  6966. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6967. begin
  6968. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6969. end;
  6970. end;
  6971. if ( aSign<>0 ) then
  6972. z := - z;
  6973. result := z;
  6974. end;
  6975. {*----------------------------------------------------------------------------
  6976. | Returns the result of converting the quadruple-precision floating-point
  6977. | value `a' to the single-precision floating-point format. The conversion
  6978. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6979. | Arithmetic.
  6980. *----------------------------------------------------------------------------*}
  6981. function float128_to_float32(a: float128): float32;
  6982. var
  6983. aSign: flag;
  6984. aExp: int32;
  6985. aSig0, aSig1: bits64;
  6986. zSig: bits32;
  6987. begin
  6988. aSig1 := extractFloat128Frac1( a );
  6989. aSig0 := extractFloat128Frac0( a );
  6990. aExp := extractFloat128Exp( a );
  6991. aSign := extractFloat128Sign( a );
  6992. if ( aExp = $7FFF ) then
  6993. begin
  6994. if ( aSig0 or aSig1 )<>0 then
  6995. begin
  6996. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  6997. exit;
  6998. end;
  6999. result := packFloat32( aSign, $FF, 0 );
  7000. exit;
  7001. end;
  7002. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7003. shift64RightJamming( aSig0, 18, aSig0 );
  7004. zSig := aSig0;
  7005. if ( aExp or zSig )<>0 then
  7006. begin
  7007. zSig := zSig or $40000000;
  7008. dec(aExp,$3F81);
  7009. end;
  7010. result := roundAndPackFloat32( aSign, aExp, zSig );
  7011. end;
  7012. {*----------------------------------------------------------------------------
  7013. | Returns the result of converting the quadruple-precision floating-point
  7014. | value `a' to the double-precision floating-point format. The conversion
  7015. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7016. | Arithmetic.
  7017. *----------------------------------------------------------------------------*}
  7018. function float128_to_float64(a: float128): float64;
  7019. var
  7020. aSign: flag;
  7021. aExp: int32;
  7022. aSig0, aSig1: bits64;
  7023. begin
  7024. aSig1 := extractFloat128Frac1( a );
  7025. aSig0 := extractFloat128Frac0( a );
  7026. aExp := extractFloat128Exp( a );
  7027. aSign := extractFloat128Sign( a );
  7028. if ( aExp = $7FFF ) then
  7029. begin
  7030. if ( aSig0 or aSig1 )<>0 then
  7031. begin
  7032. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  7033. exit;
  7034. end;
  7035. result:=packFloat64( aSign, $7FF, 0);
  7036. exit;
  7037. end;
  7038. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7039. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7040. if ( aExp or aSig0 )<>0 then
  7041. begin
  7042. aSig0 := aSig0 or int64( $4000000000000000 );
  7043. dec(aExp,$3C01);
  7044. end;
  7045. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7046. end;
  7047. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7048. {*----------------------------------------------------------------------------
  7049. | Returns the result of converting the quadruple-precision floating-point
  7050. | value `a' to the extended double-precision floating-point format. The
  7051. | conversion is performed according to the IEC/IEEE Standard for Binary
  7052. | Floating-Point Arithmetic.
  7053. *----------------------------------------------------------------------------*}
  7054. function float128_to_floatx80(a: float128): floatx80;
  7055. var
  7056. aSign: flag;
  7057. aExp: int32;
  7058. aSig0, aSig1: bits64;
  7059. begin
  7060. aSig1 := extractFloat128Frac1( a );
  7061. aSig0 := extractFloat128Frac0( a );
  7062. aExp := extractFloat128Exp( a );
  7063. aSign := extractFloat128Sign( a );
  7064. if ( aExp = $7FFF ) begin
  7065. if ( aSig0 or aSig1 ) begin
  7066. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7067. exit;
  7068. end;
  7069. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  7070. exit;
  7071. end;
  7072. if ( aExp = 0 ) begin
  7073. if ( ( aSig0 or aSig1 ) = 0 ) then
  7074. begin
  7075. result := packFloatx80( aSign, 0, 0 );
  7076. exit;
  7077. end;
  7078. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7079. end;
  7080. else begin
  7081. aSig0 or= int64( $0001000000000000 );
  7082. end;
  7083. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7084. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7085. end;
  7086. {$endif FPC_SOFTFLOAT_FLOATX80}
  7087. {*----------------------------------------------------------------------------
  7088. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7089. | Returns the result as a quadruple-precision floating-point value. The
  7090. | operation is performed according to the IEC/IEEE Standard for Binary
  7091. | Floating-Point Arithmetic.
  7092. *----------------------------------------------------------------------------*}
  7093. function float128_round_to_int(a: float128): float128;
  7094. var
  7095. aSign: flag;
  7096. aExp: int32;
  7097. lastBitMask, roundBitsMask: bits64;
  7098. roundingMode: int8;
  7099. z: float128;
  7100. begin
  7101. aExp := extractFloat128Exp( a );
  7102. if ( $402F <= aExp ) then
  7103. begin
  7104. if ( $406F <= aExp ) then
  7105. begin
  7106. if ( ( aExp = $7FFF )
  7107. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7108. ) then
  7109. begin
  7110. result := propagateFloat128NaN( a, a );
  7111. exit;
  7112. end;
  7113. result := a;
  7114. exit;
  7115. end;
  7116. lastBitMask := 1;
  7117. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7118. roundBitsMask := lastBitMask - 1;
  7119. z := a;
  7120. roundingMode := softfloat_rounding_mode;
  7121. if ( roundingMode = float_round_nearest_even ) then
  7122. begin
  7123. if ( lastBitMask )<>0 then
  7124. begin
  7125. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7126. if ( ( z.low and roundBitsMask ) = 0 ) then
  7127. z.low := z.low and not(lastBitMask);
  7128. end
  7129. else begin
  7130. if ( sbits64(z.low) < 0 ) then
  7131. begin
  7132. inc(z.high);
  7133. if ( bits64( z.low shl 1 ) = 0 ) then
  7134. z.high := z.high and not(1);
  7135. end;
  7136. end;
  7137. end
  7138. else if ( roundingMode <> float_round_to_zero ) then
  7139. begin
  7140. if ( extractFloat128Sign( z )
  7141. xor ord( roundingMode = float_round_up ) )<>0 then
  7142. begin
  7143. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7144. end;
  7145. end;
  7146. z.low := z.low and not(roundBitsMask);
  7147. end
  7148. else begin
  7149. if ( aExp < $3FFF ) then
  7150. begin
  7151. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7152. begin
  7153. result := a;
  7154. exit;
  7155. end;
  7156. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7157. aSign := extractFloat128Sign( a );
  7158. case softfloat_rounding_mode of
  7159. float_round_nearest_even:
  7160. if ( ( aExp = $3FFE )
  7161. and ( (extractFloat128Frac0( a )<>0)
  7162. or (extractFloat128Frac1( a )<>0) )
  7163. ) then begin
  7164. begin
  7165. result := packFloat128( aSign, $3FFF, 0, 0 );
  7166. exit;
  7167. end;
  7168. end;
  7169. float_round_down:
  7170. begin
  7171. if aSign<>0 then
  7172. result:=packFloat128( 1, $3FFF, 0, 0 )
  7173. else
  7174. result:=packFloat128( 0, 0, 0, 0 );
  7175. exit;
  7176. end;
  7177. float_round_up:
  7178. begin
  7179. if aSign<>0 then
  7180. result := packFloat128( 1, 0, 0, 0 )
  7181. else
  7182. result:=packFloat128( 0, $3FFF, 0, 0 );
  7183. exit;
  7184. end;
  7185. end;
  7186. result := packFloat128( aSign, 0, 0, 0 );
  7187. exit;
  7188. end;
  7189. lastBitMask := 1;
  7190. lastBitMask := lastBitMask shl ($402F - aExp);
  7191. roundBitsMask := lastBitMask - 1;
  7192. z.low := 0;
  7193. z.high := a.high;
  7194. roundingMode := softfloat_rounding_mode;
  7195. if ( roundingMode = float_round_nearest_even ) then begin
  7196. inc(z.high,lastBitMask shr 1);
  7197. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7198. z.high := z.high and not(lastBitMask);
  7199. end;
  7200. end
  7201. else if ( roundingMode <> float_round_to_zero ) then begin
  7202. if ( (extractFloat128Sign( z )<>0)
  7203. xor ( roundingMode = float_round_up ) ) then begin
  7204. z.high := z.high or ord( a.low <> 0 );
  7205. z.high := z.high+roundBitsMask;
  7206. end;
  7207. end;
  7208. z.high := z.high and not(roundBitsMask);
  7209. end;
  7210. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7211. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7212. end;
  7213. result := z;
  7214. end;
  7215. {*----------------------------------------------------------------------------
  7216. | Returns the result of adding the absolute values of the quadruple-precision
  7217. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7218. | before being returned. `zSign' is ignored if the result is a NaN.
  7219. | The addition is performed according to the IEC/IEEE Standard for Binary
  7220. | Floating-Point Arithmetic.
  7221. *----------------------------------------------------------------------------*}
  7222. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7223. var
  7224. aExp, bExp, zExp: int32;
  7225. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7226. expDiff: int32;
  7227. label
  7228. shiftRight1,roundAndPack;
  7229. begin
  7230. aSig1 := extractFloat128Frac1( a );
  7231. aSig0 := extractFloat128Frac0( a );
  7232. aExp := extractFloat128Exp( a );
  7233. bSig1 := extractFloat128Frac1( b );
  7234. bSig0 := extractFloat128Frac0( b );
  7235. bExp := extractFloat128Exp( b );
  7236. expDiff := aExp - bExp;
  7237. if ( 0 < expDiff ) then begin
  7238. if ( aExp = $7FFF ) then begin
  7239. if ( aSig0 or aSig1 )<>0 then
  7240. begin
  7241. result := propagateFloat128NaN( a, b );
  7242. exit;
  7243. end;
  7244. result := a;
  7245. exit;
  7246. end;
  7247. if ( bExp = 0 ) then begin
  7248. dec(expDiff);
  7249. end
  7250. else begin
  7251. bSig0 := bSig0 or int64( $0001000000000000 );
  7252. end;
  7253. shift128ExtraRightJamming(
  7254. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7255. zExp := aExp;
  7256. end
  7257. else if ( expDiff < 0 ) then begin
  7258. if ( bExp = $7FFF ) then begin
  7259. if ( bSig0 or bSig1 )<>0 then
  7260. begin
  7261. result := propagateFloat128NaN( a, b );
  7262. exit;
  7263. end;
  7264. result := packFloat128( zSign, $7FFF, 0, 0 );
  7265. exit;
  7266. end;
  7267. if ( aExp = 0 ) then begin
  7268. inc(expDiff);
  7269. end
  7270. else begin
  7271. aSig0 := aSig0 or int64( $0001000000000000 );
  7272. end;
  7273. shift128ExtraRightJamming(
  7274. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7275. zExp := bExp;
  7276. end
  7277. else begin
  7278. if ( aExp = $7FFF ) then begin
  7279. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7280. result := propagateFloat128NaN( a, b );
  7281. exit;
  7282. end;
  7283. result := a;
  7284. exit;
  7285. end;
  7286. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7287. if ( aExp = 0 ) then
  7288. begin
  7289. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7290. exit;
  7291. end;
  7292. zSig2 := 0;
  7293. zSig0 := zSig0 or int64( $0002000000000000 );
  7294. zExp := aExp;
  7295. goto shiftRight1;
  7296. end;
  7297. aSig0 := aSig0 or int64( $0001000000000000 );
  7298. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7299. dec(zExp);
  7300. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7301. inc(zExp);
  7302. shiftRight1:
  7303. shift128ExtraRightJamming(
  7304. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7305. roundAndPack:
  7306. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7307. end;
  7308. {*----------------------------------------------------------------------------
  7309. | Returns the result of subtracting the absolute values of the quadruple-
  7310. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  7311. | difference is negated before being returned. `zSign' is ignored if the
  7312. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  7313. | Standard for Binary Floating-Point Arithmetic.
  7314. *----------------------------------------------------------------------------*}
  7315. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  7316. var
  7317. aExp, bExp, zExp: int32;
  7318. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  7319. expDiff: int32;
  7320. z: float128;
  7321. label
  7322. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  7323. begin
  7324. aSig1 := extractFloat128Frac1( a );
  7325. aSig0 := extractFloat128Frac0( a );
  7326. aExp := extractFloat128Exp( a );
  7327. bSig1 := extractFloat128Frac1( b );
  7328. bSig0 := extractFloat128Frac0( b );
  7329. bExp := extractFloat128Exp( b );
  7330. expDiff := aExp - bExp;
  7331. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7332. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  7333. if ( 0 < expDiff ) then goto aExpBigger;
  7334. if ( expDiff < 0 ) then goto bExpBigger;
  7335. if ( aExp = $7FFF ) then begin
  7336. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7337. result := propagateFloat128NaN( a, b );
  7338. exit;
  7339. end;
  7340. float_raise( float_flag_invalid );
  7341. z.low := float128_default_nan_low;
  7342. z.high := float128_default_nan_high;
  7343. result := z;
  7344. exit;
  7345. end;
  7346. if ( aExp = 0 ) then begin
  7347. aExp := 1;
  7348. bExp := 1;
  7349. end;
  7350. if ( bSig0 < aSig0 ) then goto aBigger;
  7351. if ( aSig0 < bSig0 ) then goto bBigger;
  7352. if ( bSig1 < aSig1 ) then goto aBigger;
  7353. if ( aSig1 < bSig1 ) then goto bBigger;
  7354. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  7355. exit;
  7356. bExpBigger:
  7357. if ( bExp = $7FFF ) then begin
  7358. if ( bSig0 or bSig1 )<>0 then
  7359. begin
  7360. result := propagateFloat128NaN( a, b );
  7361. exit;
  7362. end;
  7363. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  7364. exit;
  7365. end;
  7366. if ( aExp = 0 ) then begin
  7367. inc(expDiff);
  7368. end
  7369. else begin
  7370. aSig0 := aSig0 or int64( $4000000000000000 );
  7371. end;
  7372. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7373. bSig0 := bSig0 or int64( $4000000000000000 );
  7374. bBigger:
  7375. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  7376. zExp := bExp;
  7377. zSign := zSign xor 1;
  7378. goto normalizeRoundAndPack;
  7379. aExpBigger:
  7380. if ( aExp = $7FFF ) then begin
  7381. if ( aSig0 or aSig1 )<>0 then
  7382. begin
  7383. result := propagateFloat128NaN( a, b );
  7384. exit;
  7385. end;
  7386. result := a;
  7387. exit;
  7388. end;
  7389. if ( bExp = 0 ) then begin
  7390. dec(expDiff);
  7391. end
  7392. else begin
  7393. bSig0 := bSig0 or int64( $4000000000000000 );
  7394. end;
  7395. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  7396. aSig0 := aSig0 or int64( $4000000000000000 );
  7397. aBigger:
  7398. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7399. zExp := aExp;
  7400. normalizeRoundAndPack:
  7401. dec(zExp);
  7402. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  7403. end;
  7404. {*----------------------------------------------------------------------------
  7405. | Returns the result of adding the quadruple-precision floating-point values
  7406. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  7407. | for Binary Floating-Point Arithmetic.
  7408. *----------------------------------------------------------------------------*}
  7409. function float128_add(a: float128; b: float128): float128;
  7410. var
  7411. aSign, bSign: flag;
  7412. begin
  7413. aSign := extractFloat128Sign( a );
  7414. bSign := extractFloat128Sign( b );
  7415. if ( aSign = bSign ) then begin
  7416. result := addFloat128Sigs( a, b, aSign );
  7417. end
  7418. else begin
  7419. result := subFloat128Sigs( a, b, aSign );
  7420. end;
  7421. end;
  7422. {*----------------------------------------------------------------------------
  7423. | Returns the result of subtracting the quadruple-precision floating-point
  7424. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7425. | Standard for Binary Floating-Point Arithmetic.
  7426. *----------------------------------------------------------------------------*}
  7427. function float128_sub(a: float128; b: float128): float128;
  7428. var
  7429. aSign, bSign: flag;
  7430. begin
  7431. aSign := extractFloat128Sign( a );
  7432. bSign := extractFloat128Sign( b );
  7433. if ( aSign = bSign ) then begin
  7434. result := subFloat128Sigs( a, b, aSign );
  7435. end
  7436. else begin
  7437. result := addFloat128Sigs( a, b, aSign );
  7438. end;
  7439. end;
  7440. {*----------------------------------------------------------------------------
  7441. | Returns the result of multiplying the quadruple-precision floating-point
  7442. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7443. | Standard for Binary Floating-Point Arithmetic.
  7444. *----------------------------------------------------------------------------*}
  7445. function float128_mul(a: float128; b: float128): float128;
  7446. var
  7447. aSign, bSign, zSign: flag;
  7448. aExp, bExp, zExp: int32;
  7449. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7450. z: float128;
  7451. label
  7452. invalid;
  7453. begin
  7454. aSig1 := extractFloat128Frac1( a );
  7455. aSig0 := extractFloat128Frac0( a );
  7456. aExp := extractFloat128Exp( a );
  7457. aSign := extractFloat128Sign( a );
  7458. bSig1 := extractFloat128Frac1( b );
  7459. bSig0 := extractFloat128Frac0( b );
  7460. bExp := extractFloat128Exp( b );
  7461. bSign := extractFloat128Sign( b );
  7462. zSign := aSign xor bSign;
  7463. if ( aExp = $7FFF ) then begin
  7464. if ( (( aSig0 or aSig1 )<>0)
  7465. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7466. result := propagateFloat128NaN( a, b );
  7467. exit;
  7468. end;
  7469. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  7470. result := packFloat128( zSign, $7FFF, 0, 0 );
  7471. exit;
  7472. end;
  7473. if ( bExp = $7FFF ) then begin
  7474. if ( bSig0 or bSig1 )<>0 then
  7475. begin
  7476. result := propagateFloat128NaN( a, b );
  7477. exit;
  7478. end;
  7479. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7480. invalid:
  7481. float_raise( float_flag_invalid );
  7482. z.low := float128_default_nan_low;
  7483. z.high := float128_default_nan_high;
  7484. result := z;
  7485. exit;
  7486. end;
  7487. result := packFloat128( zSign, $7FFF, 0, 0 );
  7488. exit;
  7489. end;
  7490. if ( aExp = 0 ) then begin
  7491. if ( ( aSig0 or aSig1 ) = 0 ) then
  7492. begin
  7493. result := packFloat128( zSign, 0, 0, 0 );
  7494. exit;
  7495. end;
  7496. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7497. end;
  7498. if ( bExp = 0 ) then begin
  7499. if ( ( bSig0 or bSig1 ) = 0 ) then
  7500. begin
  7501. result := packFloat128( zSign, 0, 0, 0 );
  7502. exit;
  7503. end;
  7504. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7505. end;
  7506. zExp := aExp + bExp - $4000;
  7507. aSig0 := aSig0 or int64( $0001000000000000 );
  7508. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  7509. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  7510. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  7511. zSig2 := zSig2 or ord( zSig3 <> 0 );
  7512. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  7513. shift128ExtraRightJamming(
  7514. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7515. inc(zExp);
  7516. end;
  7517. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7518. end;
  7519. {*----------------------------------------------------------------------------
  7520. | Returns the result of dividing the quadruple-precision floating-point value
  7521. | `a' by the corresponding value `b'. The operation is performed according to
  7522. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7523. *----------------------------------------------------------------------------*}
  7524. function float128_div(a: float128; b: float128): float128;
  7525. var
  7526. aSign, bSign, zSign: flag;
  7527. aExp, bExp, zExp: int32;
  7528. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7529. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7530. z: float128;
  7531. label
  7532. invalid;
  7533. begin
  7534. aSig1 := extractFloat128Frac1( a );
  7535. aSig0 := extractFloat128Frac0( a );
  7536. aExp := extractFloat128Exp( a );
  7537. aSign := extractFloat128Sign( a );
  7538. bSig1 := extractFloat128Frac1( b );
  7539. bSig0 := extractFloat128Frac0( b );
  7540. bExp := extractFloat128Exp( b );
  7541. bSign := extractFloat128Sign( b );
  7542. zSign := aSign xor bSign;
  7543. if ( aExp = $7FFF ) then begin
  7544. if ( aSig0 or aSig1 )<>0 then
  7545. begin
  7546. result := propagateFloat128NaN( a, b );
  7547. exit;
  7548. end;
  7549. if ( bExp = $7FFF ) then begin
  7550. if ( bSig0 or bSig1 )<>0 then
  7551. begin
  7552. result := propagateFloat128NaN( a, b );
  7553. exit;
  7554. end;
  7555. goto invalid;
  7556. end;
  7557. result := packFloat128( zSign, $7FFF, 0, 0 );
  7558. exit;
  7559. end;
  7560. if ( bExp = $7FFF ) then begin
  7561. if ( bSig0 or bSig1 )<>0 then
  7562. begin
  7563. result := propagateFloat128NaN( a, b );
  7564. exit;
  7565. end;
  7566. result := packFloat128( zSign, 0, 0, 0 );
  7567. exit;
  7568. end;
  7569. if ( bExp = 0 ) then begin
  7570. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7571. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7572. invalid:
  7573. float_raise( float_flag_invalid );
  7574. z.low := float128_default_nan_low;
  7575. z.high := float128_default_nan_high;
  7576. result := z;
  7577. exit;
  7578. end;
  7579. float_raise( float_flag_divbyzero );
  7580. result := packFloat128( zSign, $7FFF, 0, 0 );
  7581. exit;
  7582. end;
  7583. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7584. end;
  7585. if ( aExp = 0 ) then begin
  7586. if ( ( aSig0 or aSig1 ) = 0 ) then
  7587. begin
  7588. result := packFloat128( zSign, 0, 0, 0 );
  7589. exit;
  7590. end;
  7591. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7592. end;
  7593. zExp := aExp - bExp + $3FFD;
  7594. shortShift128Left(
  7595. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  7596. shortShift128Left(
  7597. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7598. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  7599. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  7600. inc(zExp);
  7601. end;
  7602. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7603. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  7604. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  7605. while ( sbits64(rem0) < 0 ) do begin
  7606. dec(zSig0);
  7607. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  7608. end;
  7609. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7610. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  7611. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  7612. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  7613. while ( sbits64(rem1) < 0 ) do begin
  7614. dec(zSig1);
  7615. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  7616. end;
  7617. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7618. end;
  7619. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  7620. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7621. end;
  7622. {*----------------------------------------------------------------------------
  7623. | Returns the remainder of the quadruple-precision floating-point value `a'
  7624. | with respect to the corresponding value `b'. The operation is performed
  7625. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7626. *----------------------------------------------------------------------------*}
  7627. function float128_rem(a: float128; b: float128): float128;
  7628. var
  7629. aSign, bSign, zSign: flag;
  7630. aExp, bExp, expDiff: int32;
  7631. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7632. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7633. sigMean0: sbits64;
  7634. z: float128;
  7635. label
  7636. invalid;
  7637. begin
  7638. aSig1 := extractFloat128Frac1( a );
  7639. aSig0 := extractFloat128Frac0( a );
  7640. aExp := extractFloat128Exp( a );
  7641. aSign := extractFloat128Sign( a );
  7642. bSig1 := extractFloat128Frac1( b );
  7643. bSig0 := extractFloat128Frac0( b );
  7644. bExp := extractFloat128Exp( b );
  7645. bSign := extractFloat128Sign( b );
  7646. if ( aExp = $7FFF ) then begin
  7647. if ( (( aSig0 or aSig1 )<>0)
  7648. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7649. result := propagateFloat128NaN( a, b );
  7650. exit;
  7651. end;
  7652. goto invalid;
  7653. end;
  7654. if ( bExp = $7FFF ) then begin
  7655. if ( bSig0 or bSig1 )<>0 then
  7656. begin
  7657. result := propagateFloat128NaN( a, b );
  7658. exit;
  7659. end;
  7660. result := a;
  7661. exit;
  7662. end;
  7663. if ( bExp = 0 ) then begin
  7664. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7665. invalid:
  7666. float_raise( float_flag_invalid );
  7667. z.low := float128_default_nan_low;
  7668. z.high := float128_default_nan_high;
  7669. result := z;
  7670. exit;
  7671. end;
  7672. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7673. end;
  7674. if ( aExp = 0 ) then begin
  7675. if ( ( aSig0 or aSig1 ) = 0 ) then
  7676. begin
  7677. result := a;
  7678. exit;
  7679. end;
  7680. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7681. end;
  7682. expDiff := aExp - bExp;
  7683. if ( expDiff < -1 ) then
  7684. begin
  7685. result := a;
  7686. exit;
  7687. end;
  7688. shortShift128Left(
  7689. aSig0 or int64( $0001000000000000 ),
  7690. aSig1,
  7691. 15 - ord( expDiff < 0 ),
  7692. aSig0,
  7693. aSig1
  7694. );
  7695. shortShift128Left(
  7696. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7697. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7698. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7699. dec(expDiff,64);
  7700. while ( 0 < expDiff ) do begin
  7701. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7702. if ( 4 < q ) then
  7703. q := q - 4
  7704. else
  7705. q := 0;
  7706. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7707. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  7708. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  7709. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  7710. dec(expDiff,61);
  7711. end;
  7712. if ( -64 < expDiff ) then begin
  7713. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7714. if ( 4 < q ) then
  7715. q := q - 4
  7716. else
  7717. q := 0;
  7718. q := q shr (- expDiff);
  7719. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7720. inc(expDiff,52);
  7721. if ( expDiff < 0 ) then begin
  7722. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7723. end
  7724. else begin
  7725. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  7726. end;
  7727. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7728. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  7729. end
  7730. else begin
  7731. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  7732. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7733. end;
  7734. repeat
  7735. alternateASig0 := aSig0;
  7736. alternateASig1 := aSig1;
  7737. inc(q);
  7738. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7739. until not( 0 <= sbits64(aSig0) );
  7740. add128(
  7741. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  7742. if ( ( sigMean0 < 0 )
  7743. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  7744. aSig0 := alternateASig0;
  7745. aSig1 := alternateASig1;
  7746. end;
  7747. zSign := ord( sbits64(aSig0) < 0 );
  7748. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  7749. result :=
  7750. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7751. end;
  7752. {*----------------------------------------------------------------------------
  7753. | Returns the square root of the quadruple-precision floating-point value `a'.
  7754. | The operation is performed according to the IEC/IEEE Standard for Binary
  7755. | Floating-Point Arithmetic.
  7756. *----------------------------------------------------------------------------*}
  7757. function float128_sqrt(a: float128): float128;
  7758. var
  7759. aSign: flag;
  7760. aExp, zExp: int32;
  7761. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7762. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7763. z: float128;
  7764. label
  7765. invalid;
  7766. begin
  7767. aSig1 := extractFloat128Frac1( a );
  7768. aSig0 := extractFloat128Frac0( a );
  7769. aExp := extractFloat128Exp( a );
  7770. aSign := extractFloat128Sign( a );
  7771. if ( aExp = $7FFF ) then begin
  7772. if ( aSig0 or aSig1 )<>0 then
  7773. begin
  7774. result := propagateFloat128NaN( a, a );
  7775. exit;
  7776. end;
  7777. if ( aSign=0 ) then
  7778. begin
  7779. result := a;
  7780. exit;
  7781. end;
  7782. goto invalid;
  7783. end;
  7784. if ( aSign<>0 ) then begin
  7785. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  7786. begin
  7787. result := a;
  7788. exit;
  7789. end;
  7790. invalid:
  7791. float_raise( float_flag_invalid );
  7792. z.low := float128_default_nan_low;
  7793. z.high := float128_default_nan_high;
  7794. result := z;
  7795. exit;
  7796. end;
  7797. if ( aExp = 0 ) then begin
  7798. if ( ( aSig0 or aSig1 ) = 0 ) then
  7799. begin
  7800. result := packFloat128( 0, 0, 0, 0 );
  7801. exit;
  7802. end;
  7803. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7804. end;
  7805. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7806. aSig0 := aSig0 or int64( $0001000000000000 );
  7807. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7808. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  7809. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7810. doubleZSig0 := zSig0 shl 1;
  7811. mul64To128( zSig0, zSig0, term0, term1 );
  7812. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7813. while ( sbits64(rem0) < 0 ) do begin
  7814. dec(zSig0);
  7815. dec(doubleZSig0,2);
  7816. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7817. end;
  7818. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7819. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  7820. if ( zSig1 = 0 ) then zSig1 := 1;
  7821. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7822. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7823. mul64To128( zSig1, zSig1, term2, term3 );
  7824. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7825. while ( sbits64(rem1) < 0 ) do begin
  7826. dec(zSig1);
  7827. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7828. term3 := term3 or 1;
  7829. term2 := term2 or doubleZSig0;
  7830. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7831. end;
  7832. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7833. end;
  7834. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  7835. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7836. end;
  7837. {*----------------------------------------------------------------------------
  7838. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7839. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7840. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7841. *----------------------------------------------------------------------------*}
  7842. function float128_eq(a: float128; b: float128): flag;
  7843. begin
  7844. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7845. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7846. or ( ( extractFloat128Exp( b ) = $7FFF )
  7847. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7848. ) then begin
  7849. if ( (float128_is_signaling_nan( a )<>0)
  7850. or (float128_is_signaling_nan( b )<>0) ) then begin
  7851. float_raise( float_flag_invalid );
  7852. end;
  7853. result := 0;
  7854. exit;
  7855. end;
  7856. result := ord(
  7857. ( a.low = b.low )
  7858. and ( ( a.high = b.high )
  7859. or ( ( a.low = 0 )
  7860. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  7861. ));
  7862. end;
  7863. {*----------------------------------------------------------------------------
  7864. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7865. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7866. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7867. | Arithmetic.
  7868. *----------------------------------------------------------------------------*}
  7869. function float128_le(a: float128; b: float128): flag;
  7870. var
  7871. aSign, bSign: flag;
  7872. begin
  7873. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7874. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7875. or ( ( extractFloat128Exp( b ) = $7FFF )
  7876. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7877. ) then begin
  7878. float_raise( float_flag_invalid );
  7879. result := 0;
  7880. exit;
  7881. end;
  7882. aSign := extractFloat128Sign( a );
  7883. bSign := extractFloat128Sign( b );
  7884. if ( aSign <> bSign ) then begin
  7885. result := ord(
  7886. (aSign<>0)
  7887. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7888. = 0 ));
  7889. exit;
  7890. end;
  7891. if aSign<>0 then
  7892. result := le128( b.high, b.low, a.high, a.low )
  7893. else
  7894. result := le128( a.high, a.low, b.high, b.low );
  7895. end;
  7896. {*----------------------------------------------------------------------------
  7897. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7898. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7899. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7900. *----------------------------------------------------------------------------*}
  7901. function float128_lt(a: float128; b: float128): flag;
  7902. var
  7903. aSign, bSign: flag;
  7904. begin
  7905. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7906. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7907. or ( ( extractFloat128Exp( b ) = $7FFF )
  7908. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7909. ) then begin
  7910. float_raise( float_flag_invalid );
  7911. result := 0;
  7912. exit;
  7913. end;
  7914. aSign := extractFloat128Sign( a );
  7915. bSign := extractFloat128Sign( b );
  7916. if ( aSign <> bSign ) then begin
  7917. result := ord(
  7918. (aSign<>0)
  7919. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7920. <> 0 ));
  7921. exit;
  7922. end;
  7923. if aSign<>0 then
  7924. result := lt128( b.high, b.low, a.high, a.low )
  7925. else
  7926. result := lt128( a.high, a.low, b.high, b.low );
  7927. end;
  7928. {*----------------------------------------------------------------------------
  7929. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7930. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7931. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7932. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7933. *----------------------------------------------------------------------------*}
  7934. function float128_eq_signaling(a: float128; b: float128): flag;
  7935. begin
  7936. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7937. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7938. or ( ( extractFloat128Exp( b ) = $7FFF )
  7939. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7940. ) then begin
  7941. float_raise( float_flag_invalid );
  7942. result := 0;
  7943. exit;
  7944. end;
  7945. result := ord(
  7946. ( a.low = b.low )
  7947. and ( ( a.high = b.high )
  7948. or ( ( a.low = 0 )
  7949. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7950. ));
  7951. end;
  7952. {*----------------------------------------------------------------------------
  7953. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7954. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7955. | cause an exception. Otherwise, the comparison is performed according to the
  7956. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7957. *----------------------------------------------------------------------------*}
  7958. function float128_le_quiet(a: float128; b: float128): flag;
  7959. var
  7960. aSign, bSign: flag;
  7961. begin
  7962. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7963. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7964. or ( ( extractFloat128Exp( b ) = $7FFF )
  7965. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7966. ) then begin
  7967. if ( (float128_is_signaling_nan( a )<>0)
  7968. or (float128_is_signaling_nan( b )<>0) ) then begin
  7969. float_raise( float_flag_invalid );
  7970. end;
  7971. result := 0;
  7972. exit;
  7973. end;
  7974. aSign := extractFloat128Sign( a );
  7975. bSign := extractFloat128Sign( b );
  7976. if ( aSign <> bSign ) then begin
  7977. result := ord(
  7978. (aSign<>0)
  7979. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7980. = 0 ));
  7981. exit;
  7982. end;
  7983. if aSign<>0 then
  7984. result := le128( b.high, b.low, a.high, a.low )
  7985. else
  7986. result := le128( a.high, a.low, b.high, b.low );
  7987. end;
  7988. {*----------------------------------------------------------------------------
  7989. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7990. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  7991. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  7992. | Standard for Binary Floating-Point Arithmetic.
  7993. *----------------------------------------------------------------------------*}
  7994. function float128_lt_quiet(a: float128; b: float128): flag;
  7995. var
  7996. aSign, bSign: flag;
  7997. begin
  7998. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7999. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8000. or ( ( extractFloat128Exp( b ) = $7FFF )
  8001. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8002. ) then begin
  8003. if ( (float128_is_signaling_nan( a )<>0)
  8004. or (float128_is_signaling_nan( b )<>0) ) then begin
  8005. float_raise( float_flag_invalid );
  8006. end;
  8007. result := 0;
  8008. exit;
  8009. end;
  8010. aSign := extractFloat128Sign( a );
  8011. bSign := extractFloat128Sign( b );
  8012. if ( aSign <> bSign ) then begin
  8013. result := ord(
  8014. (aSign<>0)
  8015. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8016. <> 0 ));
  8017. exit;
  8018. end;
  8019. if aSign<>0 then
  8020. result:=lt128( b.high, b.low, a.high, a.low )
  8021. else
  8022. result:=lt128( a.high, a.low, b.high, b.low );
  8023. end;
  8024. {----------------------------------------------------------------------------
  8025. | Returns the result of converting the double-precision floating-point value
  8026. | `a' to the quadruple-precision floating-point format. The conversion is
  8027. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8028. | Arithmetic.
  8029. *----------------------------------------------------------------------------}
  8030. function float64_to_float128( a : float64) : float128;
  8031. var
  8032. aSign : flag;
  8033. aExp : int16;
  8034. aSig, zSig0, zSig1 : bits64;
  8035. begin
  8036. aSig := extractFloat64Frac( a );
  8037. aExp := extractFloat64Exp( a );
  8038. aSign := extractFloat64Sign( a );
  8039. if ( aExp = $7FF ) then begin
  8040. if ( aSig<>0 ) then
  8041. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8042. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8043. exit;
  8044. end;
  8045. if ( aExp = 0 ) then begin
  8046. if ( aSig = 0 ) then
  8047. begin
  8048. result:=packFloat128( aSign, 0, 0, 0 );
  8049. exit;
  8050. end;
  8051. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8052. dec(aExp);
  8053. end;
  8054. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8055. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8056. end;
  8057. {$endif FPC_SOFTFLOAT_FLOAT128}
  8058. {$endif not(defined(fpc_softfpu_interface))}
  8059. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8060. end.
  8061. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}