softfpu.pp 320 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. {$macro on}
  69. {$define compilerproc:=stdcall }
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. float32 = longword;
  80. {$define FPC_SYSTEM_HAS_float32}
  81. { we use here a record in the function header because
  82. the record allows bitwise conversion to single }
  83. float32rec = record
  84. float32 : float32;
  85. end;
  86. flag = byte;
  87. bits8 = byte;
  88. sbits8 = shortint;
  89. bits16 = word;
  90. sbits16 = smallint;
  91. sbits32 = longint;
  92. bits32 = longword;
  93. {$ifndef fpc}
  94. qword = int64;
  95. {$endif}
  96. { now part of the system unit
  97. uint64 = qword;
  98. }
  99. bits64 = qword;
  100. sbits64 = int64;
  101. {$ifdef ENDIAN_LITTLE}
  102. float64 = record
  103. case byte of
  104. 1: (low,high : bits32);
  105. // force the record to be aligned like a double
  106. // else *_to_double will fail for cpus like sparc
  107. // and avoid expensive unpacking/packing operations
  108. 2: (dummy : double);
  109. end;
  110. floatx80 = record
  111. case byte of
  112. 1: (low : qword;high : word);
  113. // force the record to be aligned like a double
  114. // else *_to_double will fail for cpus like sparc
  115. // and avoid expensive unpacking/packing operations
  116. 2: (dummy : extended);
  117. end;
  118. float128 = record
  119. case byte of
  120. 1: (low,high : qword);
  121. // force the record to be aligned like a double
  122. // else *_to_double will fail for cpus like sparc
  123. // and avoid expensive unpacking/packing operations
  124. 2: (dummy : qword);
  125. end;
  126. {$else}
  127. float64 = record
  128. case byte of
  129. 1: (high,low : bits32);
  130. // force the record to be aligned like a double
  131. // else *_to_double will fail for cpus like sparc
  132. 2: (dummy : double);
  133. end;
  134. floatx80 = record
  135. case byte of
  136. 1: (high : word;low : qword);
  137. // force the record to be aligned like a double
  138. // else *_to_double will fail for cpus like sparc
  139. // and avoid expensive unpacking/packing operations
  140. 2: (dummy : qword);
  141. end;
  142. float128 = record
  143. case byte of
  144. 1: (high : qword;low : qword);
  145. // force the record to be aligned like a double
  146. // else *_to_double will fail for cpus like sparc
  147. // and avoid expensive unpacking/packing operations
  148. 2: (dummy : qword);
  149. end;
  150. {$endif}
  151. {$define FPC_SYSTEM_HAS_float64}
  152. {*
  153. -------------------------------------------------------------------------------
  154. Returns 1 if the double-precision floating-point value `a' is less than
  155. the corresponding value `b', and 0 otherwise. The comparison is performed
  156. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  157. -------------------------------------------------------------------------------
  158. *}
  159. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  160. {*
  161. -------------------------------------------------------------------------------
  162. Returns 1 if the double-precision floating-point value `a' is less than
  163. or equal to the corresponding value `b', and 0 otherwise. The comparison
  164. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  165. Arithmetic.
  166. -------------------------------------------------------------------------------
  167. *}
  168. Function float64_le(a: float64;b: float64): flag; compilerproc;
  169. {*
  170. -------------------------------------------------------------------------------
  171. Returns 1 if the double-precision floating-point value `a' is equal to
  172. the corresponding value `b', and 0 otherwise. The comparison is performed
  173. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  174. -------------------------------------------------------------------------------
  175. *}
  176. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  177. {*
  178. -------------------------------------------------------------------------------
  179. Returns the square root of the double-precision floating-point value `a'.
  180. The operation is performed according to the IEC/IEEE Standard for Binary
  181. Floating-Point Arithmetic.
  182. -------------------------------------------------------------------------------
  183. *}
  184. function float64_sqrt( a: float64 ): float64; compilerproc;
  185. {*
  186. -------------------------------------------------------------------------------
  187. Returns the remainder of the double-precision floating-point value `a'
  188. with respect to the corresponding value `b'. The operation is performed
  189. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  190. -------------------------------------------------------------------------------
  191. *}
  192. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  193. {*
  194. -------------------------------------------------------------------------------
  195. Returns the result of dividing the double-precision floating-point value `a'
  196. by the corresponding value `b'. The operation is performed according to the
  197. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  198. -------------------------------------------------------------------------------
  199. *}
  200. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  201. {*
  202. -------------------------------------------------------------------------------
  203. Returns the result of multiplying the double-precision floating-point values
  204. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  205. for Binary Floating-Point Arithmetic.
  206. -------------------------------------------------------------------------------
  207. *}
  208. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  209. {*
  210. -------------------------------------------------------------------------------
  211. Returns the result of subtracting the double-precision floating-point values
  212. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  213. for Binary Floating-Point Arithmetic.
  214. -------------------------------------------------------------------------------
  215. *}
  216. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  217. {*
  218. -------------------------------------------------------------------------------
  219. Returns the result of adding the double-precision floating-point values `a'
  220. and `b'. The operation is performed according to the IEC/IEEE Standard for
  221. Binary Floating-Point Arithmetic.
  222. -------------------------------------------------------------------------------
  223. *}
  224. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  225. {*
  226. -------------------------------------------------------------------------------
  227. Rounds the double-precision floating-point value `a' to an integer,
  228. and returns the result as a double-precision floating-point value. The
  229. operation is performed according to the IEC/IEEE Standard for Binary
  230. Floating-Point Arithmetic.
  231. -------------------------------------------------------------------------------
  232. *}
  233. Function float64_round_to_int(a: float64) : float64; compilerproc;
  234. {*
  235. -------------------------------------------------------------------------------
  236. Returns the result of converting the double-precision floating-point value
  237. `a' to the single-precision floating-point format. The conversion is
  238. performed according to the IEC/IEEE Standard for Binary Floating-Point
  239. Arithmetic.
  240. -------------------------------------------------------------------------------
  241. *}
  242. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  243. {*
  244. -------------------------------------------------------------------------------
  245. Returns the result of converting the double-precision floating-point value
  246. `a' to the 32-bit two's complement integer format. The conversion is
  247. performed according to the IEC/IEEE Standard for Binary Floating-Point
  248. Arithmetic, except that the conversion is always rounded toward zero.
  249. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  250. the conversion overflows, the largest integer with the same sign as `a' is
  251. returned.
  252. -------------------------------------------------------------------------------
  253. *}
  254. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  255. {*
  256. -------------------------------------------------------------------------------
  257. Returns the result of converting the double-precision floating-point value
  258. `a' to the 32-bit two's complement integer format. The conversion is
  259. performed according to the IEC/IEEE Standard for Binary Floating-Point
  260. Arithmetic---which means in particular that the conversion is rounded
  261. according to the current rounding mode. If `a' is a NaN, the largest
  262. positive integer is returned. Otherwise, if the conversion overflows, the
  263. largest integer with the same sign as `a' is returned.
  264. -------------------------------------------------------------------------------
  265. *}
  266. Function float64_to_int32(a: float64): int32; compilerproc;
  267. {*
  268. -------------------------------------------------------------------------------
  269. Returns 1 if the single-precision floating-point value `a' is less than
  270. the corresponding value `b', and 0 otherwise. The comparison is performed
  271. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  272. -------------------------------------------------------------------------------
  273. *}
  274. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  275. {*
  276. -------------------------------------------------------------------------------
  277. Returns 1 if the single-precision floating-point value `a' is less than
  278. or equal to the corresponding value `b', and 0 otherwise. The comparison
  279. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  280. Arithmetic.
  281. -------------------------------------------------------------------------------
  282. *}
  283. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  284. {*
  285. -------------------------------------------------------------------------------
  286. Returns 1 if the single-precision floating-point value `a' is equal to
  287. the corresponding value `b', and 0 otherwise. The comparison is performed
  288. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  289. -------------------------------------------------------------------------------
  290. *}
  291. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  292. {*
  293. -------------------------------------------------------------------------------
  294. Returns the square root of the single-precision floating-point value `a'.
  295. The operation is performed according to the IEC/IEEE Standard for Binary
  296. Floating-Point Arithmetic.
  297. -------------------------------------------------------------------------------
  298. *}
  299. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  300. {*
  301. -------------------------------------------------------------------------------
  302. Returns the remainder of the single-precision floating-point value `a'
  303. with respect to the corresponding value `b'. The operation is performed
  304. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  305. -------------------------------------------------------------------------------
  306. *}
  307. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  308. {*
  309. -------------------------------------------------------------------------------
  310. Returns the result of dividing the single-precision floating-point value `a'
  311. by the corresponding value `b'. The operation is performed according to the
  312. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  313. -------------------------------------------------------------------------------
  314. *}
  315. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  316. {*
  317. -------------------------------------------------------------------------------
  318. Returns the result of multiplying the single-precision floating-point values
  319. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  320. for Binary Floating-Point Arithmetic.
  321. -------------------------------------------------------------------------------
  322. *}
  323. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  324. {*
  325. -------------------------------------------------------------------------------
  326. Returns the result of subtracting the single-precision floating-point values
  327. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  328. for Binary Floating-Point Arithmetic.
  329. -------------------------------------------------------------------------------
  330. *}
  331. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  332. {*
  333. -------------------------------------------------------------------------------
  334. Returns the result of adding the single-precision floating-point values `a'
  335. and `b'. The operation is performed according to the IEC/IEEE Standard for
  336. Binary Floating-Point Arithmetic.
  337. -------------------------------------------------------------------------------
  338. *}
  339. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  340. {*
  341. -------------------------------------------------------------------------------
  342. Rounds the single-precision floating-point value `a' to an integer,
  343. and returns the result as a single-precision floating-point value. The
  344. operation is performed according to the IEC/IEEE Standard for Binary
  345. Floating-Point Arithmetic.
  346. -------------------------------------------------------------------------------
  347. *}
  348. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  349. {*
  350. -------------------------------------------------------------------------------
  351. Returns the result of converting the single-precision floating-point value
  352. `a' to the double-precision floating-point format. The conversion is
  353. performed according to the IEC/IEEE Standard for Binary Floating-Point
  354. Arithmetic.
  355. -------------------------------------------------------------------------------
  356. *}
  357. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  358. {*
  359. -------------------------------------------------------------------------------
  360. Returns the result of converting the single-precision floating-point value
  361. `a' to the 32-bit two's complement integer format. The conversion is
  362. performed according to the IEC/IEEE Standard for Binary Floating-Point
  363. Arithmetic, except that the conversion is always rounded toward zero.
  364. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  365. the conversion overflows, the largest integer with the same sign as `a' is
  366. returned.
  367. -------------------------------------------------------------------------------
  368. *}
  369. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  370. {*
  371. -------------------------------------------------------------------------------
  372. Returns the result of converting the single-precision floating-point value
  373. `a' to the 32-bit two's complement integer format. The conversion is
  374. performed according to the IEC/IEEE Standard for Binary Floating-Point
  375. Arithmetic---which means in particular that the conversion is rounded
  376. according to the current rounding mode. If `a' is a NaN, the largest
  377. positive integer is returned. Otherwise, if the conversion overflows, the
  378. largest integer with the same sign as `a' is returned.
  379. -------------------------------------------------------------------------------
  380. *}
  381. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  382. {*
  383. -------------------------------------------------------------------------------
  384. Returns the result of converting the 32-bit two's complement integer `a' to
  385. the double-precision floating-point format. The conversion is performed
  386. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  387. -------------------------------------------------------------------------------
  388. *}
  389. Function int32_to_float64( a: int32) : float64; compilerproc;
  390. {*
  391. -------------------------------------------------------------------------------
  392. Returns the result of converting the 32-bit two's complement integer `a' to
  393. the single-precision floating-point format. The conversion is performed
  394. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  395. -------------------------------------------------------------------------------
  396. *}
  397. Function int32_to_float32( a: int32): float32rec; compilerproc;
  398. {*----------------------------------------------------------------------------
  399. | Returns the result of converting the 64-bit two's complement integer `a'
  400. | to the double-precision floating-point format. The conversion is performed
  401. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  402. *----------------------------------------------------------------------------*}
  403. Function int64_to_float64( a: int64 ): float64; compilerproc;
  404. Function qword_to_float64( a: qword ): float64; compilerproc;
  405. {*----------------------------------------------------------------------------
  406. | Returns the result of converting the 64-bit two's complement integer `a'
  407. | to the single-precision floating-point format. The conversion is performed
  408. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  409. *----------------------------------------------------------------------------*}
  410. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  411. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  412. // +++
  413. function float32_to_int64( a: float32 ): int64;
  414. function float32_to_int64_round_to_zero( a: float32 ): int64;
  415. function float32_eq_signaling( a: float32; b: float32) : flag;
  416. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  417. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  418. function float32_is_signaling_nan( a : float32 ): flag;
  419. function float32_is_nan( a : float32 ): flag;
  420. function float64_to_int64( a: float64 ): int64;
  421. function float64_to_int64_round_to_zero( a: float64 ): int64;
  422. function float64_eq_signaling( a: float64; b: float64): flag;
  423. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  424. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  425. function float64_is_signaling_nan( a : float64 ): flag;
  426. function float64_is_nan( a : float64 ): flag;
  427. // ===
  428. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  429. {*----------------------------------------------------------------------------
  430. | Extended double-precision rounding precision
  431. *----------------------------------------------------------------------------*}
  432. var // threadvar!?
  433. floatx80_rounding_precision : int8 = 80;
  434. function int32_to_floatx80( a: int32 ): floatx80;
  435. function int64_to_floatx80( a: int64 ): floatx80;
  436. function qword_to_floatx80( a: qword ): floatx80;
  437. function float32_to_floatx80( a: float32 ): floatx80;
  438. function float64_to_floatx80( a: float64 ): floatx80;
  439. function floatx80_to_int32( a: floatx80 ): int32;
  440. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  441. function floatx80_to_int64( a: floatx80 ): int64;
  442. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  443. function floatx80_to_float32( a: floatx80 ): float32;
  444. function floatx80_to_float64( a: floatx80 ): float64;
  445. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  446. function floatx80_to_float128( a: floatx80 ): float128;
  447. {$endif FPC_SOFTFLOAT_FLOAT128}
  448. function floatx80_round_to_int( a: floatx80 ): floatx80;
  449. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  450. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  451. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  452. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  453. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  454. function floatx80_sqrt( a: floatx80 ): floatx80;
  455. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  456. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  457. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  458. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  459. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  460. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  461. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  462. function floatx80_is_nan(a : floatx80 ): flag;
  463. {$endif FPC_SOFTFLOAT_FLOATX80}
  464. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  465. function int32_to_float128( a: int32 ): float128;
  466. function int64_to_float128( a: int64 ): float128;
  467. function qword_to_float128( a: qword ): float128;
  468. function float32_to_float128( a: float32 ): float128;
  469. function float128_is_nan( a : float128): flag;
  470. function float128_is_signaling_nan( a : float128): flag;
  471. function float128_to_int32(a: float128): int32;
  472. function float128_to_int32_round_to_zero(a: float128): int32;
  473. function float128_to_int64(a: float128): int64;
  474. function float128_to_int64_round_to_zero(a: float128): int64;
  475. function float128_to_float32(a: float128): float32;
  476. function float128_to_float64(a: float128): float64;
  477. function float64_to_float128( a : float64) : float128;
  478. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  479. function float128_to_floatx80(a: float128): floatx80;
  480. {$endif FPC_SOFTFLOAT_FLOATX80}
  481. function float128_round_to_int(a: float128): float128;
  482. function float128_add(a: float128; b: float128): float128;
  483. function float128_sub(a: float128; b: float128): float128;
  484. function float128_mul(a: float128; b: float128): float128;
  485. function float128_div(a: float128; b: float128): float128;
  486. function float128_rem(a: float128; b: float128): float128;
  487. function float128_sqrt(a: float128): float128;
  488. function float128_eq(a: float128; b: float128): flag;
  489. function float128_le(a: float128; b: float128): flag;
  490. function float128_lt(a: float128; b: float128): flag;
  491. function float128_eq_signaling(a: float128; b: float128): flag;
  492. function float128_le_quiet(a: float128; b: float128): flag;
  493. function float128_lt_quiet(a: float128; b: float128): flag;
  494. {$endif FPC_SOFTFLOAT_FLOAT128}
  495. CONST
  496. {-------------------------------------------------------------------------------
  497. Software IEC/IEEE floating-point underflow tininess-detection mode.
  498. -------------------------------------------------------------------------------
  499. *}
  500. float_tininess_after_rounding = 0;
  501. float_tininess_before_rounding = 1;
  502. {*
  503. -------------------------------------------------------------------------------
  504. Underflow tininess-detection mode, statically initialized to default value.
  505. (The declaration in `softfloat.h' must match the `int8' type here.)
  506. -------------------------------------------------------------------------------
  507. *}
  508. var // threadvar!?
  509. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  510. {$endif not(defined(fpc_softfpu_implementation))}
  511. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  512. implementation
  513. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  514. {$if not(defined(fpc_softfpu_interface))}
  515. (*****************************************************************************)
  516. (*----------------------------------------------------------------------------*)
  517. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  518. (* division and square root approximations. (Can be specialized to target if *)
  519. (* desired.) *)
  520. (* ---------------------------------------------------------------------------*)
  521. (*****************************************************************************)
  522. { This procedure serves as a single access point to softfloat_exception_flags.
  523. It also helps to reduce code size a bit because softfloat_exception_flags is
  524. a threadvar. }
  525. procedure set_inexact_flag;
  526. begin
  527. include(softfloat_exception_flags,float_flag_inexact);
  528. end;
  529. {*----------------------------------------------------------------------------
  530. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  531. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  532. | input. If `zSign' is 1, the input is negated before being converted to an
  533. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  534. | is simply rounded to an integer, with the inexact exception raised if the
  535. | input cannot be represented exactly as an integer. However, if the fixed-
  536. | point input is too large, the invalid exception is raised and the largest
  537. | positive or negative integer is returned.
  538. *----------------------------------------------------------------------------*}
  539. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  540. var
  541. roundingMode: TFPURoundingMode;
  542. roundNearestEven: boolean;
  543. roundIncrement, roundBits: int8;
  544. z: int32;
  545. begin
  546. roundingMode := softfloat_rounding_mode;
  547. roundNearestEven := (roundingMode = float_round_nearest_even);
  548. roundIncrement := $40;
  549. if not roundNearestEven then
  550. begin
  551. if ( roundingMode = float_round_to_zero ) then
  552. begin
  553. roundIncrement := 0;
  554. end
  555. else begin
  556. roundIncrement := $7F;
  557. if ( zSign<>0 ) then
  558. begin
  559. if ( roundingMode = float_round_up ) then
  560. roundIncrement := 0;
  561. end
  562. else begin
  563. if ( roundingMode = float_round_down ) then
  564. roundIncrement := 0;
  565. end;
  566. end;
  567. end;
  568. roundBits := lo(absZ) and $7F;
  569. absZ := ( absZ + roundIncrement ) shr 7;
  570. absZ := absZ and not( bits64( ord( ( roundBits xor $40 ) = 0 ) and ord(roundNearestEven) ));
  571. z := absZ;
  572. if ( zSign<>0 ) then
  573. z := - z;
  574. if ( longint(hi( absZ )) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  575. begin
  576. float_raise( float_flag_invalid );
  577. if zSign<>0 then
  578. result:=sbits32($80000000)
  579. else
  580. result:=$7FFFFFFF;
  581. exit;
  582. end;
  583. if ( roundBits<>0 ) then
  584. set_inexact_flag;
  585. result:=z;
  586. end;
  587. {*----------------------------------------------------------------------------
  588. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  589. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  590. | and returns the properly rounded 64-bit integer corresponding to the input.
  591. | If `zSign' is 1, the input is negated before being converted to an integer.
  592. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  593. | the inexact exception raised if the input cannot be represented exactly as
  594. | an integer. However, if the fixed-point input is too large, the invalid
  595. | exception is raised and the largest positive or negative integer is
  596. | returned.
  597. *----------------------------------------------------------------------------*}
  598. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  599. var
  600. roundingMode: TFPURoundingMode;
  601. roundNearestEven, increment: flag;
  602. z: int64;
  603. label
  604. overflow;
  605. begin
  606. roundingMode := softfloat_rounding_mode;
  607. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  608. increment := ord( sbits64(absZ1) < 0 );
  609. if ( roundNearestEven=0 ) then
  610. begin
  611. if ( roundingMode = float_round_to_zero ) then
  612. begin
  613. increment := 0;
  614. end
  615. else begin
  616. if ( zSign<>0 ) then
  617. begin
  618. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  619. end
  620. else begin
  621. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  622. end;
  623. end;
  624. end;
  625. if ( increment<>0 ) then
  626. begin
  627. inc(absZ0);
  628. if ( absZ0 = 0 ) then
  629. goto overflow;
  630. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  631. end;
  632. z := absZ0;
  633. if ( zSign<>0 ) then
  634. z := - z;
  635. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  636. begin
  637. overflow:
  638. float_raise( float_flag_invalid );
  639. if zSign<>0 then
  640. result:=int64($8000000000000000)
  641. else
  642. result:=int64($7FFFFFFFFFFFFFFF);
  643. exit;
  644. end;
  645. if ( absZ1<>0 ) then
  646. set_inexact_flag;
  647. result:=z;
  648. end;
  649. {*
  650. -------------------------------------------------------------------------------
  651. Shifts `a' right by the number of bits given in `count'. If any nonzero
  652. bits are shifted off, they are ``jammed'' into the least significant bit of
  653. the result by setting the least significant bit to 1. The value of `count'
  654. can be arbitrarily large; in particular, if `count' is greater than 32, the
  655. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  656. The result is stored in the location pointed to by `zPtr'.
  657. -------------------------------------------------------------------------------
  658. *}
  659. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  660. var
  661. z: Bits32;
  662. Begin
  663. if ( count = 0 ) then
  664. z := a
  665. else
  666. if ( count < 32 ) then
  667. Begin
  668. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  669. End
  670. else
  671. Begin
  672. z := bits32( a <> 0 );
  673. End;
  674. zPtr := z;
  675. End;
  676. {*----------------------------------------------------------------------------
  677. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  678. | number of bits given in `count'. Any bits shifted off are lost. The value
  679. | of `count' can be arbitrarily large; in particular, if `count' is greater
  680. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  681. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  682. *----------------------------------------------------------------------------*}
  683. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  684. var
  685. z0, z1: bits64;
  686. negCount: int8;
  687. begin
  688. negCount := ( - count ) and 63;
  689. if ( count = 0 ) then
  690. begin
  691. z1 := a1;
  692. z0 := a0;
  693. end
  694. else if ( count < 64 ) then
  695. begin
  696. z1 := ( a0 shl negCount ) or ( a1 shr count );
  697. z0 := a0 shr count;
  698. end
  699. else
  700. begin
  701. if ( count < 128 ) then
  702. z1 := a0 shr ( count and 63 )
  703. else
  704. z1 := 0;
  705. z0 := 0;
  706. end;
  707. z1Ptr := z1;
  708. z0Ptr := z0;
  709. end;
  710. {*----------------------------------------------------------------------------
  711. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  712. | number of bits given in `count'. If any nonzero bits are shifted off, they
  713. | are ``jammed'' into the least significant bit of the result by setting the
  714. | least significant bit to 1. The value of `count' can be arbitrarily large;
  715. | in particular, if `count' is greater than 128, the result will be either
  716. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  717. | nonzero. The result is broken into two 64-bit pieces which are stored at
  718. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  719. *----------------------------------------------------------------------------*}
  720. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  721. var
  722. z0,z1 : bits64;
  723. negCount : int8;
  724. begin
  725. negCount := ( - count ) and 63;
  726. if ( count = 0 ) then begin
  727. z1 := a1;
  728. z0 := a0;
  729. end
  730. else if ( count < 64 ) then begin
  731. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  732. z0 := a0 shr count;
  733. end
  734. else begin
  735. if ( count = 64 ) then begin
  736. z1 := a0 or ord( a1 <> 0 );
  737. end
  738. else if ( count < 128 ) then begin
  739. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  740. end
  741. else begin
  742. z1 := ord( ( a0 or a1 ) <> 0 );
  743. end;
  744. z0 := 0;
  745. end;
  746. z1Ptr := z1;
  747. z0Ptr := z0;
  748. end;
  749. {*
  750. -------------------------------------------------------------------------------
  751. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  752. number of bits given in `count'. Any bits shifted off are lost. The value
  753. of `count' can be arbitrarily large; in particular, if `count' is greater
  754. than 64, the result will be 0. The result is broken into two 32-bit pieces
  755. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  756. -------------------------------------------------------------------------------
  757. *}
  758. Procedure
  759. shift64Right(
  760. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  761. Var
  762. z0, z1: bits32;
  763. negCount : int8;
  764. Begin
  765. negCount := ( - count ) AND 31;
  766. if ( count = 0 ) then
  767. Begin
  768. z1 := a1;
  769. z0 := a0;
  770. End
  771. else if ( count < 32 ) then
  772. Begin
  773. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  774. z0 := a0 shr count;
  775. End
  776. else
  777. Begin
  778. if (count < 64) then
  779. z1 := ( a0 shr ( count AND 31 ) )
  780. else
  781. z1 := 0;
  782. z0 := 0;
  783. End;
  784. z1Ptr := z1;
  785. z0Ptr := z0;
  786. End;
  787. {*
  788. -------------------------------------------------------------------------------
  789. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  790. number of bits given in `count'. If any nonzero bits are shifted off, they
  791. are ``jammed'' into the least significant bit of the result by setting the
  792. least significant bit to 1. The value of `count' can be arbitrarily large;
  793. in particular, if `count' is greater than 64, the result will be either 0
  794. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  795. nonzero. The result is broken into two 32-bit pieces which are stored at
  796. the locations pointed to by `z0Ptr' and `z1Ptr'.
  797. -------------------------------------------------------------------------------
  798. *}
  799. Procedure
  800. shift64RightJamming(
  801. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  802. VAR
  803. z0, z1 : bits32;
  804. negCount : int8;
  805. Begin
  806. negCount := ( - count ) AND 31;
  807. if ( count = 0 ) then
  808. Begin
  809. z1 := a1;
  810. z0 := a0;
  811. End
  812. else
  813. if ( count < 32 ) then
  814. Begin
  815. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  816. z0 := a0 shr count;
  817. End
  818. else
  819. Begin
  820. if ( count = 32 ) then
  821. Begin
  822. z1 := a0 OR bits32( a1 <> 0 );
  823. End
  824. else
  825. if ( count < 64 ) Then
  826. Begin
  827. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  828. End
  829. else
  830. Begin
  831. z1 := bits32( ( a0 OR a1 ) <> 0 );
  832. End;
  833. z0 := 0;
  834. End;
  835. z1Ptr := z1;
  836. z0Ptr := z0;
  837. End;
  838. {*----------------------------------------------------------------------------
  839. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  840. | bits are shifted off, they are ``jammed'' into the least significant bit of
  841. | the result by setting the least significant bit to 1. The value of `count'
  842. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  843. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  844. | The result is stored in the location pointed to by `zPtr'.
  845. *----------------------------------------------------------------------------*}
  846. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  847. var
  848. z: bits64;
  849. begin
  850. if ( count = 0 ) then
  851. begin
  852. z := a;
  853. end
  854. else if ( count < 64 ) then
  855. begin
  856. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  857. end
  858. else
  859. begin
  860. z := ord( a <> 0 );
  861. end;
  862. zPtr := z;
  863. end;
  864. {$if not defined(shift64ExtraRightJamming)}
  865. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  866. overload;
  867. forward;
  868. {$endif}
  869. {*
  870. -------------------------------------------------------------------------------
  871. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  872. by 32 _plus_ the number of bits given in `count'. The shifted result is
  873. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  874. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  875. off form a third 32-bit result as follows: The _last_ bit shifted off is
  876. the most-significant bit of the extra result, and the other 31 bits of the
  877. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  878. were all zero. This extra result is stored in the location pointed to by
  879. `z2Ptr'. The value of `count' can be arbitrarily large.
  880. (This routine makes more sense if `a0', `a1', and `a2' are considered
  881. to form a fixed-point value with binary point between `a1' and `a2'. This
  882. fixed-point value is shifted right by the number of bits given in `count',
  883. and the integer part of the result is returned at the locations pointed to
  884. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  885. corrupted as described above, and is returned at the location pointed to by
  886. `z2Ptr'.)
  887. -------------------------------------------------------------------------------
  888. }
  889. Procedure
  890. shift64ExtraRightJamming(
  891. a0: bits32;
  892. a1: bits32;
  893. a2: bits32;
  894. count: int16;
  895. VAR z0Ptr: bits32;
  896. VAR z1Ptr: bits32;
  897. VAR z2Ptr: bits32
  898. ); overload;
  899. Var
  900. z0, z1, z2: bits32;
  901. negCount : int8;
  902. Begin
  903. negCount := ( - count ) AND 31;
  904. if ( count = 0 ) then
  905. Begin
  906. z2 := a2;
  907. z1 := a1;
  908. z0 := a0;
  909. End
  910. else
  911. Begin
  912. if ( count < 32 ) Then
  913. Begin
  914. z2 := a1 shl negCount;
  915. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  916. z0 := a0 shr count;
  917. End
  918. else
  919. Begin
  920. if ( count = 32 ) then
  921. Begin
  922. z2 := a1;
  923. z1 := a0;
  924. End
  925. else
  926. Begin
  927. a2 := a2 or a1;
  928. if ( count < 64 ) then
  929. Begin
  930. z2 := a0 shl negCount;
  931. z1 := a0 shr ( count AND 31 );
  932. End
  933. else
  934. Begin
  935. if count = 64 then
  936. z2 := a0
  937. else
  938. z2 := bits32(a0 <> 0);
  939. z1 := 0;
  940. End;
  941. End;
  942. z0 := 0;
  943. End;
  944. z2 := z2 or bits32( a2 <> 0 );
  945. End;
  946. z2Ptr := z2;
  947. z1Ptr := z1;
  948. z0Ptr := z0;
  949. End;
  950. {*
  951. -------------------------------------------------------------------------------
  952. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  953. number of bits given in `count'. Any bits shifted off are lost. The value
  954. of `count' must be less than 32. The result is broken into two 32-bit
  955. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  956. -------------------------------------------------------------------------------
  957. *}
  958. Procedure
  959. shortShift64Left(
  960. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  961. Begin
  962. z1Ptr := a1 shl count;
  963. if count = 0 then
  964. z0Ptr := a0
  965. else
  966. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  967. End;
  968. {*
  969. -------------------------------------------------------------------------------
  970. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  971. by the number of bits given in `count'. Any bits shifted off are lost.
  972. The value of `count' must be less than 32. The result is broken into three
  973. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  974. `z1Ptr', and `z2Ptr'.
  975. -------------------------------------------------------------------------------
  976. *}
  977. Procedure
  978. shortShift96Left(
  979. a0: bits32;
  980. a1: bits32;
  981. a2: bits32;
  982. count: int16;
  983. VAR z0Ptr: bits32;
  984. VAR z1Ptr: bits32;
  985. VAR z2Ptr: bits32
  986. );
  987. Var
  988. z0, z1, z2: bits32;
  989. negCount: int8;
  990. Begin
  991. z2 := a2 shl count;
  992. z1 := a1 shl count;
  993. z0 := a0 shl count;
  994. if ( 0 < count ) then
  995. Begin
  996. negCount := ( ( - count ) AND 31 );
  997. z1 := z1 or (a2 shr negCount);
  998. z0 := z0 or (a1 shr negCount);
  999. End;
  1000. z2Ptr := z2;
  1001. z1Ptr := z1;
  1002. z0Ptr := z0;
  1003. End;
  1004. {*----------------------------------------------------------------------------
  1005. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1006. | number of bits given in `count'. Any bits shifted off are lost. The value
  1007. | of `count' must be less than 64. The result is broken into two 64-bit
  1008. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1009. *----------------------------------------------------------------------------*}
  1010. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1011. begin
  1012. z1Ptr := a1 shl count;
  1013. if count=0 then
  1014. z0Ptr:=a0
  1015. else
  1016. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1017. end;
  1018. {*
  1019. -------------------------------------------------------------------------------
  1020. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1021. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1022. any carry out is lost. The result is broken into two 32-bit pieces which
  1023. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1024. -------------------------------------------------------------------------------
  1025. *}
  1026. Procedure
  1027. add64(
  1028. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  1029. Var
  1030. z1: bits32;
  1031. Begin
  1032. z1 := a1 + b1;
  1033. z1Ptr := z1;
  1034. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1035. End;
  1036. {*
  1037. -------------------------------------------------------------------------------
  1038. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1039. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1040. modulo 2^96, so any carry out is lost. The result is broken into three
  1041. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1042. `z1Ptr', and `z2Ptr'.
  1043. -------------------------------------------------------------------------------
  1044. *}
  1045. Procedure
  1046. add96(
  1047. a0: bits32;
  1048. a1: bits32;
  1049. a2: bits32;
  1050. b0: bits32;
  1051. b1: bits32;
  1052. b2: bits32;
  1053. VAR z0Ptr: bits32;
  1054. VAR z1Ptr: bits32;
  1055. VAR z2Ptr: bits32
  1056. );
  1057. var
  1058. z0, z1, z2: bits32;
  1059. carry0, carry1: int8;
  1060. Begin
  1061. z2 := a2 + b2;
  1062. carry1 := int8( z2 < a2 );
  1063. z1 := a1 + b1;
  1064. carry0 := int8( z1 < a1 );
  1065. z0 := a0 + b0;
  1066. z1 := z1 + carry1;
  1067. z0 := z0 + bits32( z1 < carry1 );
  1068. z0 := z0 + carry0;
  1069. z2Ptr := z2;
  1070. z1Ptr := z1;
  1071. z0Ptr := z0;
  1072. End;
  1073. {*----------------------------------------------------------------------------
  1074. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1075. | by the number of bits given in `count'. Any bits shifted off are lost.
  1076. | The value of `count' must be less than 64. The result is broken into three
  1077. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1078. | `z1Ptr', and `z2Ptr'.
  1079. *----------------------------------------------------------------------------*}
  1080. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1081. var
  1082. z0, z1, z2 : bits64;
  1083. negCount : int8;
  1084. begin
  1085. z2 := a2 shl count;
  1086. z1 := a1 shl count;
  1087. z0 := a0 shl count;
  1088. if ( 0 < count ) then
  1089. begin
  1090. negCount := ( ( - count ) and 63 );
  1091. z1 := z1 or (a2 shr negCount);
  1092. z0 := z0 or (a1 shr negCount);
  1093. end;
  1094. z2Ptr := z2;
  1095. z1Ptr := z1;
  1096. z0Ptr := z0;
  1097. end;
  1098. {*----------------------------------------------------------------------------
  1099. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1100. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1101. | any carry out is lost. The result is broken into two 64-bit pieces which
  1102. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1103. *----------------------------------------------------------------------------*}
  1104. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1105. var
  1106. z1 : bits64;
  1107. begin
  1108. z1 := a1 + b1;
  1109. z1Ptr := z1;
  1110. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1111. end;
  1112. {*----------------------------------------------------------------------------
  1113. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1114. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1115. | modulo 2^192, so any carry out is lost. The result is broken into three
  1116. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1117. | `z1Ptr', and `z2Ptr'.
  1118. *----------------------------------------------------------------------------*}
  1119. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1120. var
  1121. z0, z1, z2 : bits64;
  1122. carry0, carry1 : int8;
  1123. begin
  1124. z2 := a2 + b2;
  1125. carry1 := ord( z2 < a2 );
  1126. z1 := a1 + b1;
  1127. carry0 := ord( z1 < a1 );
  1128. z0 := a0 + b0;
  1129. inc(z1, carry1);
  1130. inc(z0, ord( z1 < carry1 ));
  1131. inc(z0, carry0);
  1132. z2Ptr := z2;
  1133. z1Ptr := z1;
  1134. z0Ptr := z0;
  1135. end;
  1136. {*
  1137. -------------------------------------------------------------------------------
  1138. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1139. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1140. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1141. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1142. `z1Ptr'.
  1143. -------------------------------------------------------------------------------
  1144. *}
  1145. Procedure
  1146. sub64(
  1147. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1148. Begin
  1149. z1Ptr := a1 - b1;
  1150. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1151. End;
  1152. {*
  1153. -------------------------------------------------------------------------------
  1154. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1155. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1156. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1157. into three 32-bit pieces which are stored at the locations pointed to by
  1158. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1159. -------------------------------------------------------------------------------
  1160. *}
  1161. Procedure
  1162. sub96(
  1163. a0:bits32;
  1164. a1:bits32;
  1165. a2:bits32;
  1166. b0:bits32;
  1167. b1:bits32;
  1168. b2:bits32;
  1169. VAR z0Ptr:bits32;
  1170. VAR z1Ptr:bits32;
  1171. VAR z2Ptr:bits32
  1172. );
  1173. Var
  1174. z0, z1, z2: bits32;
  1175. borrow0, borrow1: int8;
  1176. Begin
  1177. z2 := a2 - b2;
  1178. borrow1 := int8( a2 < b2 );
  1179. z1 := a1 - b1;
  1180. borrow0 := int8( a1 < b1 );
  1181. z0 := a0 - b0;
  1182. z0 := z0 - bits32( z1 < borrow1 );
  1183. z1 := z1 - borrow1;
  1184. z0 := z0 -borrow0;
  1185. z2Ptr := z2;
  1186. z1Ptr := z1;
  1187. z0Ptr := z0;
  1188. End;
  1189. {*----------------------------------------------------------------------------
  1190. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1191. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1192. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1193. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1194. | `z1Ptr'.
  1195. *----------------------------------------------------------------------------*}
  1196. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1197. begin
  1198. z1Ptr := a1 - b1;
  1199. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1200. end;
  1201. {*----------------------------------------------------------------------------
  1202. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1203. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1204. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1205. | result is broken into three 64-bit pieces which are stored at the locations
  1206. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1207. *----------------------------------------------------------------------------*}
  1208. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1209. var
  1210. z0, z1, z2 : bits64;
  1211. borrow0, borrow1 : int8;
  1212. begin
  1213. z2 := a2 - b2;
  1214. borrow1 := ord( a2 < b2 );
  1215. z1 := a1 - b1;
  1216. borrow0 := ord( a1 < b1 );
  1217. z0 := a0 - b0;
  1218. dec(z0, ord( z1 < borrow1 ));
  1219. dec(z1, borrow1);
  1220. dec(z0, borrow0);
  1221. z2Ptr := z2;
  1222. z1Ptr := z1;
  1223. z0Ptr := z0;
  1224. end;
  1225. {*
  1226. -------------------------------------------------------------------------------
  1227. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1228. into two 32-bit pieces which are stored at the locations pointed to by
  1229. `z0Ptr' and `z1Ptr'.
  1230. -------------------------------------------------------------------------------
  1231. *}
  1232. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1233. :bits32 );
  1234. Var
  1235. aHigh, aLow, bHigh, bLow: bits16;
  1236. z0, zMiddleA, zMiddleB, z1: bits32;
  1237. Begin
  1238. aLow := bits16(a);
  1239. aHigh := a shr 16;
  1240. bLow := bits16(b);
  1241. bHigh := b shr 16;
  1242. z1 := ( bits32( aLow) ) * bLow;
  1243. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1244. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1245. z0 := ( bits32 (aHigh) ) * bHigh;
  1246. zMiddleA := zMiddleA + zMiddleB;
  1247. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1248. zMiddleA := zmiddleA shl 16;
  1249. z1 := z1 + zMiddleA;
  1250. z0 := z0 + bits32( z1 < zMiddleA );
  1251. z1Ptr := z1;
  1252. z0Ptr := z0;
  1253. End;
  1254. {*
  1255. -------------------------------------------------------------------------------
  1256. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1257. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1258. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1259. `z2Ptr'.
  1260. -------------------------------------------------------------------------------
  1261. *}
  1262. Procedure
  1263. mul64By32To96(
  1264. a0:bits32;
  1265. a1:bits32;
  1266. b:bits32;
  1267. VAR z0Ptr:bits32;
  1268. VAR z1Ptr:bits32;
  1269. VAR z2Ptr:bits32
  1270. );
  1271. Var
  1272. z0, z1, z2, more1: bits32;
  1273. Begin
  1274. mul32To64( a1, b, z1, z2 );
  1275. mul32To64( a0, b, z0, more1 );
  1276. add64( z0, more1, 0, z1, z0, z1 );
  1277. z2Ptr := z2;
  1278. z1Ptr := z1;
  1279. z0Ptr := z0;
  1280. End;
  1281. {*
  1282. -------------------------------------------------------------------------------
  1283. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1284. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1285. product. The product is broken into four 32-bit pieces which are stored at
  1286. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1287. -------------------------------------------------------------------------------
  1288. *}
  1289. Procedure
  1290. mul64To128(
  1291. a0:bits32;
  1292. a1:bits32;
  1293. b0:bits32;
  1294. b1:bits32;
  1295. VAR z0Ptr:bits32;
  1296. VAR z1Ptr:bits32;
  1297. VAR z2Ptr:bits32;
  1298. VAR z3Ptr:bits32
  1299. );
  1300. Var
  1301. z0, z1, z2, z3: bits32;
  1302. more1, more2: bits32;
  1303. Begin
  1304. mul32To64( a1, b1, z2, z3 );
  1305. mul32To64( a1, b0, z1, more2 );
  1306. add64( z1, more2, 0, z2, z1, z2 );
  1307. mul32To64( a0, b0, z0, more1 );
  1308. add64( z0, more1, 0, z1, z0, z1 );
  1309. mul32To64( a0, b1, more1, more2 );
  1310. add64( more1, more2, 0, z2, more1, z2 );
  1311. add64( z0, z1, 0, more1, z0, z1 );
  1312. z3Ptr := z3;
  1313. z2Ptr := z2;
  1314. z1Ptr := z1;
  1315. z0Ptr := z0;
  1316. End;
  1317. {*----------------------------------------------------------------------------
  1318. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1319. | into two 64-bit pieces which are stored at the locations pointed to by
  1320. | `z0Ptr' and `z1Ptr'.
  1321. *----------------------------------------------------------------------------*}
  1322. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1323. var
  1324. aHigh, aLow, bHigh, bLow : bits32;
  1325. z0, zMiddleA, zMiddleB, z1 : bits64;
  1326. begin
  1327. aLow := a;
  1328. aHigh := a shr 32;
  1329. bLow := b;
  1330. bHigh := b shr 32;
  1331. z1 := ( bits64(aLow) ) * bLow;
  1332. zMiddleA := ( bits64( aLow )) * bHigh;
  1333. zMiddleB := ( bits64( aHigh )) * bLow;
  1334. z0 := ( bits64(aHigh) ) * bHigh;
  1335. inc(zMiddleA, zMiddleB);
  1336. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1337. zMiddleA := zMiddleA shl 32;
  1338. inc(z1, zMiddleA);
  1339. inc(z0, ord( z1 < zMiddleA ));
  1340. z1Ptr := z1;
  1341. z0Ptr := z0;
  1342. end;
  1343. {*----------------------------------------------------------------------------
  1344. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1345. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1346. | product. The product is broken into four 64-bit pieces which are stored at
  1347. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1348. *----------------------------------------------------------------------------*}
  1349. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1350. var
  1351. z0,z1,z2,z3,more1,more2 : bits64;
  1352. begin
  1353. mul64To128( a1, b1, z2, z3 );
  1354. mul64To128( a1, b0, z1, more2 );
  1355. add128( z1, more2, 0, z2, z1, z2 );
  1356. mul64To128( a0, b0, z0, more1 );
  1357. add128( z0, more1, 0, z1, z0, z1 );
  1358. mul64To128( a0, b1, more1, more2 );
  1359. add128( more1, more2, 0, z2, more1, z2 );
  1360. add128( z0, z1, 0, more1, z0, z1 );
  1361. z3Ptr := z3;
  1362. z2Ptr := z2;
  1363. z1Ptr := z1;
  1364. z0Ptr := z0;
  1365. end;
  1366. {*----------------------------------------------------------------------------
  1367. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1368. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1369. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1370. | `z2Ptr'.
  1371. *----------------------------------------------------------------------------*}
  1372. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1373. var
  1374. z0, z1, z2, more1 : bits64;
  1375. begin
  1376. mul64To128( a1, b, z1, z2 );
  1377. mul64To128( a0, b, z0, more1 );
  1378. add128( z0, more1, 0, z1, z0, z1 );
  1379. z2Ptr := z2;
  1380. z1Ptr := z1;
  1381. z0Ptr := z0;
  1382. end;
  1383. {*----------------------------------------------------------------------------
  1384. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1385. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1386. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1387. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1388. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1389. | unsigned integer is returned.
  1390. *----------------------------------------------------------------------------*}
  1391. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1392. var
  1393. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1394. begin
  1395. if ( b <= a0 ) then
  1396. begin
  1397. result:=qword( $FFFFFFFFFFFFFFFF );
  1398. exit;
  1399. end;
  1400. b0 := b shr 32;
  1401. if ( b0 shl 32 <= a0 ) then
  1402. z:=qword( $FFFFFFFF00000000 )
  1403. else
  1404. z:=( a0 div b0 ) shl 32;
  1405. mul64To128( b, z, term0, term1 );
  1406. sub128( a0, a1, term0, term1, rem0, rem1 );
  1407. while ( ( sbits64(rem0) ) < 0 ) do begin
  1408. dec(z,qword( $100000000 ));
  1409. b1 := b shl 32;
  1410. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1411. end;
  1412. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1413. if ( b0 shl 32 <= rem0 ) then
  1414. z:=z or $FFFFFFFF
  1415. else
  1416. z:=z or rem0 div b0;
  1417. result:=z;
  1418. end;
  1419. {*
  1420. -------------------------------------------------------------------------------
  1421. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1422. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1423. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1424. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1425. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1426. unsigned integer is returned.
  1427. -------------------------------------------------------------------------------
  1428. *}
  1429. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1430. Var
  1431. b0, b1: bits32;
  1432. rem0, rem1, term0, term1: bits32;
  1433. z: bits32;
  1434. Begin
  1435. if ( b <= a0 ) then
  1436. Begin
  1437. estimateDiv64To32 := $FFFFFFFF;
  1438. exit;
  1439. End;
  1440. b0 := b shr 16;
  1441. if ( b0 shl 16 <= a0 ) then
  1442. z:= $FFFF0000
  1443. else
  1444. z:= ( a0 div b0 ) shl 16;
  1445. mul32To64( b, z, term0, term1 );
  1446. sub64( a0, a1, term0, term1, rem0, rem1 );
  1447. while ( ( sbits32 (rem0) ) < 0 ) do
  1448. Begin
  1449. z := z - $10000;
  1450. b1 := b shl 16;
  1451. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1452. End;
  1453. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1454. if ( b0 shl 16 <= rem0 ) then
  1455. z := z or $FFFF
  1456. else
  1457. z := z or (rem0 div b0);
  1458. estimateDiv64To32 := z;
  1459. End;
  1460. {*
  1461. -------------------------------------------------------------------------------
  1462. Returns an approximation to the square root of the 32-bit significand given
  1463. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1464. `aExp' (the least significant bit) is 1, the integer returned approximates
  1465. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1466. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1467. case, the approximation returned lies strictly within +/-2 of the exact
  1468. value.
  1469. -------------------------------------------------------------------------------
  1470. *}
  1471. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1472. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1473. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1474. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1475. );
  1476. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1477. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1478. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1479. );
  1480. Var
  1481. index: int8;
  1482. z: bits32;
  1483. Begin
  1484. index := ( a shr 27 ) AND 15;
  1485. if ( aExp AND 1 ) <> 0 then
  1486. Begin
  1487. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1488. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1489. a := a shr 1;
  1490. End
  1491. else
  1492. Begin
  1493. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1494. z := a div z + z;
  1495. if ( $20000 <= z ) then
  1496. z := $FFFF8000
  1497. else
  1498. z := ( z shl 15 );
  1499. if ( z <= a ) then
  1500. Begin
  1501. estimateSqrt32 := bits32 ( SarLongint( sbits32 (a)) );
  1502. exit;
  1503. End;
  1504. End;
  1505. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1506. End;
  1507. {*
  1508. -------------------------------------------------------------------------------
  1509. Returns the number of leading 0 bits before the most-significant 1 bit of
  1510. `a'. If `a' is zero, 32 is returned.
  1511. -------------------------------------------------------------------------------
  1512. *}
  1513. Function countLeadingZeros32( a:bits32 ): int8;
  1514. const countLeadingZerosHigh:array[0..255] of int8 = (
  1515. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1516. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1517. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1518. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1519. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1520. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1521. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1522. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1523. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1524. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1525. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1526. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1527. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1528. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1529. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1530. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1531. );
  1532. Var
  1533. shiftCount: int8;
  1534. Begin
  1535. shiftCount := 0;
  1536. if ( a < $10000 ) then
  1537. Begin
  1538. shiftCount := shiftcount + 16;
  1539. a := a shl 16;
  1540. End;
  1541. if ( a < $1000000 ) then
  1542. Begin
  1543. shiftCount := shiftcount + 8;
  1544. a := a shl 8;
  1545. end;
  1546. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1547. countLeadingZeros32:= shiftCount;
  1548. End;
  1549. {*----------------------------------------------------------------------------
  1550. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1551. | `a'. If `a' is zero, 64 is returned.
  1552. *----------------------------------------------------------------------------*}
  1553. function countLeadingZeros64( a : bits64): int8;
  1554. var
  1555. shiftcount : int8;
  1556. Begin
  1557. shiftCount := 0;
  1558. if ( a < bits64(bits64(1) shl 32 )) then
  1559. shiftCount := shiftcount + 32
  1560. else
  1561. a := a shr 32;
  1562. shiftCount := shiftCount + countLeadingZeros32( a );
  1563. countLeadingZeros64:= shiftCount;
  1564. End;
  1565. {*
  1566. -------------------------------------------------------------------------------
  1567. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1568. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1569. Otherwise, returns 0.
  1570. -------------------------------------------------------------------------------
  1571. *}
  1572. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1573. Begin
  1574. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1575. End;
  1576. {*
  1577. -------------------------------------------------------------------------------
  1578. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1579. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1580. returns 0.
  1581. -------------------------------------------------------------------------------
  1582. *}
  1583. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1584. Begin
  1585. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1586. End;
  1587. const
  1588. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1589. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1590. (*****************************************************************************)
  1591. (* End Low-Level arithmetic *)
  1592. (*****************************************************************************)
  1593. {*
  1594. -------------------------------------------------------------------------------
  1595. Functions and definitions to determine: (1) whether tininess for underflow
  1596. is detected before or after rounding by default, (2) what (if anything)
  1597. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1598. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1599. are propagated from function inputs to output. These details are ENDIAN
  1600. specific
  1601. -------------------------------------------------------------------------------
  1602. *}
  1603. {$IFDEF ENDIAN_LITTLE}
  1604. {*
  1605. -------------------------------------------------------------------------------
  1606. Internal canonical NaN format.
  1607. -------------------------------------------------------------------------------
  1608. *}
  1609. TYPE
  1610. commonNaNT = record
  1611. high, low : bits32;
  1612. sign: flag;
  1613. end;
  1614. {*
  1615. -------------------------------------------------------------------------------
  1616. The pattern for a default generated single-precision NaN.
  1617. -------------------------------------------------------------------------------
  1618. *}
  1619. const float32_default_nan = $FFC00000;
  1620. {*
  1621. -------------------------------------------------------------------------------
  1622. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1623. otherwise returns 0.
  1624. -------------------------------------------------------------------------------
  1625. *}
  1626. Function float32_is_nan( a : float32 ): flag;
  1627. Begin
  1628. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1629. End;
  1630. {*
  1631. -------------------------------------------------------------------------------
  1632. Returns 1 if the single-precision floating-point value `a' is a signaling
  1633. NaN; otherwise returns 0.
  1634. -------------------------------------------------------------------------------
  1635. *}
  1636. Function float32_is_signaling_nan( a : float32 ): flag;
  1637. Begin
  1638. float32_is_signaling_nan := flag
  1639. (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));
  1640. End;
  1641. {*
  1642. -------------------------------------------------------------------------------
  1643. Returns the result of converting the single-precision floating-point NaN
  1644. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1645. exception is raised.
  1646. -------------------------------------------------------------------------------
  1647. *}
  1648. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1649. var
  1650. z : commonNaNT ;
  1651. Begin
  1652. if ( float32_is_signaling_nan( a ) <> 0) then
  1653. float_raise( float_flag_invalid );
  1654. z.sign := a shr 31;
  1655. z.low := 0;
  1656. z.high := a shl 9;
  1657. c := z;
  1658. End;
  1659. {*
  1660. -------------------------------------------------------------------------------
  1661. Returns the result of converting the canonical NaN `a' to the single-
  1662. precision floating-point format.
  1663. -------------------------------------------------------------------------------
  1664. *}
  1665. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1666. Begin
  1667. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1668. End;
  1669. {*
  1670. -------------------------------------------------------------------------------
  1671. Takes two single-precision floating-point values `a' and `b', one of which
  1672. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1673. signaling NaN, the invalid exception is raised.
  1674. -------------------------------------------------------------------------------
  1675. *}
  1676. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1677. Var
  1678. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1679. label returnLargerSignificand;
  1680. Begin
  1681. aIsNaN := float32_is_nan( a );
  1682. aIsSignalingNaN := float32_is_signaling_nan( a );
  1683. bIsNaN := float32_is_nan( b );
  1684. bIsSignalingNaN := float32_is_signaling_nan( b );
  1685. a := a or $00400000;
  1686. b := b or $00400000;
  1687. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1688. float_raise( float_flag_invalid );
  1689. if ( aIsSignalingNaN )<> 0 then
  1690. Begin
  1691. if ( bIsSignalingNaN ) <> 0 then
  1692. goto returnLargerSignificand;
  1693. if bIsNan <> 0 then
  1694. propagateFloat32NaN := b
  1695. else
  1696. propagateFloat32NaN := a;
  1697. exit;
  1698. End
  1699. else if ( aIsNaN <> 0) then
  1700. Begin
  1701. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1702. Begin
  1703. propagateFloat32NaN := a;
  1704. exit;
  1705. End;
  1706. returnLargerSignificand:
  1707. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1708. Begin
  1709. propagateFloat32NaN := b;
  1710. exit;
  1711. End;
  1712. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1713. Begin
  1714. propagateFloat32NaN := a;
  1715. End;
  1716. if a < b then
  1717. propagateFloat32NaN := a
  1718. else
  1719. propagateFloat32NaN := b;
  1720. exit;
  1721. End
  1722. else
  1723. Begin
  1724. propagateFloat32NaN := b;
  1725. exit;
  1726. End;
  1727. End;
  1728. {*
  1729. -------------------------------------------------------------------------------
  1730. The pattern for a default generated double-precision NaN. The `high' and
  1731. `low' values hold the most- and least-significant bits, respectively.
  1732. -------------------------------------------------------------------------------
  1733. *}
  1734. const
  1735. float64_default_nan_high = $FFF80000;
  1736. float64_default_nan_low = $00000000;
  1737. {*
  1738. -------------------------------------------------------------------------------
  1739. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1740. otherwise returns 0.
  1741. -------------------------------------------------------------------------------
  1742. *}
  1743. Function float64_is_nan( a : float64 ) : flag;
  1744. Begin
  1745. float64_is_nan :=
  1746. flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1747. and (( a.low or ( a.high and $000FFFFF ) )<>0));
  1748. End;
  1749. {*
  1750. -------------------------------------------------------------------------------
  1751. Returns 1 if the double-precision floating-point value `a' is a signaling
  1752. NaN; otherwise returns 0.
  1753. -------------------------------------------------------------------------------
  1754. *}
  1755. Function float64_is_signaling_nan( a : float64 ): flag;
  1756. Begin
  1757. float64_is_signaling_nan :=
  1758. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1759. and ( a.low or ( a.high and $0007FFFF ) );
  1760. End;
  1761. {*
  1762. -------------------------------------------------------------------------------
  1763. Returns the result of converting the double-precision floating-point NaN
  1764. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1765. exception is raised.
  1766. -------------------------------------------------------------------------------
  1767. *}
  1768. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1769. Var
  1770. z : commonNaNT;
  1771. Begin
  1772. if ( float64_is_signaling_nan( a )<>0 ) then
  1773. float_raise( float_flag_invalid );
  1774. z.sign := a.high shr 31;
  1775. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1776. c := z;
  1777. End;
  1778. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1779. Var
  1780. z : commonNaNT;
  1781. Begin
  1782. if ( float64_is_signaling_nan( a )<>0 ) then
  1783. float_raise( float_flag_invalid );
  1784. z.sign := a.high shr 31;
  1785. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1786. result := z;
  1787. End;
  1788. {*
  1789. -------------------------------------------------------------------------------
  1790. Returns the result of converting the canonical NaN `a' to the double-
  1791. precision floating-point format.
  1792. -------------------------------------------------------------------------------
  1793. *}
  1794. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1795. Var
  1796. z: float64;
  1797. Begin
  1798. shift64Right( a.high, a.low, 12, z.high, z.low );
  1799. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1800. c := z;
  1801. End;
  1802. {*
  1803. -------------------------------------------------------------------------------
  1804. Takes two double-precision floating-point values `a' and `b', one of which
  1805. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1806. signaling NaN, the invalid exception is raised.
  1807. -------------------------------------------------------------------------------
  1808. *}
  1809. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1810. Var
  1811. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1812. label returnLargerSignificand;
  1813. Begin
  1814. aIsNaN := float64_is_nan( a );
  1815. aIsSignalingNaN := float64_is_signaling_nan( a );
  1816. bIsNaN := float64_is_nan( b );
  1817. bIsSignalingNaN := float64_is_signaling_nan( b );
  1818. a.high := a.high or $00080000;
  1819. b.high := b.high or $00080000;
  1820. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1821. float_raise( float_flag_invalid );
  1822. if ( aIsSignalingNaN )<>0 then
  1823. Begin
  1824. if ( bIsSignalingNaN )<>0 then
  1825. goto returnLargerSignificand;
  1826. if bIsNan <> 0 then
  1827. c := b
  1828. else
  1829. c := a;
  1830. exit;
  1831. End
  1832. else if ( aIsNaN )<> 0 then
  1833. Begin
  1834. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1835. Begin
  1836. c := a;
  1837. exit;
  1838. End;
  1839. returnLargerSignificand:
  1840. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1841. Begin
  1842. c := b;
  1843. exit;
  1844. End;
  1845. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1846. Begin
  1847. c := a;
  1848. exit;
  1849. End;
  1850. if a.high < b.high then
  1851. c := a
  1852. else
  1853. c := b;
  1854. exit;
  1855. End
  1856. else
  1857. Begin
  1858. c := b;
  1859. exit;
  1860. End;
  1861. End;
  1862. {*----------------------------------------------------------------------------
  1863. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1864. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1865. | returns 0.
  1866. *----------------------------------------------------------------------------*}
  1867. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1868. begin
  1869. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1870. end;
  1871. {*----------------------------------------------------------------------------
  1872. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1873. | otherwise returns 0.
  1874. *----------------------------------------------------------------------------*}
  1875. function float128_is_nan( a : float128): flag;
  1876. begin
  1877. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1878. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1879. end;
  1880. {*----------------------------------------------------------------------------
  1881. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1882. | signaling NaN; otherwise returns 0.
  1883. *----------------------------------------------------------------------------*}
  1884. function float128_is_signaling_nan( a : float128): flag;
  1885. begin
  1886. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1887. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1888. end;
  1889. {*----------------------------------------------------------------------------
  1890. | Returns the result of converting the quadruple-precision floating-point NaN
  1891. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1892. | exception is raised.
  1893. *----------------------------------------------------------------------------*}
  1894. function float128ToCommonNaN( a : float128): commonNaNT;
  1895. var
  1896. z: commonNaNT;
  1897. qhigh,qlow : qword;
  1898. begin
  1899. if ( float128_is_signaling_nan( a )<>0) then
  1900. float_raise( float_flag_invalid );
  1901. z.sign := a.high shr 63;
  1902. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1903. z.high:=qhigh shr 32;
  1904. z.low:=qhigh and $ffffffff;
  1905. result:=z;
  1906. end;
  1907. {*----------------------------------------------------------------------------
  1908. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1909. | precision floating-point format.
  1910. *----------------------------------------------------------------------------*}
  1911. function commonNaNToFloat128( a : commonNaNT): float128;
  1912. var
  1913. z: float128;
  1914. begin
  1915. shift128Right( a.high, a.low, 16, z.high, z.low );
  1916. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1917. result:=z;
  1918. end;
  1919. {*----------------------------------------------------------------------------
  1920. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1921. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1922. | `b' is a signaling NaN, the invalid exception is raised.
  1923. *----------------------------------------------------------------------------*}
  1924. function propagateFloat128NaN( a: float128; b : float128): float128;
  1925. var
  1926. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1927. label
  1928. returnLargerSignificand;
  1929. begin
  1930. aIsNaN := float128_is_nan( a );
  1931. aIsSignalingNaN := float128_is_signaling_nan( a );
  1932. bIsNaN := float128_is_nan( b );
  1933. bIsSignalingNaN := float128_is_signaling_nan( b );
  1934. a.high := a.high or int64( $0000800000000000 );
  1935. b.high := b.high or int64( $0000800000000000 );
  1936. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1937. float_raise( float_flag_invalid );
  1938. if ( aIsSignalingNaN )<>0 then
  1939. begin
  1940. if ( bIsSignalingNaN )<>0 then
  1941. goto returnLargerSignificand;
  1942. if bIsNaN<>0 then
  1943. result := b
  1944. else
  1945. result := a;
  1946. exit;
  1947. end
  1948. else if ( aIsNaN )<>0 then
  1949. begin
  1950. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1951. begin
  1952. result := a;
  1953. exit;
  1954. end;
  1955. returnLargerSignificand:
  1956. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1957. begin
  1958. result := b;
  1959. exit;
  1960. end;
  1961. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1962. begin
  1963. result := a;
  1964. exit
  1965. end;
  1966. if ( a.high < b.high ) then
  1967. result := a
  1968. else
  1969. result := b;
  1970. exit;
  1971. end
  1972. else
  1973. result:=b;
  1974. end;
  1975. {$ELSE}
  1976. { Big endian code }
  1977. (*----------------------------------------------------------------------------
  1978. | Internal canonical NaN format.
  1979. *----------------------------------------------------------------------------*)
  1980. type
  1981. commonNANT = record
  1982. high, low : bits32;
  1983. sign : flag;
  1984. end;
  1985. (*----------------------------------------------------------------------------
  1986. | The pattern for a default generated single-precision NaN.
  1987. *----------------------------------------------------------------------------*)
  1988. const float32_default_nan = $7FFFFFFF;
  1989. (*----------------------------------------------------------------------------
  1990. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1991. | otherwise returns 0.
  1992. *----------------------------------------------------------------------------*)
  1993. function float32_is_nan(a: float32): flag;
  1994. begin
  1995. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1996. end;
  1997. (*----------------------------------------------------------------------------
  1998. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1999. | NaN; otherwise returns 0.
  2000. *----------------------------------------------------------------------------*)
  2001. function float32_is_signaling_nan(a: float32):flag;
  2002. begin
  2003. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2004. end;
  2005. (*----------------------------------------------------------------------------
  2006. | Returns the result of converting the single-precision floating-point NaN
  2007. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2008. | exception is raised.
  2009. *----------------------------------------------------------------------------*)
  2010. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  2011. var
  2012. z: commonNANT;
  2013. begin
  2014. if float32_is_signaling_nan(a)<>0 then
  2015. float_raise(float_flag_invalid);
  2016. z.sign := a shr 31;
  2017. z.low := 0;
  2018. z.high := a shl 9;
  2019. c:=z;
  2020. end;
  2021. (*----------------------------------------------------------------------------
  2022. | Returns the result of converting the canonical NaN `a' to the single-
  2023. | precision floating-point format.
  2024. *----------------------------------------------------------------------------*)
  2025. function CommonNanToFloat32(a : CommonNaNT): float32;
  2026. begin
  2027. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2028. end;
  2029. (*----------------------------------------------------------------------------
  2030. | Takes two single-precision floating-point values `a' and `b', one of which
  2031. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2032. | signaling NaN, the invalid exception is raised.
  2033. *----------------------------------------------------------------------------*)
  2034. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2035. var
  2036. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2037. begin
  2038. aIsNaN := float32_is_nan( a );
  2039. aIsSignalingNaN := float32_is_signaling_nan( a );
  2040. bIsNaN := float32_is_nan( b );
  2041. bIsSignalingNaN := float32_is_signaling_nan( b );
  2042. a := a or $00400000;
  2043. b := b or $00400000;
  2044. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2045. float_raise( float_flag_invalid );
  2046. if bIsSignalingNaN<>0 then
  2047. propagateFloat32Nan := b
  2048. else if aIsSignalingNan<>0 then
  2049. propagateFloat32Nan := a
  2050. else if bIsNan<>0 then
  2051. propagateFloat32Nan := b
  2052. else
  2053. propagateFloat32Nan := a;
  2054. end;
  2055. (*----------------------------------------------------------------------------
  2056. | The pattern for a default generated double-precision NaN. The `high' and
  2057. | `low' values hold the most- and least-significant bits, respectively.
  2058. *----------------------------------------------------------------------------*)
  2059. const
  2060. float64_default_nan_high = $7FFFFFFF;
  2061. float64_default_nan_low = $FFFFFFFF;
  2062. (*----------------------------------------------------------------------------
  2063. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2064. | otherwise returns 0.
  2065. *----------------------------------------------------------------------------*)
  2066. function float64_is_nan(a: float64): flag;
  2067. begin
  2068. float64_is_nan := flag (
  2069. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2070. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2071. end;
  2072. (*----------------------------------------------------------------------------
  2073. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2074. | NaN; otherwise returns 0.
  2075. *----------------------------------------------------------------------------*)
  2076. function float64_is_signaling_nan( a:float64): flag;
  2077. begin
  2078. float64_is_signaling_nan := flag(
  2079. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2080. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2081. end;
  2082. (*----------------------------------------------------------------------------
  2083. | Returns the result of converting the double-precision floating-point NaN
  2084. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2085. | exception is raised.
  2086. *----------------------------------------------------------------------------*)
  2087. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2088. var
  2089. z : commonNaNT;
  2090. begin
  2091. if ( float64_is_signaling_nan( a )<>0 ) then
  2092. float_raise( float_flag_invalid );
  2093. z.sign := a.high shr 31;
  2094. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2095. c:=z;
  2096. end;
  2097. (*----------------------------------------------------------------------------
  2098. | Returns the result of converting the canonical NaN `a' to the double-
  2099. | precision floating-point format.
  2100. *----------------------------------------------------------------------------*)
  2101. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2102. var
  2103. z: float64;
  2104. begin
  2105. shift64Right( a.high, a.low, 12, z.high, z.low );
  2106. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2107. c:=z;
  2108. end;
  2109. (*----------------------------------------------------------------------------
  2110. | Takes two double-precision floating-point values `a' and `b', one of which
  2111. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2112. | signaling NaN, the invalid exception is raised.
  2113. *----------------------------------------------------------------------------*)
  2114. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2115. var
  2116. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2117. begin
  2118. aIsNaN := float64_is_nan( a );
  2119. aIsSignalingNaN := float64_is_signaling_nan( a );
  2120. bIsNaN := float64_is_nan( b );
  2121. bIsSignalingNaN := float64_is_signaling_nan( b );
  2122. a.high := a.high or $00080000;
  2123. b.high := b.high or $00080000;
  2124. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2125. float_raise( float_flag_invalid );
  2126. if bIsSignalingNaN<>0 then
  2127. c := b
  2128. else if aIsSignalingNan<>0 then
  2129. c := a
  2130. else if bIsNan<>0 then
  2131. c := b
  2132. else
  2133. c := a;
  2134. end;
  2135. {$ENDIF}
  2136. (****************************************************************************)
  2137. (* END ENDIAN SPECIFIC CODE *)
  2138. (****************************************************************************)
  2139. {*
  2140. -------------------------------------------------------------------------------
  2141. Returns the fraction bits of the single-precision floating-point value `a'.
  2142. -------------------------------------------------------------------------------
  2143. *}
  2144. Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
  2145. Begin
  2146. ExtractFloat32Frac := A AND $007FFFFF;
  2147. End;
  2148. {*
  2149. -------------------------------------------------------------------------------
  2150. Returns the exponent bits of the single-precision floating-point value `a'.
  2151. -------------------------------------------------------------------------------
  2152. *}
  2153. Function extractFloat32Exp( a: float32 ): Int16; inline;
  2154. Begin
  2155. extractFloat32Exp := (a shr 23) AND $FF;
  2156. End;
  2157. {*
  2158. -------------------------------------------------------------------------------
  2159. Returns the sign bit of the single-precision floating-point value `a'.
  2160. -------------------------------------------------------------------------------
  2161. *}
  2162. Function extractFloat32Sign( a: float32 ): Flag; inline;
  2163. Begin
  2164. extractFloat32Sign := a shr 31;
  2165. End;
  2166. {*
  2167. -------------------------------------------------------------------------------
  2168. Normalizes the subnormal single-precision floating-point value represented
  2169. by the denormalized significand `aSig'. The normalized exponent and
  2170. significand are stored at the locations pointed to by `zExpPtr' and
  2171. `zSigPtr', respectively.
  2172. -------------------------------------------------------------------------------
  2173. *}
  2174. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2175. Var
  2176. ShiftCount : BYTE;
  2177. Begin
  2178. shiftCount := countLeadingZeros32( aSig ) - 8;
  2179. zSigPtr := aSig shl shiftCount;
  2180. zExpPtr := 1 - shiftCount;
  2181. End;
  2182. {*
  2183. -------------------------------------------------------------------------------
  2184. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2185. single-precision floating-point value, returning the result. After being
  2186. shifted into the proper positions, the three fields are simply added
  2187. together to form the result. This means that any integer portion of `zSig'
  2188. will be added into the exponent. Since a properly normalized significand
  2189. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2190. than the desired result exponent whenever `zSig' is a complete, normalized
  2191. significand.
  2192. -------------------------------------------------------------------------------
  2193. *}
  2194. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
  2195. Begin
  2196. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2197. + zSig;
  2198. End;
  2199. {*
  2200. -------------------------------------------------------------------------------
  2201. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2202. and significand `zSig', and returns the proper single-precision floating-
  2203. point value corresponding to the abstract input. Ordinarily, the abstract
  2204. value is simply rounded and packed into the single-precision format, with
  2205. the inexact exception raised if the abstract input cannot be represented
  2206. exactly. However, if the abstract value is too large, the overflow and
  2207. inexact exceptions are raised and an infinity or maximal finite value is
  2208. returned. If the abstract value is too small, the input value is rounded to
  2209. a subnormal number, and the underflow and inexact exceptions are raised if
  2210. the abstract input cannot be represented exactly as a subnormal single-
  2211. precision floating-point number.
  2212. The input significand `zSig' has its binary point between bits 30
  2213. and 29, which is 7 bits to the left of the usual location. This shifted
  2214. significand must be normalized or smaller. If `zSig' is not normalized,
  2215. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2216. and it must not require rounding. In the usual case that `zSig' is
  2217. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2218. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2219. Binary Floating-Point Arithmetic.
  2220. -------------------------------------------------------------------------------
  2221. *}
  2222. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2223. Var
  2224. roundingMode : TFPURoundingMode;
  2225. roundNearestEven : boolean;
  2226. roundIncrement, roundBits : BYTE;
  2227. IsTiny : boolean;
  2228. Begin
  2229. roundingMode := softfloat_rounding_mode;
  2230. roundNearestEven := (roundingMode = float_round_nearest_even);
  2231. roundIncrement := $40;
  2232. if not roundNearestEven then
  2233. Begin
  2234. if ( roundingMode = float_round_to_zero ) Then
  2235. Begin
  2236. roundIncrement := 0;
  2237. End
  2238. else
  2239. Begin
  2240. roundIncrement := $7F;
  2241. if ( zSign <> 0 ) then
  2242. Begin
  2243. if roundingMode = float_round_up then roundIncrement := 0;
  2244. End
  2245. else
  2246. Begin
  2247. if roundingMode = float_round_down then roundIncrement := 0;
  2248. End;
  2249. End
  2250. End;
  2251. roundBits := zSig AND $7F;
  2252. if ($FD <= bits16 (zExp) ) then
  2253. Begin
  2254. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2255. Begin
  2256. float_raise( [float_flag_overflow,float_flag_inexact] );
  2257. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2258. exit;
  2259. End;
  2260. if ( zExp < 0 ) then
  2261. Begin
  2262. isTiny :=
  2263. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2264. OR ( zExp < -1 )
  2265. OR ( (zSig + roundIncrement) < $80000000 );
  2266. shift32RightJamming( zSig, - zExp, zSig );
  2267. zExp := 0;
  2268. roundBits := zSig AND $7F;
  2269. if ( isTiny and (roundBits<>0) ) then
  2270. float_raise( float_flag_underflow );
  2271. End;
  2272. End;
  2273. if ( roundBits )<> 0 then
  2274. set_inexact_flag;
  2275. zSig := ( zSig + roundIncrement ) shr 7;
  2276. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
  2277. if ( zSig = 0 ) then zExp := 0;
  2278. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2279. End;
  2280. {*
  2281. -------------------------------------------------------------------------------
  2282. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2283. and significand `zSig', and returns the proper single-precision floating-
  2284. point value corresponding to the abstract input. This routine is just like
  2285. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2286. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2287. floating-point exponent.
  2288. -------------------------------------------------------------------------------
  2289. *}
  2290. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2291. Var
  2292. ShiftCount : int8;
  2293. Begin
  2294. shiftCount := countLeadingZeros32( zSig ) - 1;
  2295. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2296. End;
  2297. {*
  2298. -------------------------------------------------------------------------------
  2299. Returns the most-significant 20 fraction bits of the double-precision
  2300. floating-point value `a'.
  2301. -------------------------------------------------------------------------------
  2302. *}
  2303. Function extractFloat64Frac0(a: float64): bits32; inline;
  2304. Begin
  2305. extractFloat64Frac0 := a.high and $000FFFFF;
  2306. End;
  2307. {*
  2308. -------------------------------------------------------------------------------
  2309. Returns the least-significant 32 fraction bits of the double-precision
  2310. floating-point value `a'.
  2311. -------------------------------------------------------------------------------
  2312. *}
  2313. Function extractFloat64Frac1(a: float64): bits32; inline;
  2314. Begin
  2315. extractFloat64Frac1 := a.low;
  2316. End;
  2317. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2318. Function extractFloat64Frac(a: float64): bits64; inline;
  2319. Begin
  2320. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2321. End;
  2322. {*
  2323. -------------------------------------------------------------------------------
  2324. Returns the exponent bits of the double-precision floating-point value `a'.
  2325. -------------------------------------------------------------------------------
  2326. *}
  2327. Function extractFloat64Exp(a: float64): int16; inline;
  2328. Begin
  2329. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2330. End;
  2331. {*
  2332. -------------------------------------------------------------------------------
  2333. Returns the sign bit of the double-precision floating-point value `a'.
  2334. -------------------------------------------------------------------------------
  2335. *}
  2336. Function extractFloat64Sign(a: float64) : flag; inline;
  2337. Begin
  2338. extractFloat64Sign := a.high shr 31;
  2339. End;
  2340. {*
  2341. -------------------------------------------------------------------------------
  2342. Normalizes the subnormal double-precision floating-point value represented
  2343. by the denormalized significand formed by the concatenation of `aSig0' and
  2344. `aSig1'. The normalized exponent is stored at the location pointed to by
  2345. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2346. stored at the location pointed to by `zSig0Ptr', and the least significant
  2347. 32 bits of the normalized significand are stored at the location pointed to
  2348. by `zSig1Ptr'.
  2349. -------------------------------------------------------------------------------
  2350. *}
  2351. Procedure normalizeFloat64Subnormal(
  2352. aSig0: bits32;
  2353. aSig1: bits32;
  2354. VAR zExpPtr : Int16;
  2355. VAR zSig0Ptr : Bits32;
  2356. VAR zSig1Ptr : Bits32
  2357. );
  2358. Var
  2359. ShiftCount : Int8;
  2360. Begin
  2361. if ( aSig0 = 0 ) then
  2362. Begin
  2363. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2364. if ( shiftCount < 0 ) then
  2365. Begin
  2366. zSig0Ptr := aSig1 shr ( - shiftCount );
  2367. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2368. End
  2369. else
  2370. Begin
  2371. zSig0Ptr := aSig1 shl shiftCount;
  2372. zSig1Ptr := 0;
  2373. End;
  2374. zExpPtr := - shiftCount - 31;
  2375. End
  2376. else
  2377. Begin
  2378. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2379. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2380. zExpPtr := 1 - shiftCount;
  2381. End;
  2382. End;
  2383. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2384. var
  2385. shiftCount : int8;
  2386. begin
  2387. shiftCount := countLeadingZeros64( aSig ) - 11;
  2388. zSigPtr := aSig shl shiftCount;
  2389. zExpPtr := 1 - shiftCount;
  2390. end;
  2391. {*
  2392. -------------------------------------------------------------------------------
  2393. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2394. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2395. point value, returning the result. After being shifted into the proper
  2396. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2397. together to form the most significant 32 bits of the result. This means
  2398. that any integer portion of `zSig0' will be added into the exponent. Since
  2399. a properly normalized significand will have an integer portion equal to 1,
  2400. the `zExp' input should be 1 less than the desired result exponent whenever
  2401. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2402. -------------------------------------------------------------------------------
  2403. *}
  2404. Procedure
  2405. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2406. var
  2407. z: Float64;
  2408. Begin
  2409. z.low := zSig1;
  2410. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2411. c := z;
  2412. End;
  2413. {*----------------------------------------------------------------------------
  2414. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2415. | double-precision floating-point value, returning the result. After being
  2416. | shifted into the proper positions, the three fields are simply added
  2417. | together to form the result. This means that any integer portion of `zSig'
  2418. | will be added into the exponent. Since a properly normalized significand
  2419. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2420. | than the desired result exponent whenever `zSig' is a complete, normalized
  2421. | significand.
  2422. *----------------------------------------------------------------------------*}
  2423. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2424. begin
  2425. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2426. end;
  2427. {*
  2428. -------------------------------------------------------------------------------
  2429. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2430. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2431. and `zSig2', and returns the proper double-precision floating-point value
  2432. corresponding to the abstract input. Ordinarily, the abstract value is
  2433. simply rounded and packed into the double-precision format, with the inexact
  2434. exception raised if the abstract input cannot be represented exactly.
  2435. However, if the abstract value is too large, the overflow and inexact
  2436. exceptions are raised and an infinity or maximal finite value is returned.
  2437. If the abstract value is too small, the input value is rounded to a
  2438. subnormal number, and the underflow and inexact exceptions are raised if the
  2439. abstract input cannot be represented exactly as a subnormal double-precision
  2440. floating-point number.
  2441. The input significand must be normalized or smaller. If the input
  2442. significand is not normalized, `zExp' must be 0; in that case, the result
  2443. returned is a subnormal number, and it must not require rounding. In the
  2444. usual case that the input significand is normalized, `zExp' must be 1 less
  2445. than the ``true'' floating-point exponent. The handling of underflow and
  2446. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2447. -------------------------------------------------------------------------------
  2448. *}
  2449. Procedure
  2450. roundAndPackFloat64(
  2451. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2452. Var
  2453. roundingMode : TFPURoundingMode;
  2454. roundNearestEven, increment, isTiny : Flag;
  2455. Begin
  2456. roundingMode := softfloat_rounding_mode;
  2457. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2458. increment := flag( sbits32 (zSig2) < 0 );
  2459. if ( roundNearestEven = flag(FALSE) ) then
  2460. Begin
  2461. if ( roundingMode = float_round_to_zero ) then
  2462. increment := 0
  2463. else
  2464. Begin
  2465. if ( zSign )<> 0 then
  2466. Begin
  2467. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2468. End
  2469. else
  2470. Begin
  2471. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2472. End
  2473. End
  2474. End;
  2475. if ( $7FD <= bits16 (zExp) ) then
  2476. Begin
  2477. if (( $7FD < zExp )
  2478. or (( zExp = $7FD )
  2479. and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
  2480. and (increment<>0)
  2481. )
  2482. ) then
  2483. Begin
  2484. float_raise( [float_flag_overflow,float_flag_inexact] );
  2485. if (( roundingMode = float_round_to_zero )
  2486. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2487. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2488. ) then
  2489. Begin
  2490. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2491. exit;
  2492. End;
  2493. packFloat64( zSign, $7FF, 0, 0, c );
  2494. exit;
  2495. End;
  2496. if ( zExp < 0 ) then
  2497. Begin
  2498. isTiny :=
  2499. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2500. or flag( zExp < -1 )
  2501. or flag(increment = 0)
  2502. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2503. shift64ExtraRightJamming(
  2504. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2505. zExp := 0;
  2506. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2507. if ( roundNearestEven )<>0 then
  2508. Begin
  2509. increment := flag( sbits32 (zSig2) < 0 );
  2510. End
  2511. else
  2512. Begin
  2513. if ( zSign )<>0 then
  2514. Begin
  2515. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2516. End
  2517. else
  2518. Begin
  2519. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2520. End
  2521. End;
  2522. End;
  2523. End;
  2524. if ( zSig2 )<>0 then
  2525. set_inexact_flag;
  2526. if ( increment )<>0 then
  2527. Begin
  2528. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2529. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2530. End
  2531. else
  2532. Begin
  2533. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2534. End;
  2535. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2536. End;
  2537. {*----------------------------------------------------------------------------
  2538. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2539. | and significand `zSig', and returns the proper double-precision floating-
  2540. | point value corresponding to the abstract input. Ordinarily, the abstract
  2541. | value is simply rounded and packed into the double-precision format, with
  2542. | the inexact exception raised if the abstract input cannot be represented
  2543. | exactly. However, if the abstract value is too large, the overflow and
  2544. | inexact exceptions are raised and an infinity or maximal finite value is
  2545. | returned. If the abstract value is too small, the input value is rounded
  2546. | to a subnormal number, and the underflow and inexact exceptions are raised
  2547. | if the abstract input cannot be represented exactly as a subnormal double-
  2548. | precision floating-point number.
  2549. | The input significand `zSig' has its binary point between bits 62
  2550. | and 61, which is 10 bits to the left of the usual location. This shifted
  2551. | significand must be normalized or smaller. If `zSig' is not normalized,
  2552. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2553. | and it must not require rounding. In the usual case that `zSig' is
  2554. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2555. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2556. | Binary Floating-Point Arithmetic.
  2557. *----------------------------------------------------------------------------*}
  2558. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2559. var
  2560. roundingMode: TFPURoundingMode;
  2561. roundNearestEven: flag;
  2562. roundIncrement, roundBits: int16;
  2563. isTiny: flag;
  2564. begin
  2565. roundingMode := softfloat_rounding_mode;
  2566. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2567. roundIncrement := $200;
  2568. if ( roundNearestEven=0 ) then
  2569. begin
  2570. if ( roundingMode = float_round_to_zero ) then
  2571. begin
  2572. roundIncrement := 0;
  2573. end
  2574. else begin
  2575. roundIncrement := $3FF;
  2576. if ( zSign<>0 ) then
  2577. begin
  2578. if ( roundingMode = float_round_up ) then
  2579. roundIncrement := 0;
  2580. end
  2581. else begin
  2582. if ( roundingMode = float_round_down ) then
  2583. roundIncrement := 0;
  2584. end
  2585. end
  2586. end;
  2587. roundBits := zSig and $3FF;
  2588. if ( $7FD <= bits16(zExp) ) then
  2589. begin
  2590. if ( ( $7FD < zExp )
  2591. or ( ( zExp = $7FD )
  2592. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2593. ) then
  2594. begin
  2595. float_raise( [float_flag_overflow,float_flag_inexact] );
  2596. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2597. exit;
  2598. end;
  2599. if ( zExp < 0 ) then
  2600. begin
  2601. isTiny := ord(
  2602. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2603. or ( zExp < -1 )
  2604. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2605. shift64RightJamming( zSig, - zExp, zSig );
  2606. zExp := 0;
  2607. roundBits := zSig and $3FF;
  2608. if ( isTiny and roundBits )<>0 then
  2609. float_raise( float_flag_underflow );
  2610. end
  2611. end;
  2612. if ( roundBits<>0 ) then
  2613. set_inexact_flag;
  2614. zSig := ( zSig + roundIncrement ) shr 10;
  2615. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2616. if ( zSig = 0 ) then
  2617. zExp := 0;
  2618. result:=packFloat64( zSign, zExp, zSig );
  2619. end;
  2620. {*
  2621. -------------------------------------------------------------------------------
  2622. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2623. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2624. returns the proper double-precision floating-point value corresponding
  2625. to the abstract input. This routine is just like `roundAndPackFloat64'
  2626. except that the input significand has fewer bits and does not have to be
  2627. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2628. point exponent.
  2629. -------------------------------------------------------------------------------
  2630. *}
  2631. Procedure
  2632. normalizeRoundAndPackFloat64(
  2633. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2634. Var
  2635. shiftCount : int8;
  2636. zSig2 : bits32;
  2637. Begin
  2638. if ( zSig0 = 0 ) then
  2639. Begin
  2640. zSig0 := zSig1;
  2641. zSig1 := 0;
  2642. zExp := zExp -32;
  2643. End;
  2644. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2645. if ( 0 <= shiftCount ) then
  2646. Begin
  2647. zSig2 := 0;
  2648. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2649. End
  2650. else
  2651. Begin
  2652. shift64ExtraRightJamming
  2653. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2654. End;
  2655. zExp := zExp - shiftCount;
  2656. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2657. End;
  2658. {*
  2659. ----------------------------------------------------------------------------
  2660. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2661. and significand `zSig', and returns the proper double-precision floating-
  2662. point value corresponding to the abstract input. This routine is just like
  2663. `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  2664. Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2665. floating-point exponent.
  2666. ----------------------------------------------------------------------------
  2667. *}
  2668. function normalizeRoundAndPackFloat64(zSign: flag; zExp: int16; zSig: bits64): float64;
  2669. var
  2670. shiftCount: int8;
  2671. begin
  2672. shiftCount := countLeadingZeros64( zSig ) - 1;
  2673. result := roundAndPackFloat64( zSign, zExp - shiftCount, zSig shl shiftCount);
  2674. end;
  2675. {*
  2676. -------------------------------------------------------------------------------
  2677. Returns the result of converting the 32-bit two's complement integer `a' to
  2678. the single-precision floating-point format. The conversion is performed
  2679. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2680. -------------------------------------------------------------------------------
  2681. *}
  2682. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2683. Var
  2684. zSign : Flag;
  2685. Begin
  2686. if ( a = 0 ) then
  2687. Begin
  2688. int32_to_float32.float32 := 0;
  2689. exit;
  2690. End;
  2691. if ( a = sbits32 ($80000000) ) then
  2692. Begin
  2693. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2694. exit;
  2695. end;
  2696. zSign := flag( a < 0 );
  2697. If zSign<>0 then
  2698. a := -a;
  2699. int32_to_float32.float32:=
  2700. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2701. End;
  2702. {*
  2703. -------------------------------------------------------------------------------
  2704. Returns the result of converting the 32-bit two's complement integer `a' to
  2705. the double-precision floating-point format. The conversion is performed
  2706. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2707. -------------------------------------------------------------------------------
  2708. *}
  2709. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2710. var
  2711. zSign : flag;
  2712. absA : bits32;
  2713. shiftCount : int8;
  2714. zSig0, zSig1 : bits32;
  2715. Begin
  2716. if ( a = 0 ) then
  2717. Begin
  2718. packFloat64( 0, 0, 0, 0, result );
  2719. exit;
  2720. end;
  2721. zSign := flag( a < 0 );
  2722. if ZSign<>0 then
  2723. AbsA := -a
  2724. else
  2725. AbsA := a;
  2726. shiftCount := countLeadingZeros32( absA ) - 11;
  2727. if ( 0 <= shiftCount ) then
  2728. Begin
  2729. zSig0 := absA shl shiftCount;
  2730. zSig1 := 0;
  2731. End
  2732. else
  2733. Begin
  2734. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2735. End;
  2736. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2737. End;
  2738. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2739. {$if not defined(packFloatx80)}
  2740. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2741. forward;
  2742. {$endif}
  2743. {*----------------------------------------------------------------------------
  2744. | Returns the result of converting the 32-bit two's complement integer `a'
  2745. | to the extended double-precision floating-point format. The conversion
  2746. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2747. | Arithmetic.
  2748. *----------------------------------------------------------------------------*}
  2749. function int32_to_floatx80( a: int32 ): floatx80;
  2750. var
  2751. zSign: flag;
  2752. absA: uint32;
  2753. shiftCount: int8;
  2754. zSig: bits64;
  2755. begin
  2756. if ( a = 0 ) then begin
  2757. result := packFloatx80( 0, 0, 0 );
  2758. exit;
  2759. end;
  2760. zSign := ord( a < 0 );
  2761. if zSign <> 0 then absA := - a else absA := a;
  2762. shiftCount := countLeadingZeros32( absA ) + 32;
  2763. zSig := absA;
  2764. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2765. end;
  2766. {$endif FPC_SOFTFLOAT_FLOATX80}
  2767. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2768. {$if not defined(packFloat128)}
  2769. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2770. forward;
  2771. {$endif}
  2772. {*----------------------------------------------------------------------------
  2773. | Returns the result of converting the 32-bit two's complement integer `a' to
  2774. | the quadruple-precision floating-point format. The conversion is performed
  2775. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2776. *----------------------------------------------------------------------------*}
  2777. function int32_to_float128( a: int32 ): float128;
  2778. var
  2779. zSign: flag;
  2780. absA: uint32;
  2781. shiftCount: int8;
  2782. zSig0: bits64;
  2783. begin
  2784. if ( a = 0 ) then begin
  2785. result := packFloat128( 0, 0, 0, 0 );
  2786. exit;
  2787. end;
  2788. zSign := ord( a < 0 );
  2789. if zSign <> 0 then absA := - a else absA := a;
  2790. shiftCount := countLeadingZeros32( absA ) + 17;
  2791. zSig0 := absA;
  2792. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2793. end;
  2794. {$endif FPC_SOFTFLOAT_FLOAT128}
  2795. {*
  2796. -------------------------------------------------------------------------------
  2797. Returns the result of converting the single-precision floating-point value
  2798. `a' to the 32-bit two's complement integer format. The conversion is
  2799. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2800. Arithmetic---which means in particular that the conversion is rounded
  2801. according to the current rounding mode. If `a' is a NaN, the largest
  2802. positive integer is returned. Otherwise, if the conversion overflows, the
  2803. largest integer with the same sign as `a' is returned.
  2804. -------------------------------------------------------------------------------
  2805. *}
  2806. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2807. Var
  2808. aSign: flag;
  2809. aExp, shiftCount: int16;
  2810. aSig, aSigExtra: bits32;
  2811. z: int32;
  2812. roundingMode: TFPURoundingMode;
  2813. Begin
  2814. aSig := extractFloat32Frac( a.float32 );
  2815. aExp := extractFloat32Exp( a.float32 );
  2816. aSign := extractFloat32Sign( a.float32 );
  2817. shiftCount := aExp - $96;
  2818. if ( 0 <= shiftCount ) then
  2819. Begin
  2820. if ( $9E <= aExp ) then
  2821. Begin
  2822. if ( a.float32 <> $CF000000 ) then
  2823. Begin
  2824. float_raise( float_flag_invalid );
  2825. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2826. Begin
  2827. float32_to_int32 := $7FFFFFFF;
  2828. exit;
  2829. End;
  2830. End;
  2831. float32_to_int32 := sbits32 ($80000000);
  2832. exit;
  2833. End;
  2834. z := ( aSig or $00800000 ) shl shiftCount;
  2835. if ( aSign<>0 ) then z := - z;
  2836. End
  2837. else
  2838. Begin
  2839. if ( aExp < $7E ) then
  2840. Begin
  2841. aSigExtra := aExp OR aSig;
  2842. z := 0;
  2843. End
  2844. else
  2845. Begin
  2846. aSig := aSig OR $00800000;
  2847. aSigExtra := aSig shl ( shiftCount and 31 );
  2848. z := aSig shr ( - shiftCount );
  2849. End;
  2850. if ( aSigExtra<>0 ) then
  2851. set_inexact_flag;
  2852. roundingMode := softfloat_rounding_mode;
  2853. if ( roundingMode = float_round_nearest_even ) then
  2854. Begin
  2855. if ( sbits32 (aSigExtra) < 0 ) then
  2856. Begin
  2857. Inc(z);
  2858. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2859. z := z and not 1;
  2860. End;
  2861. if ( aSign<>0 ) then
  2862. z := - z;
  2863. End
  2864. else
  2865. Begin
  2866. aSigExtra := flag( aSigExtra <> 0 );
  2867. if ( aSign<>0 ) then
  2868. Begin
  2869. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2870. z := - z;
  2871. End
  2872. else
  2873. Begin
  2874. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2875. End
  2876. End;
  2877. End;
  2878. float32_to_int32 := z;
  2879. End;
  2880. {*
  2881. -------------------------------------------------------------------------------
  2882. Returns the result of converting the single-precision floating-point value
  2883. `a' to the 32-bit two's complement integer format. The conversion is
  2884. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2885. Arithmetic, except that the conversion is always rounded toward zero.
  2886. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2887. the conversion overflows, the largest integer with the same sign as `a' is
  2888. returned.
  2889. -------------------------------------------------------------------------------
  2890. *}
  2891. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2892. Var
  2893. aSign : flag;
  2894. aExp, shiftCount : int16;
  2895. aSig : bits32;
  2896. z : int32;
  2897. Begin
  2898. aSig := extractFloat32Frac( a.float32 );
  2899. aExp := extractFloat32Exp( a.float32 );
  2900. aSign := extractFloat32Sign( a.float32 );
  2901. shiftCount := aExp - $9E;
  2902. if ( 0 <= shiftCount ) then
  2903. Begin
  2904. if ( a.float32 <> $CF000000 ) then
  2905. Begin
  2906. float_raise( float_flag_invalid );
  2907. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2908. Begin
  2909. float32_to_int32_round_to_zero := $7FFFFFFF;
  2910. exit;
  2911. end;
  2912. End;
  2913. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2914. exit;
  2915. End
  2916. else
  2917. if ( aExp <= $7E ) then
  2918. Begin
  2919. if ( aExp or aSig )<>0 then
  2920. set_inexact_flag;
  2921. float32_to_int32_round_to_zero := 0;
  2922. exit;
  2923. End;
  2924. aSig := ( aSig or $00800000 ) shl 8;
  2925. z := aSig shr ( - shiftCount );
  2926. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2927. Begin
  2928. set_inexact_flag;
  2929. End;
  2930. if ( aSign<>0 ) then z := - z;
  2931. float32_to_int32_round_to_zero := z;
  2932. End;
  2933. {*----------------------------------------------------------------------------
  2934. | Returns the result of converting the single-precision floating-point value
  2935. | `a' to the 64-bit two's complement integer format. The conversion is
  2936. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2937. | Arithmetic---which means in particular that the conversion is rounded
  2938. | according to the current rounding mode. If `a' is a NaN, the largest
  2939. | positive integer is returned. Otherwise, if the conversion overflows, the
  2940. | largest integer with the same sign as `a' is returned.
  2941. *----------------------------------------------------------------------------*}
  2942. function float32_to_int64( a: float32 ): int64;
  2943. var
  2944. aSign: flag;
  2945. aExp, shiftCount: int16;
  2946. aSig: bits32;
  2947. aSig64, aSigExtra: bits64;
  2948. begin
  2949. aSig := extractFloat32Frac( a );
  2950. aExp := extractFloat32Exp( a );
  2951. aSign := extractFloat32Sign( a );
  2952. shiftCount := $BE - aExp;
  2953. if ( shiftCount < 0 ) then begin
  2954. float_raise( float_flag_invalid );
  2955. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  2956. result := $7FFFFFFFFFFFFFFF;
  2957. exit;
  2958. end;
  2959. result := $8000000000000000;
  2960. exit;
  2961. end;
  2962. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  2963. aSig64 := aSig;
  2964. aSig64 := aSig64 shl 40;
  2965. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  2966. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  2967. end;
  2968. {*----------------------------------------------------------------------------
  2969. | Returns the result of converting the single-precision floating-point value
  2970. | `a' to the 64-bit two's complement integer format. The conversion is
  2971. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2972. | Arithmetic, except that the conversion is always rounded toward zero. If
  2973. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  2974. | conversion overflows, the largest integer with the same sign as `a' is
  2975. | returned.
  2976. *----------------------------------------------------------------------------*}
  2977. function float32_to_int64_round_to_zero( a: float32 ): int64;
  2978. var
  2979. aSign: flag;
  2980. aExp, shiftCount: int16;
  2981. aSig: bits32;
  2982. aSig64: bits64;
  2983. z: int64;
  2984. begin
  2985. aSig := extractFloat32Frac( a );
  2986. aExp := extractFloat32Exp( a );
  2987. aSign := extractFloat32Sign( a );
  2988. shiftCount := aExp - $BE;
  2989. if ( 0 <= shiftCount ) then begin
  2990. if ( a <> $DF000000 ) then begin
  2991. float_raise( float_flag_invalid );
  2992. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  2993. result := $7FFFFFFFFFFFFFFF;
  2994. exit;
  2995. end;
  2996. end;
  2997. result := $8000000000000000;
  2998. exit;
  2999. end
  3000. else if ( aExp <= $7E ) then begin
  3001. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  3002. result := 0;
  3003. exit;
  3004. end;
  3005. aSig64 := aSig or $00800000;
  3006. aSig64 := aSig64 shl 40;
  3007. z := aSig64 shr ( - shiftCount );
  3008. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3009. set_inexact_flag;
  3010. if ( aSign <> 0 ) then z := - z;
  3011. result := z;
  3012. end;
  3013. {*
  3014. -------------------------------------------------------------------------------
  3015. Returns the result of converting the single-precision floating-point value
  3016. `a' to the double-precision floating-point format. The conversion is
  3017. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3018. Arithmetic.
  3019. -------------------------------------------------------------------------------
  3020. *}
  3021. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  3022. Var
  3023. aSign : flag;
  3024. aExp : int16;
  3025. aSig, zSig0, zSig1: bits32;
  3026. tmp : CommonNanT;
  3027. Begin
  3028. aSig := extractFloat32Frac( a.float32 );
  3029. aExp := extractFloat32Exp( a.float32 );
  3030. aSign := extractFloat32Sign( a.float32 );
  3031. if ( aExp = $FF ) then
  3032. Begin
  3033. if ( aSig<>0 ) then
  3034. Begin
  3035. float32ToCommonNaN(a.float32, tmp);
  3036. commonNaNToFloat64(tmp , result);
  3037. exit;
  3038. End;
  3039. packFloat64( aSign, $7FF, 0, 0, result);
  3040. exit;
  3041. End;
  3042. if ( aExp = 0 ) then
  3043. Begin
  3044. if ( aSig = 0 ) then
  3045. Begin
  3046. packFloat64( aSign, 0, 0, 0, result );
  3047. exit;
  3048. end;
  3049. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3050. Dec(aExp);
  3051. End;
  3052. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3053. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3054. End;
  3055. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3056. {*----------------------------------------------------------------------------
  3057. | Returns the result of converting the canonical NaN `a' to the extended
  3058. | double-precision floating-point format.
  3059. *----------------------------------------------------------------------------*}
  3060. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3061. var
  3062. z : floatx80;
  3063. begin
  3064. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3065. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3066. result := z;
  3067. end;
  3068. {*----------------------------------------------------------------------------
  3069. | Returns the result of converting the single-precision floating-point value
  3070. | `a' to the extended double-precision floating-point format. The conversion
  3071. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3072. | Arithmetic.
  3073. *----------------------------------------------------------------------------*}
  3074. function float32_to_floatx80( a: float32 ): floatx80;
  3075. var
  3076. aSign: flag;
  3077. aExp: int16;
  3078. aSig: bits32;
  3079. tmp: commonNaNT;
  3080. begin
  3081. aSig := extractFloat32Frac( a );
  3082. aExp := extractFloat32Exp( a );
  3083. aSign := extractFloat32Sign( a );
  3084. if ( aExp = $FF ) then begin
  3085. if ( aSig <> 0 ) then begin
  3086. float32ToCommonNaN( a, tmp );
  3087. result := commonNaNToFloatx80( tmp );
  3088. exit;
  3089. end;
  3090. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3091. exit;
  3092. end;
  3093. if ( aExp = 0 ) then begin
  3094. if ( aSig = 0 ) then begin
  3095. result := packFloatx80( aSign, 0, 0 );
  3096. exit;
  3097. end;
  3098. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3099. end;
  3100. aSig := aSig or $00800000;
  3101. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3102. end;
  3103. {$endif FPC_SOFTFLOAT_FLOATX80}
  3104. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3105. {*----------------------------------------------------------------------------
  3106. | Returns the result of converting the single-precision floating-point value
  3107. | `a' to the double-precision floating-point format. The conversion is
  3108. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3109. | Arithmetic.
  3110. *----------------------------------------------------------------------------*}
  3111. function float32_to_float128( a: float32 ): float128;
  3112. var
  3113. aSign: flag;
  3114. aExp: int16;
  3115. aSig: bits32;
  3116. tmp: commonNaNT;
  3117. begin
  3118. aSig := extractFloat32Frac( a );
  3119. aExp := extractFloat32Exp( a );
  3120. aSign := extractFloat32Sign( a );
  3121. if ( aExp = $FF ) then begin
  3122. if ( aSig <> 0 ) then begin
  3123. float32ToCommonNaN( a, tmp );
  3124. result := commonNaNToFloat128( tmp );
  3125. exit;
  3126. end;
  3127. result := packFloat128( aSign, $7FFF, 0, 0 );
  3128. exit;
  3129. end;
  3130. if ( aExp = 0 ) then begin
  3131. if ( aSig = 0 ) then begin
  3132. result := packFloat128( aSign, 0, 0, 0 );
  3133. exit;
  3134. end;
  3135. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3136. dec( aExp );
  3137. end;
  3138. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3139. end;
  3140. {$endif FPC_SOFTFLOAT_FLOAT128}
  3141. {*
  3142. -------------------------------------------------------------------------------
  3143. Rounds the single-precision floating-point value `a' to an integer,
  3144. and returns the result as a single-precision floating-point value. The
  3145. operation is performed according to the IEC/IEEE Standard for Binary
  3146. Floating-Point Arithmetic.
  3147. -------------------------------------------------------------------------------
  3148. *}
  3149. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  3150. Var
  3151. aSign: flag;
  3152. aExp: int16;
  3153. lastBitMask, roundBitsMask: bits32;
  3154. roundingMode: TFPURoundingMode;
  3155. z: float32;
  3156. Begin
  3157. aExp := extractFloat32Exp( a.float32 );
  3158. if ( $96 <= aExp ) then
  3159. Begin
  3160. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3161. Begin
  3162. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3163. exit;
  3164. End;
  3165. float32_round_to_int:=a;
  3166. exit;
  3167. End;
  3168. if ( aExp <= $7E ) then
  3169. Begin
  3170. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3171. Begin
  3172. float32_round_to_int:=a;
  3173. exit;
  3174. end;
  3175. set_inexact_flag;
  3176. aSign := extractFloat32Sign( a.float32 );
  3177. case ( softfloat_rounding_mode ) of
  3178. float_round_nearest_even:
  3179. Begin
  3180. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3181. Begin
  3182. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3183. exit;
  3184. End;
  3185. End;
  3186. float_round_down:
  3187. Begin
  3188. if aSign <> 0 then
  3189. float32_round_to_int.float32 := $BF800000
  3190. else
  3191. float32_round_to_int.float32 := 0;
  3192. exit;
  3193. End;
  3194. float_round_up:
  3195. Begin
  3196. if aSign <> 0 then
  3197. float32_round_to_int.float32 := $80000000
  3198. else
  3199. float32_round_to_int.float32 := $3F800000;
  3200. exit;
  3201. End;
  3202. end;
  3203. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3204. exit;
  3205. End;
  3206. lastBitMask := 1;
  3207. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3208. lastBitMask := lastBitMask shl ($96 - aExp);
  3209. roundBitsMask := lastBitMask - 1;
  3210. z := a.float32;
  3211. roundingMode := softfloat_rounding_mode;
  3212. if ( roundingMode = float_round_nearest_even ) then
  3213. Begin
  3214. z := z + (lastBitMask shr 1);
  3215. if ( ( z and roundBitsMask ) = 0 ) then
  3216. z := z and not lastBitMask;
  3217. End
  3218. else if ( roundingMode <> float_round_to_zero ) then
  3219. Begin
  3220. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3221. Begin
  3222. z := z + roundBitsMask;
  3223. End;
  3224. End;
  3225. z := z and not roundBitsMask;
  3226. if ( z <> a.float32 ) then
  3227. set_inexact_flag;
  3228. float32_round_to_int.float32 := z;
  3229. End;
  3230. {*
  3231. -------------------------------------------------------------------------------
  3232. Returns the result of adding the absolute values of the single-precision
  3233. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3234. before being returned. `zSign' is ignored if the result is a NaN.
  3235. The addition is performed according to the IEC/IEEE Standard for Binary
  3236. Floating-Point Arithmetic.
  3237. -------------------------------------------------------------------------------
  3238. *}
  3239. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3240. Var
  3241. aExp, bExp, zExp: int16;
  3242. aSig, bSig, zSig: bits32;
  3243. expDiff: int16;
  3244. label roundAndPack;
  3245. Begin
  3246. aSig:=extractFloat32Frac( a );
  3247. aExp:=extractFloat32Exp( a );
  3248. bSig:=extractFloat32Frac( b );
  3249. bExp := extractFloat32Exp( b );
  3250. expDiff := aExp - bExp;
  3251. aSig := aSig shl 6;
  3252. bSig := bSig shl 6;
  3253. if ( 0 < expDiff ) then
  3254. Begin
  3255. if ( aExp = $FF ) then
  3256. Begin
  3257. if ( aSig <> 0) then
  3258. Begin
  3259. addFloat32Sigs := propagateFloat32NaN( a, b );
  3260. exit;
  3261. End;
  3262. addFloat32Sigs := a;
  3263. exit;
  3264. End;
  3265. if ( bExp = 0 ) then
  3266. Begin
  3267. Dec(expDiff);
  3268. End
  3269. else
  3270. Begin
  3271. bSig := bSig or $20000000;
  3272. End;
  3273. shift32RightJamming( bSig, expDiff, bSig );
  3274. zExp := aExp;
  3275. End
  3276. else
  3277. If ( expDiff < 0 ) then
  3278. Begin
  3279. if ( bExp = $FF ) then
  3280. Begin
  3281. if ( bSig<>0 ) then
  3282. Begin
  3283. addFloat32Sigs := propagateFloat32NaN( a, b );
  3284. exit;
  3285. end;
  3286. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3287. exit;
  3288. End;
  3289. if ( aExp = 0 ) then
  3290. Begin
  3291. Inc(expDiff);
  3292. End
  3293. else
  3294. Begin
  3295. aSig := aSig OR $20000000;
  3296. End;
  3297. shift32RightJamming( aSig, - expDiff, aSig );
  3298. zExp := bExp;
  3299. End
  3300. else
  3301. Begin
  3302. if ( aExp = $FF ) then
  3303. Begin
  3304. if ( aSig OR bSig )<> 0 then
  3305. Begin
  3306. addFloat32Sigs := propagateFloat32NaN( a, b );
  3307. exit;
  3308. end;
  3309. addFloat32Sigs := a;
  3310. exit;
  3311. End;
  3312. if ( aExp = 0 ) then
  3313. Begin
  3314. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3315. exit;
  3316. end;
  3317. zSig := $40000000 + aSig + bSig;
  3318. zExp := aExp;
  3319. goto roundAndPack;
  3320. End;
  3321. aSig := aSig OR $20000000;
  3322. zSig := ( aSig + bSig ) shl 1;
  3323. Dec(zExp);
  3324. if ( sbits32 (zSig) < 0 ) then
  3325. Begin
  3326. zSig := aSig + bSig;
  3327. Inc(zExp);
  3328. End;
  3329. roundAndPack:
  3330. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3331. End;
  3332. {*
  3333. -------------------------------------------------------------------------------
  3334. Returns the result of subtracting the absolute values of the single-
  3335. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3336. difference is negated before being returned. `zSign' is ignored if the
  3337. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3338. Standard for Binary Floating-Point Arithmetic.
  3339. -------------------------------------------------------------------------------
  3340. *}
  3341. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3342. Var
  3343. aExp, bExp, zExp: int16;
  3344. aSig, bSig, zSig: bits32;
  3345. expDiff : int16;
  3346. label aExpBigger;
  3347. label bExpBigger;
  3348. label aBigger;
  3349. label bBigger;
  3350. label normalizeRoundAndPack;
  3351. Begin
  3352. aSig := extractFloat32Frac( a );
  3353. aExp := extractFloat32Exp( a );
  3354. bSig := extractFloat32Frac( b );
  3355. bExp := extractFloat32Exp( b );
  3356. expDiff := aExp - bExp;
  3357. aSig := aSig shl 7;
  3358. bSig := bSig shl 7;
  3359. if ( 0 < expDiff ) then goto aExpBigger;
  3360. if ( expDiff < 0 ) then goto bExpBigger;
  3361. if ( aExp = $FF ) then
  3362. Begin
  3363. if ( aSig OR bSig )<> 0 then
  3364. Begin
  3365. subFloat32Sigs := propagateFloat32NaN( a, b );
  3366. exit;
  3367. End;
  3368. float_raise( float_flag_invalid );
  3369. subFloat32Sigs := float32_default_nan;
  3370. exit;
  3371. End;
  3372. if ( aExp = 0 ) then
  3373. Begin
  3374. aExp := 1;
  3375. bExp := 1;
  3376. End;
  3377. if ( bSig < aSig ) Then goto aBigger;
  3378. if ( aSig < bSig ) Then goto bBigger;
  3379. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3380. exit;
  3381. bExpBigger:
  3382. if ( bExp = $FF ) then
  3383. Begin
  3384. if ( bSig<>0 ) then
  3385. Begin
  3386. subFloat32Sigs := propagateFloat32NaN( a, b );
  3387. exit;
  3388. End;
  3389. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3390. exit;
  3391. End;
  3392. if ( aExp = 0 ) then
  3393. Begin
  3394. Inc(expDiff);
  3395. End
  3396. else
  3397. Begin
  3398. aSig := aSig OR $40000000;
  3399. End;
  3400. shift32RightJamming( aSig, - expDiff, aSig );
  3401. bSig := bSig OR $40000000;
  3402. bBigger:
  3403. zSig := bSig - aSig;
  3404. zExp := bExp;
  3405. zSign := zSign xor 1;
  3406. goto normalizeRoundAndPack;
  3407. aExpBigger:
  3408. if ( aExp = $FF ) then
  3409. Begin
  3410. if ( aSig <> 0) then
  3411. Begin
  3412. subFloat32Sigs := propagateFloat32NaN( a, b );
  3413. exit;
  3414. End;
  3415. subFloat32Sigs := a;
  3416. exit;
  3417. End;
  3418. if ( bExp = 0 ) then
  3419. Begin
  3420. Dec(expDiff);
  3421. End
  3422. else
  3423. Begin
  3424. bSig := bSig OR $40000000;
  3425. End;
  3426. shift32RightJamming( bSig, expDiff, bSig );
  3427. aSig := aSig OR $40000000;
  3428. aBigger:
  3429. zSig := aSig - bSig;
  3430. zExp := aExp;
  3431. normalizeRoundAndPack:
  3432. Dec(zExp);
  3433. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3434. End;
  3435. {*
  3436. -------------------------------------------------------------------------------
  3437. Returns the result of adding the single-precision floating-point values `a'
  3438. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3439. Binary Floating-Point Arithmetic.
  3440. -------------------------------------------------------------------------------
  3441. *}
  3442. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3443. Var
  3444. aSign, bSign: Flag;
  3445. Begin
  3446. aSign := extractFloat32Sign( a.float32 );
  3447. bSign := extractFloat32Sign( b.float32 );
  3448. if ( aSign = bSign ) then
  3449. Begin
  3450. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3451. End
  3452. else
  3453. Begin
  3454. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3455. End;
  3456. End;
  3457. {*
  3458. -------------------------------------------------------------------------------
  3459. Returns the result of subtracting the single-precision floating-point values
  3460. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3461. for Binary Floating-Point Arithmetic.
  3462. -------------------------------------------------------------------------------
  3463. *}
  3464. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3465. Var
  3466. aSign, bSign: flag;
  3467. Begin
  3468. aSign := extractFloat32Sign( a.float32 );
  3469. bSign := extractFloat32Sign( b.float32 );
  3470. if ( aSign = bSign ) then
  3471. Begin
  3472. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3473. End
  3474. else
  3475. Begin
  3476. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3477. End;
  3478. End;
  3479. {*
  3480. -------------------------------------------------------------------------------
  3481. Returns the result of multiplying the single-precision floating-point values
  3482. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3483. for Binary Floating-Point Arithmetic.
  3484. -------------------------------------------------------------------------------
  3485. *}
  3486. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3487. Var
  3488. aSign, bSign, zSign: flag;
  3489. aExp, bExp, zExp : int16;
  3490. aSig, bSig, zSig0, zSig1: bits32;
  3491. Begin
  3492. aSig := extractFloat32Frac( a.float32 );
  3493. aExp := extractFloat32Exp( a.float32 );
  3494. aSign := extractFloat32Sign( a.float32 );
  3495. bSig := extractFloat32Frac( b.float32 );
  3496. bExp := extractFloat32Exp( b.float32 );
  3497. bSign := extractFloat32Sign( b.float32 );
  3498. zSign := aSign xor bSign;
  3499. if ( aExp = $FF ) then
  3500. Begin
  3501. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3502. Begin
  3503. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3504. exit;
  3505. End;
  3506. if ( ( bits32(bExp) OR bSig ) = 0 ) then
  3507. Begin
  3508. float_raise( float_flag_invalid );
  3509. float32_mul.float32 := float32_default_nan;
  3510. exit;
  3511. End;
  3512. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3513. exit;
  3514. End;
  3515. if ( bExp = $FF ) then
  3516. Begin
  3517. if ( bSig <> 0 ) then
  3518. Begin
  3519. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3520. exit;
  3521. End;
  3522. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3523. Begin
  3524. float_raise( float_flag_invalid );
  3525. float32_mul.float32 := float32_default_nan;
  3526. exit;
  3527. End;
  3528. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3529. exit;
  3530. End;
  3531. if ( aExp = 0 ) then
  3532. Begin
  3533. if ( aSig = 0 ) then
  3534. Begin
  3535. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3536. exit;
  3537. End;
  3538. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3539. End;
  3540. if ( bExp = 0 ) then
  3541. Begin
  3542. if ( bSig = 0 ) then
  3543. Begin
  3544. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3545. exit;
  3546. End;
  3547. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3548. End;
  3549. zExp := aExp + bExp - $7F;
  3550. aSig := ( aSig OR $00800000 ) shl 7;
  3551. bSig := ( bSig OR $00800000 ) shl 8;
  3552. mul32To64( aSig, bSig, zSig0, zSig1 );
  3553. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3554. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3555. Begin
  3556. zSig0 := zSig0 shl 1;
  3557. Dec(zExp);
  3558. End;
  3559. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3560. End;
  3561. {*
  3562. -------------------------------------------------------------------------------
  3563. Returns the result of dividing the single-precision floating-point value `a'
  3564. by the corresponding value `b'. The operation is performed according to the
  3565. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3566. -------------------------------------------------------------------------------
  3567. *}
  3568. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3569. Var
  3570. aSign, bSign, zSign: flag;
  3571. aExp, bExp, zExp: int16;
  3572. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3573. Begin
  3574. aSig := extractFloat32Frac( a.float32 );
  3575. aExp := extractFloat32Exp( a.float32 );
  3576. aSign := extractFloat32Sign( a.float32 );
  3577. bSig := extractFloat32Frac( b.float32 );
  3578. bExp := extractFloat32Exp( b.float32 );
  3579. bSign := extractFloat32Sign( b.float32 );
  3580. zSign := aSign xor bSign;
  3581. if ( aExp = $FF ) then
  3582. Begin
  3583. if ( aSig <> 0 ) then
  3584. Begin
  3585. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3586. exit;
  3587. End;
  3588. if ( bExp = $FF ) then
  3589. Begin
  3590. if ( bSig <> 0) then
  3591. Begin
  3592. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3593. exit;
  3594. End;
  3595. float_raise( float_flag_invalid );
  3596. float32_div.float32 := float32_default_nan;
  3597. exit;
  3598. End;
  3599. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3600. exit;
  3601. End;
  3602. if ( bExp = $FF ) then
  3603. Begin
  3604. if ( bSig <> 0) then
  3605. Begin
  3606. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3607. exit;
  3608. End;
  3609. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3610. exit;
  3611. End;
  3612. if ( bExp = 0 ) Then
  3613. Begin
  3614. if ( bSig = 0 ) Then
  3615. Begin
  3616. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3617. Begin
  3618. float_raise( float_flag_invalid );
  3619. float32_div.float32 := float32_default_nan;
  3620. exit;
  3621. End;
  3622. float_raise( float_flag_divbyzero );
  3623. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3624. exit;
  3625. End;
  3626. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3627. End;
  3628. if ( aExp = 0 ) Then
  3629. Begin
  3630. if ( aSig = 0 ) Then
  3631. Begin
  3632. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3633. exit;
  3634. End;
  3635. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3636. End;
  3637. zExp := aExp - bExp + $7D;
  3638. aSig := ( aSig OR $00800000 ) shl 7;
  3639. bSig := ( bSig OR $00800000 ) shl 8;
  3640. if ( bSig <= ( aSig + aSig ) ) then
  3641. Begin
  3642. aSig := aSig shr 1;
  3643. Inc(zExp);
  3644. End;
  3645. zSig := estimateDiv64To32( aSig, 0, bSig );
  3646. if ( ( zSig and $3F ) <= 2 ) then
  3647. Begin
  3648. mul32To64( bSig, zSig, term0, term1 );
  3649. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3650. while ( sbits32 (rem0) < 0 ) do
  3651. Begin
  3652. Dec(zSig);
  3653. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3654. End;
  3655. zSig := zSig or bits32( rem1 <> 0 );
  3656. End;
  3657. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3658. End;
  3659. {*
  3660. -------------------------------------------------------------------------------
  3661. Returns the remainder of the single-precision floating-point value `a'
  3662. with respect to the corresponding value `b'. The operation is performed
  3663. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3664. -------------------------------------------------------------------------------
  3665. *}
  3666. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3667. Var
  3668. aSign, zSign: flag;
  3669. aExp, bExp, expDiff: int16;
  3670. aSig, bSig, q, alternateASig: bits32;
  3671. sigMean: sbits32;
  3672. Begin
  3673. aSig := extractFloat32Frac( a.float32 );
  3674. aExp := extractFloat32Exp( a.float32 );
  3675. aSign := extractFloat32Sign( a.float32 );
  3676. bSig := extractFloat32Frac( b.float32 );
  3677. bExp := extractFloat32Exp( b.float32 );
  3678. if ( aExp = $FF ) then
  3679. Begin
  3680. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3681. Begin
  3682. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3683. exit;
  3684. End;
  3685. float_raise( float_flag_invalid );
  3686. float32_rem.float32 := float32_default_nan;
  3687. exit;
  3688. End;
  3689. if ( bExp = $FF ) then
  3690. Begin
  3691. if ( bSig <> 0 ) then
  3692. Begin
  3693. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3694. exit;
  3695. End;
  3696. float32_rem := a;
  3697. exit;
  3698. End;
  3699. if ( bExp = 0 ) then
  3700. Begin
  3701. if ( bSig = 0 ) then
  3702. Begin
  3703. float_raise( float_flag_invalid );
  3704. float32_rem.float32 := float32_default_nan;
  3705. exit;
  3706. End;
  3707. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3708. End;
  3709. if ( aExp = 0 ) then
  3710. Begin
  3711. if ( aSig = 0 ) then
  3712. Begin
  3713. float32_rem := a;
  3714. exit;
  3715. End;
  3716. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3717. End;
  3718. expDiff := aExp - bExp;
  3719. aSig := ( aSig OR $00800000 ) shl 8;
  3720. bSig := ( bSig OR $00800000 ) shl 8;
  3721. if ( expDiff < 0 ) then
  3722. Begin
  3723. if ( expDiff < -1 ) then
  3724. Begin
  3725. float32_rem := a;
  3726. exit;
  3727. End;
  3728. aSig := aSig shr 1;
  3729. End;
  3730. q := bits32( bSig <= aSig );
  3731. if ( q <> 0) then
  3732. aSig := aSig - bSig;
  3733. expDiff := expDiff - 32;
  3734. while ( 0 < expDiff ) do
  3735. Begin
  3736. q := estimateDiv64To32( aSig, 0, bSig );
  3737. if (2 < q) then
  3738. q := q - 2
  3739. else
  3740. q := 0;
  3741. aSig := - ( ( bSig shr 2 ) * q );
  3742. expDiff := expDiff - 30;
  3743. End;
  3744. expDiff := expDiff + 32;
  3745. if ( 0 < expDiff ) then
  3746. Begin
  3747. q := estimateDiv64To32( aSig, 0, bSig );
  3748. if (2 < q) then
  3749. q := q - 2
  3750. else
  3751. q := 0;
  3752. q := q shr (32 - expDiff);
  3753. bSig := bSig shr 2;
  3754. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3755. End
  3756. else
  3757. Begin
  3758. aSig := aSig shr 2;
  3759. bSig := bSig shr 2;
  3760. End;
  3761. Repeat
  3762. alternateASig := aSig;
  3763. Inc(q);
  3764. aSig := aSig - bSig;
  3765. Until not ( 0 <= sbits32 (aSig) );
  3766. sigMean := aSig + alternateASig;
  3767. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3768. Begin
  3769. aSig := alternateASig;
  3770. End;
  3771. zSign := flag( sbits32 (aSig) < 0 );
  3772. if ( zSign<>0 ) then
  3773. aSig := - aSig;
  3774. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3775. End;
  3776. {*
  3777. -------------------------------------------------------------------------------
  3778. Returns the square root of the single-precision floating-point value `a'.
  3779. The operation is performed according to the IEC/IEEE Standard for Binary
  3780. Floating-Point Arithmetic.
  3781. -------------------------------------------------------------------------------
  3782. *}
  3783. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3784. Var
  3785. aSign : flag;
  3786. aExp, zExp : int16;
  3787. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3788. label roundAndPack;
  3789. Begin
  3790. aSig := extractFloat32Frac( a.float32 );
  3791. aExp := extractFloat32Exp( a.float32 );
  3792. aSign := extractFloat32Sign( a.float32 );
  3793. if ( aExp = $FF ) then
  3794. Begin
  3795. if ( aSig <> 0) then
  3796. Begin
  3797. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3798. exit;
  3799. End;
  3800. if ( aSign = 0) then
  3801. Begin
  3802. float32_sqrt := a;
  3803. exit;
  3804. End;
  3805. float_raise( float_flag_invalid );
  3806. float32_sqrt.float32 := float32_default_nan;
  3807. exit;
  3808. End;
  3809. if ( aSign <> 0) then
  3810. Begin
  3811. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3812. Begin
  3813. float32_sqrt := a;
  3814. exit;
  3815. End;
  3816. float_raise( float_flag_invalid );
  3817. float32_sqrt.float32 := float32_default_nan;
  3818. exit;
  3819. End;
  3820. if ( aExp = 0 ) then
  3821. Begin
  3822. if ( aSig = 0 ) then
  3823. Begin
  3824. float32_sqrt.float32 := 0;
  3825. exit;
  3826. End;
  3827. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3828. End;
  3829. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3830. aSig := ( aSig OR $00800000 ) shl 8;
  3831. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3832. if ( ( zSig and $7F ) <= 5 ) then
  3833. Begin
  3834. if ( zSig < 2 ) then
  3835. Begin
  3836. zSig := $7FFFFFFF;
  3837. goto roundAndPack;
  3838. End
  3839. else
  3840. Begin
  3841. aSig := aSig shr (aExp and 1);
  3842. mul32To64( zSig, zSig, term0, term1 );
  3843. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3844. while ( sbits32 (rem0) < 0 ) do
  3845. Begin
  3846. Dec(zSig);
  3847. shortShift64Left( 0, zSig, 1, term0, term1 );
  3848. term1 := term1 or 1;
  3849. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3850. End;
  3851. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3852. End;
  3853. End;
  3854. shift32RightJamming( zSig, 1, zSig );
  3855. roundAndPack:
  3856. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3857. End;
  3858. {*
  3859. -------------------------------------------------------------------------------
  3860. Returns 1 if the single-precision floating-point value `a' is equal to
  3861. the corresponding value `b', and 0 otherwise. The comparison is performed
  3862. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3863. -------------------------------------------------------------------------------
  3864. *}
  3865. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3866. Begin
  3867. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3868. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3869. ) then
  3870. Begin
  3871. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3872. Begin
  3873. float_raise( float_flag_invalid );
  3874. End;
  3875. float32_eq := 0;
  3876. exit;
  3877. End;
  3878. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3879. End;
  3880. {*
  3881. -------------------------------------------------------------------------------
  3882. Returns 1 if the single-precision floating-point value `a' is less than
  3883. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3884. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3885. Arithmetic.
  3886. -------------------------------------------------------------------------------
  3887. *}
  3888. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3889. var
  3890. aSign, bSign: flag;
  3891. Begin
  3892. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3893. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3894. ) then
  3895. Begin
  3896. float_raise( float_flag_invalid );
  3897. float32_le := 0;
  3898. exit;
  3899. End;
  3900. aSign := extractFloat32Sign( a.float32 );
  3901. bSign := extractFloat32Sign( b.float32 );
  3902. if ( aSign <> bSign ) then
  3903. Begin
  3904. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3905. exit;
  3906. End;
  3907. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3908. End;
  3909. {*
  3910. -------------------------------------------------------------------------------
  3911. Returns 1 if the single-precision floating-point value `a' is less than
  3912. the corresponding value `b', and 0 otherwise. The comparison is performed
  3913. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3914. -------------------------------------------------------------------------------
  3915. *}
  3916. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3917. var
  3918. aSign, bSign: flag;
  3919. Begin
  3920. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3921. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3922. ) then
  3923. Begin
  3924. float_raise( float_flag_invalid );
  3925. float32_lt :=0;
  3926. exit;
  3927. End;
  3928. aSign := extractFloat32Sign( a.float32 );
  3929. bSign := extractFloat32Sign( b.float32 );
  3930. if ( aSign <> bSign ) then
  3931. Begin
  3932. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3933. exit;
  3934. End;
  3935. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3936. End;
  3937. {*
  3938. -------------------------------------------------------------------------------
  3939. Returns 1 if the single-precision floating-point value `a' is equal to
  3940. the corresponding value `b', and 0 otherwise. The invalid exception is
  3941. raised if either operand is a NaN. Otherwise, the comparison is performed
  3942. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3943. -------------------------------------------------------------------------------
  3944. *}
  3945. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3946. Begin
  3947. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3948. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3949. ) then
  3950. Begin
  3951. float_raise( float_flag_invalid );
  3952. float32_eq_signaling := 0;
  3953. exit;
  3954. End;
  3955. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3956. End;
  3957. {*
  3958. -------------------------------------------------------------------------------
  3959. Returns 1 if the single-precision floating-point value `a' is less than or
  3960. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3961. cause an exception. Otherwise, the comparison is performed according to the
  3962. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3963. -------------------------------------------------------------------------------
  3964. *}
  3965. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3966. Var
  3967. aSign, bSign: flag;
  3968. Begin
  3969. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3970. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3971. ) then
  3972. Begin
  3973. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3974. Begin
  3975. float_raise( float_flag_invalid );
  3976. End;
  3977. float32_le_quiet := 0;
  3978. exit;
  3979. End;
  3980. aSign := extractFloat32Sign( a );
  3981. bSign := extractFloat32Sign( b );
  3982. if ( aSign <> bSign ) then
  3983. Begin
  3984. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3985. exit;
  3986. End;
  3987. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3988. End;
  3989. {*
  3990. -------------------------------------------------------------------------------
  3991. Returns 1 if the single-precision floating-point value `a' is less than
  3992. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3993. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3994. Standard for Binary Floating-Point Arithmetic.
  3995. -------------------------------------------------------------------------------
  3996. *}
  3997. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3998. Var
  3999. aSign, bSign: flag;
  4000. Begin
  4001. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4002. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4003. ) then
  4004. Begin
  4005. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4006. Begin
  4007. float_raise( float_flag_invalid );
  4008. End;
  4009. float32_lt_quiet := 0;
  4010. exit;
  4011. End;
  4012. aSign := extractFloat32Sign( a );
  4013. bSign := extractFloat32Sign( b );
  4014. if ( aSign <> bSign ) then
  4015. Begin
  4016. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4017. exit;
  4018. End;
  4019. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4020. End;
  4021. {*
  4022. -------------------------------------------------------------------------------
  4023. Returns the result of converting the double-precision floating-point value
  4024. `a' to the 32-bit two's complement integer format. The conversion is
  4025. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4026. Arithmetic---which means in particular that the conversion is rounded
  4027. according to the current rounding mode. If `a' is a NaN, the largest
  4028. positive integer is returned. Otherwise, if the conversion overflows, the
  4029. largest integer with the same sign as `a' is returned.
  4030. -------------------------------------------------------------------------------
  4031. *}
  4032. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4033. var
  4034. aSign: flag;
  4035. aExp, shiftCount: int16;
  4036. aSig0, aSig1, absZ, aSigExtra: bits32;
  4037. z: int32;
  4038. roundingMode: TFPURoundingMode;
  4039. label invalid;
  4040. Begin
  4041. aSig1 := extractFloat64Frac1( a );
  4042. aSig0 := extractFloat64Frac0( a );
  4043. aExp := extractFloat64Exp( a );
  4044. aSign := extractFloat64Sign( a );
  4045. shiftCount := aExp - $413;
  4046. if ( 0 <= shiftCount ) then
  4047. Begin
  4048. if ( $41E < aExp ) then
  4049. Begin
  4050. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4051. aSign := 0;
  4052. goto invalid;
  4053. End;
  4054. shortShift64Left(
  4055. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4056. if ( $80000000 < absZ ) then
  4057. goto invalid;
  4058. End
  4059. else
  4060. Begin
  4061. aSig1 := flag( aSig1 <> 0 );
  4062. if ( aExp < $3FE ) then
  4063. Begin
  4064. aSigExtra := aExp OR aSig0 OR aSig1;
  4065. absZ := 0;
  4066. End
  4067. else
  4068. Begin
  4069. aSig0 := aSig0 OR $00100000;
  4070. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4071. absZ := aSig0 shr ( - shiftCount );
  4072. End;
  4073. End;
  4074. roundingMode := softfloat_rounding_mode;
  4075. if ( roundingMode = float_round_nearest_even ) then
  4076. Begin
  4077. if ( sbits32(aSigExtra) < 0 ) then
  4078. Begin
  4079. Inc(absZ);
  4080. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4081. absZ := absZ and not 1;
  4082. End;
  4083. if aSign <> 0 then
  4084. z := - absZ
  4085. else
  4086. z := absZ;
  4087. End
  4088. else
  4089. Begin
  4090. aSigExtra := bits32( aSigExtra <> 0 );
  4091. if ( aSign <> 0) then
  4092. Begin
  4093. z := - ( absZ
  4094. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4095. End
  4096. else
  4097. Begin
  4098. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4099. End
  4100. End;
  4101. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4102. Begin
  4103. invalid:
  4104. float_raise( float_flag_invalid );
  4105. if (aSign <> 0 ) then
  4106. float64_to_int32 := sbits32 ($80000000)
  4107. else
  4108. float64_to_int32 := $7FFFFFFF;
  4109. exit;
  4110. End;
  4111. if ( aSigExtra <> 0) then
  4112. set_inexact_flag;
  4113. float64_to_int32 := z;
  4114. End;
  4115. {*
  4116. -------------------------------------------------------------------------------
  4117. Returns the result of converting the double-precision floating-point value
  4118. `a' to the 32-bit two's complement integer format. The conversion is
  4119. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4120. Arithmetic, except that the conversion is always rounded toward zero.
  4121. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4122. the conversion overflows, the largest integer with the same sign as `a' is
  4123. returned.
  4124. -------------------------------------------------------------------------------
  4125. *}
  4126. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4127. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4128. Var
  4129. aSign: flag;
  4130. aExp, shiftCount: int16;
  4131. aSig0, aSig1, absZ, aSigExtra: bits32;
  4132. z: int32;
  4133. label invalid;
  4134. Begin
  4135. aSig1 := extractFloat64Frac1( a );
  4136. aSig0 := extractFloat64Frac0( a );
  4137. aExp := extractFloat64Exp( a );
  4138. aSign := extractFloat64Sign( a );
  4139. shiftCount := aExp - $413;
  4140. if ( 0 <= shiftCount ) then
  4141. Begin
  4142. if ( $41E < aExp ) then
  4143. Begin
  4144. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4145. aSign := 0;
  4146. goto invalid;
  4147. End;
  4148. shortShift64Left(
  4149. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4150. End
  4151. else
  4152. Begin
  4153. if ( aExp < $3FF ) then
  4154. Begin
  4155. if ( bits32(aExp) OR aSig0 OR aSig1 )<>0 then
  4156. Begin
  4157. set_inexact_flag;
  4158. End;
  4159. float64_to_int32_round_to_zero := 0;
  4160. exit;
  4161. End;
  4162. aSig0 := aSig0 or $00100000;
  4163. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4164. absZ := aSig0 shr ( - shiftCount );
  4165. End;
  4166. if aSign <> 0 then
  4167. z := - absZ
  4168. else
  4169. z := absZ;
  4170. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4171. Begin
  4172. invalid:
  4173. float_raise( float_flag_invalid );
  4174. if (aSign <> 0) then
  4175. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4176. else
  4177. float64_to_int32_round_to_zero := $7FFFFFFF;
  4178. exit;
  4179. End;
  4180. if ( aSigExtra <> 0) then
  4181. set_inexact_flag;
  4182. float64_to_int32_round_to_zero := z;
  4183. End;
  4184. {*----------------------------------------------------------------------------
  4185. | Returns the result of converting the double-precision floating-point value
  4186. | `a' to the 64-bit two's complement integer format. The conversion is
  4187. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4188. | Arithmetic---which means in particular that the conversion is rounded
  4189. | according to the current rounding mode. If `a' is a NaN, the largest
  4190. | positive integer is returned. Otherwise, if the conversion overflows, the
  4191. | largest integer with the same sign as `a' is returned.
  4192. *----------------------------------------------------------------------------*}
  4193. function float64_to_int64( a: float64 ): int64;
  4194. var
  4195. aSign: flag;
  4196. aExp, shiftCount: int16;
  4197. aSig, aSigExtra: bits64;
  4198. begin
  4199. aSig := extractFloat64Frac( a );
  4200. aExp := extractFloat64Exp( a );
  4201. aSign := extractFloat64Sign( a );
  4202. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4203. shiftCount := $433 - aExp;
  4204. if ( shiftCount <= 0 ) then begin
  4205. if ( $43E < aExp ) then begin
  4206. float_raise( float_flag_invalid );
  4207. if ( ( aSign = 0 )
  4208. or ( ( aExp = $7FF )
  4209. and ( aSig <> $0010000000000000 ) )
  4210. ) then begin
  4211. result := $7FFFFFFFFFFFFFFF;
  4212. exit;
  4213. end;
  4214. result := $8000000000000000;
  4215. exit;
  4216. end;
  4217. aSigExtra := 0;
  4218. aSig := aSig shl ( - shiftCount );
  4219. end
  4220. else
  4221. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4222. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4223. end;
  4224. {*----------------------------------------------------------------------------
  4225. | Returns the result of converting the double-precision floating-point value
  4226. | `a' to the 64-bit two's complement integer format. The conversion is
  4227. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4228. | Arithmetic, except that the conversion is always rounded toward zero.
  4229. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4230. | the conversion overflows, the largest integer with the same sign as `a' is
  4231. | returned.
  4232. *----------------------------------------------------------------------------*}
  4233. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4234. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4235. var
  4236. aSign: flag;
  4237. aExp, shiftCount: int16;
  4238. aSig: bits64;
  4239. z: int64;
  4240. begin
  4241. aSig := extractFloat64Frac( a );
  4242. aExp := extractFloat64Exp( a );
  4243. aSign := extractFloat64Sign( a );
  4244. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4245. shiftCount := aExp - $433;
  4246. if ( 0 <= shiftCount ) then begin
  4247. if ( $43E <= aExp ) then begin
  4248. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4249. float_raise( float_flag_invalid );
  4250. if ( ( aSign = 0 )
  4251. or ( ( aExp = $7FF )
  4252. and ( aSig <> $0010000000000000 ) )
  4253. ) then begin
  4254. result := $7FFFFFFFFFFFFFFF;
  4255. exit;
  4256. end;
  4257. end;
  4258. result := $8000000000000000;
  4259. exit;
  4260. end;
  4261. z := aSig shl shiftCount;
  4262. end
  4263. else begin
  4264. if ( aExp < $3FE ) then begin
  4265. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  4266. result := 0;
  4267. exit;
  4268. end;
  4269. z := aSig shr ( - shiftCount );
  4270. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4271. set_inexact_flag;
  4272. end;
  4273. if ( aSign <> 0 ) then z := - z;
  4274. result := z;
  4275. end;
  4276. {*
  4277. -------------------------------------------------------------------------------
  4278. Returns the result of converting the double-precision floating-point value
  4279. `a' to the single-precision floating-point format. The conversion is
  4280. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4281. Arithmetic.
  4282. -------------------------------------------------------------------------------
  4283. *}
  4284. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  4285. Var
  4286. aSign: flag;
  4287. aExp: int16;
  4288. aSig0, aSig1, zSig: bits32;
  4289. allZero: bits32;
  4290. tmp : CommonNanT;
  4291. Begin
  4292. aSig1 := extractFloat64Frac1( a );
  4293. aSig0 := extractFloat64Frac0( a );
  4294. aExp := extractFloat64Exp( a );
  4295. aSign := extractFloat64Sign( a );
  4296. if ( aExp = $7FF ) then
  4297. Begin
  4298. if ( aSig0 OR aSig1 ) <> 0 then
  4299. Begin
  4300. float64ToCommonNaN( a, tmp );
  4301. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4302. exit;
  4303. End;
  4304. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4305. exit;
  4306. End;
  4307. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4308. if ( aExp <> 0) then
  4309. zSig := zSig OR $40000000;
  4310. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4311. End;
  4312. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4313. {*----------------------------------------------------------------------------
  4314. | Returns the result of converting the double-precision floating-point value
  4315. | `a' to the extended double-precision floating-point format. The conversion
  4316. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4317. | Arithmetic.
  4318. *----------------------------------------------------------------------------*}
  4319. function float64_to_floatx80( a: float64 ): floatx80;
  4320. var
  4321. aSign: flag;
  4322. aExp: int16;
  4323. aSig: bits64;
  4324. begin
  4325. aSig := extractFloat64Frac( a );
  4326. aExp := extractFloat64Exp( a );
  4327. aSign := extractFloat64Sign( a );
  4328. if ( aExp = $7FF ) then begin
  4329. if ( aSig <> 0 ) then begin
  4330. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4331. exit;
  4332. end;
  4333. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4334. exit;
  4335. end;
  4336. if ( aExp = 0 ) then begin
  4337. if ( aSig = 0 ) then begin
  4338. result := packFloatx80( aSign, 0, 0 );
  4339. exit;
  4340. end;
  4341. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4342. end;
  4343. result :=
  4344. packFloatx80(
  4345. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4346. end;
  4347. {$endif FPC_SOFTFLOAT_FLOATX80}
  4348. {*
  4349. -------------------------------------------------------------------------------
  4350. Rounds the double-precision floating-point value `a' to an integer,
  4351. and returns the result as a double-precision floating-point value. The
  4352. operation is performed according to the IEC/IEEE Standard for Binary
  4353. Floating-Point Arithmetic.
  4354. -------------------------------------------------------------------------------
  4355. *}
  4356. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4357. Var
  4358. aSign: flag;
  4359. aExp: int16;
  4360. lastBitMask, roundBitsMask: bits32;
  4361. roundingMode: TFPURoundingMode;
  4362. z: float64;
  4363. Begin
  4364. aExp := extractFloat64Exp( a );
  4365. if ( $413 <= aExp ) then
  4366. Begin
  4367. if ( $433 <= aExp ) then
  4368. Begin
  4369. if ( ( aExp = $7FF )
  4370. AND
  4371. (
  4372. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4373. ) <>0)
  4374. ) then
  4375. Begin
  4376. propagateFloat64NaN( a, a, result );
  4377. exit;
  4378. End;
  4379. result := a;
  4380. exit;
  4381. End;
  4382. lastBitMask := 1;
  4383. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4384. roundBitsMask := lastBitMask - 1;
  4385. z := a;
  4386. roundingMode := softfloat_rounding_mode;
  4387. if ( roundingMode = float_round_nearest_even ) then
  4388. Begin
  4389. if ( lastBitMask <> 0) then
  4390. Begin
  4391. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4392. if ( ( z.low and roundBitsMask ) = 0 ) then
  4393. z.low := z.low and not lastBitMask;
  4394. End
  4395. else
  4396. Begin
  4397. if ( sbits32 (z.low) < 0 ) then
  4398. Begin
  4399. Inc(z.high);
  4400. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4401. z.high := z.high and not 1;
  4402. End;
  4403. End;
  4404. End
  4405. else if ( roundingMode <> float_round_to_zero ) then
  4406. Begin
  4407. if ( extractFloat64Sign( z )
  4408. xor flag( roundingMode = float_round_up ) )<> 0 then
  4409. Begin
  4410. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4411. End;
  4412. End;
  4413. z.low := z.low and not roundBitsMask;
  4414. End
  4415. else
  4416. Begin
  4417. if ( aExp <= $3FE ) then
  4418. Begin
  4419. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4420. Begin
  4421. result := a;
  4422. exit;
  4423. End;
  4424. set_inexact_flag;
  4425. aSign := extractFloat64Sign( a );
  4426. case ( softfloat_rounding_mode ) of
  4427. float_round_nearest_even:
  4428. Begin
  4429. if ( ( aExp = $3FE )
  4430. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4431. ) then
  4432. Begin
  4433. packFloat64( aSign, $3FF, 0, 0, result );
  4434. exit;
  4435. End;
  4436. End;
  4437. float_round_down:
  4438. Begin
  4439. if aSign<>0 then
  4440. packFloat64( 1, $3FF, 0, 0, result )
  4441. else
  4442. packFloat64( 0, 0, 0, 0, result );
  4443. exit;
  4444. End;
  4445. float_round_up:
  4446. Begin
  4447. if aSign <> 0 then
  4448. packFloat64( 1, 0, 0, 0, result )
  4449. else
  4450. packFloat64( 0, $3FF, 0, 0, result );
  4451. exit;
  4452. End;
  4453. end;
  4454. packFloat64( aSign, 0, 0, 0, result );
  4455. exit;
  4456. End;
  4457. lastBitMask := 1;
  4458. lastBitMask := lastBitMask shl ($413 - aExp);
  4459. roundBitsMask := lastBitMask - 1;
  4460. z.low := 0;
  4461. z.high := a.high;
  4462. roundingMode := softfloat_rounding_mode;
  4463. if ( roundingMode = float_round_nearest_even ) then
  4464. Begin
  4465. z.high := z.high + lastBitMask shr 1;
  4466. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4467. Begin
  4468. z.high := z.high and not lastBitMask;
  4469. End;
  4470. End
  4471. else if ( roundingMode <> float_round_to_zero ) then
  4472. Begin
  4473. if ( extractFloat64Sign( z )
  4474. xor flag( roundingMode = float_round_up ) )<> 0 then
  4475. Begin
  4476. z.high := z.high or bits32( a.low <> 0 );
  4477. z.high := z.high + roundBitsMask;
  4478. End;
  4479. End;
  4480. z.high := z.high and not roundBitsMask;
  4481. End;
  4482. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4483. Begin
  4484. set_inexact_flag;
  4485. End;
  4486. result := z;
  4487. End;
  4488. {*
  4489. -------------------------------------------------------------------------------
  4490. Returns the result of adding the absolute values of the double-precision
  4491. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4492. before being returned. `zSign' is ignored if the result is a NaN.
  4493. The addition is performed according to the IEC/IEEE Standard for Binary
  4494. Floating-Point Arithmetic.
  4495. -------------------------------------------------------------------------------
  4496. *}
  4497. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4498. Var
  4499. aExp, bExp, zExp: int16;
  4500. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4501. expDiff: int16;
  4502. label shiftRight1;
  4503. label roundAndPack;
  4504. Begin
  4505. aSig1 := extractFloat64Frac1( a );
  4506. aSig0 := extractFloat64Frac0( a );
  4507. aExp := extractFloat64Exp( a );
  4508. bSig1 := extractFloat64Frac1( b );
  4509. bSig0 := extractFloat64Frac0( b );
  4510. bExp := extractFloat64Exp( b );
  4511. expDiff := aExp - bExp;
  4512. if ( 0 < expDiff ) then
  4513. Begin
  4514. if ( aExp = $7FF ) then
  4515. Begin
  4516. if ( aSig0 OR aSig1 ) <> 0 then
  4517. Begin
  4518. propagateFloat64NaN( a, b, out );
  4519. exit;
  4520. end;
  4521. out := a;
  4522. exit;
  4523. End;
  4524. if ( bExp = 0 ) then
  4525. Begin
  4526. Dec(expDiff);
  4527. End
  4528. else
  4529. Begin
  4530. bSig0 := bSig0 or $00100000;
  4531. End;
  4532. shift64ExtraRightJamming(
  4533. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4534. zExp := aExp;
  4535. End
  4536. else if ( expDiff < 0 ) then
  4537. Begin
  4538. if ( bExp = $7FF ) then
  4539. Begin
  4540. if ( bSig0 OR bSig1 ) <> 0 then
  4541. Begin
  4542. propagateFloat64NaN( a, b, out );
  4543. exit;
  4544. End;
  4545. packFloat64( zSign, $7FF, 0, 0, out );
  4546. exit;
  4547. End;
  4548. if ( aExp = 0 ) then
  4549. Begin
  4550. Inc(expDiff);
  4551. End
  4552. else
  4553. Begin
  4554. aSig0 := aSig0 or $00100000;
  4555. End;
  4556. shift64ExtraRightJamming(
  4557. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4558. zExp := bExp;
  4559. End
  4560. else
  4561. Begin
  4562. if ( aExp = $7FF ) then
  4563. Begin
  4564. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4565. Begin
  4566. propagateFloat64NaN( a, b, out );
  4567. exit;
  4568. End;
  4569. out := a;
  4570. exit;
  4571. End;
  4572. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4573. if ( aExp = 0 ) then
  4574. Begin
  4575. packFloat64( zSign, 0, zSig0, zSig1, out );
  4576. exit;
  4577. End;
  4578. zSig2 := 0;
  4579. zSig0 := zSig0 or $00200000;
  4580. zExp := aExp;
  4581. goto shiftRight1;
  4582. End;
  4583. aSig0 := aSig0 or $00100000;
  4584. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4585. Dec(zExp);
  4586. if ( zSig0 < $00200000 ) then
  4587. goto roundAndPack;
  4588. Inc(zExp);
  4589. shiftRight1:
  4590. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4591. roundAndPack:
  4592. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4593. End;
  4594. {*
  4595. -------------------------------------------------------------------------------
  4596. Returns the result of subtracting the absolute values of the double-
  4597. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4598. difference is negated before being returned. `zSign' is ignored if the
  4599. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4600. Standard for Binary Floating-Point Arithmetic.
  4601. -------------------------------------------------------------------------------
  4602. *}
  4603. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4604. Var
  4605. aExp, bExp, zExp: int16;
  4606. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4607. expDiff: int16;
  4608. z: float64;
  4609. label aExpBigger;
  4610. label bExpBigger;
  4611. label aBigger;
  4612. label bBigger;
  4613. label normalizeRoundAndPack;
  4614. Begin
  4615. aSig1 := extractFloat64Frac1( a );
  4616. aSig0 := extractFloat64Frac0( a );
  4617. aExp := extractFloat64Exp( a );
  4618. bSig1 := extractFloat64Frac1( b );
  4619. bSig0 := extractFloat64Frac0( b );
  4620. bExp := extractFloat64Exp( b );
  4621. expDiff := aExp - bExp;
  4622. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4623. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4624. if ( 0 < expDiff ) then goto aExpBigger;
  4625. if ( expDiff < 0 ) then goto bExpBigger;
  4626. if ( aExp = $7FF ) then
  4627. Begin
  4628. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4629. Begin
  4630. propagateFloat64NaN( a, b, out );
  4631. exit;
  4632. End;
  4633. float_raise( float_flag_invalid );
  4634. z.low := float64_default_nan_low;
  4635. z.high := float64_default_nan_high;
  4636. out := z;
  4637. exit;
  4638. End;
  4639. if ( aExp = 0 ) then
  4640. Begin
  4641. aExp := 1;
  4642. bExp := 1;
  4643. End;
  4644. if ( bSig0 < aSig0 ) then goto aBigger;
  4645. if ( aSig0 < bSig0 ) then goto bBigger;
  4646. if ( bSig1 < aSig1 ) then goto aBigger;
  4647. if ( aSig1 < bSig1 ) then goto bBigger;
  4648. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4649. exit;
  4650. bExpBigger:
  4651. if ( bExp = $7FF ) then
  4652. Begin
  4653. if ( bSig0 OR bSig1 ) <> 0 then
  4654. Begin
  4655. propagateFloat64NaN( a, b, out );
  4656. exit;
  4657. End;
  4658. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4659. exit;
  4660. End;
  4661. if ( aExp = 0 ) then
  4662. Begin
  4663. Inc(expDiff);
  4664. End
  4665. else
  4666. Begin
  4667. aSig0 := aSig0 or $40000000;
  4668. End;
  4669. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4670. bSig0 := bSig0 or $40000000;
  4671. bBigger:
  4672. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4673. zExp := bExp;
  4674. zSign := zSign xor 1;
  4675. goto normalizeRoundAndPack;
  4676. aExpBigger:
  4677. if ( aExp = $7FF ) then
  4678. Begin
  4679. if ( aSig0 OR aSig1 ) <> 0 then
  4680. Begin
  4681. propagateFloat64NaN( a, b, out );
  4682. exit;
  4683. End;
  4684. out := a;
  4685. exit;
  4686. End;
  4687. if ( bExp = 0 ) then
  4688. Begin
  4689. Dec(expDiff);
  4690. End
  4691. else
  4692. Begin
  4693. bSig0 := bSig0 or $40000000;
  4694. End;
  4695. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4696. aSig0 := aSig0 or $40000000;
  4697. aBigger:
  4698. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4699. zExp := aExp;
  4700. normalizeRoundAndPack:
  4701. Dec(zExp);
  4702. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4703. End;
  4704. {*
  4705. -------------------------------------------------------------------------------
  4706. Returns the result of adding the double-precision floating-point values `a'
  4707. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4708. Binary Floating-Point Arithmetic.
  4709. -------------------------------------------------------------------------------
  4710. *}
  4711. Function float64_add( a: float64; b : float64) : Float64;
  4712. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4713. Var
  4714. aSign, bSign: flag;
  4715. Begin
  4716. aSign := extractFloat64Sign( a );
  4717. bSign := extractFloat64Sign( b );
  4718. if ( aSign = bSign ) then
  4719. Begin
  4720. addFloat64Sigs( a, b, aSign, result );
  4721. End
  4722. else
  4723. Begin
  4724. subFloat64Sigs( a, b, aSign, result );
  4725. End;
  4726. End;
  4727. {*
  4728. -------------------------------------------------------------------------------
  4729. Returns the result of subtracting the double-precision floating-point values
  4730. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4731. for Binary Floating-Point Arithmetic.
  4732. -------------------------------------------------------------------------------
  4733. *}
  4734. Function float64_sub(a: float64; b : float64) : Float64;
  4735. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4736. Var
  4737. aSign, bSign: flag;
  4738. Begin
  4739. aSign := extractFloat64Sign( a );
  4740. bSign := extractFloat64Sign( b );
  4741. if ( aSign = bSign ) then
  4742. Begin
  4743. subFloat64Sigs( a, b, aSign, result );
  4744. End
  4745. else
  4746. Begin
  4747. addFloat64Sigs( a, b, aSign, result );
  4748. End;
  4749. End;
  4750. {*
  4751. -------------------------------------------------------------------------------
  4752. Returns the result of multiplying the double-precision floating-point values
  4753. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4754. for Binary Floating-Point Arithmetic.
  4755. -------------------------------------------------------------------------------
  4756. *}
  4757. Function float64_mul( a: float64; b:float64) : Float64;
  4758. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4759. Var
  4760. aSign, bSign, zSign: flag;
  4761. aExp, bExp, zExp: int16;
  4762. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4763. z: float64;
  4764. label invalid;
  4765. Begin
  4766. aSig1 := extractFloat64Frac1( a );
  4767. aSig0 := extractFloat64Frac0( a );
  4768. aExp := extractFloat64Exp( a );
  4769. aSign := extractFloat64Sign( a );
  4770. bSig1 := extractFloat64Frac1( b );
  4771. bSig0 := extractFloat64Frac0( b );
  4772. bExp := extractFloat64Exp( b );
  4773. bSign := extractFloat64Sign( b );
  4774. zSign := aSign xor bSign;
  4775. if ( aExp = $7FF ) then
  4776. Begin
  4777. if ( (( aSig0 OR aSig1 ) <>0)
  4778. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4779. Begin
  4780. propagateFloat64NaN( a, b, result );
  4781. exit;
  4782. End;
  4783. if ( ( bits32(bExp) OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4784. packFloat64( zSign, $7FF, 0, 0, result );
  4785. exit;
  4786. End;
  4787. if ( bExp = $7FF ) then
  4788. Begin
  4789. if ( bSig0 OR bSig1 )<> 0 then
  4790. Begin
  4791. propagateFloat64NaN( a, b, result );
  4792. exit;
  4793. End;
  4794. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4795. Begin
  4796. invalid:
  4797. float_raise( float_flag_invalid );
  4798. z.low := float64_default_nan_low;
  4799. z.high := float64_default_nan_high;
  4800. result := z;
  4801. exit;
  4802. End;
  4803. packFloat64( zSign, $7FF, 0, 0, result );
  4804. exit;
  4805. End;
  4806. if ( aExp = 0 ) then
  4807. Begin
  4808. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4809. Begin
  4810. packFloat64( zSign, 0, 0, 0, result );
  4811. exit;
  4812. End;
  4813. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4814. End;
  4815. if ( bExp = 0 ) then
  4816. Begin
  4817. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4818. Begin
  4819. packFloat64( zSign, 0, 0, 0, result );
  4820. exit;
  4821. End;
  4822. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4823. End;
  4824. zExp := aExp + bExp - $400;
  4825. aSig0 := aSig0 or $00100000;
  4826. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4827. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4828. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4829. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4830. if ( $00200000 <= zSig0 ) then
  4831. Begin
  4832. shift64ExtraRightJamming(
  4833. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4834. Inc(zExp);
  4835. End;
  4836. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4837. End;
  4838. {*
  4839. -------------------------------------------------------------------------------
  4840. Returns the result of dividing the double-precision floating-point value `a'
  4841. by the corresponding value `b'. The operation is performed according to the
  4842. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4843. -------------------------------------------------------------------------------
  4844. *}
  4845. Function float64_div(a: float64; b : float64) : Float64;
  4846. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4847. Var
  4848. aSign, bSign, zSign: flag;
  4849. aExp, bExp, zExp: int16;
  4850. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4851. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4852. z: float64;
  4853. label invalid;
  4854. Begin
  4855. aSig1 := extractFloat64Frac1( a );
  4856. aSig0 := extractFloat64Frac0( a );
  4857. aExp := extractFloat64Exp( a );
  4858. aSign := extractFloat64Sign( a );
  4859. bSig1 := extractFloat64Frac1( b );
  4860. bSig0 := extractFloat64Frac0( b );
  4861. bExp := extractFloat64Exp( b );
  4862. bSign := extractFloat64Sign( b );
  4863. zSign := aSign xor bSign;
  4864. if ( aExp = $7FF ) then
  4865. Begin
  4866. if ( aSig0 OR aSig1 )<> 0 then
  4867. Begin
  4868. propagateFloat64NaN( a, b, result );
  4869. exit;
  4870. end;
  4871. if ( bExp = $7FF ) then
  4872. Begin
  4873. if ( bSig0 OR bSig1 )<>0 then
  4874. Begin
  4875. propagateFloat64NaN( a, b, result );
  4876. exit;
  4877. End;
  4878. goto invalid;
  4879. End;
  4880. packFloat64( zSign, $7FF, 0, 0, result );
  4881. exit;
  4882. End;
  4883. if ( bExp = $7FF ) then
  4884. Begin
  4885. if ( bSig0 OR bSig1 )<> 0 then
  4886. Begin
  4887. propagateFloat64NaN( a, b, result );
  4888. exit;
  4889. End;
  4890. packFloat64( zSign, 0, 0, 0, result );
  4891. exit;
  4892. End;
  4893. if ( bExp = 0 ) then
  4894. Begin
  4895. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4896. Begin
  4897. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  4898. Begin
  4899. invalid:
  4900. float_raise( float_flag_invalid );
  4901. z.low := float64_default_nan_low;
  4902. z.high := float64_default_nan_high;
  4903. result := z;
  4904. exit;
  4905. End;
  4906. float_raise( float_flag_divbyzero );
  4907. packFloat64( zSign, $7FF, 0, 0, result );
  4908. exit;
  4909. End;
  4910. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4911. End;
  4912. if ( aExp = 0 ) then
  4913. Begin
  4914. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4915. Begin
  4916. packFloat64( zSign, 0, 0, 0, result );
  4917. exit;
  4918. End;
  4919. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4920. End;
  4921. zExp := aExp - bExp + $3FD;
  4922. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4923. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4924. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4925. Begin
  4926. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4927. Inc(zExp);
  4928. End;
  4929. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4930. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4931. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4932. while ( sbits32 (rem0) < 0 ) do
  4933. Begin
  4934. Dec(zSig0);
  4935. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4936. End;
  4937. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4938. if ( ( zSig1 and $3FF ) <= 4 ) then
  4939. Begin
  4940. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4941. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4942. while ( sbits32 (rem1) < 0 ) do
  4943. Begin
  4944. Dec(zSig1);
  4945. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4946. End;
  4947. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4948. End;
  4949. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4950. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4951. End;
  4952. {*
  4953. -------------------------------------------------------------------------------
  4954. Returns the remainder of the double-precision floating-point value `a'
  4955. with respect to the corresponding value `b'. The operation is performed
  4956. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4957. -------------------------------------------------------------------------------
  4958. *}
  4959. Function float64_rem(a: float64; b : float64) : float64;
  4960. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4961. Var
  4962. aSign, zSign: flag;
  4963. aExp, bExp, expDiff: int16;
  4964. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4965. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4966. sigMean0: sbits32;
  4967. z: float64;
  4968. label invalid;
  4969. Begin
  4970. aSig1 := extractFloat64Frac1( a );
  4971. aSig0 := extractFloat64Frac0( a );
  4972. aExp := extractFloat64Exp( a );
  4973. aSign := extractFloat64Sign( a );
  4974. bSig1 := extractFloat64Frac1( b );
  4975. bSig0 := extractFloat64Frac0( b );
  4976. bExp := extractFloat64Exp( b );
  4977. if ( aExp = $7FF ) then
  4978. Begin
  4979. if ((( aSig0 OR aSig1 )<>0)
  4980. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4981. Begin
  4982. propagateFloat64NaN( a, b, result );
  4983. exit;
  4984. End;
  4985. goto invalid;
  4986. End;
  4987. if ( bExp = $7FF ) then
  4988. Begin
  4989. if ( bSig0 OR bSig1 ) <> 0 then
  4990. Begin
  4991. propagateFloat64NaN( a, b, result );
  4992. exit;
  4993. End;
  4994. result := a;
  4995. exit;
  4996. End;
  4997. if ( bExp = 0 ) then
  4998. Begin
  4999. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5000. Begin
  5001. invalid:
  5002. float_raise( float_flag_invalid );
  5003. z.low := float64_default_nan_low;
  5004. z.high := float64_default_nan_high;
  5005. result := z;
  5006. exit;
  5007. End;
  5008. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5009. End;
  5010. if ( aExp = 0 ) then
  5011. Begin
  5012. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5013. Begin
  5014. result := a;
  5015. exit;
  5016. End;
  5017. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5018. End;
  5019. expDiff := aExp - bExp;
  5020. if ( expDiff < -1 ) then
  5021. Begin
  5022. result := a;
  5023. exit;
  5024. End;
  5025. shortShift64Left(
  5026. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5027. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5028. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5029. if ( q )<>0 then
  5030. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5031. expDiff := expDiff - 32;
  5032. while ( 0 < expDiff ) do
  5033. Begin
  5034. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5035. if 4 < q then
  5036. q:= q - 4
  5037. else
  5038. q := 0;
  5039. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5040. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5041. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5042. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5043. expDiff := expDiff - 29;
  5044. End;
  5045. if ( -32 < expDiff ) then
  5046. Begin
  5047. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5048. if 4 < q then
  5049. q := q - 4
  5050. else
  5051. q := 0;
  5052. q := q shr (- expDiff);
  5053. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5054. expDiff := expDiff + 24;
  5055. if ( expDiff < 0 ) then
  5056. Begin
  5057. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5058. End
  5059. else
  5060. Begin
  5061. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5062. End;
  5063. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5064. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5065. End
  5066. else
  5067. Begin
  5068. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5069. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5070. End;
  5071. Repeat
  5072. alternateASig0 := aSig0;
  5073. alternateASig1 := aSig1;
  5074. Inc(q);
  5075. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5076. Until not ( 0 <= sbits32 (aSig0) );
  5077. add64(
  5078. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5079. if ( ( sigMean0 < 0 )
  5080. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5081. Begin
  5082. aSig0 := alternateASig0;
  5083. aSig1 := alternateASig1;
  5084. End;
  5085. zSign := flag( sbits32 (aSig0) < 0 );
  5086. if ( zSign <> 0 ) then
  5087. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5088. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5089. End;
  5090. {*
  5091. -------------------------------------------------------------------------------
  5092. Returns the square root of the double-precision floating-point value `a'.
  5093. The operation is performed according to the IEC/IEEE Standard for Binary
  5094. Floating-Point Arithmetic.
  5095. -------------------------------------------------------------------------------
  5096. *}
  5097. function float64_sqrt( a: float64 ): float64;
  5098. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5099. Var
  5100. aSign: flag;
  5101. aExp, zExp: int16;
  5102. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5103. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5104. label invalid;
  5105. Begin
  5106. aSig1 := extractFloat64Frac1( a );
  5107. aSig0 := extractFloat64Frac0( a );
  5108. aExp := extractFloat64Exp( a );
  5109. aSign := extractFloat64Sign( a );
  5110. if ( aExp = $7FF ) then
  5111. Begin
  5112. if ( aSig0 OR aSig1 ) <> 0 then
  5113. Begin
  5114. propagateFloat64NaN( a, a, result );
  5115. exit;
  5116. End;
  5117. if ( aSign = 0) then
  5118. Begin
  5119. result := a;
  5120. exit;
  5121. End;
  5122. goto invalid;
  5123. End;
  5124. if ( aSign <> 0 ) then
  5125. Begin
  5126. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5127. Begin
  5128. result := a;
  5129. exit;
  5130. End;
  5131. invalid:
  5132. float_raise( float_flag_invalid );
  5133. result.low := float64_default_nan_low;
  5134. result.high := float64_default_nan_high;
  5135. exit;
  5136. End;
  5137. if ( aExp = 0 ) then
  5138. Begin
  5139. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5140. Begin
  5141. packFloat64( 0, 0, 0, 0, result );
  5142. exit;
  5143. End;
  5144. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5145. End;
  5146. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5147. aSig0 := aSig0 or $00100000;
  5148. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5149. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5150. if ( zSig0 = 0 ) then
  5151. zSig0 := $7FFFFFFF;
  5152. doubleZSig0 := zSig0 + zSig0;
  5153. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5154. mul32To64( zSig0, zSig0, term0, term1 );
  5155. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5156. while ( sbits32 (rem0) < 0 ) do
  5157. Begin
  5158. Dec(zSig0);
  5159. doubleZSig0 := doubleZSig0 - 2;
  5160. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5161. End;
  5162. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5163. if ( ( zSig1 and $1FF ) <= 5 ) then
  5164. Begin
  5165. if ( zSig1 = 0 ) then
  5166. zSig1 := 1;
  5167. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5168. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5169. mul32To64( zSig1, zSig1, term2, term3 );
  5170. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5171. while ( sbits32 (rem1) < 0 ) do
  5172. Begin
  5173. Dec(zSig1);
  5174. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5175. term3 := term3 or 1;
  5176. term2 := term2 or doubleZSig0;
  5177. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5178. End;
  5179. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5180. End;
  5181. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5182. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
  5183. End;
  5184. {*
  5185. -------------------------------------------------------------------------------
  5186. Returns 1 if the double-precision floating-point value `a' is equal to
  5187. the corresponding value `b', and 0 otherwise. The comparison is performed
  5188. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5189. -------------------------------------------------------------------------------
  5190. *}
  5191. Function float64_eq(a: float64; b: float64): flag;
  5192. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5193. Begin
  5194. if
  5195. (
  5196. ( extractFloat64Exp( a ) = $7FF )
  5197. AND
  5198. (
  5199. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5200. )
  5201. )
  5202. OR (
  5203. ( extractFloat64Exp( b ) = $7FF )
  5204. AND (
  5205. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5206. )
  5207. )
  5208. ) then
  5209. Begin
  5210. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5211. float_raise( float_flag_invalid );
  5212. float64_eq := 0;
  5213. exit;
  5214. End;
  5215. float64_eq := flag(
  5216. ( a.low = b.low )
  5217. AND ( ( a.high = b.high )
  5218. OR ( ( a.low = 0 )
  5219. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5220. ));
  5221. End;
  5222. {*
  5223. -------------------------------------------------------------------------------
  5224. Returns 1 if the double-precision floating-point value `a' is less than
  5225. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5226. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5227. Arithmetic.
  5228. -------------------------------------------------------------------------------
  5229. *}
  5230. Function float64_le(a: float64;b: float64): flag;
  5231. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5232. Var
  5233. aSign, bSign: flag;
  5234. Begin
  5235. if
  5236. (
  5237. ( extractFloat64Exp( a ) = $7FF )
  5238. AND
  5239. (
  5240. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5241. )
  5242. )
  5243. OR (
  5244. ( extractFloat64Exp( b ) = $7FF )
  5245. AND (
  5246. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5247. )
  5248. )
  5249. ) then
  5250. Begin
  5251. float_raise( float_flag_invalid );
  5252. float64_le := 0;
  5253. exit;
  5254. End;
  5255. aSign := extractFloat64Sign( a );
  5256. bSign := extractFloat64Sign( b );
  5257. if ( aSign <> bSign ) then
  5258. Begin
  5259. float64_le := flag(
  5260. (aSign <> 0)
  5261. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5262. = 0 ));
  5263. exit;
  5264. End;
  5265. if aSign <> 0 then
  5266. float64_le := le64( b.high, b.low, a.high, a.low )
  5267. else
  5268. float64_le := le64( a.high, a.low, b.high, b.low );
  5269. End;
  5270. {*
  5271. -------------------------------------------------------------------------------
  5272. Returns 1 if the double-precision floating-point value `a' is less than
  5273. the corresponding value `b', and 0 otherwise. The comparison is performed
  5274. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5275. -------------------------------------------------------------------------------
  5276. *}
  5277. Function float64_lt(a: float64;b: float64): flag;
  5278. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5279. Var
  5280. aSign, bSign: flag;
  5281. Begin
  5282. if
  5283. (
  5284. ( extractFloat64Exp( a ) = $7FF )
  5285. AND
  5286. (
  5287. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5288. )
  5289. )
  5290. OR (
  5291. ( extractFloat64Exp( b ) = $7FF )
  5292. AND (
  5293. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5294. )
  5295. )
  5296. ) then
  5297. Begin
  5298. float_raise( float_flag_invalid );
  5299. float64_lt := 0;
  5300. exit;
  5301. End;
  5302. aSign := extractFloat64Sign( a );
  5303. bSign := extractFloat64Sign( b );
  5304. if ( aSign <> bSign ) then
  5305. Begin
  5306. float64_lt := flag(
  5307. (aSign <> 0)
  5308. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5309. <> 0 ));
  5310. exit;
  5311. End;
  5312. if aSign <> 0 then
  5313. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5314. else
  5315. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5316. End;
  5317. {*
  5318. -------------------------------------------------------------------------------
  5319. Returns 1 if the double-precision floating-point value `a' is equal to
  5320. the corresponding value `b', and 0 otherwise. The invalid exception is
  5321. raised if either operand is a NaN. Otherwise, the comparison is performed
  5322. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5323. -------------------------------------------------------------------------------
  5324. *}
  5325. Function float64_eq_signaling( a: float64; b: float64): flag;
  5326. Begin
  5327. if
  5328. (
  5329. ( extractFloat64Exp( a ) = $7FF )
  5330. AND
  5331. (
  5332. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5333. )
  5334. )
  5335. OR (
  5336. ( extractFloat64Exp( b ) = $7FF )
  5337. AND (
  5338. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5339. )
  5340. )
  5341. ) then
  5342. Begin
  5343. float_raise( float_flag_invalid );
  5344. float64_eq_signaling := 0;
  5345. exit;
  5346. End;
  5347. float64_eq_signaling := flag(
  5348. ( a.low = b.low )
  5349. AND ( ( a.high = b.high )
  5350. OR ( ( a.low = 0 )
  5351. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5352. ));
  5353. End;
  5354. {*
  5355. -------------------------------------------------------------------------------
  5356. Returns 1 if the double-precision floating-point value `a' is less than or
  5357. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5358. cause an exception. Otherwise, the comparison is performed according to the
  5359. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5360. -------------------------------------------------------------------------------
  5361. *}
  5362. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5363. Var
  5364. aSign, bSign : flag;
  5365. Begin
  5366. if
  5367. (
  5368. ( extractFloat64Exp( a ) = $7FF )
  5369. AND
  5370. (
  5371. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5372. )
  5373. )
  5374. OR (
  5375. ( extractFloat64Exp( b ) = $7FF )
  5376. AND (
  5377. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5378. )
  5379. )
  5380. ) then
  5381. Begin
  5382. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5383. float_raise( float_flag_invalid );
  5384. float64_le_quiet := 0;
  5385. exit;
  5386. End;
  5387. aSign := extractFloat64Sign( a );
  5388. bSign := extractFloat64Sign( b );
  5389. if ( aSign <> bSign ) then
  5390. Begin
  5391. float64_le_quiet := flag
  5392. ((aSign <> 0)
  5393. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5394. = 0 ));
  5395. exit;
  5396. End;
  5397. if aSign <> 0 then
  5398. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5399. else
  5400. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5401. End;
  5402. {*
  5403. -------------------------------------------------------------------------------
  5404. Returns 1 if the double-precision floating-point value `a' is less than
  5405. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5406. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5407. Standard for Binary Floating-Point Arithmetic.
  5408. -------------------------------------------------------------------------------
  5409. *}
  5410. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5411. Var
  5412. aSign, bSign: flag;
  5413. Begin
  5414. if
  5415. (
  5416. ( extractFloat64Exp( a ) = $7FF )
  5417. AND
  5418. (
  5419. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5420. )
  5421. )
  5422. OR (
  5423. ( extractFloat64Exp( b ) = $7FF )
  5424. AND (
  5425. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5426. )
  5427. )
  5428. ) then
  5429. Begin
  5430. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5431. float_raise( float_flag_invalid );
  5432. float64_lt_quiet := 0;
  5433. exit;
  5434. End;
  5435. aSign := extractFloat64Sign( a );
  5436. bSign := extractFloat64Sign( b );
  5437. if ( aSign <> bSign ) then
  5438. Begin
  5439. float64_lt_quiet := flag(
  5440. (aSign<>0)
  5441. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5442. <> 0 ));
  5443. exit;
  5444. End;
  5445. If aSign <> 0 then
  5446. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5447. else
  5448. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5449. End;
  5450. {*----------------------------------------------------------------------------
  5451. | Returns the result of converting the 64-bit two's complement integer `a'
  5452. | to the single-precision floating-point format. The conversion is performed
  5453. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5454. *----------------------------------------------------------------------------*}
  5455. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5456. var
  5457. zSign : flag;
  5458. absA : uint64;
  5459. shiftCount: int8;
  5460. Begin
  5461. if ( a = 0 ) then
  5462. begin
  5463. int64_to_float32.float32 := 0;
  5464. exit;
  5465. end;
  5466. if a < 0 then
  5467. zSign := flag(TRUE)
  5468. else
  5469. zSign := flag(FALSE);
  5470. if zSign<>0 then
  5471. absA := -a
  5472. else
  5473. absA := a;
  5474. shiftCount := countLeadingZeros64( absA ) - 40;
  5475. if ( 0 <= shiftCount ) then
  5476. begin
  5477. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5478. end
  5479. else
  5480. begin
  5481. shiftCount := shiftCount + 7;
  5482. if ( shiftCount < 0 ) then
  5483. shift64RightJamming( absA, - shiftCount, absA )
  5484. else
  5485. absA := absA shl shiftCount;
  5486. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5487. end;
  5488. End;
  5489. {*----------------------------------------------------------------------------
  5490. | Returns the result of converting the 64-bit two's complement integer `a'
  5491. | to the single-precision floating-point format. The conversion is performed
  5492. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5493. | Unisgned version.
  5494. *----------------------------------------------------------------------------*}
  5495. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5496. var
  5497. absA : uint64;
  5498. shiftCount: int8;
  5499. Begin
  5500. if ( a = 0 ) then
  5501. begin
  5502. qword_to_float32.float32 := 0;
  5503. exit;
  5504. end;
  5505. absA := a;
  5506. shiftCount := countLeadingZeros64( absA ) - 40;
  5507. if ( 0 <= shiftCount ) then
  5508. begin
  5509. qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
  5510. end
  5511. else
  5512. begin
  5513. shiftCount := shiftCount + 7;
  5514. if ( shiftCount < 0 ) then
  5515. shift64RightJamming( absA, - shiftCount, absA )
  5516. else
  5517. absA := absA shl shiftCount;
  5518. qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
  5519. end;
  5520. End;
  5521. {*----------------------------------------------------------------------------
  5522. | Returns the result of converting the 64-bit two's complement integer `a'
  5523. | to the double-precision floating-point format. The conversion is performed
  5524. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5525. *----------------------------------------------------------------------------*}
  5526. function qword_to_float64( a: qword ): float64;
  5527. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5528. var
  5529. shiftCount: int8;
  5530. Begin
  5531. if ( a = 0 ) then
  5532. result := packFloat64( 0, 0, 0 )
  5533. else
  5534. begin
  5535. shiftCount := countLeadingZeros64(a) - 1;
  5536. { numbers with <= 53 significant bits are converted exactly }
  5537. if (shiftCount > 9) then
  5538. result := packFloat64(0, $43c - shiftCount, a shl (shiftCount-10))
  5539. else if (shiftCount>=0) then
  5540. result := roundAndPackFloat64( 0, $43c - shiftCount, a shl shiftCount)
  5541. else
  5542. begin
  5543. { the only possible negative value is -1, in case bit 63 of 'a' is set }
  5544. shift64RightJamming(a, 1, a);
  5545. result := roundAndPackFloat64(0, $43d, a);
  5546. end;
  5547. end;
  5548. End;
  5549. {*----------------------------------------------------------------------------
  5550. | Returns the result of converting the 64-bit two's complement integer `a'
  5551. | to the double-precision floating-point format. The conversion is performed
  5552. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5553. *----------------------------------------------------------------------------*}
  5554. function int64_to_float64( a: int64 ): float64;
  5555. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5556. Begin
  5557. if ( a = 0 ) then
  5558. result := packFloat64( 0, 0, 0 )
  5559. else if (a = int64($8000000000000000)) then
  5560. result := packFloat64( 1, $43e, 0 )
  5561. else if (a < 0) then
  5562. result := normalizeRoundAndPackFloat64( 1, $43c, -a )
  5563. else
  5564. result := normalizeRoundAndPackFloat64( 0, $43c, a );
  5565. End;
  5566. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5567. {*----------------------------------------------------------------------------
  5568. | Returns the result of converting the 64-bit two's complement integer `a'
  5569. | to the extended double-precision floating-point format. The conversion
  5570. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5571. | Arithmetic.
  5572. *----------------------------------------------------------------------------*}
  5573. function int64_to_floatx80( a: int64 ): floatx80;
  5574. var
  5575. zSign: flag;
  5576. absA: uint64;
  5577. shiftCount: int8;
  5578. begin
  5579. if ( a = 0 ) then begin
  5580. result := packFloatx80( 0, 0, 0 );
  5581. exit;
  5582. end;
  5583. zSign := ord( a < 0 );
  5584. if zSign <> 0 then absA := - a else absA := a;
  5585. shiftCount := countLeadingZeros64( absA );
  5586. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5587. end;
  5588. {*----------------------------------------------------------------------------
  5589. | Returns the result of converting the 64-bit two's complement integer `a'
  5590. | to the extended double-precision floating-point format. The conversion
  5591. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5592. | Arithmetic.
  5593. | Unsigned version.
  5594. *----------------------------------------------------------------------------*}
  5595. function qword_to_floatx80( a: qword ): floatx80;
  5596. var
  5597. absA: bits64;
  5598. shiftCount: int8;
  5599. begin
  5600. if ( a = 0 ) then begin
  5601. result := packFloatx80( 0, 0, 0 );
  5602. exit;
  5603. end;
  5604. absA := a;
  5605. shiftCount := countLeadingZeros64( absA );
  5606. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5607. end;
  5608. {$endif FPC_SOFTFLOAT_FLOATX80}
  5609. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5610. {*----------------------------------------------------------------------------
  5611. | Returns the result of converting the 64-bit two's complement integer `a' to
  5612. | the quadruple-precision floating-point format. The conversion is performed
  5613. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5614. *----------------------------------------------------------------------------*}
  5615. function int64_to_float128( a: int64 ): float128;
  5616. var
  5617. zSign: flag;
  5618. absA: uint64;
  5619. shiftCount: int8;
  5620. zExp: int32;
  5621. zSig0, zSig1: bits64;
  5622. begin
  5623. if ( a = 0 ) then begin
  5624. result := packFloat128( 0, 0, 0, 0 );
  5625. exit;
  5626. end;
  5627. zSign := ord( a < 0 );
  5628. if zSign <> 0 then absA := - a else absA := a;
  5629. shiftCount := countLeadingZeros64( absA ) + 49;
  5630. zExp := $406E - shiftCount;
  5631. if ( 64 <= shiftCount ) then begin
  5632. zSig1 := 0;
  5633. zSig0 := absA;
  5634. dec( shiftCount, 64 );
  5635. end
  5636. else begin
  5637. zSig1 := absA;
  5638. zSig0 := 0;
  5639. end;
  5640. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5641. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5642. end;
  5643. {*----------------------------------------------------------------------------
  5644. | Returns the result of converting the 64-bit two's complement integer `a' to
  5645. | the quadruple-precision floating-point format. The conversion is performed
  5646. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5647. | Unsigned version.
  5648. *----------------------------------------------------------------------------*}
  5649. function qword_to_float128( a: qword ): float128;
  5650. var
  5651. absA: bits64;
  5652. shiftCount: int8;
  5653. zExp: int32;
  5654. zSig0, zSig1: bits64;
  5655. begin
  5656. if ( a = 0 ) then begin
  5657. result := packFloat128( 0, 0, 0, 0 );
  5658. exit;
  5659. end;
  5660. absA := a;
  5661. shiftCount := countLeadingZeros64( absA ) + 49;
  5662. zExp := $406E - shiftCount;
  5663. if ( 64 <= shiftCount ) then begin
  5664. zSig1 := 0;
  5665. zSig0 := absA;
  5666. dec( shiftCount, 64 );
  5667. end
  5668. else begin
  5669. zSig1 := absA;
  5670. zSig0 := 0;
  5671. end;
  5672. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5673. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5674. end;
  5675. {$endif FPC_SOFTFLOAT_FLOAT128}
  5676. {*----------------------------------------------------------------------------
  5677. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5678. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5679. | Otherwise, returns 0.
  5680. *----------------------------------------------------------------------------*}
  5681. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5682. begin
  5683. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5684. end;
  5685. {*----------------------------------------------------------------------------
  5686. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5687. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5688. | Otherwise, returns 0.
  5689. *----------------------------------------------------------------------------*}
  5690. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5691. begin
  5692. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5693. end;
  5694. {*----------------------------------------------------------------------------
  5695. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5696. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5697. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5698. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5699. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5700. | the most-significant bit of the extra result, and the other 63 bits of the
  5701. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5702. | were all zero. This extra result is stored in the location pointed to by
  5703. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5704. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5705. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5706. | fixed-point value is shifted right by the number of bits given in `count',
  5707. | and the integer part of the result is returned at the locations pointed to
  5708. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5709. | corrupted as described above, and is returned at the location pointed to by
  5710. | `z2Ptr'.)
  5711. *----------------------------------------------------------------------------*}
  5712. procedure shift128ExtraRightJamming(
  5713. a0: bits64;
  5714. a1: bits64;
  5715. a2: bits64;
  5716. count: int16;
  5717. var z0Ptr: bits64;
  5718. var z1Ptr: bits64;
  5719. var z2Ptr: bits64);
  5720. var
  5721. z0, z1, z2: bits64;
  5722. negCount: int8;
  5723. begin
  5724. negCount := ( - count ) and 63;
  5725. if ( count = 0 ) then
  5726. begin
  5727. z2 := a2;
  5728. z1 := a1;
  5729. z0 := a0;
  5730. end
  5731. else begin
  5732. if ( count < 64 ) then
  5733. begin
  5734. z2 := a1 shl negCount;
  5735. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5736. z0 := a0 shr count;
  5737. end
  5738. else begin
  5739. if ( count = 64 ) then
  5740. begin
  5741. z2 := a1;
  5742. z1 := a0;
  5743. end
  5744. else begin
  5745. a2 := a2 or a1;
  5746. if ( count < 128 ) then
  5747. begin
  5748. z2 := a0 shl negCount;
  5749. z1 := a0 shr ( count and 63 );
  5750. end
  5751. else begin
  5752. if ( count = 128 ) then
  5753. z2 := a0
  5754. else
  5755. z2 := ord( a0 <> 0 );
  5756. z1 := 0;
  5757. end;
  5758. end;
  5759. z0 := 0;
  5760. end;
  5761. z2 := z2 or ord( a2 <> 0 );
  5762. end;
  5763. z2Ptr := z2;
  5764. z1Ptr := z1;
  5765. z0Ptr := z0;
  5766. end;
  5767. {*----------------------------------------------------------------------------
  5768. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5769. | _plus_ the number of bits given in `count'. The shifted result is at most
  5770. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5771. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5772. | shifted off is the most-significant bit of the extra result, and the other
  5773. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5774. | bits shifted off were all zero. This extra result is stored in the location
  5775. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5776. | (This routine makes more sense if `a0' and `a1' are considered to form
  5777. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5778. | point value is shifted right by the number of bits given in `count', and
  5779. | the integer part of the result is returned at the location pointed to by
  5780. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5781. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5782. *----------------------------------------------------------------------------*}
  5783. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5784. var
  5785. z0, z1: bits64;
  5786. negCount: int8;
  5787. begin
  5788. negCount := ( - count ) and 63;
  5789. if ( count = 0 ) then
  5790. begin
  5791. z1 := a1;
  5792. z0 := a0;
  5793. end
  5794. else if ( count < 64 ) then
  5795. begin
  5796. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5797. z0 := a0 shr count;
  5798. end
  5799. else begin
  5800. if ( count = 64 ) then
  5801. begin
  5802. z1 := a0 or ord( a1 <> 0 );
  5803. end
  5804. else begin
  5805. z1 := ord( ( a0 or a1 ) <> 0 );
  5806. end;
  5807. z0 := 0;
  5808. end;
  5809. z1Ptr := z1;
  5810. z0Ptr := z0;
  5811. end;
  5812. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5813. {*----------------------------------------------------------------------------
  5814. | Returns the fraction bits of the extended double-precision floating-point
  5815. | value `a'.
  5816. *----------------------------------------------------------------------------*}
  5817. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5818. begin
  5819. result:=a.low;
  5820. end;
  5821. {*----------------------------------------------------------------------------
  5822. | Returns the exponent bits of the extended double-precision floating-point
  5823. | value `a'.
  5824. *----------------------------------------------------------------------------*}
  5825. function extractFloatx80Exp(a : floatx80): int32;inline;
  5826. begin
  5827. result:=a.high and $7FFF;
  5828. end;
  5829. {*----------------------------------------------------------------------------
  5830. | Returns the sign bit of the extended double-precision floating-point value
  5831. | `a'.
  5832. *----------------------------------------------------------------------------*}
  5833. function extractFloatx80Sign(a : floatx80): flag;inline;
  5834. begin
  5835. result:=a.high shr 15;
  5836. end;
  5837. {*----------------------------------------------------------------------------
  5838. | Normalizes the subnormal extended double-precision floating-point value
  5839. | represented by the denormalized significand `aSig'. The normalized exponent
  5840. | and significand are stored at the locations pointed to by `zExpPtr' and
  5841. | `zSigPtr', respectively.
  5842. *----------------------------------------------------------------------------*}
  5843. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5844. var
  5845. shiftCount: int8;
  5846. begin
  5847. shiftCount := countLeadingZeros64( aSig );
  5848. zSigPtr := aSig shl shiftCount;
  5849. zExpPtr := 1 - shiftCount;
  5850. end;
  5851. {*----------------------------------------------------------------------------
  5852. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5853. | extended double-precision floating-point value, returning the result.
  5854. *----------------------------------------------------------------------------*}
  5855. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5856. var
  5857. z: floatx80;
  5858. begin
  5859. z.low := zSig;
  5860. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5861. result:=z;
  5862. end;
  5863. {*----------------------------------------------------------------------------
  5864. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5865. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5866. | and returns the proper extended double-precision floating-point value
  5867. | corresponding to the abstract input. Ordinarily, the abstract value is
  5868. | rounded and packed into the extended double-precision format, with the
  5869. | inexact exception raised if the abstract input cannot be represented
  5870. | exactly. However, if the abstract value is too large, the overflow and
  5871. | inexact exceptions are raised and an infinity or maximal finite value is
  5872. | returned. If the abstract value is too small, the input value is rounded to
  5873. | a subnormal number, and the underflow and inexact exceptions are raised if
  5874. | the abstract input cannot be represented exactly as a subnormal extended
  5875. | double-precision floating-point number.
  5876. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5877. | number of bits as single or double precision, respectively. Otherwise, the
  5878. | result is rounded to the full precision of the extended double-precision
  5879. | format.
  5880. | The input significand must be normalized or smaller. If the input
  5881. | significand is not normalized, `zExp' must be 0; in that case, the result
  5882. | returned is a subnormal number, and it must not require rounding. The
  5883. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5884. | Floating-Point Arithmetic.
  5885. *----------------------------------------------------------------------------*}
  5886. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5887. var
  5888. roundingMode: int8;
  5889. roundNearestEven, increment, isTiny: flag;
  5890. roundIncrement, roundMask, roundBits: int64;
  5891. label
  5892. precision80, overflow;
  5893. begin
  5894. roundingMode := softfloat_rounding_mode;
  5895. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5896. if ( roundingPrecision = 80 ) then
  5897. goto precision80;
  5898. if ( roundingPrecision = 64 ) then
  5899. begin
  5900. roundIncrement := int64( $0000000000000400 );
  5901. roundMask := int64( $00000000000007FF );
  5902. end
  5903. else if ( roundingPrecision = 32 ) then
  5904. begin
  5905. roundIncrement := int64( $0000008000000000 );
  5906. roundMask := int64( $000000FFFFFFFFFF );
  5907. end
  5908. else begin
  5909. goto precision80;
  5910. end;
  5911. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5912. if ( not (roundNearestEven<>0) ) then
  5913. begin
  5914. if ( roundingMode = float_round_to_zero ) then
  5915. begin
  5916. roundIncrement := 0;
  5917. end
  5918. else begin
  5919. roundIncrement := roundMask;
  5920. if ( zSign<>0 ) then
  5921. begin
  5922. if ( roundingMode = float_round_up ) then
  5923. roundIncrement := 0;
  5924. end
  5925. else begin
  5926. if ( roundingMode = float_round_down ) then
  5927. roundIncrement := 0;
  5928. end;
  5929. end;
  5930. end;
  5931. roundBits := zSig0 and roundMask;
  5932. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  5933. if ( ( $7FFE < zExp )
  5934. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5935. ) then begin
  5936. goto overflow;
  5937. end;
  5938. if ( zExp <= 0 ) then begin
  5939. isTiny := ord (
  5940. ( softfloat_detect_tininess = float_tininess_before_rounding )
  5941. or ( zExp < 0 )
  5942. or ( zSig0 <= zSig0 + roundIncrement ) );
  5943. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5944. zExp := 0;
  5945. roundBits := zSig0 and roundMask;
  5946. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  5947. if ( roundBits <> 0 ) then set_inexact_flag;
  5948. inc( zSig0, roundIncrement );
  5949. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  5950. roundIncrement := roundMask + 1;
  5951. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  5952. roundMask := roundMask or roundIncrement;
  5953. end;
  5954. zSig0 := zSig0 and not roundMask;
  5955. result:=packFloatx80( zSign, zExp, zSig0 );
  5956. exit;
  5957. end;
  5958. end;
  5959. if ( roundBits <> 0 ) then set_inexact_flag;
  5960. inc( zSig0, roundIncrement );
  5961. if ( zSig0 < roundIncrement ) then begin
  5962. inc(zExp);
  5963. zSig0 := bits64( $8000000000000000 );
  5964. end;
  5965. roundIncrement := roundMask + 1;
  5966. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  5967. roundMask := roundMask or roundIncrement;
  5968. end;
  5969. zSig0 := zSig0 and not roundMask;
  5970. if ( zSig0 = 0 ) then zExp := 0;
  5971. result:=packFloatx80( zSign, zExp, zSig0 );
  5972. exit;
  5973. precision80:
  5974. increment := ord ( sbits64( zSig1 ) < 0 );
  5975. if ( roundNearestEven = 0 ) then begin
  5976. if ( roundingMode = float_round_to_zero ) then begin
  5977. increment := 0;
  5978. end
  5979. else begin
  5980. if ( zSign <> 0 ) then begin
  5981. increment := ord ( roundingMode = float_round_down ) and zSig1;
  5982. end
  5983. else begin
  5984. increment := ord ( roundingMode = float_round_up ) and zSig1;
  5985. end;
  5986. end;
  5987. end;
  5988. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  5989. if ( ( $7FFE < zExp )
  5990. or ( ( zExp = $7FFE )
  5991. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  5992. and ( increment <> 0 )
  5993. )
  5994. ) then begin
  5995. roundMask := 0;
  5996. overflow:
  5997. float_raise( [float_flag_overflow,float_flag_inexact] );
  5998. if ( ( roundingMode = float_round_to_zero )
  5999. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6000. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6001. ) then begin
  6002. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6003. exit;
  6004. end;
  6005. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6006. exit;
  6007. end;
  6008. if ( zExp <= 0 ) then begin
  6009. isTiny := ord(
  6010. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6011. or ( zExp < 0 )
  6012. or ( increment = 0 )
  6013. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6014. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6015. zExp := 0;
  6016. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6017. if ( zSig1 <> 0 ) then set_inexact_flag;
  6018. if ( roundNearestEven <> 0 ) then begin
  6019. increment := ord( sbits64( zSig1 ) < 0 );
  6020. end
  6021. else begin
  6022. if ( zSign <> 0 ) then begin
  6023. increment := ord( roundingMode = float_round_down ) and zSig1;
  6024. end
  6025. else begin
  6026. increment := ord( roundingMode = float_round_up ) and zSig1;
  6027. end;
  6028. end;
  6029. if ( increment <> 0 ) then begin
  6030. inc(zSig0);
  6031. zSig0 :=
  6032. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6033. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6034. end;
  6035. result:=packFloatx80( zSign, zExp, zSig0 );
  6036. exit;
  6037. end;
  6038. end;
  6039. if ( zSig1 <> 0 ) then set_inexact_flag;
  6040. if ( increment <> 0 ) then begin
  6041. inc(zSig0);
  6042. if ( zSig0 = 0 ) then begin
  6043. inc(zExp);
  6044. zSig0 := bits64( $8000000000000000 );
  6045. end
  6046. else begin
  6047. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6048. end;
  6049. end
  6050. else begin
  6051. if ( zSig0 = 0 ) then zExp := 0;
  6052. end;
  6053. result:=packFloatx80( zSign, zExp, zSig0 );
  6054. end;
  6055. {*----------------------------------------------------------------------------
  6056. | Takes an abstract floating-point value having sign `zSign', exponent
  6057. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6058. | and returns the proper extended double-precision floating-point value
  6059. | corresponding to the abstract input. This routine is just like
  6060. | `roundAndPackFloatx80' except that the input significand does not have to be
  6061. | normalized.
  6062. *----------------------------------------------------------------------------*}
  6063. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6064. var
  6065. shiftCount: int8;
  6066. begin
  6067. if ( zSig0 = 0 ) then begin
  6068. zSig0 := zSig1;
  6069. zSig1 := 0;
  6070. dec( zExp, 64 );
  6071. end;
  6072. shiftCount := countLeadingZeros64( zSig0 );
  6073. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6074. zExp := zExp - shiftCount;
  6075. result :=
  6076. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6077. end;
  6078. {*----------------------------------------------------------------------------
  6079. | Returns the result of converting the extended double-precision floating-
  6080. | point value `a' to the 32-bit two's complement integer format. The
  6081. | conversion is performed according to the IEC/IEEE Standard for Binary
  6082. | Floating-Point Arithmetic---which means in particular that the conversion
  6083. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6084. | largest positive integer is returned. Otherwise, if the conversion
  6085. | overflows, the largest integer with the same sign as `a' is returned.
  6086. *----------------------------------------------------------------------------*}
  6087. function floatx80_to_int32(a: floatx80): int32;
  6088. var
  6089. aSign: flag;
  6090. aExp, shiftCount: int32;
  6091. aSig: bits64;
  6092. begin
  6093. aSig := extractFloatx80Frac( a );
  6094. aExp := extractFloatx80Exp( a );
  6095. aSign := extractFloatx80Sign( a );
  6096. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6097. shiftCount := $4037 - aExp;
  6098. if ( shiftCount <= 0 ) then shiftCount := 1;
  6099. shift64RightJamming( aSig, shiftCount, aSig );
  6100. result := roundAndPackInt32( aSign, aSig );
  6101. end;
  6102. {*----------------------------------------------------------------------------
  6103. | Returns the result of converting the extended double-precision floating-
  6104. | point value `a' to the 32-bit two's complement integer format. The
  6105. | conversion is performed according to the IEC/IEEE Standard for Binary
  6106. | Floating-Point Arithmetic, except that the conversion is always rounded
  6107. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6108. | Otherwise, if the conversion overflows, the largest integer with the same
  6109. | sign as `a' is returned.
  6110. *----------------------------------------------------------------------------*}
  6111. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6112. var
  6113. aSign: flag;
  6114. aExp, shiftCount: int32;
  6115. aSig, savedASig: bits64;
  6116. z: int32;
  6117. label
  6118. invalid;
  6119. begin
  6120. aSig := extractFloatx80Frac( a );
  6121. aExp := extractFloatx80Exp( a );
  6122. aSign := extractFloatx80Sign( a );
  6123. if ( $401E < aExp ) then begin
  6124. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6125. goto invalid;
  6126. end
  6127. else if ( aExp < $3FFF ) then begin
  6128. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6129. result := 0;
  6130. exit;
  6131. end;
  6132. shiftCount := $403E - aExp;
  6133. savedASig := aSig;
  6134. aSig := aSig shr shiftCount;
  6135. z := aSig;
  6136. if ( aSign <> 0 ) then z := - z;
  6137. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6138. invalid:
  6139. float_raise( float_flag_invalid );
  6140. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6141. exit;
  6142. end;
  6143. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6144. set_inexact_flag;
  6145. end;
  6146. result := z;
  6147. end;
  6148. {*----------------------------------------------------------------------------
  6149. | Returns the result of converting the extended double-precision floating-
  6150. | point value `a' to the 64-bit two's complement integer format. The
  6151. | conversion is performed according to the IEC/IEEE Standard for Binary
  6152. | Floating-Point Arithmetic---which means in particular that the conversion
  6153. | is rounded according to the current rounding mode. If `a' is a NaN,
  6154. | the largest positive integer is returned. Otherwise, if the conversion
  6155. | overflows, the largest integer with the same sign as `a' is returned.
  6156. *----------------------------------------------------------------------------*}
  6157. function floatx80_to_int64(a: floatx80): int64;
  6158. var
  6159. aSign: flag;
  6160. aExp, shiftCount: int32;
  6161. aSig, aSigExtra: bits64;
  6162. begin
  6163. aSig := extractFloatx80Frac( a );
  6164. aExp := extractFloatx80Exp( a );
  6165. aSign := extractFloatx80Sign( a );
  6166. shiftCount := $403E - aExp;
  6167. if ( shiftCount <= 0 ) then begin
  6168. if ( shiftCount <> 0 ) then begin
  6169. float_raise( float_flag_invalid );
  6170. if ( ( aSign = 0 )
  6171. or ( ( aExp = $7FFF )
  6172. and ( aSig <> bits64( $8000000000000000 ) ) )
  6173. ) then begin
  6174. result := $7FFFFFFFFFFFFFFF;
  6175. exit;
  6176. end;
  6177. result := $8000000000000000;
  6178. exit;
  6179. end;
  6180. aSigExtra := 0;
  6181. end
  6182. else begin
  6183. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6184. end;
  6185. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6186. end;
  6187. {*----------------------------------------------------------------------------
  6188. | Returns the result of converting the extended double-precision floating-
  6189. | point value `a' to the 64-bit two's complement integer format. The
  6190. | conversion is performed according to the IEC/IEEE Standard for Binary
  6191. | Floating-Point Arithmetic, except that the conversion is always rounded
  6192. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6193. | Otherwise, if the conversion overflows, the largest integer with the same
  6194. | sign as `a' is returned.
  6195. *----------------------------------------------------------------------------*}
  6196. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6197. var
  6198. aSign: flag;
  6199. aExp, shiftCount: int32;
  6200. aSig: bits64;
  6201. z: int64;
  6202. begin
  6203. aSig := extractFloatx80Frac( a );
  6204. aExp := extractFloatx80Exp( a );
  6205. aSign := extractFloatx80Sign( a );
  6206. shiftCount := aExp - $403E;
  6207. if ( 0 <= shiftCount ) then begin
  6208. aSig := $7FFFFFFFFFFFFFFF;
  6209. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6210. float_raise( float_flag_invalid );
  6211. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6212. result := $7FFFFFFFFFFFFFFF;
  6213. exit;
  6214. end;
  6215. end;
  6216. result := $8000000000000000;
  6217. exit;
  6218. end
  6219. else if ( aExp < $3FFF ) then begin
  6220. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6221. result := 0;
  6222. exit;
  6223. end;
  6224. z := aSig shr ( - shiftCount );
  6225. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6226. set_inexact_flag;
  6227. end;
  6228. if ( aSign <> 0 ) then z := - z;
  6229. result := z;
  6230. end;
  6231. {*----------------------------------------------------------------------------
  6232. | The pattern for a default generated extended double-precision NaN. The
  6233. | `high' and `low' values hold the most- and least-significant bits,
  6234. | respectively.
  6235. *----------------------------------------------------------------------------*}
  6236. const
  6237. floatx80_default_nan_high = $FFFF;
  6238. floatx80_default_nan_low = bits64( $C000000000000000 );
  6239. {*----------------------------------------------------------------------------
  6240. | Returns 1 if the extended double-precision floating-point value `a' is a
  6241. | signaling NaN; otherwise returns 0.
  6242. *----------------------------------------------------------------------------*}
  6243. function floatx80_is_signaling_nan(a : floatx80): flag;
  6244. var
  6245. aLow: bits64;
  6246. begin
  6247. aLow := a.low and not $4000000000000000;
  6248. result := ord(
  6249. ( a.high and $7FFF = $7FFF )
  6250. and ( bits64( aLow shl 1 ) <> 0 )
  6251. and ( a.low = aLow ) );
  6252. end;
  6253. {*----------------------------------------------------------------------------
  6254. | Returns the result of converting the extended double-precision floating-
  6255. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6256. | invalid exception is raised.
  6257. *----------------------------------------------------------------------------*}
  6258. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6259. var
  6260. z: commonNaNT;
  6261. begin
  6262. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6263. z.sign := a.high shr 15;
  6264. z.low := 0;
  6265. z.high := a.low shl 1;
  6266. result := z;
  6267. end;
  6268. {*----------------------------------------------------------------------------
  6269. | Returns 1 if the extended double-precision floating-point value `a' is a
  6270. | NaN; otherwise returns 0.
  6271. *----------------------------------------------------------------------------*}
  6272. function floatx80_is_nan(a : floatx80 ): flag;
  6273. begin
  6274. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
  6275. end;
  6276. {*----------------------------------------------------------------------------
  6277. | Takes two extended double-precision floating-point values `a' and `b', one
  6278. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6279. | `b' is a signaling NaN, the invalid exception is raised.
  6280. *----------------------------------------------------------------------------*}
  6281. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6282. var
  6283. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6284. label
  6285. returnLargerSignificand;
  6286. begin
  6287. aIsNaN := floatx80_is_nan( a );
  6288. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6289. bIsNaN := floatx80_is_nan( b );
  6290. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6291. a.low := a.low or $C000000000000000;
  6292. b.low := b.low or $C000000000000000;
  6293. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6294. if aIsSignalingNaN <> 0 then begin
  6295. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6296. if bIsNaN <> 0 then result := b else result := a;
  6297. exit;
  6298. end
  6299. else if aIsNaN <>0 then begin
  6300. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6301. result := a;
  6302. exit;
  6303. end;
  6304. returnLargerSignificand:
  6305. if ( a.low < b.low ) then begin
  6306. result := b;
  6307. exit;
  6308. end;
  6309. if ( b.low < a.low ) then begin
  6310. result := a;
  6311. exit;
  6312. end;
  6313. if a.high < b.high then result := a else result := b;
  6314. exit;
  6315. end
  6316. else
  6317. result := b;
  6318. end;
  6319. {*----------------------------------------------------------------------------
  6320. | Returns the result of converting the extended double-precision floating-
  6321. | point value `a' to the single-precision floating-point format. The
  6322. | conversion is performed according to the IEC/IEEE Standard for Binary
  6323. | Floating-Point Arithmetic.
  6324. *----------------------------------------------------------------------------*}
  6325. function floatx80_to_float32(a: floatx80): float32;
  6326. var
  6327. aSign: flag;
  6328. aExp: int32;
  6329. aSig: bits64;
  6330. begin
  6331. aSig := extractFloatx80Frac( a );
  6332. aExp := extractFloatx80Exp( a );
  6333. aSign := extractFloatx80Sign( a );
  6334. if ( aExp = $7FFF ) then begin
  6335. if bits64( aSig shl 1 ) <> 0 then begin
  6336. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6337. exit;
  6338. end;
  6339. result := packFloat32( aSign, $FF, 0 );
  6340. exit;
  6341. end;
  6342. shift64RightJamming( aSig, 33, aSig );
  6343. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6344. result := roundAndPackFloat32( aSign, aExp, aSig );
  6345. end;
  6346. {*----------------------------------------------------------------------------
  6347. | Returns the result of converting the extended double-precision floating-
  6348. | point value `a' to the double-precision floating-point format. The
  6349. | conversion is performed according to the IEC/IEEE Standard for Binary
  6350. | Floating-Point Arithmetic.
  6351. *----------------------------------------------------------------------------*}
  6352. function floatx80_to_float64(a: floatx80): float64;
  6353. var
  6354. aSign: flag;
  6355. aExp: int32;
  6356. aSig, zSig: bits64;
  6357. begin
  6358. aSig := extractFloatx80Frac( a );
  6359. aExp := extractFloatx80Exp( a );
  6360. aSign := extractFloatx80Sign( a );
  6361. if ( aExp = $7FFF ) then begin
  6362. if bits64( aSig shl 1 ) <> 0 then begin
  6363. commonNaNToFloat64( floatx80ToCommonNaN( a ), result );
  6364. exit;
  6365. end;
  6366. result := packFloat64( aSign, $7FF, 0 );
  6367. exit;
  6368. end;
  6369. shift64RightJamming( aSig, 1, zSig );
  6370. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6371. result := roundAndPackFloat64( aSign, aExp, zSig );
  6372. end;
  6373. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6374. {*----------------------------------------------------------------------------
  6375. | Returns the result of converting the extended double-precision floating-
  6376. | point value `a' to the quadruple-precision floating-point format. The
  6377. | conversion is performed according to the IEC/IEEE Standard for Binary
  6378. | Floating-Point Arithmetic.
  6379. *----------------------------------------------------------------------------*}
  6380. function floatx80_to_float128(a: floatx80): float128;
  6381. var
  6382. aSign: flag;
  6383. aExp: int16;
  6384. aSig, zSig0, zSig1: bits64;
  6385. begin
  6386. aSig := extractFloatx80Frac( a );
  6387. aExp := extractFloatx80Exp( a );
  6388. aSign := extractFloatx80Sign( a );
  6389. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6390. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6391. exit;
  6392. end;
  6393. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6394. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6395. end;
  6396. {$endif FPC_SOFTFLOAT_FLOAT128}
  6397. {*----------------------------------------------------------------------------
  6398. | Rounds the extended double-precision floating-point value `a' to an integer,
  6399. | and Returns the result as an extended quadruple-precision floating-point
  6400. | value. The operation is performed according to the IEC/IEEE Standard for
  6401. | Binary Floating-Point Arithmetic.
  6402. *----------------------------------------------------------------------------*}
  6403. function floatx80_round_to_int(a: floatx80): floatx80;
  6404. var
  6405. aSign: flag;
  6406. aExp: int32;
  6407. lastBitMask, roundBitsMask: bits64;
  6408. roundingMode: int8;
  6409. z: floatx80;
  6410. begin
  6411. aExp := extractFloatx80Exp( a );
  6412. if ( $403E <= aExp ) then begin
  6413. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6414. result := propagateFloatx80NaN( a, a );
  6415. exit;
  6416. end;
  6417. result := a;
  6418. exit;
  6419. end;
  6420. if ( aExp < $3FFF ) then begin
  6421. if ( ( aExp = 0 )
  6422. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6423. result := a;
  6424. exit;
  6425. end;
  6426. set_inexact_flag;
  6427. aSign := extractFloatx80Sign( a );
  6428. case softfloat_rounding_mode of
  6429. float_round_nearest_even:
  6430. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6431. ) then begin
  6432. result :=
  6433. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6434. exit;
  6435. end;
  6436. float_round_down: begin
  6437. if aSign <> 0 then
  6438. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6439. else
  6440. result := packFloatx80( 0, 0, 0 );
  6441. exit;
  6442. end;
  6443. float_round_up: begin
  6444. if aSign <> 0 then
  6445. result := packFloatx80( 1, 0, 0 )
  6446. else
  6447. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6448. exit;
  6449. end;
  6450. end;
  6451. result := packFloatx80( aSign, 0, 0 );
  6452. exit;
  6453. end;
  6454. lastBitMask := 1;
  6455. lastBitMask := lastBitMask shl ( $403E - aExp );
  6456. roundBitsMask := lastBitMask - 1;
  6457. z := a;
  6458. roundingMode := softfloat_rounding_mode;
  6459. if ( roundingMode = float_round_nearest_even ) then begin
  6460. inc( z.low, lastBitMask shr 1 );
  6461. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6462. end
  6463. else if ( roundingMode <> float_round_to_zero ) then begin
  6464. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6465. inc( z.low, roundBitsMask );
  6466. end;
  6467. end;
  6468. z.low := z.low and not roundBitsMask;
  6469. if ( z.low = 0 ) then begin
  6470. inc(z.high);
  6471. z.low := bits64( $8000000000000000 );
  6472. end;
  6473. if ( z.low <> a.low ) then set_inexact_flag;
  6474. result := z;
  6475. end;
  6476. {*----------------------------------------------------------------------------
  6477. | Returns the result of adding the absolute values of the extended double-
  6478. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6479. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6480. | The addition is performed according to the IEC/IEEE Standard for Binary
  6481. | Floating-Point Arithmetic.
  6482. *----------------------------------------------------------------------------*}
  6483. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6484. var
  6485. aExp, bExp, zExp: int32;
  6486. aSig, bSig, zSig0, zSig1: bits64;
  6487. expDiff: int32;
  6488. label
  6489. shiftRight1, roundAndPack;
  6490. begin
  6491. aSig := extractFloatx80Frac( a );
  6492. aExp := extractFloatx80Exp( a );
  6493. bSig := extractFloatx80Frac( b );
  6494. bExp := extractFloatx80Exp( b );
  6495. expDiff := aExp - bExp;
  6496. if ( 0 < expDiff ) then begin
  6497. if ( aExp = $7FFF ) then begin
  6498. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6499. result := propagateFloatx80NaN( a, b );
  6500. exit;
  6501. end;
  6502. result := a;
  6503. exit;
  6504. end;
  6505. if ( bExp = 0 ) then dec(expDiff);
  6506. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6507. zExp := aExp;
  6508. end
  6509. else if ( expDiff < 0 ) then begin
  6510. if ( bExp = $7FFF ) then begin
  6511. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6512. result := propagateFloatx80NaN( a, b );
  6513. exit;
  6514. end;
  6515. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6516. exit;
  6517. end;
  6518. if ( aExp = 0 ) then inc(expDiff);
  6519. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6520. zExp := bExp;
  6521. end
  6522. else begin
  6523. if ( aExp = $7FFF ) then begin
  6524. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6525. result := propagateFloatx80NaN( a, b );
  6526. exit;
  6527. end;
  6528. result := a;
  6529. exit;
  6530. end;
  6531. zSig1 := 0;
  6532. zSig0 := aSig + bSig;
  6533. if ( aExp = 0 ) then begin
  6534. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6535. goto roundAndPack;
  6536. end;
  6537. zExp := aExp;
  6538. goto shiftRight1;
  6539. end;
  6540. zSig0 := aSig + bSig;
  6541. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6542. shiftRight1:
  6543. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6544. zSig0 := zSig0 or $8000000000000000;
  6545. inc(zExp);
  6546. roundAndPack:
  6547. result :=
  6548. roundAndPackFloatx80(
  6549. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6550. end;
  6551. {*----------------------------------------------------------------------------
  6552. | Returns the result of subtracting the absolute values of the extended
  6553. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6554. | difference is negated before being returned. `zSign' is ignored if the
  6555. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6556. | Standard for Binary Floating-Point Arithmetic.
  6557. *----------------------------------------------------------------------------*}
  6558. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6559. var
  6560. aExp, bExp, zExp: int32;
  6561. aSig, bSig, zSig0, zSig1: bits64;
  6562. expDiff: int32;
  6563. z: floatx80;
  6564. label
  6565. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6566. begin
  6567. aSig := extractFloatx80Frac( a );
  6568. aExp := extractFloatx80Exp( a );
  6569. bSig := extractFloatx80Frac( b );
  6570. bExp := extractFloatx80Exp( b );
  6571. expDiff := aExp - bExp;
  6572. if ( 0 < expDiff ) then goto aExpBigger;
  6573. if ( expDiff < 0 ) then goto bExpBigger;
  6574. if ( aExp = $7FFF ) then begin
  6575. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6576. result := propagateFloatx80NaN( a, b );
  6577. exit;
  6578. end;
  6579. float_raise( float_flag_invalid );
  6580. z.low := floatx80_default_nan_low;
  6581. z.high := floatx80_default_nan_high;
  6582. result := z;
  6583. exit;
  6584. end;
  6585. if ( aExp = 0 ) then begin
  6586. aExp := 1;
  6587. bExp := 1;
  6588. end;
  6589. zSig1 := 0;
  6590. if ( bSig < aSig ) then goto aBigger;
  6591. if ( aSig < bSig ) then goto bBigger;
  6592. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6593. exit;
  6594. bExpBigger:
  6595. if ( bExp = $7FFF ) then begin
  6596. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6597. result := propagateFloatx80NaN( a, b );
  6598. exit;
  6599. end;
  6600. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6601. exit;
  6602. end;
  6603. if ( aExp = 0 ) then inc(expDiff);
  6604. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6605. bBigger:
  6606. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6607. zExp := bExp;
  6608. zSign := zSign xor 1;
  6609. goto normalizeRoundAndPack;
  6610. aExpBigger:
  6611. if ( aExp = $7FFF ) then begin
  6612. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6613. result := propagateFloatx80NaN( a, b );
  6614. exit;
  6615. end;
  6616. result := a;
  6617. exit;
  6618. end;
  6619. if ( bExp = 0 ) then dec(expDiff);
  6620. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6621. aBigger:
  6622. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6623. zExp := aExp;
  6624. normalizeRoundAndPack:
  6625. result :=
  6626. normalizeRoundAndPackFloatx80(
  6627. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6628. end;
  6629. {*----------------------------------------------------------------------------
  6630. | Returns the result of adding the extended double-precision floating-point
  6631. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6632. | Standard for Binary Floating-Point Arithmetic.
  6633. *----------------------------------------------------------------------------*}
  6634. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6635. var
  6636. aSign, bSign: flag;
  6637. begin
  6638. aSign := extractFloatx80Sign( a );
  6639. bSign := extractFloatx80Sign( b );
  6640. if ( aSign = bSign ) then begin
  6641. result := addFloatx80Sigs( a, b, aSign );
  6642. end
  6643. else begin
  6644. result := subFloatx80Sigs( a, b, aSign );
  6645. end;
  6646. end;
  6647. {*----------------------------------------------------------------------------
  6648. | Returns the result of subtracting the extended double-precision floating-
  6649. | point values `a' and `b'. The operation is performed according to the
  6650. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6651. *----------------------------------------------------------------------------*}
  6652. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6653. var
  6654. aSign, bSign: flag;
  6655. begin
  6656. aSign := extractFloatx80Sign( a );
  6657. bSign := extractFloatx80Sign( b );
  6658. if ( aSign = bSign ) then begin
  6659. result := subFloatx80Sigs( a, b, aSign );
  6660. end
  6661. else begin
  6662. result := addFloatx80Sigs( a, b, aSign );
  6663. end;
  6664. end;
  6665. {*----------------------------------------------------------------------------
  6666. | Returns the result of multiplying the extended double-precision floating-
  6667. | point values `a' and `b'. The operation is performed according to the
  6668. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6669. *----------------------------------------------------------------------------*}
  6670. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6671. var
  6672. aSign, bSign, zSign: flag;
  6673. aExp, bExp, zExp: int32;
  6674. aSig, bSig, zSig0, zSig1: bits64;
  6675. z: floatx80;
  6676. label
  6677. invalid;
  6678. begin
  6679. aSig := extractFloatx80Frac( a );
  6680. aExp := extractFloatx80Exp( a );
  6681. aSign := extractFloatx80Sign( a );
  6682. bSig := extractFloatx80Frac( b );
  6683. bExp := extractFloatx80Exp( b );
  6684. bSign := extractFloatx80Sign( b );
  6685. zSign := aSign xor bSign;
  6686. if ( aExp = $7FFF ) then begin
  6687. if ( bits64( aSig shl 1 ) <> 0 )
  6688. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6689. result := propagateFloatx80NaN( a, b );
  6690. exit;
  6691. end;
  6692. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6693. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6694. exit;
  6695. end;
  6696. if ( bExp = $7FFF ) then begin
  6697. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6698. result := propagateFloatx80NaN( a, b );
  6699. exit;
  6700. end;
  6701. if ( ( aExp or aSig ) = 0 ) then begin
  6702. invalid:
  6703. float_raise( float_flag_invalid );
  6704. z.low := floatx80_default_nan_low;
  6705. z.high := floatx80_default_nan_high;
  6706. result := z;
  6707. exit;
  6708. end;
  6709. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6710. exit;
  6711. end;
  6712. if ( aExp = 0 ) then begin
  6713. if ( aSig = 0 ) then begin
  6714. result := packFloatx80( zSign, 0, 0 );
  6715. exit;
  6716. end;
  6717. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6718. end;
  6719. if ( bExp = 0 ) then begin
  6720. if ( bSig = 0 ) then begin
  6721. result := packFloatx80( zSign, 0, 0 );
  6722. exit;
  6723. end;
  6724. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6725. end;
  6726. zExp := aExp + bExp - $3FFE;
  6727. mul64To128( aSig, bSig, zSig0, zSig1 );
  6728. if 0 < sbits64( zSig0 ) then begin
  6729. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6730. dec(zExp);
  6731. end;
  6732. result :=
  6733. roundAndPackFloatx80(
  6734. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6735. end;
  6736. {*----------------------------------------------------------------------------
  6737. | Returns the result of dividing the extended double-precision floating-point
  6738. | value `a' by the corresponding value `b'. The operation is performed
  6739. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6740. *----------------------------------------------------------------------------*}
  6741. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6742. var
  6743. aSign, bSign, zSign: flag;
  6744. aExp, bExp, zExp: int32;
  6745. aSig, bSig, zSig0, zSig1: bits64;
  6746. rem0, rem1, rem2, term0, term1, term2: bits64;
  6747. z: floatx80;
  6748. label
  6749. invalid;
  6750. begin
  6751. aSig := extractFloatx80Frac( a );
  6752. aExp := extractFloatx80Exp( a );
  6753. aSign := extractFloatx80Sign( a );
  6754. bSig := extractFloatx80Frac( b );
  6755. bExp := extractFloatx80Exp( b );
  6756. bSign := extractFloatx80Sign( b );
  6757. zSign := aSign xor bSign;
  6758. if ( aExp = $7FFF ) then begin
  6759. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6760. result := propagateFloatx80NaN( a, b );
  6761. exit;
  6762. end;
  6763. if ( bExp = $7FFF ) then begin
  6764. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6765. result := propagateFloatx80NaN( a, b );
  6766. exit;
  6767. end;
  6768. goto invalid;
  6769. end;
  6770. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6771. exit;
  6772. end;
  6773. if ( bExp = $7FFF ) then begin
  6774. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6775. result := propagateFloatx80NaN( a, b );
  6776. exit;
  6777. end;
  6778. result := packFloatx80( zSign, 0, 0 );
  6779. exit;
  6780. end;
  6781. if ( bExp = 0 ) then begin
  6782. if ( bSig = 0 ) then begin
  6783. if ( ( aExp or aSig ) = 0 ) then begin
  6784. invalid:
  6785. float_raise( float_flag_invalid );
  6786. z.low := floatx80_default_nan_low;
  6787. z.high := floatx80_default_nan_high;
  6788. result := z;
  6789. exit;
  6790. end;
  6791. float_raise( float_flag_divbyzero );
  6792. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6793. exit;
  6794. end;
  6795. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6796. end;
  6797. if ( aExp = 0 ) then begin
  6798. if ( aSig = 0 ) then begin
  6799. result := packFloatx80( zSign, 0, 0 );
  6800. exit;
  6801. end;
  6802. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6803. end;
  6804. zExp := aExp - bExp + $3FFE;
  6805. rem1 := 0;
  6806. if ( bSig <= aSig ) then begin
  6807. shift128Right( aSig, 0, 1, aSig, rem1 );
  6808. inc(zExp);
  6809. end;
  6810. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6811. mul64To128( bSig, zSig0, term0, term1 );
  6812. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6813. while ( sbits64( rem0 ) < 0 ) do begin
  6814. dec(zSig0);
  6815. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6816. end;
  6817. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6818. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6819. mul64To128( bSig, zSig1, term1, term2 );
  6820. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6821. while ( sbits64( rem1 ) < 0 ) do begin
  6822. dec(zSig1);
  6823. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6824. end;
  6825. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6826. end;
  6827. result :=
  6828. roundAndPackFloatx80(
  6829. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6830. end;
  6831. {*----------------------------------------------------------------------------
  6832. | Returns the remainder of the extended double-precision floating-point value
  6833. | `a' with respect to the corresponding value `b'. The operation is performed
  6834. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6835. *----------------------------------------------------------------------------*}
  6836. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6837. var
  6838. aSign, zSign: flag;
  6839. aExp, bExp, expDiff: int32;
  6840. aSig0, aSig1, bSig: bits64;
  6841. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6842. z: floatx80;
  6843. label
  6844. invalid;
  6845. begin
  6846. aSig0 := extractFloatx80Frac( a );
  6847. aExp := extractFloatx80Exp( a );
  6848. aSign := extractFloatx80Sign( a );
  6849. bSig := extractFloatx80Frac( b );
  6850. bExp := extractFloatx80Exp( b );
  6851. if ( aExp = $7FFF ) then begin
  6852. if ( bits64( aSig0 shl 1 ) <> 0 )
  6853. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6854. result := propagateFloatx80NaN( a, b );
  6855. exit;
  6856. end;
  6857. goto invalid;
  6858. end;
  6859. if ( bExp = $7FFF ) then begin
  6860. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6861. result := propagateFloatx80NaN( a, b );
  6862. exit;
  6863. end;
  6864. result := a;
  6865. exit;
  6866. end;
  6867. if ( bExp = 0 ) then begin
  6868. if ( bSig = 0 ) then begin
  6869. invalid:
  6870. float_raise( float_flag_invalid );
  6871. z.low := floatx80_default_nan_low;
  6872. z.high := floatx80_default_nan_high;
  6873. result := z;
  6874. exit;
  6875. end;
  6876. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6877. end;
  6878. if ( aExp = 0 ) then begin
  6879. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  6880. result := a;
  6881. exit;
  6882. end;
  6883. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6884. end;
  6885. bSig := bSig or $8000000000000000;
  6886. zSign := aSign;
  6887. expDiff := aExp - bExp;
  6888. aSig1 := 0;
  6889. if ( expDiff < 0 ) then begin
  6890. if ( expDiff < -1 ) then begin
  6891. result := a;
  6892. exit;
  6893. end;
  6894. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6895. expDiff := 0;
  6896. end;
  6897. q := ord( bSig <= aSig0 );
  6898. if ( q <> 0 ) then dec( aSig0, bSig );
  6899. dec( expDiff, 64 );
  6900. while ( 0 < expDiff ) do begin
  6901. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6902. if ( 2 < q ) then q := q - 2 else q := 0;
  6903. mul64To128( bSig, q, term0, term1 );
  6904. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6905. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6906. dec( expDiff, 62 );
  6907. end;
  6908. inc( expDiff, 64 );
  6909. if ( 0 < expDiff ) then begin
  6910. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6911. if ( 2 < q ) then q:= q - 2 else q := 0;
  6912. q := q shr ( 64 - expDiff );
  6913. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6914. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6915. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6916. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  6917. inc(q);
  6918. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6919. end;
  6920. end
  6921. else begin
  6922. term1 := 0;
  6923. term0 := bSig;
  6924. end;
  6925. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6926. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  6927. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  6928. and ( q and 1 <> 0 ) )
  6929. then begin
  6930. aSig0 := alternateASig0;
  6931. aSig1 := alternateASig1;
  6932. zSign := ord( zSign = 0 );
  6933. end;
  6934. result :=
  6935. normalizeRoundAndPackFloatx80(
  6936. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6937. end;
  6938. {*----------------------------------------------------------------------------
  6939. | Returns the square root of the extended double-precision floating-point
  6940. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6941. | for Binary Floating-Point Arithmetic.
  6942. *----------------------------------------------------------------------------*}
  6943. function floatx80_sqrt(a: floatx80): floatx80;
  6944. var
  6945. aSign: flag;
  6946. aExp, zExp: int32;
  6947. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6948. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6949. z: floatx80;
  6950. label
  6951. invalid;
  6952. begin
  6953. aSig0 := extractFloatx80Frac( a );
  6954. aExp := extractFloatx80Exp( a );
  6955. aSign := extractFloatx80Sign( a );
  6956. if ( aExp = $7FFF ) then begin
  6957. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  6958. result := propagateFloatx80NaN( a, a );
  6959. exit;
  6960. end;
  6961. if ( aSign = 0 ) then begin
  6962. result := a;
  6963. exit;
  6964. end;
  6965. goto invalid;
  6966. end;
  6967. if ( aSign <> 0 ) then begin
  6968. if ( ( aExp or aSig0 ) = 0 ) then begin
  6969. result := a;
  6970. exit;
  6971. end;
  6972. invalid:
  6973. float_raise( float_flag_invalid );
  6974. z.low := floatx80_default_nan_low;
  6975. z.high := floatx80_default_nan_high;
  6976. result := z;
  6977. exit;
  6978. end;
  6979. if ( aExp = 0 ) then begin
  6980. if ( aSig0 = 0 ) then begin
  6981. result := packFloatx80( 0, 0, 0 );
  6982. exit;
  6983. end;
  6984. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6985. end;
  6986. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
  6987. zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
  6988. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  6989. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  6990. doubleZSig0 := zSig0 shl 1;
  6991. mul64To128( zSig0, zSig0, term0, term1 );
  6992. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  6993. while ( sbits64( rem0 ) < 0 ) do begin
  6994. dec(zSig0);
  6995. dec( doubleZSig0, 2 );
  6996. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  6997. end;
  6998. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  6999. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7000. if ( zSig1 = 0 ) then zSig1 := 1;
  7001. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7002. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7003. mul64To128( zSig1, zSig1, term2, term3 );
  7004. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7005. while ( sbits64( rem1 ) < 0 ) do begin
  7006. dec(zSig1);
  7007. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7008. term3 := term3 or 1;
  7009. term2 := term2 or doubleZSig0;
  7010. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7011. end;
  7012. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7013. end;
  7014. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7015. zSig0 := zSig0 or doubleZSig0;
  7016. result :=
  7017. roundAndPackFloatx80(
  7018. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7019. end;
  7020. {*----------------------------------------------------------------------------
  7021. | Returns 1 if the extended double-precision floating-point value `a' is
  7022. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7023. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7024. | Arithmetic.
  7025. *----------------------------------------------------------------------------*}
  7026. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7027. begin
  7028. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7029. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7030. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7031. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7032. ) then begin
  7033. if ( floatx80_is_signaling_nan( a )
  7034. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7035. float_raise( float_flag_invalid );
  7036. end;
  7037. result := 0;
  7038. exit;
  7039. end;
  7040. result := ord(
  7041. ( a.low = b.low )
  7042. and ( ( a.high = b.high )
  7043. or ( ( a.low = 0 )
  7044. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7045. ) );
  7046. end;
  7047. {*----------------------------------------------------------------------------
  7048. | Returns 1 if the extended double-precision floating-point value `a' is
  7049. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7050. | comparison is performed according to the IEC/IEEE Standard for Binary
  7051. | Floating-Point Arithmetic.
  7052. *----------------------------------------------------------------------------*}
  7053. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7054. var
  7055. aSign, bSign: flag;
  7056. begin
  7057. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7058. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7059. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7060. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7061. then begin
  7062. float_raise( float_flag_invalid );
  7063. result := 0;
  7064. exit;
  7065. end;
  7066. aSign := extractFloatx80Sign( a );
  7067. bSign := extractFloatx80Sign( b );
  7068. if ( aSign <> bSign ) then begin
  7069. result := ord(
  7070. ( aSign <> 0 )
  7071. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7072. exit;
  7073. end;
  7074. if aSign<>0 then
  7075. result := le128( b.high, b.low, a.high, a.low )
  7076. else
  7077. result := le128( a.high, a.low, b.high, b.low );
  7078. end;
  7079. {*----------------------------------------------------------------------------
  7080. | Returns 1 if the extended double-precision floating-point value `a' is
  7081. | less than the corresponding value `b', and 0 otherwise. The comparison
  7082. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7083. | Arithmetic.
  7084. *----------------------------------------------------------------------------*}
  7085. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7086. var
  7087. aSign, bSign: flag;
  7088. begin
  7089. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7090. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7091. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7092. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7093. then begin
  7094. float_raise( float_flag_invalid );
  7095. result := 0;
  7096. exit;
  7097. end;
  7098. aSign := extractFloatx80Sign( a );
  7099. bSign := extractFloatx80Sign( b );
  7100. if ( aSign <> bSign ) then begin
  7101. result := ord(
  7102. ( aSign <> 0 )
  7103. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7104. exit;
  7105. end;
  7106. if aSign <> 0 then
  7107. result := lt128( b.high, b.low, a.high, a.low )
  7108. else
  7109. result := lt128( a.high, a.low, b.high, b.low );
  7110. end;
  7111. {*----------------------------------------------------------------------------
  7112. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7113. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7114. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7115. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7116. *----------------------------------------------------------------------------*}
  7117. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7118. begin
  7119. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7120. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7121. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7122. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7123. then begin
  7124. float_raise( float_flag_invalid );
  7125. result := 0;
  7126. exit;
  7127. end;
  7128. result := ord(
  7129. ( a.low = b.low )
  7130. and ( ( a.high = b.high )
  7131. or ( ( a.low = 0 )
  7132. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7133. ) );
  7134. end;
  7135. {*----------------------------------------------------------------------------
  7136. | Returns 1 if the extended double-precision floating-point value `a' is less
  7137. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7138. | do not cause an exception. Otherwise, the comparison is performed according
  7139. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7140. *----------------------------------------------------------------------------*}
  7141. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7142. var
  7143. aSign, bSign: flag;
  7144. begin
  7145. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7146. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7147. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7148. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7149. then begin
  7150. if ( floatx80_is_signaling_nan( a )
  7151. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7152. float_raise( float_flag_invalid );
  7153. end;
  7154. result := 0;
  7155. exit;
  7156. end;
  7157. aSign := extractFloatx80Sign( a );
  7158. bSign := extractFloatx80Sign( b );
  7159. if ( aSign <> bSign ) then begin
  7160. result := ord(
  7161. ( aSign <> 0 )
  7162. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7163. exit;
  7164. end;
  7165. if aSign <> 0 then
  7166. result := le128( b.high, b.low, a.high, a.low )
  7167. else
  7168. result := le128( a.high, a.low, b.high, b.low );
  7169. end;
  7170. {*----------------------------------------------------------------------------
  7171. | Returns 1 if the extended double-precision floating-point value `a' is less
  7172. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7173. | an exception. Otherwise, the comparison is performed according to the
  7174. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7175. *----------------------------------------------------------------------------*}
  7176. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7177. var
  7178. aSign, bSign: flag;
  7179. begin
  7180. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7181. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7182. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7183. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7184. then begin
  7185. if ( floatx80_is_signaling_nan( a )
  7186. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7187. float_raise( float_flag_invalid );
  7188. end;
  7189. result := 0;
  7190. exit;
  7191. end;
  7192. aSign := extractFloatx80Sign( a );
  7193. bSign := extractFloatx80Sign( b );
  7194. if ( aSign <> bSign ) then begin
  7195. result := ord(
  7196. ( aSign <> 0 )
  7197. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7198. exit;
  7199. end;
  7200. if aSign <> 0 then
  7201. result := lt128( b.high, b.low, a.high, a.low )
  7202. else
  7203. result := lt128( a.high, a.low, b.high, b.low );
  7204. end;
  7205. {$endif FPC_SOFTFLOAT_FLOATX80}
  7206. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7207. {*----------------------------------------------------------------------------
  7208. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7209. | floating-point value `a'.
  7210. *----------------------------------------------------------------------------*}
  7211. function extractFloat128Frac1(a : float128): bits64;
  7212. begin
  7213. result:=a.low;
  7214. end;
  7215. {*----------------------------------------------------------------------------
  7216. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7217. | floating-point value `a'.
  7218. *----------------------------------------------------------------------------*}
  7219. function extractFloat128Frac0(a : float128): bits64;
  7220. begin
  7221. result:=a.high and int64($0000FFFFFFFFFFFF);
  7222. end;
  7223. {*----------------------------------------------------------------------------
  7224. | Returns the exponent bits of the quadruple-precision floating-point value
  7225. | `a'.
  7226. *----------------------------------------------------------------------------*}
  7227. function extractFloat128Exp(a : float128): int32;
  7228. begin
  7229. result:=( a.high shr 48 ) and $7FFF;
  7230. end;
  7231. {*----------------------------------------------------------------------------
  7232. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7233. *----------------------------------------------------------------------------*}
  7234. function extractFloat128Sign(a : float128): flag;
  7235. begin
  7236. result:=a.high shr 63;
  7237. end;
  7238. {*----------------------------------------------------------------------------
  7239. | Normalizes the subnormal quadruple-precision floating-point value
  7240. | represented by the denormalized significand formed by the concatenation of
  7241. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7242. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7243. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7244. | least significant 64 bits of the normalized significand are stored at the
  7245. | location pointed to by `zSig1Ptr'.
  7246. *----------------------------------------------------------------------------*}
  7247. procedure normalizeFloat128Subnormal(
  7248. aSig0: bits64;
  7249. aSig1: bits64;
  7250. var zExpPtr: int32;
  7251. var zSig0Ptr: bits64;
  7252. var zSig1Ptr: bits64);
  7253. var
  7254. shiftCount: int8;
  7255. begin
  7256. if ( aSig0 = 0 ) then
  7257. begin
  7258. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7259. if ( shiftCount < 0 ) then
  7260. begin
  7261. zSig0Ptr := aSig1 shr ( - shiftCount );
  7262. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7263. end
  7264. else begin
  7265. zSig0Ptr := aSig1 shl shiftCount;
  7266. zSig1Ptr := 0;
  7267. end;
  7268. zExpPtr := - shiftCount - 63;
  7269. end
  7270. else begin
  7271. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7272. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7273. zExpPtr := 1 - shiftCount;
  7274. end;
  7275. end;
  7276. {*----------------------------------------------------------------------------
  7277. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7278. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7279. | floating-point value, returning the result. After being shifted into the
  7280. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7281. | added together to form the most significant 32 bits of the result. This
  7282. | means that any integer portion of `zSig0' will be added into the exponent.
  7283. | Since a properly normalized significand will have an integer portion equal
  7284. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7285. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7286. | significand.
  7287. *----------------------------------------------------------------------------*}
  7288. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7289. var
  7290. z: float128;
  7291. begin
  7292. z.low := zSig1;
  7293. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7294. result:=z;
  7295. end;
  7296. {*----------------------------------------------------------------------------
  7297. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7298. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7299. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7300. | corresponding to the abstract input. Ordinarily, the abstract value is
  7301. | simply rounded and packed into the quadruple-precision format, with the
  7302. | inexact exception raised if the abstract input cannot be represented
  7303. | exactly. However, if the abstract value is too large, the overflow and
  7304. | inexact exceptions are raised and an infinity or maximal finite value is
  7305. | returned. If the abstract value is too small, the input value is rounded to
  7306. | a subnormal number, and the underflow and inexact exceptions are raised if
  7307. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7308. | precision floating-point number.
  7309. | The input significand must be normalized or smaller. If the input
  7310. | significand is not normalized, `zExp' must be 0; in that case, the result
  7311. | returned is a subnormal number, and it must not require rounding. In the
  7312. | usual case that the input significand is normalized, `zExp' must be 1 less
  7313. | than the ``true'' floating-point exponent. The handling of underflow and
  7314. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7315. *----------------------------------------------------------------------------*}
  7316. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7317. var
  7318. roundingMode: int8;
  7319. roundNearestEven, increment, isTiny: flag;
  7320. begin
  7321. roundingMode := softfloat_rounding_mode;
  7322. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7323. increment := ord( sbits64(zSig2) < 0 );
  7324. if ( roundNearestEven=0 ) then
  7325. begin
  7326. if ( roundingMode = float_round_to_zero ) then
  7327. begin
  7328. increment := 0;
  7329. end
  7330. else begin
  7331. if ( zSign<>0 ) then
  7332. begin
  7333. increment := ord( roundingMode = float_round_down ) and zSig2;
  7334. end
  7335. else begin
  7336. increment := ord( roundingMode = float_round_up ) and zSig2;
  7337. end;
  7338. end;
  7339. end;
  7340. if ( $7FFD <= bits32(zExp) ) then
  7341. begin
  7342. if ( ord( $7FFD < zExp )
  7343. or ( ord( zExp = $7FFD )
  7344. and eq128(
  7345. int64( $0001FFFFFFFFFFFF ),
  7346. bits64( $FFFFFFFFFFFFFFFF ),
  7347. zSig0,
  7348. zSig1
  7349. )
  7350. and increment
  7351. )
  7352. )<>0 then
  7353. begin
  7354. float_raise( [float_flag_overflow,float_flag_inexact] );
  7355. if ( ord( roundingMode = float_round_to_zero )
  7356. or ( zSign and ord( roundingMode = float_round_up ) )
  7357. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7358. )<>0 then
  7359. begin
  7360. result :=
  7361. packFloat128(
  7362. zSign,
  7363. $7FFE,
  7364. int64( $0000FFFFFFFFFFFF ),
  7365. bits64( $FFFFFFFFFFFFFFFF )
  7366. );
  7367. exit;
  7368. end;
  7369. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7370. exit;
  7371. end;
  7372. if ( zExp < 0 ) then
  7373. begin
  7374. isTiny :=
  7375. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7376. or ( zExp < -1 )
  7377. or not( increment<>0 )
  7378. or boolean(lt128(
  7379. zSig0,
  7380. zSig1,
  7381. int64( $0001FFFFFFFFFFFF ),
  7382. bits64( $FFFFFFFFFFFFFFFF )
  7383. )));
  7384. shift128ExtraRightJamming(
  7385. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7386. zExp := 0;
  7387. if ( isTiny and zSig2 )<>0 then
  7388. float_raise( float_flag_underflow );
  7389. if ( roundNearestEven<>0 ) then
  7390. begin
  7391. increment := ord( sbits64(zSig2) < 0 );
  7392. end
  7393. else begin
  7394. if ( zSign<>0 ) then
  7395. begin
  7396. increment := ord( roundingMode = float_round_down ) and zSig2;
  7397. end
  7398. else begin
  7399. increment := ord( roundingMode = float_round_up ) and zSig2;
  7400. end;
  7401. end;
  7402. end;
  7403. end;
  7404. if ( zSig2<>0 ) then
  7405. set_inexact_flag;
  7406. if ( increment<>0 ) then
  7407. begin
  7408. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7409. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7410. end
  7411. else begin
  7412. if ( ( zSig0 or zSig1 ) = 0 ) then
  7413. zExp := 0;
  7414. end;
  7415. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7416. end;
  7417. {*----------------------------------------------------------------------------
  7418. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7419. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7420. | returns the proper quadruple-precision floating-point value corresponding
  7421. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7422. | except that the input significand has fewer bits and does not have to be
  7423. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7424. | point exponent.
  7425. *----------------------------------------------------------------------------*}
  7426. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7427. var
  7428. shiftCount: int8;
  7429. zSig2: bits64;
  7430. begin
  7431. if ( zSig0 = 0 ) then
  7432. begin
  7433. zSig0 := zSig1;
  7434. zSig1 := 0;
  7435. dec(zExp, 64);
  7436. end;
  7437. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7438. if ( 0 <= shiftCount ) then
  7439. begin
  7440. zSig2 := 0;
  7441. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7442. end
  7443. else begin
  7444. shift128ExtraRightJamming(
  7445. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7446. end;
  7447. dec(zExp, shiftCount);
  7448. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7449. end;
  7450. {*----------------------------------------------------------------------------
  7451. | Returns the result of converting the quadruple-precision floating-point
  7452. | value `a' to the 32-bit two's complement integer format. The conversion
  7453. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7454. | Arithmetic---which means in particular that the conversion is rounded
  7455. | according to the current rounding mode. If `a' is a NaN, the largest
  7456. | positive integer is returned. Otherwise, if the conversion overflows, the
  7457. | largest integer with the same sign as `a' is returned.
  7458. *----------------------------------------------------------------------------*}
  7459. function float128_to_int32(a: float128): int32;
  7460. var
  7461. aSign: flag;
  7462. aExp, shiftCount: int32;
  7463. aSig0, aSig1: bits64;
  7464. begin
  7465. aSig1 := extractFloat128Frac1( a );
  7466. aSig0 := extractFloat128Frac0( a );
  7467. aExp := extractFloat128Exp( a );
  7468. aSign := extractFloat128Sign( a );
  7469. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7470. aSign := 0;
  7471. if ( aExp<>0 ) then
  7472. aSig0 := aSig0 or int64( $0001000000000000 );
  7473. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7474. shiftCount := $4028 - aExp;
  7475. if ( 0 < shiftCount ) then
  7476. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7477. result := roundAndPackInt32( aSign, aSig0 );
  7478. end;
  7479. {*----------------------------------------------------------------------------
  7480. | Returns the result of converting the quadruple-precision floating-point
  7481. | value `a' to the 32-bit two's complement integer format. The conversion
  7482. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7483. | Arithmetic, except that the conversion is always rounded toward zero. If
  7484. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7485. | conversion overflows, the largest integer with the same sign as `a' is
  7486. | returned.
  7487. *----------------------------------------------------------------------------*}
  7488. function float128_to_int32_round_to_zero(a: float128): int32;
  7489. var
  7490. aSign: flag;
  7491. aExp, shiftCount: int32;
  7492. aSig0, aSig1, savedASig: bits64;
  7493. z: int32;
  7494. label
  7495. invalid;
  7496. begin
  7497. aSig1 := extractFloat128Frac1( a );
  7498. aSig0 := extractFloat128Frac0( a );
  7499. aExp := extractFloat128Exp( a );
  7500. aSign := extractFloat128Sign( a );
  7501. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7502. if ( $401E < aExp ) then
  7503. begin
  7504. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7505. aSign := 0;
  7506. goto invalid;
  7507. end
  7508. else if ( aExp < $3FFF ) then
  7509. begin
  7510. if ( aExp or aSig0 )<>0 then
  7511. set_inexact_flag;
  7512. result := 0;
  7513. exit;
  7514. end;
  7515. aSig0 := aSig0 or int64( $0001000000000000 );
  7516. shiftCount := $402F - aExp;
  7517. savedASig := aSig0;
  7518. aSig0 := aSig0 shr shiftCount;
  7519. z := aSig0;
  7520. if ( aSign )<>0 then
  7521. z := - z;
  7522. if ( ord( z < 0 ) xor aSign )<>0 then
  7523. begin
  7524. invalid:
  7525. float_raise( float_flag_invalid );
  7526. if aSign<>0 then
  7527. result:= int32( $80000000 )
  7528. else
  7529. result:=$7FFFFFFF;
  7530. exit;
  7531. end;
  7532. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7533. begin
  7534. set_inexact_flag;
  7535. end;
  7536. result := z;
  7537. end;
  7538. {*----------------------------------------------------------------------------
  7539. | Returns the result of converting the quadruple-precision floating-point
  7540. | value `a' to the 64-bit two's complement integer format. The conversion
  7541. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7542. | Arithmetic---which means in particular that the conversion is rounded
  7543. | according to the current rounding mode. If `a' is a NaN, the largest
  7544. | positive integer is returned. Otherwise, if the conversion overflows, the
  7545. | largest integer with the same sign as `a' is returned.
  7546. *----------------------------------------------------------------------------*}
  7547. function float128_to_int64(a: float128): int64;
  7548. var
  7549. aSign: flag;
  7550. aExp, shiftCount: int32;
  7551. aSig0, aSig1: bits64;
  7552. begin
  7553. aSig1 := extractFloat128Frac1( a );
  7554. aSig0 := extractFloat128Frac0( a );
  7555. aExp := extractFloat128Exp( a );
  7556. aSign := extractFloat128Sign( a );
  7557. if ( aExp<>0 ) then
  7558. aSig0 := aSig0 or int64( $0001000000000000 );
  7559. shiftCount := $402F - aExp;
  7560. if ( shiftCount <= 0 ) then
  7561. begin
  7562. if ( $403E < aExp ) then
  7563. begin
  7564. float_raise( float_flag_invalid );
  7565. if ( (aSign=0)
  7566. or ( ( aExp = $7FFF )
  7567. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7568. )
  7569. ) then
  7570. begin
  7571. result := int64( $7FFFFFFFFFFFFFFF );
  7572. exit;
  7573. end;
  7574. result := int64( $8000000000000000 );
  7575. exit;
  7576. end;
  7577. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7578. end
  7579. else begin
  7580. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7581. end;
  7582. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7583. end;
  7584. {*----------------------------------------------------------------------------
  7585. | Returns the result of converting the quadruple-precision floating-point
  7586. | value `a' to the 64-bit two's complement integer format. The conversion
  7587. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7588. | Arithmetic, except that the conversion is always rounded toward zero.
  7589. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7590. | the conversion overflows, the largest integer with the same sign as `a' is
  7591. | returned.
  7592. *----------------------------------------------------------------------------*}
  7593. function float128_to_int64_round_to_zero(a: float128): int64;
  7594. var
  7595. aSign: flag;
  7596. aExp, shiftCount: int32;
  7597. aSig0, aSig1: bits64;
  7598. z: int64;
  7599. begin
  7600. aSig1 := extractFloat128Frac1( a );
  7601. aSig0 := extractFloat128Frac0( a );
  7602. aExp := extractFloat128Exp( a );
  7603. aSign := extractFloat128Sign( a );
  7604. if ( aExp<>0 ) then
  7605. aSig0 := aSig0 or int64( $0001000000000000 );
  7606. shiftCount := aExp - $402F;
  7607. if ( 0 < shiftCount ) then
  7608. begin
  7609. if ( $403E <= aExp ) then
  7610. begin
  7611. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7612. if ( ( a.high = bits64( $C03E000000000000 ) )
  7613. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7614. begin
  7615. if ( aSig1<>0 ) then
  7616. set_inexact_flag;
  7617. end
  7618. else begin
  7619. float_raise( float_flag_invalid );
  7620. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7621. begin
  7622. result := int64( $7FFFFFFFFFFFFFFF );
  7623. exit;
  7624. end;
  7625. end;
  7626. result := int64( $8000000000000000 );
  7627. exit;
  7628. end;
  7629. z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
  7630. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7631. begin
  7632. set_inexact_flag;
  7633. end;
  7634. end
  7635. else begin
  7636. if ( aExp < $3FFF ) then
  7637. begin
  7638. if ( aExp or aSig0 or aSig1 )<>0 then
  7639. begin
  7640. set_inexact_flag;
  7641. end;
  7642. result := 0;
  7643. exit;
  7644. end;
  7645. z := aSig0 shr ( - shiftCount );
  7646. if ( (aSig1<>0)
  7647. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7648. begin
  7649. set_inexact_flag;
  7650. end;
  7651. end;
  7652. if ( aSign<>0 ) then
  7653. z := - z;
  7654. result := z;
  7655. end;
  7656. {*----------------------------------------------------------------------------
  7657. | Returns the result of converting the quadruple-precision floating-point
  7658. | value `a' to the single-precision floating-point format. The conversion
  7659. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7660. | Arithmetic.
  7661. *----------------------------------------------------------------------------*}
  7662. function float128_to_float32(a: float128): float32;
  7663. var
  7664. aSign: flag;
  7665. aExp: int32;
  7666. aSig0, aSig1: bits64;
  7667. zSig: bits32;
  7668. begin
  7669. aSig1 := extractFloat128Frac1( a );
  7670. aSig0 := extractFloat128Frac0( a );
  7671. aExp := extractFloat128Exp( a );
  7672. aSign := extractFloat128Sign( a );
  7673. if ( aExp = $7FFF ) then
  7674. begin
  7675. if ( aSig0 or aSig1 )<>0 then
  7676. begin
  7677. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7678. exit;
  7679. end;
  7680. result := packFloat32( aSign, $FF, 0 );
  7681. exit;
  7682. end;
  7683. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7684. shift64RightJamming( aSig0, 18, aSig0 );
  7685. zSig := aSig0;
  7686. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7687. begin
  7688. zSig := zSig or $40000000;
  7689. dec(aExp,$3F81);
  7690. end;
  7691. result := roundAndPackFloat32( aSign, aExp, zSig );
  7692. end;
  7693. {*----------------------------------------------------------------------------
  7694. | Returns the result of converting the quadruple-precision floating-point
  7695. | value `a' to the double-precision floating-point format. The conversion
  7696. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7697. | Arithmetic.
  7698. *----------------------------------------------------------------------------*}
  7699. function float128_to_float64(a: float128): float64;
  7700. var
  7701. aSign: flag;
  7702. aExp: int32;
  7703. aSig0, aSig1: bits64;
  7704. begin
  7705. aSig1 := extractFloat128Frac1( a );
  7706. aSig0 := extractFloat128Frac0( a );
  7707. aExp := extractFloat128Exp( a );
  7708. aSign := extractFloat128Sign( a );
  7709. if ( aExp = $7FFF ) then
  7710. begin
  7711. if ( aSig0 or aSig1 )<>0 then
  7712. begin
  7713. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  7714. exit;
  7715. end;
  7716. result:=packFloat64( aSign, $7FF, 0);
  7717. exit;
  7718. end;
  7719. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7720. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7721. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7722. begin
  7723. aSig0 := aSig0 or int64( $4000000000000000 );
  7724. dec(aExp,$3C01);
  7725. end;
  7726. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7727. end;
  7728. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7729. {*----------------------------------------------------------------------------
  7730. | Returns the result of converting the quadruple-precision floating-point
  7731. | value `a' to the extended double-precision floating-point format. The
  7732. | conversion is performed according to the IEC/IEEE Standard for Binary
  7733. | Floating-Point Arithmetic.
  7734. *----------------------------------------------------------------------------*}
  7735. function float128_to_floatx80(a: float128): floatx80;
  7736. var
  7737. aSign: flag;
  7738. aExp: int32;
  7739. aSig0, aSig1: bits64;
  7740. begin
  7741. aSig1 := extractFloat128Frac1( a );
  7742. aSig0 := extractFloat128Frac0( a );
  7743. aExp := extractFloat128Exp( a );
  7744. aSign := extractFloat128Sign( a );
  7745. if ( aExp = $7FFF ) then begin
  7746. if ( aSig0 or aSig1 <> 0 ) then begin
  7747. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7748. exit;
  7749. end;
  7750. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7751. exit;
  7752. end;
  7753. if ( aExp = 0 ) then begin
  7754. if ( ( aSig0 or aSig1 ) = 0 ) then
  7755. begin
  7756. result := packFloatx80( aSign, 0, 0 );
  7757. exit;
  7758. end;
  7759. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7760. end
  7761. else begin
  7762. aSig0 := aSig0 or int64( $0001000000000000 );
  7763. end;
  7764. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7765. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7766. end;
  7767. {$endif FPC_SOFTFLOAT_FLOATX80}
  7768. {*----------------------------------------------------------------------------
  7769. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7770. | Returns the result as a quadruple-precision floating-point value. The
  7771. | operation is performed according to the IEC/IEEE Standard for Binary
  7772. | Floating-Point Arithmetic.
  7773. *----------------------------------------------------------------------------*}
  7774. function float128_round_to_int(a: float128): float128;
  7775. var
  7776. aSign: flag;
  7777. aExp: int32;
  7778. lastBitMask, roundBitsMask: bits64;
  7779. roundingMode: int8;
  7780. z: float128;
  7781. begin
  7782. aExp := extractFloat128Exp( a );
  7783. if ( $402F <= aExp ) then
  7784. begin
  7785. if ( $406F <= aExp ) then
  7786. begin
  7787. if ( ( aExp = $7FFF )
  7788. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7789. ) then
  7790. begin
  7791. result := propagateFloat128NaN( a, a );
  7792. exit;
  7793. end;
  7794. result := a;
  7795. exit;
  7796. end;
  7797. lastBitMask := 1;
  7798. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7799. roundBitsMask := lastBitMask - 1;
  7800. z := a;
  7801. roundingMode := softfloat_rounding_mode;
  7802. if ( roundingMode = float_round_nearest_even ) then
  7803. begin
  7804. if ( lastBitMask )<>0 then
  7805. begin
  7806. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7807. if ( ( z.low and roundBitsMask ) = 0 ) then
  7808. z.low := z.low and not(lastBitMask);
  7809. end
  7810. else begin
  7811. if ( sbits64(z.low) < 0 ) then
  7812. begin
  7813. inc(z.high);
  7814. if ( bits64( z.low shl 1 ) = 0 ) then
  7815. z.high := z.high and not bits64( 1 );
  7816. end;
  7817. end;
  7818. end
  7819. else if ( roundingMode <> float_round_to_zero ) then
  7820. begin
  7821. if ( extractFloat128Sign( z )
  7822. xor ord( roundingMode = float_round_up ) )<>0 then
  7823. begin
  7824. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7825. end;
  7826. end;
  7827. z.low := z.low and not(roundBitsMask);
  7828. end
  7829. else begin
  7830. if ( aExp < $3FFF ) then
  7831. begin
  7832. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7833. begin
  7834. result := a;
  7835. exit;
  7836. end;
  7837. set_inexact_flag;
  7838. aSign := extractFloat128Sign( a );
  7839. case softfloat_rounding_mode of
  7840. float_round_nearest_even:
  7841. if ( ( aExp = $3FFE )
  7842. and ( (extractFloat128Frac0( a )<>0)
  7843. or (extractFloat128Frac1( a )<>0) )
  7844. ) then begin
  7845. begin
  7846. result := packFloat128( aSign, $3FFF, 0, 0 );
  7847. exit;
  7848. end;
  7849. end;
  7850. float_round_down:
  7851. begin
  7852. if aSign<>0 then
  7853. result:=packFloat128( 1, $3FFF, 0, 0 )
  7854. else
  7855. result:=packFloat128( 0, 0, 0, 0 );
  7856. exit;
  7857. end;
  7858. float_round_up:
  7859. begin
  7860. if aSign<>0 then
  7861. result := packFloat128( 1, 0, 0, 0 )
  7862. else
  7863. result:=packFloat128( 0, $3FFF, 0, 0 );
  7864. exit;
  7865. end;
  7866. end;
  7867. result := packFloat128( aSign, 0, 0, 0 );
  7868. exit;
  7869. end;
  7870. lastBitMask := 1;
  7871. lastBitMask := lastBitMask shl ($402F - aExp);
  7872. roundBitsMask := lastBitMask - 1;
  7873. z.low := 0;
  7874. z.high := a.high;
  7875. roundingMode := softfloat_rounding_mode;
  7876. if ( roundingMode = float_round_nearest_even ) then begin
  7877. inc(z.high,lastBitMask shr 1);
  7878. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7879. z.high := z.high and not(lastBitMask);
  7880. end;
  7881. end
  7882. else if ( roundingMode <> float_round_to_zero ) then begin
  7883. if ( (extractFloat128Sign( z )<>0)
  7884. xor ( roundingMode = float_round_up ) ) then begin
  7885. z.high := z.high or ord( a.low <> 0 );
  7886. z.high := z.high+roundBitsMask;
  7887. end;
  7888. end;
  7889. z.high := z.high and not(roundBitsMask);
  7890. end;
  7891. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7892. set_inexact_flag;
  7893. end;
  7894. result := z;
  7895. end;
  7896. {*----------------------------------------------------------------------------
  7897. | Returns the result of adding the absolute values of the quadruple-precision
  7898. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7899. | before being returned. `zSign' is ignored if the result is a NaN.
  7900. | The addition is performed according to the IEC/IEEE Standard for Binary
  7901. | Floating-Point Arithmetic.
  7902. *----------------------------------------------------------------------------*}
  7903. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7904. var
  7905. aExp, bExp, zExp: int32;
  7906. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7907. expDiff: int32;
  7908. label
  7909. shiftRight1,roundAndPack;
  7910. begin
  7911. aSig1 := extractFloat128Frac1( a );
  7912. aSig0 := extractFloat128Frac0( a );
  7913. aExp := extractFloat128Exp( a );
  7914. bSig1 := extractFloat128Frac1( b );
  7915. bSig0 := extractFloat128Frac0( b );
  7916. bExp := extractFloat128Exp( b );
  7917. expDiff := aExp - bExp;
  7918. if ( 0 < expDiff ) then begin
  7919. if ( aExp = $7FFF ) then begin
  7920. if ( aSig0 or aSig1 )<>0 then
  7921. begin
  7922. result := propagateFloat128NaN( a, b );
  7923. exit;
  7924. end;
  7925. result := a;
  7926. exit;
  7927. end;
  7928. if ( bExp = 0 ) then begin
  7929. dec(expDiff);
  7930. end
  7931. else begin
  7932. bSig0 := bSig0 or int64( $0001000000000000 );
  7933. end;
  7934. shift128ExtraRightJamming(
  7935. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7936. zExp := aExp;
  7937. end
  7938. else if ( expDiff < 0 ) then begin
  7939. if ( bExp = $7FFF ) then begin
  7940. if ( bSig0 or bSig1 )<>0 then
  7941. begin
  7942. result := propagateFloat128NaN( a, b );
  7943. exit;
  7944. end;
  7945. result := packFloat128( zSign, $7FFF, 0, 0 );
  7946. exit;
  7947. end;
  7948. if ( aExp = 0 ) then begin
  7949. inc(expDiff);
  7950. end
  7951. else begin
  7952. aSig0 := aSig0 or int64( $0001000000000000 );
  7953. end;
  7954. shift128ExtraRightJamming(
  7955. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7956. zExp := bExp;
  7957. end
  7958. else begin
  7959. if ( aExp = $7FFF ) then begin
  7960. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7961. result := propagateFloat128NaN( a, b );
  7962. exit;
  7963. end;
  7964. result := a;
  7965. exit;
  7966. end;
  7967. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7968. if ( aExp = 0 ) then
  7969. begin
  7970. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7971. exit;
  7972. end;
  7973. zSig2 := 0;
  7974. zSig0 := zSig0 or int64( $0002000000000000 );
  7975. zExp := aExp;
  7976. goto shiftRight1;
  7977. end;
  7978. aSig0 := aSig0 or int64( $0001000000000000 );
  7979. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7980. dec(zExp);
  7981. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7982. inc(zExp);
  7983. shiftRight1:
  7984. shift128ExtraRightJamming(
  7985. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7986. roundAndPack:
  7987. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7988. end;
  7989. {*----------------------------------------------------------------------------
  7990. | Returns the result of subtracting the absolute values of the quadruple-
  7991. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  7992. | difference is negated before being returned. `zSign' is ignored if the
  7993. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  7994. | Standard for Binary Floating-Point Arithmetic.
  7995. *----------------------------------------------------------------------------*}
  7996. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  7997. var
  7998. aExp, bExp, zExp: int32;
  7999. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8000. expDiff: int32;
  8001. z: float128;
  8002. label
  8003. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8004. begin
  8005. aSig1 := extractFloat128Frac1( a );
  8006. aSig0 := extractFloat128Frac0( a );
  8007. aExp := extractFloat128Exp( a );
  8008. bSig1 := extractFloat128Frac1( b );
  8009. bSig0 := extractFloat128Frac0( b );
  8010. bExp := extractFloat128Exp( b );
  8011. expDiff := aExp - bExp;
  8012. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8013. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8014. if ( 0 < expDiff ) then goto aExpBigger;
  8015. if ( expDiff < 0 ) then goto bExpBigger;
  8016. if ( aExp = $7FFF ) then begin
  8017. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8018. result := propagateFloat128NaN( a, b );
  8019. exit;
  8020. end;
  8021. float_raise( float_flag_invalid );
  8022. z.low := float128_default_nan_low;
  8023. z.high := float128_default_nan_high;
  8024. result := z;
  8025. exit;
  8026. end;
  8027. if ( aExp = 0 ) then begin
  8028. aExp := 1;
  8029. bExp := 1;
  8030. end;
  8031. if ( bSig0 < aSig0 ) then goto aBigger;
  8032. if ( aSig0 < bSig0 ) then goto bBigger;
  8033. if ( bSig1 < aSig1 ) then goto aBigger;
  8034. if ( aSig1 < bSig1 ) then goto bBigger;
  8035. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8036. exit;
  8037. bExpBigger:
  8038. if ( bExp = $7FFF ) then begin
  8039. if ( bSig0 or bSig1 )<>0 then
  8040. begin
  8041. result := propagateFloat128NaN( a, b );
  8042. exit;
  8043. end;
  8044. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8045. exit;
  8046. end;
  8047. if ( aExp = 0 ) then begin
  8048. inc(expDiff);
  8049. end
  8050. else begin
  8051. aSig0 := aSig0 or int64( $4000000000000000 );
  8052. end;
  8053. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8054. bSig0 := bSig0 or int64( $4000000000000000 );
  8055. bBigger:
  8056. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8057. zExp := bExp;
  8058. zSign := zSign xor 1;
  8059. goto normalizeRoundAndPack;
  8060. aExpBigger:
  8061. if ( aExp = $7FFF ) then begin
  8062. if ( aSig0 or aSig1 )<>0 then
  8063. begin
  8064. result := propagateFloat128NaN( a, b );
  8065. exit;
  8066. end;
  8067. result := a;
  8068. exit;
  8069. end;
  8070. if ( bExp = 0 ) then begin
  8071. dec(expDiff);
  8072. end
  8073. else begin
  8074. bSig0 := bSig0 or int64( $4000000000000000 );
  8075. end;
  8076. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8077. aSig0 := aSig0 or int64( $4000000000000000 );
  8078. aBigger:
  8079. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8080. zExp := aExp;
  8081. normalizeRoundAndPack:
  8082. dec(zExp);
  8083. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8084. end;
  8085. {*----------------------------------------------------------------------------
  8086. | Returns the result of adding the quadruple-precision floating-point values
  8087. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8088. | for Binary Floating-Point Arithmetic.
  8089. *----------------------------------------------------------------------------*}
  8090. function float128_add(a: float128; b: float128): float128;
  8091. var
  8092. aSign, bSign: flag;
  8093. begin
  8094. aSign := extractFloat128Sign( a );
  8095. bSign := extractFloat128Sign( b );
  8096. if ( aSign = bSign ) then begin
  8097. result := addFloat128Sigs( a, b, aSign );
  8098. end
  8099. else begin
  8100. result := subFloat128Sigs( a, b, aSign );
  8101. end;
  8102. end;
  8103. {*----------------------------------------------------------------------------
  8104. | Returns the result of subtracting the quadruple-precision floating-point
  8105. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8106. | Standard for Binary Floating-Point Arithmetic.
  8107. *----------------------------------------------------------------------------*}
  8108. function float128_sub(a: float128; b: float128): float128;
  8109. var
  8110. aSign, bSign: flag;
  8111. begin
  8112. aSign := extractFloat128Sign( a );
  8113. bSign := extractFloat128Sign( b );
  8114. if ( aSign = bSign ) then begin
  8115. result := subFloat128Sigs( a, b, aSign );
  8116. end
  8117. else begin
  8118. result := addFloat128Sigs( a, b, aSign );
  8119. end;
  8120. end;
  8121. {*----------------------------------------------------------------------------
  8122. | Returns the result of multiplying the quadruple-precision floating-point
  8123. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8124. | Standard for Binary Floating-Point Arithmetic.
  8125. *----------------------------------------------------------------------------*}
  8126. function float128_mul(a: float128; b: float128): float128;
  8127. var
  8128. aSign, bSign, zSign: flag;
  8129. aExp, bExp, zExp: int32;
  8130. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8131. z: float128;
  8132. label
  8133. invalid;
  8134. begin
  8135. aSig1 := extractFloat128Frac1( a );
  8136. aSig0 := extractFloat128Frac0( a );
  8137. aExp := extractFloat128Exp( a );
  8138. aSign := extractFloat128Sign( a );
  8139. bSig1 := extractFloat128Frac1( b );
  8140. bSig0 := extractFloat128Frac0( b );
  8141. bExp := extractFloat128Exp( b );
  8142. bSign := extractFloat128Sign( b );
  8143. zSign := aSign xor bSign;
  8144. if ( aExp = $7FFF ) then begin
  8145. if ( (( aSig0 or aSig1 )<>0)
  8146. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8147. result := propagateFloat128NaN( a, b );
  8148. exit;
  8149. end;
  8150. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8151. result := packFloat128( zSign, $7FFF, 0, 0 );
  8152. exit;
  8153. end;
  8154. if ( bExp = $7FFF ) then begin
  8155. if ( bSig0 or bSig1 )<>0 then
  8156. begin
  8157. result := propagateFloat128NaN( a, b );
  8158. exit;
  8159. end;
  8160. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8161. invalid:
  8162. float_raise( float_flag_invalid );
  8163. z.low := float128_default_nan_low;
  8164. z.high := float128_default_nan_high;
  8165. result := z;
  8166. exit;
  8167. end;
  8168. result := packFloat128( zSign, $7FFF, 0, 0 );
  8169. exit;
  8170. end;
  8171. if ( aExp = 0 ) then begin
  8172. if ( ( aSig0 or aSig1 ) = 0 ) then
  8173. begin
  8174. result := packFloat128( zSign, 0, 0, 0 );
  8175. exit;
  8176. end;
  8177. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8178. end;
  8179. if ( bExp = 0 ) then begin
  8180. if ( ( bSig0 or bSig1 ) = 0 ) then
  8181. begin
  8182. result := packFloat128( zSign, 0, 0, 0 );
  8183. exit;
  8184. end;
  8185. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8186. end;
  8187. zExp := aExp + bExp - $4000;
  8188. aSig0 := aSig0 or int64( $0001000000000000 );
  8189. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8190. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8191. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8192. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8193. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8194. shift128ExtraRightJamming(
  8195. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8196. inc(zExp);
  8197. end;
  8198. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8199. end;
  8200. {*----------------------------------------------------------------------------
  8201. | Returns the result of dividing the quadruple-precision floating-point value
  8202. | `a' by the corresponding value `b'. The operation is performed according to
  8203. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8204. *----------------------------------------------------------------------------*}
  8205. function float128_div(a: float128; b: float128): float128;
  8206. var
  8207. aSign, bSign, zSign: flag;
  8208. aExp, bExp, zExp: int32;
  8209. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8210. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8211. z: float128;
  8212. label
  8213. invalid;
  8214. begin
  8215. aSig1 := extractFloat128Frac1( a );
  8216. aSig0 := extractFloat128Frac0( a );
  8217. aExp := extractFloat128Exp( a );
  8218. aSign := extractFloat128Sign( a );
  8219. bSig1 := extractFloat128Frac1( b );
  8220. bSig0 := extractFloat128Frac0( b );
  8221. bExp := extractFloat128Exp( b );
  8222. bSign := extractFloat128Sign( b );
  8223. zSign := aSign xor bSign;
  8224. if ( aExp = $7FFF ) then begin
  8225. if ( aSig0 or aSig1 )<>0 then
  8226. begin
  8227. result := propagateFloat128NaN( a, b );
  8228. exit;
  8229. end;
  8230. if ( bExp = $7FFF ) then begin
  8231. if ( bSig0 or bSig1 )<>0 then
  8232. begin
  8233. result := propagateFloat128NaN( a, b );
  8234. exit;
  8235. end;
  8236. goto invalid;
  8237. end;
  8238. result := packFloat128( zSign, $7FFF, 0, 0 );
  8239. exit;
  8240. end;
  8241. if ( bExp = $7FFF ) then begin
  8242. if ( bSig0 or bSig1 )<>0 then
  8243. begin
  8244. result := propagateFloat128NaN( a, b );
  8245. exit;
  8246. end;
  8247. result := packFloat128( zSign, 0, 0, 0 );
  8248. exit;
  8249. end;
  8250. if ( bExp = 0 ) then begin
  8251. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8252. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8253. invalid:
  8254. float_raise( float_flag_invalid );
  8255. z.low := float128_default_nan_low;
  8256. z.high := float128_default_nan_high;
  8257. result := z;
  8258. exit;
  8259. end;
  8260. float_raise( float_flag_divbyzero );
  8261. result := packFloat128( zSign, $7FFF, 0, 0 );
  8262. exit;
  8263. end;
  8264. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8265. end;
  8266. if ( aExp = 0 ) then begin
  8267. if ( ( aSig0 or aSig1 ) = 0 ) then
  8268. begin
  8269. result := packFloat128( zSign, 0, 0, 0 );
  8270. exit;
  8271. end;
  8272. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8273. end;
  8274. zExp := aExp - bExp + $3FFD;
  8275. shortShift128Left(
  8276. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8277. shortShift128Left(
  8278. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8279. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8280. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8281. inc(zExp);
  8282. end;
  8283. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8284. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8285. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8286. while ( sbits64(rem0) < 0 ) do begin
  8287. dec(zSig0);
  8288. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8289. end;
  8290. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8291. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8292. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8293. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8294. while ( sbits64(rem1) < 0 ) do begin
  8295. dec(zSig1);
  8296. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8297. end;
  8298. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8299. end;
  8300. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8301. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8302. end;
  8303. {*----------------------------------------------------------------------------
  8304. | Returns the remainder of the quadruple-precision floating-point value `a'
  8305. | with respect to the corresponding value `b'. The operation is performed
  8306. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8307. *----------------------------------------------------------------------------*}
  8308. function float128_rem(a: float128; b: float128): float128;
  8309. var
  8310. aSign, zSign: flag;
  8311. aExp, bExp, expDiff: int32;
  8312. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8313. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8314. sigMean0: sbits64;
  8315. z: float128;
  8316. label
  8317. invalid;
  8318. begin
  8319. aSig1 := extractFloat128Frac1( a );
  8320. aSig0 := extractFloat128Frac0( a );
  8321. aExp := extractFloat128Exp( a );
  8322. aSign := extractFloat128Sign( a );
  8323. bSig1 := extractFloat128Frac1( b );
  8324. bSig0 := extractFloat128Frac0( b );
  8325. bExp := extractFloat128Exp( b );
  8326. if ( aExp = $7FFF ) then begin
  8327. if ( (( aSig0 or aSig1 )<>0)
  8328. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8329. result := propagateFloat128NaN( a, b );
  8330. exit;
  8331. end;
  8332. goto invalid;
  8333. end;
  8334. if ( bExp = $7FFF ) then begin
  8335. if ( bSig0 or bSig1 )<>0 then
  8336. begin
  8337. result := propagateFloat128NaN( a, b );
  8338. exit;
  8339. end;
  8340. result := a;
  8341. exit;
  8342. end;
  8343. if ( bExp = 0 ) then begin
  8344. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8345. invalid:
  8346. float_raise( float_flag_invalid );
  8347. z.low := float128_default_nan_low;
  8348. z.high := float128_default_nan_high;
  8349. result := z;
  8350. exit;
  8351. end;
  8352. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8353. end;
  8354. if ( aExp = 0 ) then begin
  8355. if ( ( aSig0 or aSig1 ) = 0 ) then
  8356. begin
  8357. result := a;
  8358. exit;
  8359. end;
  8360. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8361. end;
  8362. expDiff := aExp - bExp;
  8363. if ( expDiff < -1 ) then
  8364. begin
  8365. result := a;
  8366. exit;
  8367. end;
  8368. shortShift128Left(
  8369. aSig0 or int64( $0001000000000000 ),
  8370. aSig1,
  8371. 15 - ord( expDiff < 0 ),
  8372. aSig0,
  8373. aSig1
  8374. );
  8375. shortShift128Left(
  8376. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8377. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8378. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8379. dec(expDiff,64);
  8380. while ( 0 < expDiff ) do begin
  8381. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8382. if ( 4 < q ) then
  8383. q := q - 4
  8384. else
  8385. q := 0;
  8386. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8387. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8388. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8389. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8390. dec(expDiff,61);
  8391. end;
  8392. if ( -64 < expDiff ) then begin
  8393. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8394. if ( 4 < q ) then
  8395. q := q - 4
  8396. else
  8397. q := 0;
  8398. q := q shr (- expDiff);
  8399. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8400. inc(expDiff,52);
  8401. if ( expDiff < 0 ) then begin
  8402. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8403. end
  8404. else begin
  8405. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8406. end;
  8407. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8408. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8409. end
  8410. else begin
  8411. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8412. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8413. end;
  8414. repeat
  8415. alternateASig0 := aSig0;
  8416. alternateASig1 := aSig1;
  8417. inc(q);
  8418. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8419. until not( 0 <= sbits64(aSig0) );
  8420. add128(
  8421. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8422. if ( ( sigMean0 < 0 )
  8423. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8424. aSig0 := alternateASig0;
  8425. aSig1 := alternateASig1;
  8426. end;
  8427. zSign := ord( sbits64(aSig0) < 0 );
  8428. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8429. result :=
  8430. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8431. end;
  8432. {*----------------------------------------------------------------------------
  8433. | Returns the square root of the quadruple-precision floating-point value `a'.
  8434. | The operation is performed according to the IEC/IEEE Standard for Binary
  8435. | Floating-Point Arithmetic.
  8436. *----------------------------------------------------------------------------*}
  8437. function float128_sqrt(a: float128): float128;
  8438. var
  8439. aSign: flag;
  8440. aExp, zExp: int32;
  8441. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8442. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8443. z: float128;
  8444. label
  8445. invalid;
  8446. begin
  8447. aSig1 := extractFloat128Frac1( a );
  8448. aSig0 := extractFloat128Frac0( a );
  8449. aExp := extractFloat128Exp( a );
  8450. aSign := extractFloat128Sign( a );
  8451. if ( aExp = $7FFF ) then begin
  8452. if ( aSig0 or aSig1 )<>0 then
  8453. begin
  8454. result := propagateFloat128NaN( a, a );
  8455. exit;
  8456. end;
  8457. if ( aSign=0 ) then
  8458. begin
  8459. result := a;
  8460. exit;
  8461. end;
  8462. goto invalid;
  8463. end;
  8464. if ( aSign<>0 ) then begin
  8465. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8466. begin
  8467. result := a;
  8468. exit;
  8469. end;
  8470. invalid:
  8471. float_raise( float_flag_invalid );
  8472. z.low := float128_default_nan_low;
  8473. z.high := float128_default_nan_high;
  8474. result := z;
  8475. exit;
  8476. end;
  8477. if ( aExp = 0 ) then begin
  8478. if ( ( aSig0 or aSig1 ) = 0 ) then
  8479. begin
  8480. result := packFloat128( 0, 0, 0, 0 );
  8481. exit;
  8482. end;
  8483. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8484. end;
  8485. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
  8486. aSig0 := aSig0 or int64( $0001000000000000 );
  8487. zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
  8488. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8489. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8490. doubleZSig0 := zSig0 shl 1;
  8491. mul64To128( zSig0, zSig0, term0, term1 );
  8492. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8493. while ( sbits64(rem0) < 0 ) do begin
  8494. dec(zSig0);
  8495. dec(doubleZSig0,2);
  8496. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8497. end;
  8498. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8499. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8500. if ( zSig1 = 0 ) then zSig1 := 1;
  8501. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8502. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8503. mul64To128( zSig1, zSig1, term2, term3 );
  8504. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8505. while ( sbits64(rem1) < 0 ) do begin
  8506. dec(zSig1);
  8507. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8508. term3 := term3 or 1;
  8509. term2 := term2 or doubleZSig0;
  8510. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8511. end;
  8512. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8513. end;
  8514. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8515. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8516. end;
  8517. {*----------------------------------------------------------------------------
  8518. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8519. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8520. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8521. *----------------------------------------------------------------------------*}
  8522. function float128_eq(a: float128; b: float128): flag;
  8523. begin
  8524. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8525. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8526. or ( ( extractFloat128Exp( b ) = $7FFF )
  8527. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8528. ) then begin
  8529. if ( (float128_is_signaling_nan( a )<>0)
  8530. or (float128_is_signaling_nan( b )<>0) ) then begin
  8531. float_raise( float_flag_invalid );
  8532. end;
  8533. result := 0;
  8534. exit;
  8535. end;
  8536. result := ord(
  8537. ( a.low = b.low )
  8538. and ( ( a.high = b.high )
  8539. or ( ( a.low = 0 )
  8540. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8541. ));
  8542. end;
  8543. {*----------------------------------------------------------------------------
  8544. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8545. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8546. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8547. | Arithmetic.
  8548. *----------------------------------------------------------------------------*}
  8549. function float128_le(a: float128; b: float128): flag;
  8550. var
  8551. aSign, bSign: flag;
  8552. begin
  8553. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8554. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8555. or ( ( extractFloat128Exp( b ) = $7FFF )
  8556. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8557. ) then begin
  8558. float_raise( float_flag_invalid );
  8559. result := 0;
  8560. exit;
  8561. end;
  8562. aSign := extractFloat128Sign( a );
  8563. bSign := extractFloat128Sign( b );
  8564. if ( aSign <> bSign ) then begin
  8565. result := ord(
  8566. (aSign<>0)
  8567. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8568. = 0 ));
  8569. exit;
  8570. end;
  8571. if aSign<>0 then
  8572. result := le128( b.high, b.low, a.high, a.low )
  8573. else
  8574. result := le128( a.high, a.low, b.high, b.low );
  8575. end;
  8576. {*----------------------------------------------------------------------------
  8577. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8578. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8579. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8580. *----------------------------------------------------------------------------*}
  8581. function float128_lt(a: float128; b: float128): flag;
  8582. var
  8583. aSign, bSign: flag;
  8584. begin
  8585. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8586. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8587. or ( ( extractFloat128Exp( b ) = $7FFF )
  8588. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8589. ) then begin
  8590. float_raise( float_flag_invalid );
  8591. result := 0;
  8592. exit;
  8593. end;
  8594. aSign := extractFloat128Sign( a );
  8595. bSign := extractFloat128Sign( b );
  8596. if ( aSign <> bSign ) then begin
  8597. result := ord(
  8598. (aSign<>0)
  8599. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8600. <> 0 ));
  8601. exit;
  8602. end;
  8603. if aSign<>0 then
  8604. result := lt128( b.high, b.low, a.high, a.low )
  8605. else
  8606. result := lt128( a.high, a.low, b.high, b.low );
  8607. end;
  8608. {*----------------------------------------------------------------------------
  8609. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8610. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8611. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8612. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8613. *----------------------------------------------------------------------------*}
  8614. function float128_eq_signaling(a: float128; b: float128): flag;
  8615. begin
  8616. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8617. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8618. or ( ( extractFloat128Exp( b ) = $7FFF )
  8619. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8620. ) then begin
  8621. float_raise( float_flag_invalid );
  8622. result := 0;
  8623. exit;
  8624. end;
  8625. result := ord(
  8626. ( a.low = b.low )
  8627. and ( ( a.high = b.high )
  8628. or ( ( a.low = 0 )
  8629. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8630. ));
  8631. end;
  8632. {*----------------------------------------------------------------------------
  8633. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8634. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8635. | cause an exception. Otherwise, the comparison is performed according to the
  8636. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8637. *----------------------------------------------------------------------------*}
  8638. function float128_le_quiet(a: float128; b: float128): flag;
  8639. var
  8640. aSign, bSign: flag;
  8641. begin
  8642. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8643. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8644. or ( ( extractFloat128Exp( b ) = $7FFF )
  8645. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8646. ) then begin
  8647. if ( (float128_is_signaling_nan( a )<>0)
  8648. or (float128_is_signaling_nan( b )<>0) ) then begin
  8649. float_raise( float_flag_invalid );
  8650. end;
  8651. result := 0;
  8652. exit;
  8653. end;
  8654. aSign := extractFloat128Sign( a );
  8655. bSign := extractFloat128Sign( b );
  8656. if ( aSign <> bSign ) then begin
  8657. result := ord(
  8658. (aSign<>0)
  8659. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8660. = 0 ));
  8661. exit;
  8662. end;
  8663. if aSign<>0 then
  8664. result := le128( b.high, b.low, a.high, a.low )
  8665. else
  8666. result := le128( a.high, a.low, b.high, b.low );
  8667. end;
  8668. {*----------------------------------------------------------------------------
  8669. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8670. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8671. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8672. | Standard for Binary Floating-Point Arithmetic.
  8673. *----------------------------------------------------------------------------*}
  8674. function float128_lt_quiet(a: float128; b: float128): flag;
  8675. var
  8676. aSign, bSign: flag;
  8677. begin
  8678. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8679. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8680. or ( ( extractFloat128Exp( b ) = $7FFF )
  8681. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8682. ) then begin
  8683. if ( (float128_is_signaling_nan( a )<>0)
  8684. or (float128_is_signaling_nan( b )<>0) ) then begin
  8685. float_raise( float_flag_invalid );
  8686. end;
  8687. result := 0;
  8688. exit;
  8689. end;
  8690. aSign := extractFloat128Sign( a );
  8691. bSign := extractFloat128Sign( b );
  8692. if ( aSign <> bSign ) then begin
  8693. result := ord(
  8694. (aSign<>0)
  8695. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8696. <> 0 ));
  8697. exit;
  8698. end;
  8699. if aSign<>0 then
  8700. result:=lt128( b.high, b.low, a.high, a.low )
  8701. else
  8702. result:=lt128( a.high, a.low, b.high, b.low );
  8703. end;
  8704. {----------------------------------------------------------------------------
  8705. | Returns the result of converting the double-precision floating-point value
  8706. | `a' to the quadruple-precision floating-point format. The conversion is
  8707. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8708. | Arithmetic.
  8709. *----------------------------------------------------------------------------}
  8710. function float64_to_float128( a : float64) : float128;
  8711. var
  8712. aSign : flag;
  8713. aExp : int16;
  8714. aSig, zSig0, zSig1 : bits64;
  8715. begin
  8716. aSig := extractFloat64Frac( a );
  8717. aExp := extractFloat64Exp( a );
  8718. aSign := extractFloat64Sign( a );
  8719. if ( aExp = $7FF ) then begin
  8720. if ( aSig<>0 ) then begin
  8721. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8722. exit;
  8723. end;
  8724. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8725. exit;
  8726. end;
  8727. if ( aExp = 0 ) then begin
  8728. if ( aSig = 0 ) then
  8729. begin
  8730. result:=packFloat128( aSign, 0, 0, 0 );
  8731. exit;
  8732. end;
  8733. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8734. dec(aExp);
  8735. end;
  8736. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8737. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8738. end;
  8739. {$endif FPC_SOFTFLOAT_FLOAT128}
  8740. {$endif not(defined(fpc_softfpu_interface))}
  8741. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8742. end.
  8743. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}