softfpu.pp 324 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. {$macro on}
  69. {$define compilerproc:=stdcall }
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. float32 = longword;
  80. {$define FPC_SYSTEM_HAS_float32}
  81. { we use here a record in the function header because
  82. the record allows bitwise conversion to single }
  83. float32rec = record
  84. float32 : float32;
  85. end;
  86. flag = byte;
  87. bits8 = byte;
  88. sbits8 = shortint;
  89. bits16 = word;
  90. sbits16 = smallint;
  91. sbits32 = longint;
  92. bits32 = longword;
  93. {$ifndef fpc}
  94. qword = int64;
  95. {$endif}
  96. { now part of the system unit
  97. uint64 = qword;
  98. }
  99. bits64 = qword;
  100. sbits64 = int64;
  101. {$ifdef ENDIAN_LITTLE}
  102. float64 = record
  103. case byte of
  104. 1: (low,high : bits32);
  105. // force the record to be aligned like a double
  106. // else *_to_double will fail for cpus like sparc
  107. // and avoid expensive unpacking/packing operations
  108. 2: (dummy : double);
  109. end;
  110. int64rec = record
  111. case byte of
  112. 1: (low,high : bits32);
  113. // force the record to be aligned like a double
  114. // else *_to_double will fail for cpus like sparc
  115. // and avoid expensive unpacking/packing operations
  116. 2: (dummy : int64);
  117. end;
  118. floatx80 = record
  119. case byte of
  120. 1: (low : qword;high : word);
  121. // force the record to be aligned like a double
  122. // else *_to_double will fail for cpus like sparc
  123. // and avoid expensive unpacking/packing operations
  124. 2: (dummy : extended);
  125. end;
  126. float128 = record
  127. case byte of
  128. 1: (low,high : qword);
  129. // force the record to be aligned like a double
  130. // else *_to_double will fail for cpus like sparc
  131. // and avoid expensive unpacking/packing operations
  132. 2: (dummy : qword);
  133. end;
  134. {$else}
  135. float64 = record
  136. case byte of
  137. 1: (high,low : bits32);
  138. // force the record to be aligned like a double
  139. // else *_to_double will fail for cpus like sparc
  140. 2: (dummy : double);
  141. end;
  142. int64rec = record
  143. case byte of
  144. 1: (high,low : bits32);
  145. // force the record to be aligned like a double
  146. // else *_to_double will fail for cpus like sparc
  147. // and avoid expensive unpacking/packing operations
  148. 2: (dummy : int64);
  149. end;
  150. floatx80 = record
  151. case byte of
  152. 1: (high : word;low : qword);
  153. // force the record to be aligned like a double
  154. // else *_to_double will fail for cpus like sparc
  155. // and avoid expensive unpacking/packing operations
  156. 2: (dummy : qword);
  157. end;
  158. float128 = record
  159. case byte of
  160. 1: (high : qword;low : qword);
  161. // force the record to be aligned like a double
  162. // else *_to_double will fail for cpus like sparc
  163. // and avoid expensive unpacking/packing operations
  164. 2: (dummy : qword);
  165. end;
  166. {$endif}
  167. {$define FPC_SYSTEM_HAS_float64}
  168. {*
  169. -------------------------------------------------------------------------------
  170. Returns 1 if the double-precision floating-point value `a' is less than
  171. the corresponding value `b', and 0 otherwise. The comparison is performed
  172. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  173. -------------------------------------------------------------------------------
  174. *}
  175. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  176. {*
  177. -------------------------------------------------------------------------------
  178. Returns 1 if the double-precision floating-point value `a' is less than
  179. or equal to the corresponding value `b', and 0 otherwise. The comparison
  180. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  181. Arithmetic.
  182. -------------------------------------------------------------------------------
  183. *}
  184. Function float64_le(a: float64;b: float64): flag; compilerproc;
  185. {*
  186. -------------------------------------------------------------------------------
  187. Returns 1 if the double-precision floating-point value `a' is equal to
  188. the corresponding value `b', and 0 otherwise. The comparison is performed
  189. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  190. -------------------------------------------------------------------------------
  191. *}
  192. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  193. {*
  194. -------------------------------------------------------------------------------
  195. Returns the square root of the double-precision floating-point value `a'.
  196. The operation is performed according to the IEC/IEEE Standard for Binary
  197. Floating-Point Arithmetic.
  198. -------------------------------------------------------------------------------
  199. *}
  200. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  201. {*
  202. -------------------------------------------------------------------------------
  203. Returns the remainder of the double-precision floating-point value `a'
  204. with respect to the corresponding value `b'. The operation is performed
  205. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  206. -------------------------------------------------------------------------------
  207. *}
  208. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  209. {*
  210. -------------------------------------------------------------------------------
  211. Returns the result of dividing the double-precision floating-point value `a'
  212. by the corresponding value `b'. The operation is performed according to the
  213. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  214. -------------------------------------------------------------------------------
  215. *}
  216. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  217. {*
  218. -------------------------------------------------------------------------------
  219. Returns the result of multiplying the double-precision floating-point values
  220. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  221. for Binary Floating-Point Arithmetic.
  222. -------------------------------------------------------------------------------
  223. *}
  224. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  225. {*
  226. -------------------------------------------------------------------------------
  227. Returns the result of subtracting the double-precision floating-point values
  228. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  229. for Binary Floating-Point Arithmetic.
  230. -------------------------------------------------------------------------------
  231. *}
  232. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  233. {*
  234. -------------------------------------------------------------------------------
  235. Returns the result of adding the double-precision floating-point values `a'
  236. and `b'. The operation is performed according to the IEC/IEEE Standard for
  237. Binary Floating-Point Arithmetic.
  238. -------------------------------------------------------------------------------
  239. *}
  240. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  241. {*
  242. -------------------------------------------------------------------------------
  243. Rounds the double-precision floating-point value `a' to an integer,
  244. and returns the result as a double-precision floating-point value. The
  245. operation is performed according to the IEC/IEEE Standard for Binary
  246. Floating-Point Arithmetic.
  247. -------------------------------------------------------------------------------
  248. *}
  249. Function float64_round_to_int(a: float64) : float64; compilerproc;
  250. {*
  251. -------------------------------------------------------------------------------
  252. Returns the result of converting the double-precision floating-point value
  253. `a' to the single-precision floating-point format. The conversion is
  254. performed according to the IEC/IEEE Standard for Binary Floating-Point
  255. Arithmetic.
  256. -------------------------------------------------------------------------------
  257. *}
  258. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  259. {*
  260. -------------------------------------------------------------------------------
  261. Returns the result of converting the double-precision floating-point value
  262. `a' to the 32-bit two's complement integer format. The conversion is
  263. performed according to the IEC/IEEE Standard for Binary Floating-Point
  264. Arithmetic, except that the conversion is always rounded toward zero.
  265. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  266. the conversion overflows, the largest integer with the same sign as `a' is
  267. returned.
  268. -------------------------------------------------------------------------------
  269. *}
  270. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  271. {*
  272. -------------------------------------------------------------------------------
  273. Returns the result of converting the double-precision floating-point value
  274. `a' to the 32-bit two's complement integer format. The conversion is
  275. performed according to the IEC/IEEE Standard for Binary Floating-Point
  276. Arithmetic---which means in particular that the conversion is rounded
  277. according to the current rounding mode. If `a' is a NaN, the largest
  278. positive integer is returned. Otherwise, if the conversion overflows, the
  279. largest integer with the same sign as `a' is returned.
  280. -------------------------------------------------------------------------------
  281. *}
  282. Function float64_to_int32(a: float64): int32; compilerproc;
  283. {*
  284. -------------------------------------------------------------------------------
  285. Returns 1 if the single-precision floating-point value `a' is less than
  286. the corresponding value `b', and 0 otherwise. The comparison is performed
  287. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  288. -------------------------------------------------------------------------------
  289. *}
  290. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  291. {*
  292. -------------------------------------------------------------------------------
  293. Returns 1 if the single-precision floating-point value `a' is less than
  294. or equal to the corresponding value `b', and 0 otherwise. The comparison
  295. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  296. Arithmetic.
  297. -------------------------------------------------------------------------------
  298. *}
  299. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  300. {*
  301. -------------------------------------------------------------------------------
  302. Returns 1 if the single-precision floating-point value `a' is equal to
  303. the corresponding value `b', and 0 otherwise. The comparison is performed
  304. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  305. -------------------------------------------------------------------------------
  306. *}
  307. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  308. {*
  309. -------------------------------------------------------------------------------
  310. Returns the square root of the single-precision floating-point value `a'.
  311. The operation is performed according to the IEC/IEEE Standard for Binary
  312. Floating-Point Arithmetic.
  313. -------------------------------------------------------------------------------
  314. *}
  315. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  316. {*
  317. -------------------------------------------------------------------------------
  318. Returns the remainder of the single-precision floating-point value `a'
  319. with respect to the corresponding value `b'. The operation is performed
  320. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  321. -------------------------------------------------------------------------------
  322. *}
  323. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  324. {*
  325. -------------------------------------------------------------------------------
  326. Returns the result of dividing the single-precision floating-point value `a'
  327. by the corresponding value `b'. The operation is performed according to the
  328. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  329. -------------------------------------------------------------------------------
  330. *}
  331. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  332. {*
  333. -------------------------------------------------------------------------------
  334. Returns the result of multiplying the single-precision floating-point values
  335. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  336. for Binary Floating-Point Arithmetic.
  337. -------------------------------------------------------------------------------
  338. *}
  339. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  340. {*
  341. -------------------------------------------------------------------------------
  342. Returns the result of subtracting the single-precision floating-point values
  343. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  344. for Binary Floating-Point Arithmetic.
  345. -------------------------------------------------------------------------------
  346. *}
  347. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  348. {*
  349. -------------------------------------------------------------------------------
  350. Returns the result of adding the single-precision floating-point values `a'
  351. and `b'. The operation is performed according to the IEC/IEEE Standard for
  352. Binary Floating-Point Arithmetic.
  353. -------------------------------------------------------------------------------
  354. *}
  355. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  356. {*
  357. -------------------------------------------------------------------------------
  358. Rounds the single-precision floating-point value `a' to an integer,
  359. and returns the result as a single-precision floating-point value. The
  360. operation is performed according to the IEC/IEEE Standard for Binary
  361. Floating-Point Arithmetic.
  362. -------------------------------------------------------------------------------
  363. *}
  364. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  365. {*
  366. -------------------------------------------------------------------------------
  367. Returns the result of converting the single-precision floating-point value
  368. `a' to the double-precision floating-point format. The conversion is
  369. performed according to the IEC/IEEE Standard for Binary Floating-Point
  370. Arithmetic.
  371. -------------------------------------------------------------------------------
  372. *}
  373. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  374. {*
  375. -------------------------------------------------------------------------------
  376. Returns the result of converting the single-precision floating-point value
  377. `a' to the 32-bit two's complement integer format. The conversion is
  378. performed according to the IEC/IEEE Standard for Binary Floating-Point
  379. Arithmetic, except that the conversion is always rounded toward zero.
  380. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  381. the conversion overflows, the largest integer with the same sign as `a' is
  382. returned.
  383. -------------------------------------------------------------------------------
  384. *}
  385. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  386. {*
  387. -------------------------------------------------------------------------------
  388. Returns the result of converting the single-precision floating-point value
  389. `a' to the 32-bit two's complement integer format. The conversion is
  390. performed according to the IEC/IEEE Standard for Binary Floating-Point
  391. Arithmetic---which means in particular that the conversion is rounded
  392. according to the current rounding mode. If `a' is a NaN, the largest
  393. positive integer is returned. Otherwise, if the conversion overflows, the
  394. largest integer with the same sign as `a' is returned.
  395. -------------------------------------------------------------------------------
  396. *}
  397. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  398. {*
  399. -------------------------------------------------------------------------------
  400. Returns the result of converting the 32-bit two's complement integer `a' to
  401. the double-precision floating-point format. The conversion is performed
  402. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  403. -------------------------------------------------------------------------------
  404. *}
  405. Function int32_to_float64( a: int32) : float64; compilerproc;
  406. {*
  407. -------------------------------------------------------------------------------
  408. Returns the result of converting the 32-bit two's complement integer `a' to
  409. the single-precision floating-point format. The conversion is performed
  410. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  411. -------------------------------------------------------------------------------
  412. *}
  413. Function int32_to_float32( a: int32): float32rec; compilerproc;
  414. {*----------------------------------------------------------------------------
  415. | Returns the result of converting the 64-bit two's complement integer `a'
  416. | to the double-precision floating-point format. The conversion is performed
  417. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  418. *----------------------------------------------------------------------------*}
  419. Function int64_to_float64( a: int64 ): float64; compilerproc;
  420. Function qword_to_float64( a: qword ): float64; compilerproc;
  421. {*----------------------------------------------------------------------------
  422. | Returns the result of converting the 64-bit two's complement integer `a'
  423. | to the single-precision floating-point format. The conversion is performed
  424. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  425. *----------------------------------------------------------------------------*}
  426. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  427. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  428. // +++
  429. function float32_to_int64( a: float32 ): int64;
  430. function float32_to_int64_round_to_zero( a: float32 ): int64;
  431. function float32_eq_signaling( a: float32; b: float32) : flag;
  432. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  433. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  434. function float32_is_signaling_nan( a : float32 ): flag;
  435. function float32_is_nan( a : float32 ): flag;
  436. function float64_to_int64( a: float64 ): int64;
  437. function float64_to_int64_round_to_zero( a: float64 ): int64;
  438. function float64_eq_signaling( a: float64; b: float64): flag;
  439. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  440. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  441. function float64_is_signaling_nan( a : float64 ): flag;
  442. function float64_is_nan( a : float64 ): flag;
  443. // ===
  444. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  445. {*----------------------------------------------------------------------------
  446. | Extended double-precision rounding precision
  447. *----------------------------------------------------------------------------*}
  448. var // threadvar!?
  449. floatx80_rounding_precision : int8 = 80;
  450. function int32_to_floatx80( a: int32 ): floatx80;
  451. function int64_to_floatx80( a: int64 ): floatx80;
  452. function qword_to_floatx80( a: qword ): floatx80;
  453. function float32_to_floatx80( a: float32 ): floatx80;
  454. function float64_to_floatx80( a: float64 ): floatx80;
  455. function floatx80_to_int32( a: floatx80 ): int32;
  456. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  457. function floatx80_to_int64( a: floatx80 ): int64;
  458. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  459. function floatx80_to_float32( a: floatx80 ): float32;
  460. function floatx80_to_float64( a: floatx80 ): float64;
  461. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  462. function floatx80_to_float128( a: floatx80 ): float128;
  463. {$endif FPC_SOFTFLOAT_FLOAT128}
  464. function floatx80_round_to_int( a: floatx80 ): floatx80;
  465. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  466. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  467. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  468. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  469. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  470. function floatx80_sqrt( a: floatx80 ): floatx80;
  471. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  472. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  473. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  474. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  475. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  476. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  477. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  478. function floatx80_is_nan(a : floatx80 ): flag;
  479. {$endif FPC_SOFTFLOAT_FLOATX80}
  480. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  481. function int32_to_float128( a: int32 ): float128;
  482. function int64_to_float128( a: int64 ): float128;
  483. function qword_to_float128( a: qword ): float128;
  484. function float32_to_float128( a: float32 ): float128;
  485. function float128_is_nan( a : float128): flag;
  486. function float128_is_signaling_nan( a : float128): flag;
  487. function float128_to_int32(a: float128): int32;
  488. function float128_to_int32_round_to_zero(a: float128): int32;
  489. function float128_to_int64(a: float128): int64;
  490. function float128_to_int64_round_to_zero(a: float128): int64;
  491. function float128_to_float32(a: float128): float32;
  492. function float128_to_float64(a: float128): float64;
  493. function float64_to_float128( a : float64) : float128;
  494. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  495. function float128_to_floatx80(a: float128): floatx80;
  496. {$endif FPC_SOFTFLOAT_FLOATX80}
  497. function float128_round_to_int(a: float128): float128;
  498. function float128_add(a: float128; b: float128): float128;
  499. function float128_sub(a: float128; b: float128): float128;
  500. function float128_mul(a: float128; b: float128): float128;
  501. function float128_div(a: float128; b: float128): float128;
  502. function float128_rem(a: float128; b: float128): float128;
  503. function float128_sqrt(a: float128): float128;
  504. function float128_eq(a: float128; b: float128): flag;
  505. function float128_le(a: float128; b: float128): flag;
  506. function float128_lt(a: float128; b: float128): flag;
  507. function float128_eq_signaling(a: float128; b: float128): flag;
  508. function float128_le_quiet(a: float128; b: float128): flag;
  509. function float128_lt_quiet(a: float128; b: float128): flag;
  510. {$endif FPC_SOFTFLOAT_FLOAT128}
  511. CONST
  512. {-------------------------------------------------------------------------------
  513. Software IEC/IEEE floating-point underflow tininess-detection mode.
  514. -------------------------------------------------------------------------------
  515. *}
  516. float_tininess_after_rounding = 0;
  517. float_tininess_before_rounding = 1;
  518. {*
  519. -------------------------------------------------------------------------------
  520. Underflow tininess-detection mode, statically initialized to default value.
  521. (The declaration in `softfloat.h' must match the `int8' type here.)
  522. -------------------------------------------------------------------------------
  523. *}
  524. var // threadvar!?
  525. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  526. {$endif not(defined(fpc_softfpu_implementation))}
  527. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  528. implementation
  529. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  530. {$if not(defined(fpc_softfpu_interface))}
  531. (*****************************************************************************)
  532. (*----------------------------------------------------------------------------*)
  533. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  534. (* division and square root approximations. (Can be specialized to target if *)
  535. (* desired.) *)
  536. (* ---------------------------------------------------------------------------*)
  537. (*****************************************************************************)
  538. {*----------------------------------------------------------------------------
  539. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  540. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  541. | input. If `zSign' is 1, the input is negated before being converted to an
  542. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  543. | is simply rounded to an integer, with the inexact exception raised if the
  544. | input cannot be represented exactly as an integer. However, if the fixed-
  545. | point input is too large, the invalid exception is raised and the largest
  546. | positive or negative integer is returned.
  547. *----------------------------------------------------------------------------*}
  548. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  549. var
  550. roundingMode: int8;
  551. roundNearestEven: flag;
  552. roundIncrement, roundBits: int8;
  553. z: int32;
  554. begin
  555. roundingMode := softfloat_rounding_mode;
  556. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  557. roundIncrement := $40;
  558. if ( roundNearestEven=0 ) then
  559. begin
  560. if ( roundingMode = float_round_to_zero ) then
  561. begin
  562. roundIncrement := 0;
  563. end
  564. else begin
  565. roundIncrement := $7F;
  566. if ( zSign<>0 ) then
  567. begin
  568. if ( roundingMode = float_round_up ) then
  569. roundIncrement := 0;
  570. end
  571. else begin
  572. if ( roundingMode = float_round_down ) then
  573. roundIncrement := 0;
  574. end;
  575. end;
  576. end;
  577. roundBits := absZ and $7F;
  578. absZ := ( absZ + roundIncrement ) shr 7;
  579. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  580. z := absZ;
  581. if ( zSign<>0 ) then
  582. z := - z;
  583. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  584. begin
  585. float_raise( float_flag_invalid );
  586. if zSign<>0 then
  587. result:=sbits32($80000000)
  588. else
  589. result:=$7FFFFFFF;
  590. exit;
  591. end;
  592. if ( roundBits<>0 ) then
  593. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  594. result:=z;
  595. end;
  596. {*----------------------------------------------------------------------------
  597. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  598. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  599. | and returns the properly rounded 64-bit integer corresponding to the input.
  600. | If `zSign' is 1, the input is negated before being converted to an integer.
  601. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  602. | the inexact exception raised if the input cannot be represented exactly as
  603. | an integer. However, if the fixed-point input is too large, the invalid
  604. | exception is raised and the largest positive or negative integer is
  605. | returned.
  606. *----------------------------------------------------------------------------*}
  607. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  608. var
  609. roundingMode: int8;
  610. roundNearestEven, increment: flag;
  611. z: int64;
  612. label
  613. overflow;
  614. begin
  615. roundingMode := softfloat_rounding_mode;
  616. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  617. increment := ord( sbits64(absZ1) < 0 );
  618. if ( roundNearestEven=0 ) then
  619. begin
  620. if ( roundingMode = float_round_to_zero ) then
  621. begin
  622. increment := 0;
  623. end
  624. else begin
  625. if ( zSign<>0 ) then
  626. begin
  627. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  628. end
  629. else begin
  630. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  631. end;
  632. end;
  633. end;
  634. if ( increment<>0 ) then
  635. begin
  636. inc(absZ0);
  637. if ( absZ0 = 0 ) then
  638. goto overflow;
  639. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  640. end;
  641. z := absZ0;
  642. if ( zSign<>0 ) then
  643. z := - z;
  644. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  645. begin
  646. overflow:
  647. float_raise( float_flag_invalid );
  648. if zSign<>0 then
  649. result:=int64($8000000000000000)
  650. else
  651. result:=int64($7FFFFFFFFFFFFFFF);
  652. exit;
  653. end;
  654. if ( absZ1<>0 ) then
  655. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  656. result:=z;
  657. end;
  658. {*
  659. -------------------------------------------------------------------------------
  660. Shifts `a' right by the number of bits given in `count'. If any nonzero
  661. bits are shifted off, they are ``jammed'' into the least significant bit of
  662. the result by setting the least significant bit to 1. The value of `count'
  663. can be arbitrarily large; in particular, if `count' is greater than 32, the
  664. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  665. The result is stored in the location pointed to by `zPtr'.
  666. -------------------------------------------------------------------------------
  667. *}
  668. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  669. var
  670. z: Bits32;
  671. Begin
  672. if ( count = 0 ) then
  673. z := a
  674. else
  675. if ( count < 32 ) then
  676. Begin
  677. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  678. End
  679. else
  680. Begin
  681. z := bits32( a <> 0 );
  682. End;
  683. zPtr := z;
  684. End;
  685. {*----------------------------------------------------------------------------
  686. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  687. | number of bits given in `count'. Any bits shifted off are lost. The value
  688. | of `count' can be arbitrarily large; in particular, if `count' is greater
  689. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  690. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  691. *----------------------------------------------------------------------------*}
  692. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  693. var
  694. z0, z1: bits64;
  695. negCount: int8;
  696. begin
  697. negCount := ( - count ) and 63;
  698. if ( count = 0 ) then
  699. begin
  700. z1 := a1;
  701. z0 := a0;
  702. end
  703. else if ( count < 64 ) then
  704. begin
  705. z1 := ( a0 shl negCount ) or ( a1 shr count );
  706. z0 := a0 shr count;
  707. end
  708. else
  709. begin
  710. if ( count < 64 ) then
  711. z1 := a0 shr ( count and 63 )
  712. else
  713. z1 := 0;
  714. z0 := 0;
  715. end;
  716. z1Ptr := z1;
  717. z0Ptr := z0;
  718. end;
  719. {*----------------------------------------------------------------------------
  720. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  721. | number of bits given in `count'. If any nonzero bits are shifted off, they
  722. | are ``jammed'' into the least significant bit of the result by setting the
  723. | least significant bit to 1. The value of `count' can be arbitrarily large;
  724. | in particular, if `count' is greater than 128, the result will be either
  725. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  726. | nonzero. The result is broken into two 64-bit pieces which are stored at
  727. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  728. *----------------------------------------------------------------------------*}
  729. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  730. var
  731. z0,z1 : bits64;
  732. negCount : int8;
  733. begin
  734. negCount := ( - count ) and 63;
  735. if ( count = 0 ) then begin
  736. z1 := a1;
  737. z0 := a0;
  738. end
  739. else if ( count < 64 ) then begin
  740. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  741. z0 := a0>>count;
  742. end
  743. else begin
  744. if ( count = 64 ) then begin
  745. z1 := a0 or ord( a1 <> 0 );
  746. end
  747. else if ( count < 128 ) then begin
  748. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  749. end
  750. else begin
  751. z1 := ord( ( a0 or a1 ) <> 0 );
  752. end;
  753. z0 := 0;
  754. end;
  755. z1Ptr := z1;
  756. z0Ptr := z0;
  757. end;
  758. {*
  759. -------------------------------------------------------------------------------
  760. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  761. number of bits given in `count'. Any bits shifted off are lost. The value
  762. of `count' can be arbitrarily large; in particular, if `count' is greater
  763. than 64, the result will be 0. The result is broken into two 32-bit pieces
  764. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  765. -------------------------------------------------------------------------------
  766. *}
  767. Procedure
  768. shift64Right(
  769. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  770. Var
  771. z0, z1: bits32;
  772. negCount : int8;
  773. Begin
  774. negCount := ( - count ) AND 31;
  775. if ( count = 0 ) then
  776. Begin
  777. z1 := a1;
  778. z0 := a0;
  779. End
  780. else if ( count < 32 ) then
  781. Begin
  782. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  783. z0 := a0 shr count;
  784. End
  785. else
  786. Begin
  787. if (count < 64) then
  788. z1 := ( a0 shr ( count AND 31 ) )
  789. else
  790. z1 := 0;
  791. z0 := 0;
  792. End;
  793. z1Ptr := z1;
  794. z0Ptr := z0;
  795. End;
  796. {*
  797. -------------------------------------------------------------------------------
  798. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  799. number of bits given in `count'. If any nonzero bits are shifted off, they
  800. are ``jammed'' into the least significant bit of the result by setting the
  801. least significant bit to 1. The value of `count' can be arbitrarily large;
  802. in particular, if `count' is greater than 64, the result will be either 0
  803. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  804. nonzero. The result is broken into two 32-bit pieces which are stored at
  805. the locations pointed to by `z0Ptr' and `z1Ptr'.
  806. -------------------------------------------------------------------------------
  807. *}
  808. Procedure
  809. shift64RightJamming(
  810. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  811. VAR
  812. z0, z1 : bits32;
  813. negCount : int8;
  814. Begin
  815. negCount := ( - count ) AND 31;
  816. if ( count = 0 ) then
  817. Begin
  818. z1 := a1;
  819. z0 := a0;
  820. End
  821. else
  822. if ( count < 32 ) then
  823. Begin
  824. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  825. z0 := a0 shr count;
  826. End
  827. else
  828. Begin
  829. if ( count = 32 ) then
  830. Begin
  831. z1 := a0 OR bits32( a1 <> 0 );
  832. End
  833. else
  834. if ( count < 64 ) Then
  835. Begin
  836. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  837. End
  838. else
  839. Begin
  840. z1 := bits32( ( a0 OR a1 ) <> 0 );
  841. End;
  842. z0 := 0;
  843. End;
  844. z1Ptr := z1;
  845. z0Ptr := z0;
  846. End;
  847. {*----------------------------------------------------------------------------
  848. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  849. | bits are shifted off, they are ``jammed'' into the least significant bit of
  850. | the result by setting the least significant bit to 1. The value of `count'
  851. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  852. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  853. | The result is stored in the location pointed to by `zPtr'.
  854. *----------------------------------------------------------------------------*}
  855. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  856. var
  857. z: bits64;
  858. begin
  859. if ( count = 0 ) then
  860. begin
  861. z := a;
  862. end
  863. else if ( count < 64 ) then
  864. begin
  865. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  866. end
  867. else
  868. begin
  869. z := ord( a <> 0 );
  870. end;
  871. zPtr := z;
  872. end;
  873. {$if not defined(shift64ExtraRightJamming)}
  874. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  875. overload;
  876. forward;
  877. {$endif}
  878. {*
  879. -------------------------------------------------------------------------------
  880. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  881. by 32 _plus_ the number of bits given in `count'. The shifted result is
  882. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  883. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  884. off form a third 32-bit result as follows: The _last_ bit shifted off is
  885. the most-significant bit of the extra result, and the other 31 bits of the
  886. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  887. were all zero. This extra result is stored in the location pointed to by
  888. `z2Ptr'. The value of `count' can be arbitrarily large.
  889. (This routine makes more sense if `a0', `a1', and `a2' are considered
  890. to form a fixed-point value with binary point between `a1' and `a2'. This
  891. fixed-point value is shifted right by the number of bits given in `count',
  892. and the integer part of the result is returned at the locations pointed to
  893. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  894. corrupted as described above, and is returned at the location pointed to by
  895. `z2Ptr'.)
  896. -------------------------------------------------------------------------------
  897. }
  898. Procedure
  899. shift64ExtraRightJamming(
  900. a0: bits32;
  901. a1: bits32;
  902. a2: bits32;
  903. count: int16;
  904. VAR z0Ptr: bits32;
  905. VAR z1Ptr: bits32;
  906. VAR z2Ptr: bits32
  907. ); overload;
  908. Var
  909. z0, z1, z2: bits32;
  910. negCount : int8;
  911. Begin
  912. negCount := ( - count ) AND 31;
  913. if ( count = 0 ) then
  914. Begin
  915. z2 := a2;
  916. z1 := a1;
  917. z0 := a0;
  918. End
  919. else
  920. Begin
  921. if ( count < 32 ) Then
  922. Begin
  923. z2 := a1 shl negCount;
  924. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  925. z0 := a0 shr count;
  926. End
  927. else
  928. Begin
  929. if ( count = 32 ) then
  930. Begin
  931. z2 := a1;
  932. z1 := a0;
  933. End
  934. else
  935. Begin
  936. a2 := a2 or a1;
  937. if ( count < 64 ) then
  938. Begin
  939. z2 := a0 shl negCount;
  940. z1 := a0 shr ( count AND 31 );
  941. End
  942. else
  943. Begin
  944. if count = 64 then
  945. z2 := a0
  946. else
  947. z2 := bits32(a0 <> 0);
  948. z1 := 0;
  949. End;
  950. End;
  951. z0 := 0;
  952. End;
  953. z2 := z2 or bits32( a2 <> 0 );
  954. End;
  955. z2Ptr := z2;
  956. z1Ptr := z1;
  957. z0Ptr := z0;
  958. End;
  959. {*
  960. -------------------------------------------------------------------------------
  961. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  962. number of bits given in `count'. Any bits shifted off are lost. The value
  963. of `count' must be less than 32. The result is broken into two 32-bit
  964. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  965. -------------------------------------------------------------------------------
  966. *}
  967. Procedure
  968. shortShift64Left(
  969. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  970. Begin
  971. z1Ptr := a1 shl count;
  972. if count = 0 then
  973. z0Ptr := a0
  974. else
  975. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  976. End;
  977. {*
  978. -------------------------------------------------------------------------------
  979. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  980. by the number of bits given in `count'. Any bits shifted off are lost.
  981. The value of `count' must be less than 32. The result is broken into three
  982. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  983. `z1Ptr', and `z2Ptr'.
  984. -------------------------------------------------------------------------------
  985. *}
  986. Procedure
  987. shortShift96Left(
  988. a0: bits32;
  989. a1: bits32;
  990. a2: bits32;
  991. count: int16;
  992. VAR z0Ptr: bits32;
  993. VAR z1Ptr: bits32;
  994. VAR z2Ptr: bits32
  995. );
  996. Var
  997. z0, z1, z2: bits32;
  998. negCount: int8;
  999. Begin
  1000. z2 := a2 shl count;
  1001. z1 := a1 shl count;
  1002. z0 := a0 shl count;
  1003. if ( 0 < count ) then
  1004. Begin
  1005. negCount := ( ( - count ) AND 31 );
  1006. z1 := z1 or (a2 shr negCount);
  1007. z0 := z0 or (a1 shr negCount);
  1008. End;
  1009. z2Ptr := z2;
  1010. z1Ptr := z1;
  1011. z0Ptr := z0;
  1012. End;
  1013. {*----------------------------------------------------------------------------
  1014. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1015. | number of bits given in `count'. Any bits shifted off are lost. The value
  1016. | of `count' must be less than 64. The result is broken into two 64-bit
  1017. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1018. *----------------------------------------------------------------------------*}
  1019. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1020. begin
  1021. z1Ptr := a1 shl count;
  1022. if count=0 then
  1023. z0Ptr:=a0
  1024. else
  1025. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1026. end;
  1027. {*
  1028. -------------------------------------------------------------------------------
  1029. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1030. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1031. any carry out is lost. The result is broken into two 32-bit pieces which
  1032. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1033. -------------------------------------------------------------------------------
  1034. *}
  1035. Procedure
  1036. add64(
  1037. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  1038. Var
  1039. z1: bits32;
  1040. Begin
  1041. z1 := a1 + b1;
  1042. z1Ptr := z1;
  1043. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1044. End;
  1045. {*
  1046. -------------------------------------------------------------------------------
  1047. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1048. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1049. modulo 2^96, so any carry out is lost. The result is broken into three
  1050. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1051. `z1Ptr', and `z2Ptr'.
  1052. -------------------------------------------------------------------------------
  1053. *}
  1054. Procedure
  1055. add96(
  1056. a0: bits32;
  1057. a1: bits32;
  1058. a2: bits32;
  1059. b0: bits32;
  1060. b1: bits32;
  1061. b2: bits32;
  1062. VAR z0Ptr: bits32;
  1063. VAR z1Ptr: bits32;
  1064. VAR z2Ptr: bits32
  1065. );
  1066. var
  1067. z0, z1, z2: bits32;
  1068. carry0, carry1: int8;
  1069. Begin
  1070. z2 := a2 + b2;
  1071. carry1 := int8( z2 < a2 );
  1072. z1 := a1 + b1;
  1073. carry0 := int8( z1 < a1 );
  1074. z0 := a0 + b0;
  1075. z1 := z1 + carry1;
  1076. z0 := z0 + bits32( z1 < carry1 );
  1077. z0 := z0 + carry0;
  1078. z2Ptr := z2;
  1079. z1Ptr := z1;
  1080. z0Ptr := z0;
  1081. End;
  1082. {*----------------------------------------------------------------------------
  1083. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1084. | by the number of bits given in `count'. Any bits shifted off are lost.
  1085. | The value of `count' must be less than 64. The result is broken into three
  1086. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1087. | `z1Ptr', and `z2Ptr'.
  1088. *----------------------------------------------------------------------------*}
  1089. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1090. var
  1091. z0, z1, z2 : bits64;
  1092. negCount : int8;
  1093. begin
  1094. z2 := a2 shl count;
  1095. z1 := a1 shl count;
  1096. z0 := a0 shl count;
  1097. if ( 0 < count ) then
  1098. begin
  1099. negCount := ( ( - count ) and 63 );
  1100. z1 := z1 or (a2 shr negCount);
  1101. z0 := z0 or (a1 shr negCount);
  1102. end;
  1103. z2Ptr := z2;
  1104. z1Ptr := z1;
  1105. z0Ptr := z0;
  1106. end;
  1107. {*----------------------------------------------------------------------------
  1108. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1109. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1110. | any carry out is lost. The result is broken into two 64-bit pieces which
  1111. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1112. *----------------------------------------------------------------------------*}
  1113. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1114. var
  1115. z1 : bits64;
  1116. begin
  1117. z1 := a1 + b1;
  1118. z1Ptr := z1;
  1119. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1120. end;
  1121. {*----------------------------------------------------------------------------
  1122. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1123. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1124. | modulo 2^192, so any carry out is lost. The result is broken into three
  1125. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1126. | `z1Ptr', and `z2Ptr'.
  1127. *----------------------------------------------------------------------------*}
  1128. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1129. var
  1130. z0, z1, z2 : bits64;
  1131. carry0, carry1 : int8;
  1132. begin
  1133. z2 := a2 + b2;
  1134. carry1 := ord( z2 < a2 );
  1135. z1 := a1 + b1;
  1136. carry0 := ord( z1 < a1 );
  1137. z0 := a0 + b0;
  1138. inc(z1, carry1);
  1139. inc(z0, ord( z1 < carry1 ));
  1140. inc(z0, carry0);
  1141. z2Ptr := z2;
  1142. z1Ptr := z1;
  1143. z0Ptr := z0;
  1144. end;
  1145. {*
  1146. -------------------------------------------------------------------------------
  1147. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1148. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1149. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1150. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1151. `z1Ptr'.
  1152. -------------------------------------------------------------------------------
  1153. *}
  1154. Procedure
  1155. sub64(
  1156. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1157. Begin
  1158. z1Ptr := a1 - b1;
  1159. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1160. End;
  1161. {*
  1162. -------------------------------------------------------------------------------
  1163. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1164. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1165. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1166. into three 32-bit pieces which are stored at the locations pointed to by
  1167. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1168. -------------------------------------------------------------------------------
  1169. *}
  1170. Procedure
  1171. sub96(
  1172. a0:bits32;
  1173. a1:bits32;
  1174. a2:bits32;
  1175. b0:bits32;
  1176. b1:bits32;
  1177. b2:bits32;
  1178. VAR z0Ptr:bits32;
  1179. VAR z1Ptr:bits32;
  1180. VAR z2Ptr:bits32
  1181. );
  1182. Var
  1183. z0, z1, z2: bits32;
  1184. borrow0, borrow1: int8;
  1185. Begin
  1186. z2 := a2 - b2;
  1187. borrow1 := int8( a2 < b2 );
  1188. z1 := a1 - b1;
  1189. borrow0 := int8( a1 < b1 );
  1190. z0 := a0 - b0;
  1191. z0 := z0 - bits32( z1 < borrow1 );
  1192. z1 := z1 - borrow1;
  1193. z0 := z0 -borrow0;
  1194. z2Ptr := z2;
  1195. z1Ptr := z1;
  1196. z0Ptr := z0;
  1197. End;
  1198. {*----------------------------------------------------------------------------
  1199. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1200. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1201. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1202. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1203. | `z1Ptr'.
  1204. *----------------------------------------------------------------------------*}
  1205. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1206. begin
  1207. z1Ptr := a1 - b1;
  1208. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1209. end;
  1210. {*----------------------------------------------------------------------------
  1211. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1212. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1213. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1214. | result is broken into three 64-bit pieces which are stored at the locations
  1215. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1216. *----------------------------------------------------------------------------*}
  1217. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1218. var
  1219. z0, z1, z2 : bits64;
  1220. borrow0, borrow1 : int8;
  1221. begin
  1222. z2 := a2 - b2;
  1223. borrow1 := ord( a2 < b2 );
  1224. z1 := a1 - b1;
  1225. borrow0 := ord( a1 < b1 );
  1226. z0 := a0 - b0;
  1227. dec(z0, ord( z1 < borrow1 ));
  1228. dec(z1, borrow1);
  1229. dec(z0, borrow0);
  1230. z2Ptr := z2;
  1231. z1Ptr := z1;
  1232. z0Ptr := z0;
  1233. end;
  1234. {*
  1235. -------------------------------------------------------------------------------
  1236. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1237. into two 32-bit pieces which are stored at the locations pointed to by
  1238. `z0Ptr' and `z1Ptr'.
  1239. -------------------------------------------------------------------------------
  1240. *}
  1241. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1242. :bits32 );
  1243. Var
  1244. aHigh, aLow, bHigh, bLow: bits16;
  1245. z0, zMiddleA, zMiddleB, z1: bits32;
  1246. Begin
  1247. aLow := a and $ffff;
  1248. aHigh := a shr 16;
  1249. bLow := b and $ffff;
  1250. bHigh := b shr 16;
  1251. z1 := ( bits32( aLow) ) * bLow;
  1252. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1253. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1254. z0 := ( bits32 (aHigh) ) * bHigh;
  1255. zMiddleA := zMiddleA + zMiddleB;
  1256. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1257. zMiddleA := zmiddleA shl 16;
  1258. z1 := z1 + zMiddleA;
  1259. z0 := z0 + bits32( z1 < zMiddleA );
  1260. z1Ptr := z1;
  1261. z0Ptr := z0;
  1262. End;
  1263. {*
  1264. -------------------------------------------------------------------------------
  1265. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1266. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1267. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1268. `z2Ptr'.
  1269. -------------------------------------------------------------------------------
  1270. *}
  1271. Procedure
  1272. mul64By32To96(
  1273. a0:bits32;
  1274. a1:bits32;
  1275. b:bits32;
  1276. VAR z0Ptr:bits32;
  1277. VAR z1Ptr:bits32;
  1278. VAR z2Ptr:bits32
  1279. );
  1280. Var
  1281. z0, z1, z2, more1: bits32;
  1282. Begin
  1283. mul32To64( a1, b, z1, z2 );
  1284. mul32To64( a0, b, z0, more1 );
  1285. add64( z0, more1, 0, z1, z0, z1 );
  1286. z2Ptr := z2;
  1287. z1Ptr := z1;
  1288. z0Ptr := z0;
  1289. End;
  1290. {*
  1291. -------------------------------------------------------------------------------
  1292. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1293. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1294. product. The product is broken into four 32-bit pieces which are stored at
  1295. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1296. -------------------------------------------------------------------------------
  1297. *}
  1298. Procedure
  1299. mul64To128(
  1300. a0:bits32;
  1301. a1:bits32;
  1302. b0:bits32;
  1303. b1:bits32;
  1304. VAR z0Ptr:bits32;
  1305. VAR z1Ptr:bits32;
  1306. VAR z2Ptr:bits32;
  1307. VAR z3Ptr:bits32
  1308. );
  1309. Var
  1310. z0, z1, z2, z3: bits32;
  1311. more1, more2: bits32;
  1312. Begin
  1313. mul32To64( a1, b1, z2, z3 );
  1314. mul32To64( a1, b0, z1, more2 );
  1315. add64( z1, more2, 0, z2, z1, z2 );
  1316. mul32To64( a0, b0, z0, more1 );
  1317. add64( z0, more1, 0, z1, z0, z1 );
  1318. mul32To64( a0, b1, more1, more2 );
  1319. add64( more1, more2, 0, z2, more1, z2 );
  1320. add64( z0, z1, 0, more1, z0, z1 );
  1321. z3Ptr := z3;
  1322. z2Ptr := z2;
  1323. z1Ptr := z1;
  1324. z0Ptr := z0;
  1325. End;
  1326. {*----------------------------------------------------------------------------
  1327. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1328. | into two 64-bit pieces which are stored at the locations pointed to by
  1329. | `z0Ptr' and `z1Ptr'.
  1330. *----------------------------------------------------------------------------*}
  1331. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1332. var
  1333. aHigh, aLow, bHigh, bLow : bits32;
  1334. z0, zMiddleA, zMiddleB, z1 : bits64;
  1335. begin
  1336. aLow := a;
  1337. aHigh := a shr 32;
  1338. bLow := b;
  1339. bHigh := b shr 32;
  1340. z1 := ( bits64(aLow) ) * bLow;
  1341. zMiddleA := ( bits64( aLow )) * bHigh;
  1342. zMiddleB := ( bits64( aHigh )) * bLow;
  1343. z0 := ( bits64(aHigh) ) * bHigh;
  1344. inc(zMiddleA, zMiddleB);
  1345. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1346. zMiddleA := zMiddleA shl 32;
  1347. inc(z1, zMiddleA);
  1348. inc(z0, ord( z1 < zMiddleA ));
  1349. z1Ptr := z1;
  1350. z0Ptr := z0;
  1351. end;
  1352. {*----------------------------------------------------------------------------
  1353. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1354. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1355. | product. The product is broken into four 64-bit pieces which are stored at
  1356. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1357. *----------------------------------------------------------------------------*}
  1358. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1359. var
  1360. z0,z1,z2,z3,more1,more2 : bits64;
  1361. begin
  1362. mul64To128( a1, b1, z2, z3 );
  1363. mul64To128( a1, b0, z1, more2 );
  1364. add128( z1, more2, 0, z2, z1, z2 );
  1365. mul64To128( a0, b0, z0, more1 );
  1366. add128( z0, more1, 0, z1, z0, z1 );
  1367. mul64To128( a0, b1, more1, more2 );
  1368. add128( more1, more2, 0, z2, more1, z2 );
  1369. add128( z0, z1, 0, more1, z0, z1 );
  1370. z3Ptr := z3;
  1371. z2Ptr := z2;
  1372. z1Ptr := z1;
  1373. z0Ptr := z0;
  1374. end;
  1375. {*----------------------------------------------------------------------------
  1376. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1377. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1378. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1379. | `z2Ptr'.
  1380. *----------------------------------------------------------------------------*}
  1381. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1382. var
  1383. z0, z1, z2, more1 : bits64;
  1384. begin
  1385. mul64To128( a1, b, z1, z2 );
  1386. mul64To128( a0, b, z0, more1 );
  1387. add128( z0, more1, 0, z1, z0, z1 );
  1388. z2Ptr := z2;
  1389. z1Ptr := z1;
  1390. z0Ptr := z0;
  1391. end;
  1392. {*----------------------------------------------------------------------------
  1393. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1394. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1395. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1396. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1397. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1398. | unsigned integer is returned.
  1399. *----------------------------------------------------------------------------*}
  1400. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1401. var
  1402. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1403. begin
  1404. if ( b <= a0 ) then
  1405. begin
  1406. result:=qword( $FFFFFFFFFFFFFFFF );
  1407. exit;
  1408. end;
  1409. b0 := b shr 32;
  1410. if ( b0 shl 32 <= a0 ) then
  1411. z:=qword( $FFFFFFFF00000000 )
  1412. else
  1413. z:=( a0 div b0 ) shl 32;
  1414. mul64To128( b, z, term0, term1 );
  1415. sub128( a0, a1, term0, term1, rem0, rem1 );
  1416. while ( ( sbits64(rem0) ) < 0 ) do begin
  1417. dec(z,qword( $100000000 ));
  1418. b1 := b shl 32;
  1419. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1420. end;
  1421. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1422. if ( b0 shl 32 <= rem0 ) then
  1423. z:=z or $FFFFFFFF
  1424. else
  1425. z:=z or rem0 div b0;
  1426. result:=z;
  1427. end;
  1428. {*
  1429. -------------------------------------------------------------------------------
  1430. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1431. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1432. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1433. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1434. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1435. unsigned integer is returned.
  1436. -------------------------------------------------------------------------------
  1437. *}
  1438. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1439. Var
  1440. b0, b1: bits32;
  1441. rem0, rem1, term0, term1: bits32;
  1442. z: bits32;
  1443. Begin
  1444. if ( b <= a0 ) then
  1445. Begin
  1446. estimateDiv64To32 := $FFFFFFFF;
  1447. exit;
  1448. End;
  1449. b0 := b shr 16;
  1450. if ( b0 shl 16 <= a0 ) then
  1451. z:= $FFFF0000
  1452. else
  1453. z:= ( a0 div b0 ) shl 16;
  1454. mul32To64( b, z, term0, term1 );
  1455. sub64( a0, a1, term0, term1, rem0, rem1 );
  1456. while ( ( sbits32 (rem0) ) < 0 ) do
  1457. Begin
  1458. z := z - $10000;
  1459. b1 := b shl 16;
  1460. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1461. End;
  1462. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1463. if ( b0 shl 16 <= rem0 ) then
  1464. z := z or $FFFF
  1465. else
  1466. z := z or (rem0 div b0);
  1467. estimateDiv64To32 := z;
  1468. End;
  1469. {*
  1470. -------------------------------------------------------------------------------
  1471. Returns an approximation to the square root of the 32-bit significand given
  1472. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1473. `aExp' (the least significant bit) is 1, the integer returned approximates
  1474. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1475. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1476. case, the approximation returned lies strictly within +/-2 of the exact
  1477. value.
  1478. -------------------------------------------------------------------------------
  1479. *}
  1480. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1481. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1482. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1483. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1484. );
  1485. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1486. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1487. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1488. );
  1489. Var
  1490. index: int8;
  1491. z: bits32;
  1492. Begin
  1493. index := ( a shr 27 ) AND 15;
  1494. if ( aExp AND 1 ) <> 0 then
  1495. Begin
  1496. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1497. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1498. a := a shr 1;
  1499. End
  1500. else
  1501. Begin
  1502. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1503. z := a div z + z;
  1504. if ( $20000 <= z ) then
  1505. z := $FFFF8000
  1506. else
  1507. z := ( z shl 15 );
  1508. if ( z <= a ) then
  1509. Begin
  1510. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1511. exit;
  1512. End;
  1513. End;
  1514. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1515. End;
  1516. {*
  1517. -------------------------------------------------------------------------------
  1518. Returns the number of leading 0 bits before the most-significant 1 bit of
  1519. `a'. If `a' is zero, 32 is returned.
  1520. -------------------------------------------------------------------------------
  1521. *}
  1522. Function countLeadingZeros32( a:bits32 ): int8;
  1523. const countLeadingZerosHigh:array[0..255] of int8 = (
  1524. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1525. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1526. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1527. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1528. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1529. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1530. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1531. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1532. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1533. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1534. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1535. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1536. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1537. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1538. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1539. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1540. );
  1541. Var
  1542. shiftCount: int8;
  1543. Begin
  1544. shiftCount := 0;
  1545. if ( a < $10000 ) then
  1546. Begin
  1547. shiftCount := shiftcount + 16;
  1548. a := a shl 16;
  1549. End;
  1550. if ( a < $1000000 ) then
  1551. Begin
  1552. shiftCount := shiftcount + 8;
  1553. a := a shl 8;
  1554. end;
  1555. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1556. countLeadingZeros32:= shiftCount;
  1557. End;
  1558. {*----------------------------------------------------------------------------
  1559. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1560. | `a'. If `a' is zero, 64 is returned.
  1561. *----------------------------------------------------------------------------*}
  1562. function countLeadingZeros64( a : bits64): int8;
  1563. var
  1564. shiftcount : int8;
  1565. Begin
  1566. shiftCount := 0;
  1567. if ( a < bits64(bits64(1) shl 32 )) then
  1568. shiftCount := shiftcount + 32
  1569. else
  1570. a := a shr 32;
  1571. shiftCount := shiftCount + countLeadingZeros32( a );
  1572. countLeadingZeros64:= shiftCount;
  1573. End;
  1574. {*
  1575. -------------------------------------------------------------------------------
  1576. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1577. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1578. returns 0.
  1579. -------------------------------------------------------------------------------
  1580. *}
  1581. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1582. Begin
  1583. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1584. End;
  1585. {*
  1586. -------------------------------------------------------------------------------
  1587. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1588. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1589. Otherwise, returns 0.
  1590. -------------------------------------------------------------------------------
  1591. *}
  1592. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1593. Begin
  1594. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1595. End;
  1596. {*
  1597. -------------------------------------------------------------------------------
  1598. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1599. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1600. returns 0.
  1601. -------------------------------------------------------------------------------
  1602. *}
  1603. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1604. Begin
  1605. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1606. End;
  1607. {*
  1608. -------------------------------------------------------------------------------
  1609. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1610. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1611. returns 0.
  1612. -------------------------------------------------------------------------------
  1613. *}
  1614. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1615. Begin
  1616. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1617. End;
  1618. const
  1619. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1620. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1621. (*****************************************************************************)
  1622. (* End Low-Level arithmetic *)
  1623. (*****************************************************************************)
  1624. {*
  1625. -------------------------------------------------------------------------------
  1626. Functions and definitions to determine: (1) whether tininess for underflow
  1627. is detected before or after rounding by default, (2) what (if anything)
  1628. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1629. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1630. are propagated from function inputs to output. These details are ENDIAN
  1631. specific
  1632. -------------------------------------------------------------------------------
  1633. *}
  1634. {$IFDEF ENDIAN_LITTLE}
  1635. {*
  1636. -------------------------------------------------------------------------------
  1637. Internal canonical NaN format.
  1638. -------------------------------------------------------------------------------
  1639. *}
  1640. TYPE
  1641. commonNaNT = record
  1642. high, low : bits32;
  1643. sign: flag;
  1644. end;
  1645. {*
  1646. -------------------------------------------------------------------------------
  1647. The pattern for a default generated single-precision NaN.
  1648. -------------------------------------------------------------------------------
  1649. *}
  1650. const float32_default_nan = $FFC00000;
  1651. {*
  1652. -------------------------------------------------------------------------------
  1653. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1654. otherwise returns 0.
  1655. -------------------------------------------------------------------------------
  1656. *}
  1657. Function float32_is_nan( a : float32 ): flag;
  1658. Begin
  1659. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1660. End;
  1661. {*
  1662. -------------------------------------------------------------------------------
  1663. Returns 1 if the single-precision floating-point value `a' is a signaling
  1664. NaN; otherwise returns 0.
  1665. -------------------------------------------------------------------------------
  1666. *}
  1667. Function float32_is_signaling_nan( a : float32 ): flag;
  1668. Begin
  1669. float32_is_signaling_nan := flag
  1670. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1671. End;
  1672. {*
  1673. -------------------------------------------------------------------------------
  1674. Returns the result of converting the single-precision floating-point NaN
  1675. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1676. exception is raised.
  1677. -------------------------------------------------------------------------------
  1678. *}
  1679. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1680. var
  1681. z : commonNaNT ;
  1682. Begin
  1683. if ( float32_is_signaling_nan( a ) <> 0) then
  1684. float_raise( float_flag_invalid );
  1685. z.sign := a shr 31;
  1686. z.low := 0;
  1687. z.high := a shl 9;
  1688. c := z;
  1689. End;
  1690. {*
  1691. -------------------------------------------------------------------------------
  1692. Returns the result of converting the canonical NaN `a' to the single-
  1693. precision floating-point format.
  1694. -------------------------------------------------------------------------------
  1695. *}
  1696. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1697. Begin
  1698. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1699. End;
  1700. {*
  1701. -------------------------------------------------------------------------------
  1702. Takes two single-precision floating-point values `a' and `b', one of which
  1703. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1704. signaling NaN, the invalid exception is raised.
  1705. -------------------------------------------------------------------------------
  1706. *}
  1707. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1708. Var
  1709. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1710. label returnLargerSignificand;
  1711. Begin
  1712. aIsNaN := float32_is_nan( a );
  1713. aIsSignalingNaN := float32_is_signaling_nan( a );
  1714. bIsNaN := float32_is_nan( b );
  1715. bIsSignalingNaN := float32_is_signaling_nan( b );
  1716. a := a or $00400000;
  1717. b := b or $00400000;
  1718. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1719. float_raise( float_flag_invalid );
  1720. if ( aIsSignalingNaN )<> 0 then
  1721. Begin
  1722. if ( bIsSignalingNaN ) <> 0 then
  1723. goto returnLargerSignificand;
  1724. if bIsNan <> 0 then
  1725. propagateFloat32NaN := b
  1726. else
  1727. propagateFloat32NaN := a;
  1728. exit;
  1729. End
  1730. else if ( aIsNaN <> 0) then
  1731. Begin
  1732. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1733. Begin
  1734. propagateFloat32NaN := a;
  1735. exit;
  1736. End;
  1737. returnLargerSignificand:
  1738. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1739. Begin
  1740. propagateFloat32NaN := b;
  1741. exit;
  1742. End;
  1743. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1744. Begin
  1745. propagateFloat32NaN := a;
  1746. End;
  1747. if a < b then
  1748. propagateFloat32NaN := a
  1749. else
  1750. propagateFloat32NaN := b;
  1751. exit;
  1752. End
  1753. else
  1754. Begin
  1755. propagateFloat32NaN := b;
  1756. exit;
  1757. End;
  1758. End;
  1759. {*
  1760. -------------------------------------------------------------------------------
  1761. The pattern for a default generated double-precision NaN. The `high' and
  1762. `low' values hold the most- and least-significant bits, respectively.
  1763. -------------------------------------------------------------------------------
  1764. *}
  1765. const
  1766. float64_default_nan_high = $FFF80000;
  1767. float64_default_nan_low = $00000000;
  1768. {*
  1769. -------------------------------------------------------------------------------
  1770. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1771. otherwise returns 0.
  1772. -------------------------------------------------------------------------------
  1773. *}
  1774. Function float64_is_nan( a : float64 ) : flag;
  1775. Begin
  1776. float64_is_nan :=
  1777. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1778. and ( a.low or ( a.high and $000FFFFF ) );
  1779. End;
  1780. {*
  1781. -------------------------------------------------------------------------------
  1782. Returns 1 if the double-precision floating-point value `a' is a signaling
  1783. NaN; otherwise returns 0.
  1784. -------------------------------------------------------------------------------
  1785. *}
  1786. Function float64_is_signaling_nan( a : float64 ): flag;
  1787. Begin
  1788. float64_is_signaling_nan :=
  1789. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1790. and ( a.low or ( a.high and $0007FFFF ) );
  1791. End;
  1792. {*
  1793. -------------------------------------------------------------------------------
  1794. Returns the result of converting the double-precision floating-point NaN
  1795. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1796. exception is raised.
  1797. -------------------------------------------------------------------------------
  1798. *}
  1799. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1800. Var
  1801. z : commonNaNT;
  1802. Begin
  1803. if ( float64_is_signaling_nan( a )<>0 ) then
  1804. float_raise( float_flag_invalid );
  1805. z.sign := a.high shr 31;
  1806. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1807. c := z;
  1808. End;
  1809. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1810. Var
  1811. z : commonNaNT;
  1812. Begin
  1813. if ( float64_is_signaling_nan( a )<>0 ) then
  1814. float_raise( float_flag_invalid );
  1815. z.sign := a.high shr 31;
  1816. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1817. result := z;
  1818. End;
  1819. {*
  1820. -------------------------------------------------------------------------------
  1821. Returns the result of converting the canonical NaN `a' to the double-
  1822. precision floating-point format.
  1823. -------------------------------------------------------------------------------
  1824. *}
  1825. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1826. Var
  1827. z: float64;
  1828. Begin
  1829. shift64Right( a.high, a.low, 12, z.high, z.low );
  1830. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1831. c := z;
  1832. End;
  1833. {*
  1834. -------------------------------------------------------------------------------
  1835. Takes two double-precision floating-point values `a' and `b', one of which
  1836. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1837. signaling NaN, the invalid exception is raised.
  1838. -------------------------------------------------------------------------------
  1839. *}
  1840. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1841. Var
  1842. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1843. label returnLargerSignificand;
  1844. Begin
  1845. aIsNaN := float64_is_nan( a );
  1846. aIsSignalingNaN := float64_is_signaling_nan( a );
  1847. bIsNaN := float64_is_nan( b );
  1848. bIsSignalingNaN := float64_is_signaling_nan( b );
  1849. a.high := a.high or $00080000;
  1850. b.high := b.high or $00080000;
  1851. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1852. float_raise( float_flag_invalid );
  1853. if ( aIsSignalingNaN )<>0 then
  1854. Begin
  1855. if ( bIsSignalingNaN )<>0 then
  1856. goto returnLargerSignificand;
  1857. if bIsNan <> 0 then
  1858. c := b
  1859. else
  1860. c := a;
  1861. exit;
  1862. End
  1863. else if ( aIsNaN )<> 0 then
  1864. Begin
  1865. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1866. Begin
  1867. c := a;
  1868. exit;
  1869. End;
  1870. returnLargerSignificand:
  1871. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1872. Begin
  1873. c := b;
  1874. exit;
  1875. End;
  1876. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1877. Begin
  1878. c := a;
  1879. exit;
  1880. End;
  1881. if a.high < b.high then
  1882. c := a
  1883. else
  1884. c := b;
  1885. exit;
  1886. End
  1887. else
  1888. Begin
  1889. c := b;
  1890. exit;
  1891. End;
  1892. End;
  1893. {*----------------------------------------------------------------------------
  1894. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1895. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1896. | returns 0.
  1897. *----------------------------------------------------------------------------*}
  1898. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1899. begin
  1900. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1901. end;
  1902. {*----------------------------------------------------------------------------
  1903. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1904. | otherwise returns 0.
  1905. *----------------------------------------------------------------------------*}
  1906. function float128_is_nan( a : float128): flag;
  1907. begin
  1908. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1909. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1910. end;
  1911. {*----------------------------------------------------------------------------
  1912. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1913. | signaling NaN; otherwise returns 0.
  1914. *----------------------------------------------------------------------------*}
  1915. function float128_is_signaling_nan( a : float128): flag;
  1916. begin
  1917. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1918. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1919. end;
  1920. {*----------------------------------------------------------------------------
  1921. | Returns the result of converting the quadruple-precision floating-point NaN
  1922. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1923. | exception is raised.
  1924. *----------------------------------------------------------------------------*}
  1925. function float128ToCommonNaN( a : float128): commonNaNT;
  1926. var
  1927. z: commonNaNT;
  1928. qhigh,qlow : qword;
  1929. begin
  1930. if ( float128_is_signaling_nan( a )<>0) then
  1931. float_raise( float_flag_invalid );
  1932. z.sign := a.high shr 63;
  1933. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1934. z.high:=qhigh shr 32;
  1935. z.low:=qhigh and $ffffffff;
  1936. result:=z;
  1937. end;
  1938. {*----------------------------------------------------------------------------
  1939. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1940. | precision floating-point format.
  1941. *----------------------------------------------------------------------------*}
  1942. function commonNaNToFloat128( a : commonNaNT): float128;
  1943. var
  1944. z: float128;
  1945. begin
  1946. shift128Right( a.high, a.low, 16, z.high, z.low );
  1947. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1948. result:=z;
  1949. end;
  1950. {*----------------------------------------------------------------------------
  1951. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1952. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1953. | `b' is a signaling NaN, the invalid exception is raised.
  1954. *----------------------------------------------------------------------------*}
  1955. function propagateFloat128NaN( a: float128; b : float128): float128;
  1956. var
  1957. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1958. label
  1959. returnLargerSignificand;
  1960. begin
  1961. aIsNaN := float128_is_nan( a );
  1962. aIsSignalingNaN := float128_is_signaling_nan( a );
  1963. bIsNaN := float128_is_nan( b );
  1964. bIsSignalingNaN := float128_is_signaling_nan( b );
  1965. a.high := a.high or int64( $0000800000000000 );
  1966. b.high := b.high or int64( $0000800000000000 );
  1967. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1968. float_raise( float_flag_invalid );
  1969. if ( aIsSignalingNaN )<>0 then
  1970. begin
  1971. if ( bIsSignalingNaN )<>0 then
  1972. goto returnLargerSignificand;
  1973. if bIsNaN<>0 then
  1974. result := b
  1975. else
  1976. result := a;
  1977. exit;
  1978. end
  1979. else if ( aIsNaN )<>0 then
  1980. begin
  1981. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1982. begin
  1983. result := a;
  1984. exit;
  1985. end;
  1986. returnLargerSignificand:
  1987. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1988. begin
  1989. result := b;
  1990. exit;
  1991. end;
  1992. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1993. begin
  1994. result := a;
  1995. exit
  1996. end;
  1997. if ( a.high < b.high ) then
  1998. result := a
  1999. else
  2000. result := b;
  2001. exit;
  2002. end
  2003. else
  2004. result:=b;
  2005. end;
  2006. {$ELSE}
  2007. { Big endian code }
  2008. (*----------------------------------------------------------------------------
  2009. | Internal canonical NaN format.
  2010. *----------------------------------------------------------------------------*)
  2011. type
  2012. commonNANT = record
  2013. high, low : bits32;
  2014. sign : flag;
  2015. end;
  2016. (*----------------------------------------------------------------------------
  2017. | The pattern for a default generated single-precision NaN.
  2018. *----------------------------------------------------------------------------*)
  2019. const float32_default_nan = $7FFFFFFF;
  2020. (*----------------------------------------------------------------------------
  2021. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  2022. | otherwise returns 0.
  2023. *----------------------------------------------------------------------------*)
  2024. function float32_is_nan(a: float32): flag;
  2025. begin
  2026. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  2027. end;
  2028. (*----------------------------------------------------------------------------
  2029. | Returns 1 if the single-precision floating-point value `a' is a signaling
  2030. | NaN; otherwise returns 0.
  2031. *----------------------------------------------------------------------------*)
  2032. function float32_is_signaling_nan(a: float32):flag;
  2033. begin
  2034. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2035. end;
  2036. (*----------------------------------------------------------------------------
  2037. | Returns the result of converting the single-precision floating-point NaN
  2038. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2039. | exception is raised.
  2040. *----------------------------------------------------------------------------*)
  2041. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  2042. var
  2043. z: commonNANT;
  2044. begin
  2045. if float32_is_signaling_nan(a)<>0 then
  2046. float_raise(float_flag_invalid);
  2047. z.sign := a shr 31;
  2048. z.low := 0;
  2049. z.high := a shl 9;
  2050. c:=z;
  2051. end;
  2052. (*----------------------------------------------------------------------------
  2053. | Returns the result of converting the canonical NaN `a' to the single-
  2054. | precision floating-point format.
  2055. *----------------------------------------------------------------------------*)
  2056. function CommonNanToFloat32(a : CommonNaNT): float32;
  2057. begin
  2058. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2059. end;
  2060. (*----------------------------------------------------------------------------
  2061. | Takes two single-precision floating-point values `a' and `b', one of which
  2062. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2063. | signaling NaN, the invalid exception is raised.
  2064. *----------------------------------------------------------------------------*)
  2065. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2066. var
  2067. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2068. begin
  2069. aIsNaN := float32_is_nan( a );
  2070. aIsSignalingNaN := float32_is_signaling_nan( a );
  2071. bIsNaN := float32_is_nan( b );
  2072. bIsSignalingNaN := float32_is_signaling_nan( b );
  2073. a := a or $00400000;
  2074. b := b or $00400000;
  2075. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2076. float_raise( float_flag_invalid );
  2077. if bIsSignalingNaN<>0 then
  2078. propagateFloat32Nan := b
  2079. else if aIsSignalingNan<>0 then
  2080. propagateFloat32Nan := a
  2081. else if bIsNan<>0 then
  2082. propagateFloat32Nan := b
  2083. else
  2084. propagateFloat32Nan := a;
  2085. end;
  2086. (*----------------------------------------------------------------------------
  2087. | The pattern for a default generated double-precision NaN. The `high' and
  2088. | `low' values hold the most- and least-significant bits, respectively.
  2089. *----------------------------------------------------------------------------*)
  2090. const
  2091. float64_default_nan_high = $7FFFFFFF;
  2092. float64_default_nan_low = $FFFFFFFF;
  2093. (*----------------------------------------------------------------------------
  2094. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2095. | otherwise returns 0.
  2096. *----------------------------------------------------------------------------*)
  2097. function float64_is_nan(a: float64): flag;
  2098. begin
  2099. float64_is_nan := flag (
  2100. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2101. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2102. end;
  2103. (*----------------------------------------------------------------------------
  2104. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2105. | NaN; otherwise returns 0.
  2106. *----------------------------------------------------------------------------*)
  2107. function float64_is_signaling_nan( a:float64): flag;
  2108. begin
  2109. float64_is_signaling_nan := flag(
  2110. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2111. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2112. end;
  2113. (*----------------------------------------------------------------------------
  2114. | Returns the result of converting the double-precision floating-point NaN
  2115. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2116. | exception is raised.
  2117. *----------------------------------------------------------------------------*)
  2118. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2119. var
  2120. z : commonNaNT;
  2121. begin
  2122. if ( float64_is_signaling_nan( a )<>0 ) then
  2123. float_raise( float_flag_invalid );
  2124. z.sign := a.high shr 31;
  2125. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2126. c:=z;
  2127. end;
  2128. (*----------------------------------------------------------------------------
  2129. | Returns the result of converting the canonical NaN `a' to the double-
  2130. | precision floating-point format.
  2131. *----------------------------------------------------------------------------*)
  2132. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2133. var
  2134. z: float64;
  2135. begin
  2136. shift64Right( a.high, a.low, 12, z.high, z.low );
  2137. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2138. c:=z;
  2139. end;
  2140. (*----------------------------------------------------------------------------
  2141. | Takes two double-precision floating-point values `a' and `b', one of which
  2142. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2143. | signaling NaN, the invalid exception is raised.
  2144. *----------------------------------------------------------------------------*)
  2145. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2146. var
  2147. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2148. begin
  2149. aIsNaN := float64_is_nan( a );
  2150. aIsSignalingNaN := float64_is_signaling_nan( a );
  2151. bIsNaN := float64_is_nan( b );
  2152. bIsSignalingNaN := float64_is_signaling_nan( b );
  2153. a.high := a.high or $00080000;
  2154. b.high := b.high or $00080000;
  2155. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2156. float_raise( float_flag_invalid );
  2157. if bIsSignalingNaN<>0 then
  2158. c := b
  2159. else if aIsSignalingNan<>0 then
  2160. c := a
  2161. else if bIsNan<>0 then
  2162. c := b
  2163. else
  2164. c := a;
  2165. end;
  2166. {$ENDIF}
  2167. (****************************************************************************)
  2168. (* END ENDIAN SPECIFIC CODE *)
  2169. (****************************************************************************)
  2170. {*
  2171. -------------------------------------------------------------------------------
  2172. Returns the fraction bits of the single-precision floating-point value `a'.
  2173. -------------------------------------------------------------------------------
  2174. *}
  2175. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2176. Begin
  2177. ExtractFloat32Frac := A AND $007FFFFF;
  2178. End;
  2179. {*
  2180. -------------------------------------------------------------------------------
  2181. Returns the exponent bits of the single-precision floating-point value `a'.
  2182. -------------------------------------------------------------------------------
  2183. *}
  2184. Function extractFloat32Exp( a: float32 ): Int16;
  2185. Begin
  2186. extractFloat32Exp := (a shr 23) AND $FF;
  2187. End;
  2188. {*
  2189. -------------------------------------------------------------------------------
  2190. Returns the sign bit of the single-precision floating-point value `a'.
  2191. -------------------------------------------------------------------------------
  2192. *}
  2193. Function extractFloat32Sign( a: float32 ): Flag;
  2194. Begin
  2195. extractFloat32Sign := a shr 31;
  2196. End;
  2197. {*
  2198. -------------------------------------------------------------------------------
  2199. Normalizes the subnormal single-precision floating-point value represented
  2200. by the denormalized significand `aSig'. The normalized exponent and
  2201. significand are stored at the locations pointed to by `zExpPtr' and
  2202. `zSigPtr', respectively.
  2203. -------------------------------------------------------------------------------
  2204. *}
  2205. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2206. Var
  2207. ShiftCount : BYTE;
  2208. Begin
  2209. shiftCount := countLeadingZeros32( aSig ) - 8;
  2210. zSigPtr := aSig shl shiftCount;
  2211. zExpPtr := 1 - shiftCount;
  2212. End;
  2213. {*
  2214. -------------------------------------------------------------------------------
  2215. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2216. single-precision floating-point value, returning the result. After being
  2217. shifted into the proper positions, the three fields are simply added
  2218. together to form the result. This means that any integer portion of `zSig'
  2219. will be added into the exponent. Since a properly normalized significand
  2220. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2221. than the desired result exponent whenever `zSig' is a complete, normalized
  2222. significand.
  2223. -------------------------------------------------------------------------------
  2224. *}
  2225. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2226. Begin
  2227. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2228. + zSig;
  2229. End;
  2230. {*
  2231. -------------------------------------------------------------------------------
  2232. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2233. and significand `zSig', and returns the proper single-precision floating-
  2234. point value corresponding to the abstract input. Ordinarily, the abstract
  2235. value is simply rounded and packed into the single-precision format, with
  2236. the inexact exception raised if the abstract input cannot be represented
  2237. exactly. However, if the abstract value is too large, the overflow and
  2238. inexact exceptions are raised and an infinity or maximal finite value is
  2239. returned. If the abstract value is too small, the input value is rounded to
  2240. a subnormal number, and the underflow and inexact exceptions are raised if
  2241. the abstract input cannot be represented exactly as a subnormal single-
  2242. precision floating-point number.
  2243. The input significand `zSig' has its binary point between bits 30
  2244. and 29, which is 7 bits to the left of the usual location. This shifted
  2245. significand must be normalized or smaller. If `zSig' is not normalized,
  2246. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2247. and it must not require rounding. In the usual case that `zSig' is
  2248. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2249. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2250. Binary Floating-Point Arithmetic.
  2251. -------------------------------------------------------------------------------
  2252. *}
  2253. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2254. Var
  2255. roundingMode : BYTE;
  2256. roundNearestEven : Flag;
  2257. roundIncrement, roundBits : BYTE;
  2258. IsTiny : Flag;
  2259. Begin
  2260. roundingMode := softfloat_rounding_mode;
  2261. if (roundingMode = float_round_nearest_even) then
  2262. Begin
  2263. roundNearestEven := Flag(TRUE);
  2264. end
  2265. else
  2266. roundNearestEven := Flag(FALSE);
  2267. roundIncrement := $40;
  2268. if ( Boolean(roundNearestEven) = FALSE) then
  2269. Begin
  2270. if ( roundingMode = float_round_to_zero ) Then
  2271. Begin
  2272. roundIncrement := 0;
  2273. End
  2274. else
  2275. Begin
  2276. roundIncrement := $7F;
  2277. if ( zSign <> 0 ) then
  2278. Begin
  2279. if roundingMode = float_round_up then roundIncrement := 0;
  2280. End
  2281. else
  2282. Begin
  2283. if roundingMode = float_round_down then roundIncrement := 0;
  2284. End;
  2285. End
  2286. End;
  2287. roundBits := zSig AND $7F;
  2288. if ($FD <= bits16 (zExp) ) then
  2289. Begin
  2290. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2291. Begin
  2292. float_raise( float_flag_overflow OR float_flag_inexact );
  2293. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2294. exit;
  2295. End;
  2296. if ( zExp < 0 ) then
  2297. Begin
  2298. isTiny :=
  2299. flag(( softfloat_detect_tininess = float_tininess_before_rounding )
  2300. OR ( zExp < -1 )
  2301. OR ( (zSig + roundIncrement) < $80000000 ));
  2302. shift32RightJamming( zSig, - zExp, zSig );
  2303. zExp := 0;
  2304. roundBits := zSig AND $7F;
  2305. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2306. float_raise( float_flag_underflow );
  2307. End;
  2308. End;
  2309. if ( roundBits )<> 0 then
  2310. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  2311. zSig := ( zSig + roundIncrement ) shr 7;
  2312. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2313. if ( zSig = 0 ) then zExp := 0;
  2314. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2315. exit;
  2316. End;
  2317. {*
  2318. -------------------------------------------------------------------------------
  2319. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2320. and significand `zSig', and returns the proper single-precision floating-
  2321. point value corresponding to the abstract input. This routine is just like
  2322. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2323. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2324. floating-point exponent.
  2325. -------------------------------------------------------------------------------
  2326. *}
  2327. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2328. Var
  2329. ShiftCount : int8;
  2330. Begin
  2331. shiftCount := countLeadingZeros32( zSig ) - 1;
  2332. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2333. End;
  2334. {*
  2335. -------------------------------------------------------------------------------
  2336. Returns the most-significant 20 fraction bits of the double-precision
  2337. floating-point value `a'.
  2338. -------------------------------------------------------------------------------
  2339. *}
  2340. Function extractFloat64Frac0(a: float64): bits32;
  2341. Begin
  2342. extractFloat64Frac0 := a.high and $000FFFFF;
  2343. End;
  2344. {*
  2345. -------------------------------------------------------------------------------
  2346. Returns the least-significant 32 fraction bits of the double-precision
  2347. floating-point value `a'.
  2348. -------------------------------------------------------------------------------
  2349. *}
  2350. Function extractFloat64Frac1(a: float64): bits32;
  2351. Begin
  2352. extractFloat64Frac1 := a.low;
  2353. End;
  2354. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2355. Function extractFloat64Frac(a: float64): bits64;
  2356. Begin
  2357. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2358. End;
  2359. {*
  2360. -------------------------------------------------------------------------------
  2361. Returns the exponent bits of the double-precision floating-point value `a'.
  2362. -------------------------------------------------------------------------------
  2363. *}
  2364. Function extractFloat64Exp(a: float64): int16;
  2365. Begin
  2366. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2367. End;
  2368. {*
  2369. -------------------------------------------------------------------------------
  2370. Returns the sign bit of the double-precision floating-point value `a'.
  2371. -------------------------------------------------------------------------------
  2372. *}
  2373. Function extractFloat64Sign(a: float64) : flag;
  2374. Begin
  2375. extractFloat64Sign := a.high shr 31;
  2376. End;
  2377. {*
  2378. -------------------------------------------------------------------------------
  2379. Normalizes the subnormal double-precision floating-point value represented
  2380. by the denormalized significand formed by the concatenation of `aSig0' and
  2381. `aSig1'. The normalized exponent is stored at the location pointed to by
  2382. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2383. stored at the location pointed to by `zSig0Ptr', and the least significant
  2384. 32 bits of the normalized significand are stored at the location pointed to
  2385. by `zSig1Ptr'.
  2386. -------------------------------------------------------------------------------
  2387. *}
  2388. Procedure normalizeFloat64Subnormal(
  2389. aSig0: bits32;
  2390. aSig1: bits32;
  2391. VAR zExpPtr : Int16;
  2392. VAR zSig0Ptr : Bits32;
  2393. VAR zSig1Ptr : Bits32
  2394. );
  2395. Var
  2396. ShiftCount : Int8;
  2397. Begin
  2398. if ( aSig0 = 0 ) then
  2399. Begin
  2400. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2401. if ( shiftCount < 0 ) then
  2402. Begin
  2403. zSig0Ptr := aSig1 shr ( - shiftCount );
  2404. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2405. End
  2406. else
  2407. Begin
  2408. zSig0Ptr := aSig1 shl shiftCount;
  2409. zSig1Ptr := 0;
  2410. End;
  2411. zExpPtr := - shiftCount - 31;
  2412. End
  2413. else
  2414. Begin
  2415. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2416. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2417. zExpPtr := 1 - shiftCount;
  2418. End;
  2419. End;
  2420. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2421. var
  2422. shiftCount : int8;
  2423. begin
  2424. shiftCount := countLeadingZeros64( aSig ) - 11;
  2425. zSigPtr := aSig shl shiftCount;
  2426. zExpPtr := 1 - shiftCount;
  2427. end;
  2428. {*
  2429. -------------------------------------------------------------------------------
  2430. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2431. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2432. point value, returning the result. After being shifted into the proper
  2433. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2434. together to form the most significant 32 bits of the result. This means
  2435. that any integer portion of `zSig0' will be added into the exponent. Since
  2436. a properly normalized significand will have an integer portion equal to 1,
  2437. the `zExp' input should be 1 less than the desired result exponent whenever
  2438. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2439. -------------------------------------------------------------------------------
  2440. *}
  2441. Procedure
  2442. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2443. var
  2444. z: Float64;
  2445. Begin
  2446. z.low := zSig1;
  2447. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2448. c := z;
  2449. End;
  2450. {*----------------------------------------------------------------------------
  2451. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2452. | double-precision floating-point value, returning the result. After being
  2453. | shifted into the proper positions, the three fields are simply added
  2454. | together to form the result. This means that any integer portion of `zSig'
  2455. | will be added into the exponent. Since a properly normalized significand
  2456. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2457. | than the desired result exponent whenever `zSig' is a complete, normalized
  2458. | significand.
  2459. *----------------------------------------------------------------------------*}
  2460. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2461. begin
  2462. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2463. end;
  2464. {*
  2465. -------------------------------------------------------------------------------
  2466. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2467. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2468. and `zSig2', and returns the proper double-precision floating-point value
  2469. corresponding to the abstract input. Ordinarily, the abstract value is
  2470. simply rounded and packed into the double-precision format, with the inexact
  2471. exception raised if the abstract input cannot be represented exactly.
  2472. However, if the abstract value is too large, the overflow and inexact
  2473. exceptions are raised and an infinity or maximal finite value is returned.
  2474. If the abstract value is too small, the input value is rounded to a
  2475. subnormal number, and the underflow and inexact exceptions are raised if the
  2476. abstract input cannot be represented exactly as a subnormal double-precision
  2477. floating-point number.
  2478. The input significand must be normalized or smaller. If the input
  2479. significand is not normalized, `zExp' must be 0; in that case, the result
  2480. returned is a subnormal number, and it must not require rounding. In the
  2481. usual case that the input significand is normalized, `zExp' must be 1 less
  2482. than the ``true'' floating-point exponent. The handling of underflow and
  2483. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2484. -------------------------------------------------------------------------------
  2485. *}
  2486. Procedure
  2487. roundAndPackFloat64(
  2488. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2489. Var
  2490. roundingMode : Int8;
  2491. roundNearestEven, increment, isTiny : Flag;
  2492. Begin
  2493. roundingMode := softfloat_rounding_mode;
  2494. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2495. increment := flag( sbits32 (zSig2) < 0 );
  2496. if ( roundNearestEven = flag(FALSE) ) then
  2497. Begin
  2498. if ( roundingMode = float_round_to_zero ) then
  2499. increment := 0
  2500. else
  2501. Begin
  2502. if ( zSign )<> 0 then
  2503. Begin
  2504. increment := flag( roundingMode = float_round_down ) and zSig2;
  2505. End
  2506. else
  2507. Begin
  2508. increment := flag( roundingMode = float_round_up ) and zSig2;
  2509. End
  2510. End
  2511. End;
  2512. if ( $7FD <= bits16 (zExp) ) then
  2513. Begin
  2514. if (( $7FD < zExp )
  2515. or (( zExp = $7FD )
  2516. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2517. and (increment<>0)
  2518. )
  2519. ) then
  2520. Begin
  2521. float_raise( float_flag_overflow OR float_flag_inexact );
  2522. if (( roundingMode = float_round_to_zero )
  2523. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2524. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2525. ) then
  2526. Begin
  2527. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2528. exit;
  2529. End;
  2530. packFloat64( zSign, $7FF, 0, 0, c );
  2531. exit;
  2532. End;
  2533. if ( zExp < 0 ) then
  2534. Begin
  2535. isTiny :=
  2536. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2537. or flag( zExp < -1 )
  2538. or flag(increment = 0)
  2539. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2540. shift64ExtraRightJamming(
  2541. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2542. zExp := 0;
  2543. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2544. if ( roundNearestEven )<>0 then
  2545. Begin
  2546. increment := flag( sbits32 (zSig2) < 0 );
  2547. End
  2548. else
  2549. Begin
  2550. if ( zSign )<>0 then
  2551. Begin
  2552. increment := flag( roundingMode = float_round_down ) and zSig2;
  2553. End
  2554. else
  2555. Begin
  2556. increment := flag( roundingMode = float_round_up ) and zSig2;
  2557. End
  2558. End;
  2559. End;
  2560. End;
  2561. if ( zSig2 )<>0 then
  2562. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2563. if ( increment )<>0 then
  2564. Begin
  2565. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2566. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2567. End
  2568. else
  2569. Begin
  2570. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2571. End;
  2572. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2573. End;
  2574. {*----------------------------------------------------------------------------
  2575. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2576. | and significand `zSig', and returns the proper double-precision floating-
  2577. | point value corresponding to the abstract input. Ordinarily, the abstract
  2578. | value is simply rounded and packed into the double-precision format, with
  2579. | the inexact exception raised if the abstract input cannot be represented
  2580. | exactly. However, if the abstract value is too large, the overflow and
  2581. | inexact exceptions are raised and an infinity or maximal finite value is
  2582. | returned. If the abstract value is too small, the input value is rounded
  2583. | to a subnormal number, and the underflow and inexact exceptions are raised
  2584. | if the abstract input cannot be represented exactly as a subnormal double-
  2585. | precision floating-point number.
  2586. | The input significand `zSig' has its binary point between bits 62
  2587. | and 61, which is 10 bits to the left of the usual location. This shifted
  2588. | significand must be normalized or smaller. If `zSig' is not normalized,
  2589. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2590. | and it must not require rounding. In the usual case that `zSig' is
  2591. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2592. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2593. | Binary Floating-Point Arithmetic.
  2594. *----------------------------------------------------------------------------*}
  2595. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2596. var
  2597. roundingMode: int8;
  2598. roundNearestEven: flag;
  2599. roundIncrement, roundBits: int16;
  2600. isTiny: flag;
  2601. begin
  2602. roundingMode := softfloat_rounding_mode;
  2603. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2604. roundIncrement := $200;
  2605. if ( roundNearestEven=0 ) then
  2606. begin
  2607. if ( roundingMode = float_round_to_zero ) then
  2608. begin
  2609. roundIncrement := 0;
  2610. end
  2611. else begin
  2612. roundIncrement := $3FF;
  2613. if ( zSign<>0 ) then
  2614. begin
  2615. if ( roundingMode = float_round_up ) then
  2616. roundIncrement := 0;
  2617. end
  2618. else begin
  2619. if ( roundingMode = float_round_down ) then
  2620. roundIncrement := 0;
  2621. end
  2622. end
  2623. end;
  2624. roundBits := zSig and $3FF;
  2625. if ( $7FD <= bits16(zExp) ) then
  2626. begin
  2627. if ( ( $7FD < zExp )
  2628. or ( ( zExp = $7FD )
  2629. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2630. ) then
  2631. begin
  2632. float_raise( float_flag_overflow or float_flag_inexact );
  2633. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2634. exit;
  2635. end;
  2636. if ( zExp < 0 ) then
  2637. begin
  2638. isTiny := ord(
  2639. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2640. or ( zExp < -1 )
  2641. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2642. shift64RightJamming( zSig, - zExp, zSig );
  2643. zExp := 0;
  2644. roundBits := zSig and $3FF;
  2645. if ( isTiny and roundBits )<>0 then
  2646. float_raise( float_flag_underflow );
  2647. end
  2648. end;
  2649. if ( roundBits<>0 ) then
  2650. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2651. zSig := ( zSig + roundIncrement ) shr 10;
  2652. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2653. if ( zSig = 0 ) then
  2654. zExp := 0;
  2655. result:=packFloat64( zSign, zExp, zSig );
  2656. end;
  2657. {*
  2658. -------------------------------------------------------------------------------
  2659. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2660. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2661. returns the proper double-precision floating-point value corresponding
  2662. to the abstract input. This routine is just like `roundAndPackFloat64'
  2663. except that the input significand has fewer bits and does not have to be
  2664. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2665. point exponent.
  2666. -------------------------------------------------------------------------------
  2667. *}
  2668. Procedure
  2669. normalizeRoundAndPackFloat64(
  2670. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2671. Var
  2672. shiftCount : int8;
  2673. zSig2 : bits32;
  2674. Begin
  2675. if ( zSig0 = 0 ) then
  2676. Begin
  2677. zSig0 := zSig1;
  2678. zSig1 := 0;
  2679. zExp := zExp -32;
  2680. End;
  2681. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2682. if ( 0 <= shiftCount ) then
  2683. Begin
  2684. zSig2 := 0;
  2685. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2686. End
  2687. else
  2688. Begin
  2689. shift64ExtraRightJamming
  2690. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2691. End;
  2692. zExp := zExp - shiftCount;
  2693. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2694. End;
  2695. {*
  2696. -------------------------------------------------------------------------------
  2697. Returns the result of converting the 32-bit two's complement integer `a' to
  2698. the single-precision floating-point format. The conversion is performed
  2699. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2700. -------------------------------------------------------------------------------
  2701. *}
  2702. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2703. Var
  2704. zSign : Flag;
  2705. Begin
  2706. if ( a = 0 ) then
  2707. Begin
  2708. int32_to_float32.float32 := 0;
  2709. exit;
  2710. End;
  2711. if ( a = sbits32 ($80000000) ) then
  2712. Begin
  2713. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2714. exit;
  2715. end;
  2716. zSign := flag( a < 0 );
  2717. If zSign<>0 then
  2718. a := -a;
  2719. int32_to_float32.float32:=
  2720. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2721. End;
  2722. {*
  2723. -------------------------------------------------------------------------------
  2724. Returns the result of converting the 32-bit two's complement integer `a' to
  2725. the double-precision floating-point format. The conversion is performed
  2726. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2727. -------------------------------------------------------------------------------
  2728. *}
  2729. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2730. var
  2731. zSign : flag;
  2732. absA : bits32;
  2733. shiftCount : int8;
  2734. zSig0, zSig1 : bits32;
  2735. Begin
  2736. if ( a = 0 ) then
  2737. Begin
  2738. packFloat64( 0, 0, 0, 0, result );
  2739. exit;
  2740. end;
  2741. zSign := flag( a < 0 );
  2742. if ZSign<>0 then
  2743. AbsA := -a
  2744. else
  2745. AbsA := a;
  2746. shiftCount := countLeadingZeros32( absA ) - 11;
  2747. if ( 0 <= shiftCount ) then
  2748. Begin
  2749. zSig0 := absA shl shiftCount;
  2750. zSig1 := 0;
  2751. End
  2752. else
  2753. Begin
  2754. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2755. End;
  2756. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2757. End;
  2758. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2759. {$if not defined(packFloatx80)}
  2760. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2761. forward;
  2762. {$endif}
  2763. {*----------------------------------------------------------------------------
  2764. | Returns the result of converting the 32-bit two's complement integer `a'
  2765. | to the extended double-precision floating-point format. The conversion
  2766. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2767. | Arithmetic.
  2768. *----------------------------------------------------------------------------*}
  2769. function int32_to_floatx80( a: int32 ): floatx80;
  2770. var
  2771. zSign: flag;
  2772. absA: uint32;
  2773. shiftCount: int8;
  2774. zSig: bits64;
  2775. begin
  2776. if ( a = 0 ) then begin
  2777. result := packFloatx80( 0, 0, 0 );
  2778. exit;
  2779. end;
  2780. zSign := ord( a < 0 );
  2781. if zSign <> 0 then absA := - a else absA := a;
  2782. shiftCount := countLeadingZeros32( absA ) + 32;
  2783. zSig := absA;
  2784. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2785. end;
  2786. {$endif FPC_SOFTFLOAT_FLOATX80}
  2787. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2788. {$if not defined(packFloat128)}
  2789. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2790. forward;
  2791. {$endif}
  2792. {*----------------------------------------------------------------------------
  2793. | Returns the result of converting the 32-bit two's complement integer `a' to
  2794. | the quadruple-precision floating-point format. The conversion is performed
  2795. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2796. *----------------------------------------------------------------------------*}
  2797. function int32_to_float128( a: int32 ): float128;
  2798. var
  2799. zSign: flag;
  2800. absA: uint32;
  2801. shiftCount: int8;
  2802. zSig0: bits64;
  2803. begin
  2804. if ( a = 0 ) then begin
  2805. result := packFloat128( 0, 0, 0, 0 );
  2806. exit;
  2807. end;
  2808. zSign := ord( a < 0 );
  2809. if zSign <> 0 then absA := - a else absA := a;
  2810. shiftCount := countLeadingZeros32( absA ) + 17;
  2811. zSig0 := absA;
  2812. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2813. end;
  2814. {$endif FPC_SOFTFLOAT_FLOAT128}
  2815. {*
  2816. -------------------------------------------------------------------------------
  2817. Returns the result of converting the single-precision floating-point value
  2818. `a' to the 32-bit two's complement integer format. The conversion is
  2819. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2820. Arithmetic---which means in particular that the conversion is rounded
  2821. according to the current rounding mode. If `a' is a NaN, the largest
  2822. positive integer is returned. Otherwise, if the conversion overflows, the
  2823. largest integer with the same sign as `a' is returned.
  2824. -------------------------------------------------------------------------------
  2825. *}
  2826. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2827. Var
  2828. aSign: flag;
  2829. aExp, shiftCount: int16;
  2830. aSig, aSigExtra: bits32;
  2831. z: int32;
  2832. roundingMode: int8;
  2833. Begin
  2834. aSig := extractFloat32Frac( a.float32 );
  2835. aExp := extractFloat32Exp( a.float32 );
  2836. aSign := extractFloat32Sign( a.float32 );
  2837. shiftCount := aExp - $96;
  2838. if ( 0 <= shiftCount ) then
  2839. Begin
  2840. if ( $9E <= aExp ) then
  2841. Begin
  2842. if ( a.float32 <> $CF000000 ) then
  2843. Begin
  2844. float_raise( float_flag_invalid );
  2845. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2846. Begin
  2847. float32_to_int32 := $7FFFFFFF;
  2848. exit;
  2849. End;
  2850. End;
  2851. float32_to_int32 := sbits32 ($80000000);
  2852. exit;
  2853. End;
  2854. z := ( aSig or $00800000 ) shl shiftCount;
  2855. if ( aSign<>0 ) then z := - z;
  2856. End
  2857. else
  2858. Begin
  2859. if ( aExp < $7E ) then
  2860. Begin
  2861. aSigExtra := aExp OR aSig;
  2862. z := 0;
  2863. End
  2864. else
  2865. Begin
  2866. aSig := aSig OR $00800000;
  2867. aSigExtra := aSig shl ( shiftCount and 31 );
  2868. z := aSig shr ( - shiftCount );
  2869. End;
  2870. if ( aSigExtra<>0 ) then
  2871. softfloat_exception_flags := softfloat_exception_flags
  2872. or float_flag_inexact;
  2873. roundingMode := softfloat_rounding_mode;
  2874. if ( roundingMode = float_round_nearest_even ) then
  2875. Begin
  2876. if ( sbits32 (aSigExtra) < 0 ) then
  2877. Begin
  2878. Inc(z);
  2879. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2880. z := z and not 1;
  2881. End;
  2882. if ( aSign<>0 ) then
  2883. z := - z;
  2884. End
  2885. else
  2886. Begin
  2887. aSigExtra := flag( aSigExtra <> 0 );
  2888. if ( aSign<>0 ) then
  2889. Begin
  2890. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2891. z := - z;
  2892. End
  2893. else
  2894. Begin
  2895. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2896. End
  2897. End;
  2898. End;
  2899. float32_to_int32 := z;
  2900. End;
  2901. {*
  2902. -------------------------------------------------------------------------------
  2903. Returns the result of converting the single-precision floating-point value
  2904. `a' to the 32-bit two's complement integer format. The conversion is
  2905. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2906. Arithmetic, except that the conversion is always rounded toward zero.
  2907. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2908. the conversion overflows, the largest integer with the same sign as `a' is
  2909. returned.
  2910. -------------------------------------------------------------------------------
  2911. *}
  2912. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2913. Var
  2914. aSign : flag;
  2915. aExp, shiftCount : int16;
  2916. aSig : bits32;
  2917. z : int32;
  2918. Begin
  2919. aSig := extractFloat32Frac( a.float32 );
  2920. aExp := extractFloat32Exp( a.float32 );
  2921. aSign := extractFloat32Sign( a.float32 );
  2922. shiftCount := aExp - $9E;
  2923. if ( 0 <= shiftCount ) then
  2924. Begin
  2925. if ( a.float32 <> $CF000000 ) then
  2926. Begin
  2927. float_raise( float_flag_invalid );
  2928. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2929. Begin
  2930. float32_to_int32_round_to_zero := $7FFFFFFF;
  2931. exit;
  2932. end;
  2933. End;
  2934. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2935. exit;
  2936. End
  2937. else
  2938. if ( aExp <= $7E ) then
  2939. Begin
  2940. if ( aExp or aSig )<>0 then
  2941. softfloat_exception_flags :=
  2942. softfloat_exception_flags or float_flag_inexact;
  2943. float32_to_int32_round_to_zero := 0;
  2944. exit;
  2945. End;
  2946. aSig := ( aSig or $00800000 ) shl 8;
  2947. z := aSig shr ( - shiftCount );
  2948. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2949. Begin
  2950. softfloat_exception_flags :=
  2951. softfloat_exception_flags or float_flag_inexact;
  2952. End;
  2953. if ( aSign<>0 ) then z := - z;
  2954. float32_to_int32_round_to_zero := z;
  2955. End;
  2956. {*----------------------------------------------------------------------------
  2957. | Returns the result of converting the single-precision floating-point value
  2958. | `a' to the 64-bit two's complement integer format. The conversion is
  2959. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2960. | Arithmetic---which means in particular that the conversion is rounded
  2961. | according to the current rounding mode. If `a' is a NaN, the largest
  2962. | positive integer is returned. Otherwise, if the conversion overflows, the
  2963. | largest integer with the same sign as `a' is returned.
  2964. *----------------------------------------------------------------------------*}
  2965. function float32_to_int64( a: float32 ): int64;
  2966. var
  2967. aSign: flag;
  2968. aExp, shiftCount: int16;
  2969. aSig: bits32;
  2970. aSig64, aSigExtra: bits64;
  2971. begin
  2972. aSig := extractFloat32Frac( a );
  2973. aExp := extractFloat32Exp( a );
  2974. aSign := extractFloat32Sign( a );
  2975. shiftCount := $BE - aExp;
  2976. if ( shiftCount < 0 ) then begin
  2977. float_raise( float_flag_invalid );
  2978. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  2979. result := $7FFFFFFFFFFFFFFF;
  2980. exit;
  2981. end;
  2982. result := $8000000000000000;
  2983. exit;
  2984. end;
  2985. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  2986. aSig64 := aSig;
  2987. aSig64 := aSig64 shl 40;
  2988. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  2989. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  2990. end;
  2991. {*----------------------------------------------------------------------------
  2992. | Returns the result of converting the single-precision floating-point value
  2993. | `a' to the 64-bit two's complement integer format. The conversion is
  2994. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2995. | Arithmetic, except that the conversion is always rounded toward zero. If
  2996. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  2997. | conversion overflows, the largest integer with the same sign as `a' is
  2998. | returned.
  2999. *----------------------------------------------------------------------------*}
  3000. function float32_to_int64_round_to_zero( a: float32 ): int64;
  3001. var
  3002. aSign: flag;
  3003. aExp, shiftCount: int16;
  3004. aSig: bits32;
  3005. aSig64: bits64;
  3006. z: int64;
  3007. begin
  3008. aSig := extractFloat32Frac( a );
  3009. aExp := extractFloat32Exp( a );
  3010. aSign := extractFloat32Sign( a );
  3011. shiftCount := aExp - $BE;
  3012. if ( 0 <= shiftCount ) then begin
  3013. if ( a <> $DF000000 ) then begin
  3014. float_raise( float_flag_invalid );
  3015. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3016. result := $7FFFFFFFFFFFFFFF;
  3017. exit;
  3018. end;
  3019. end;
  3020. result := $8000000000000000;
  3021. exit;
  3022. end
  3023. else if ( aExp <= $7E ) then begin
  3024. if ( aExp or aSig <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3025. result := 0;
  3026. exit;
  3027. end;
  3028. aSig64 := aSig or $00800000;
  3029. aSig64 := aSig64 shl 40;
  3030. z := aSig64 shr ( - shiftCount );
  3031. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3032. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3033. if ( aSign <> 0 ) then z := - z;
  3034. result := z;
  3035. end;
  3036. {*
  3037. -------------------------------------------------------------------------------
  3038. Returns the result of converting the single-precision floating-point value
  3039. `a' to the double-precision floating-point format. The conversion is
  3040. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3041. Arithmetic.
  3042. -------------------------------------------------------------------------------
  3043. *}
  3044. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  3045. Var
  3046. aSign : flag;
  3047. aExp : int16;
  3048. aSig, zSig0, zSig1: bits32;
  3049. tmp : CommonNanT;
  3050. Begin
  3051. aSig := extractFloat32Frac( a.float32 );
  3052. aExp := extractFloat32Exp( a.float32 );
  3053. aSign := extractFloat32Sign( a.float32 );
  3054. if ( aExp = $FF ) then
  3055. Begin
  3056. if ( aSig<>0 ) then
  3057. Begin
  3058. float32ToCommonNaN(a.float32, tmp);
  3059. commonNaNToFloat64(tmp , result);
  3060. exit;
  3061. End;
  3062. packFloat64( aSign, $7FF, 0, 0, result);
  3063. exit;
  3064. End;
  3065. if ( aExp = 0 ) then
  3066. Begin
  3067. if ( aSig = 0 ) then
  3068. Begin
  3069. packFloat64( aSign, 0, 0, 0, result );
  3070. exit;
  3071. end;
  3072. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3073. Dec(aExp);
  3074. End;
  3075. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3076. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3077. End;
  3078. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3079. {*----------------------------------------------------------------------------
  3080. | Returns the result of converting the canonical NaN `a' to the extended
  3081. | double-precision floating-point format.
  3082. *----------------------------------------------------------------------------*}
  3083. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3084. var
  3085. z : floatx80;
  3086. begin
  3087. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3088. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3089. result := z;
  3090. end;
  3091. {*----------------------------------------------------------------------------
  3092. | Returns the result of converting the single-precision floating-point value
  3093. | `a' to the extended double-precision floating-point format. The conversion
  3094. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3095. | Arithmetic.
  3096. *----------------------------------------------------------------------------*}
  3097. function float32_to_floatx80( a: float32 ): floatx80;
  3098. var
  3099. aSign: flag;
  3100. aExp: int16;
  3101. aSig: bits32;
  3102. tmp: commonNaNT;
  3103. begin
  3104. aSig := extractFloat32Frac( a );
  3105. aExp := extractFloat32Exp( a );
  3106. aSign := extractFloat32Sign( a );
  3107. if ( aExp = $FF ) then begin
  3108. if ( aSig <> 0 ) then begin
  3109. float32ToCommonNaN( a, tmp );
  3110. result := commonNaNToFloatx80( tmp );
  3111. exit;
  3112. end;
  3113. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3114. exit;
  3115. end;
  3116. if ( aExp = 0 ) then begin
  3117. if ( aSig = 0 ) then begin
  3118. result := packFloatx80( aSign, 0, 0 );
  3119. exit;
  3120. end;
  3121. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3122. end;
  3123. aSig := aSig or $00800000;
  3124. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3125. end;
  3126. {$endif FPC_SOFTFLOAT_FLOATX80}
  3127. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3128. {*----------------------------------------------------------------------------
  3129. | Returns the result of converting the single-precision floating-point value
  3130. | `a' to the double-precision floating-point format. The conversion is
  3131. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3132. | Arithmetic.
  3133. *----------------------------------------------------------------------------*}
  3134. function float32_to_float128( a: float32 ): float128;
  3135. var
  3136. aSign: flag;
  3137. aExp: int16;
  3138. aSig: bits32;
  3139. tmp: commonNaNT;
  3140. begin
  3141. aSig := extractFloat32Frac( a );
  3142. aExp := extractFloat32Exp( a );
  3143. aSign := extractFloat32Sign( a );
  3144. if ( aExp = $FF ) then begin
  3145. if ( aSig <> 0 ) then begin
  3146. float32ToCommonNaN( a, tmp );
  3147. result := commonNaNToFloat128( tmp );
  3148. exit;
  3149. end;
  3150. result := packFloat128( aSign, $7FFF, 0, 0 );
  3151. exit;
  3152. end;
  3153. if ( aExp = 0 ) then begin
  3154. if ( aSig = 0 ) then begin
  3155. result := packFloat128( aSign, 0, 0, 0 );
  3156. exit;
  3157. end;
  3158. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3159. dec( aExp );
  3160. end;
  3161. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3162. end;
  3163. {$endif FPC_SOFTFLOAT_FLOAT128}
  3164. {*
  3165. -------------------------------------------------------------------------------
  3166. Rounds the single-precision floating-point value `a' to an integer,
  3167. and returns the result as a single-precision floating-point value. The
  3168. operation is performed according to the IEC/IEEE Standard for Binary
  3169. Floating-Point Arithmetic.
  3170. -------------------------------------------------------------------------------
  3171. *}
  3172. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  3173. Var
  3174. aSign: flag;
  3175. aExp: int16;
  3176. lastBitMask, roundBitsMask: bits32;
  3177. roundingMode: int8;
  3178. z: float32;
  3179. Begin
  3180. aExp := extractFloat32Exp( a.float32 );
  3181. if ( $96 <= aExp ) then
  3182. Begin
  3183. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3184. Begin
  3185. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3186. exit;
  3187. End;
  3188. float32_round_to_int:=a;
  3189. exit;
  3190. End;
  3191. if ( aExp <= $7E ) then
  3192. Begin
  3193. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3194. Begin
  3195. float32_round_to_int:=a;
  3196. exit;
  3197. end;
  3198. softfloat_exception_flags
  3199. := softfloat_exception_flags OR float_flag_inexact;
  3200. aSign := extractFloat32Sign( a.float32 );
  3201. case ( softfloat_rounding_mode ) of
  3202. float_round_nearest_even:
  3203. Begin
  3204. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3205. Begin
  3206. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3207. exit;
  3208. End;
  3209. End;
  3210. float_round_down:
  3211. Begin
  3212. if aSign <> 0 then
  3213. float32_round_to_int.float32 := $BF800000
  3214. else
  3215. float32_round_to_int.float32 := 0;
  3216. exit;
  3217. End;
  3218. float_round_up:
  3219. Begin
  3220. if aSign <> 0 then
  3221. float32_round_to_int.float32 := $80000000
  3222. else
  3223. float32_round_to_int.float32 := $3F800000;
  3224. exit;
  3225. End;
  3226. end;
  3227. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3228. End;
  3229. lastBitMask := 1;
  3230. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3231. lastBitMask := lastBitMask shl ($96 - aExp);
  3232. roundBitsMask := lastBitMask - 1;
  3233. z := a.float32;
  3234. roundingMode := softfloat_rounding_mode;
  3235. if ( roundingMode = float_round_nearest_even ) then
  3236. Begin
  3237. z := z + (lastBitMask shr 1);
  3238. if ( ( z and roundBitsMask ) = 0 ) then
  3239. z := z and not lastBitMask;
  3240. End
  3241. else if ( roundingMode <> float_round_to_zero ) then
  3242. Begin
  3243. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3244. Begin
  3245. z := z + roundBitsMask;
  3246. End;
  3247. End;
  3248. z := z and not roundBitsMask;
  3249. if ( z <> a.float32 ) then
  3250. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3251. float32_round_to_int.float32 := z;
  3252. End;
  3253. {*
  3254. -------------------------------------------------------------------------------
  3255. Returns the result of adding the absolute values of the single-precision
  3256. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3257. before being returned. `zSign' is ignored if the result is a NaN.
  3258. The addition is performed according to the IEC/IEEE Standard for Binary
  3259. Floating-Point Arithmetic.
  3260. -------------------------------------------------------------------------------
  3261. *}
  3262. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3263. Var
  3264. aExp, bExp, zExp: int16;
  3265. aSig, bSig, zSig: bits32;
  3266. expDiff: int16;
  3267. label roundAndPack;
  3268. Begin
  3269. aSig:=extractFloat32Frac( a );
  3270. aExp:=extractFloat32Exp( a );
  3271. bSig:=extractFloat32Frac( b );
  3272. bExp := extractFloat32Exp( b );
  3273. expDiff := aExp - bExp;
  3274. aSig := aSig shl 6;
  3275. bSig := bSig shl 6;
  3276. if ( 0 < expDiff ) then
  3277. Begin
  3278. if ( aExp = $FF ) then
  3279. Begin
  3280. if ( aSig <> 0) then
  3281. Begin
  3282. addFloat32Sigs := propagateFloat32NaN( a, b );
  3283. exit;
  3284. End;
  3285. addFloat32Sigs := a;
  3286. exit;
  3287. End;
  3288. if ( bExp = 0 ) then
  3289. Begin
  3290. Dec(expDiff);
  3291. End
  3292. else
  3293. Begin
  3294. bSig := bSig or $20000000;
  3295. End;
  3296. shift32RightJamming( bSig, expDiff, bSig );
  3297. zExp := aExp;
  3298. End
  3299. else
  3300. If ( expDiff < 0 ) then
  3301. Begin
  3302. if ( bExp = $FF ) then
  3303. Begin
  3304. if ( bSig<>0 ) then
  3305. Begin
  3306. addFloat32Sigs := propagateFloat32NaN( a, b );
  3307. exit;
  3308. end;
  3309. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3310. exit;
  3311. End;
  3312. if ( aExp = 0 ) then
  3313. Begin
  3314. Inc(expDiff);
  3315. End
  3316. else
  3317. Begin
  3318. aSig := aSig OR $20000000;
  3319. End;
  3320. shift32RightJamming( aSig, - expDiff, aSig );
  3321. zExp := bExp;
  3322. End
  3323. else
  3324. Begin
  3325. if ( aExp = $FF ) then
  3326. Begin
  3327. if ( aSig OR bSig )<> 0 then
  3328. Begin
  3329. addFloat32Sigs := propagateFloat32NaN( a, b );
  3330. exit;
  3331. end;
  3332. addFloat32Sigs := a;
  3333. exit;
  3334. End;
  3335. if ( aExp = 0 ) then
  3336. Begin
  3337. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3338. exit;
  3339. end;
  3340. zSig := $40000000 + aSig + bSig;
  3341. zExp := aExp;
  3342. goto roundAndPack;
  3343. End;
  3344. aSig := aSig OR $20000000;
  3345. zSig := ( aSig + bSig ) shl 1;
  3346. Dec(zExp);
  3347. if ( sbits32 (zSig) < 0 ) then
  3348. Begin
  3349. zSig := aSig + bSig;
  3350. Inc(zExp);
  3351. End;
  3352. roundAndPack:
  3353. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3354. End;
  3355. {*
  3356. -------------------------------------------------------------------------------
  3357. Returns the result of subtracting the absolute values of the single-
  3358. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3359. difference is negated before being returned. `zSign' is ignored if the
  3360. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3361. Standard for Binary Floating-Point Arithmetic.
  3362. -------------------------------------------------------------------------------
  3363. *}
  3364. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3365. Var
  3366. aExp, bExp, zExp: int16;
  3367. aSig, bSig, zSig: bits32;
  3368. expDiff : int16;
  3369. label aExpBigger;
  3370. label bExpBigger;
  3371. label aBigger;
  3372. label bBigger;
  3373. label normalizeRoundAndPack;
  3374. Begin
  3375. aSig := extractFloat32Frac( a );
  3376. aExp := extractFloat32Exp( a );
  3377. bSig := extractFloat32Frac( b );
  3378. bExp := extractFloat32Exp( b );
  3379. expDiff := aExp - bExp;
  3380. aSig := aSig shl 7;
  3381. bSig := bSig shl 7;
  3382. if ( 0 < expDiff ) then goto aExpBigger;
  3383. if ( expDiff < 0 ) then goto bExpBigger;
  3384. if ( aExp = $FF ) then
  3385. Begin
  3386. if ( aSig OR bSig )<> 0 then
  3387. Begin
  3388. subFloat32Sigs := propagateFloat32NaN( a, b );
  3389. exit;
  3390. End;
  3391. float_raise( float_flag_invalid );
  3392. subFloat32Sigs := float32_default_nan;
  3393. exit;
  3394. End;
  3395. if ( aExp = 0 ) then
  3396. Begin
  3397. aExp := 1;
  3398. bExp := 1;
  3399. End;
  3400. if ( bSig < aSig ) Then goto aBigger;
  3401. if ( aSig < bSig ) Then goto bBigger;
  3402. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3403. exit;
  3404. bExpBigger:
  3405. if ( bExp = $FF ) then
  3406. Begin
  3407. if ( bSig<>0 ) then
  3408. Begin
  3409. subFloat32Sigs := propagateFloat32NaN( a, b );
  3410. exit;
  3411. End;
  3412. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3413. exit;
  3414. End;
  3415. if ( aExp = 0 ) then
  3416. Begin
  3417. Inc(expDiff);
  3418. End
  3419. else
  3420. Begin
  3421. aSig := aSig OR $40000000;
  3422. End;
  3423. shift32RightJamming( aSig, - expDiff, aSig );
  3424. bSig := bSig OR $40000000;
  3425. bBigger:
  3426. zSig := bSig - aSig;
  3427. zExp := bExp;
  3428. zSign := zSign xor 1;
  3429. goto normalizeRoundAndPack;
  3430. aExpBigger:
  3431. if ( aExp = $FF ) then
  3432. Begin
  3433. if ( aSig <> 0) then
  3434. Begin
  3435. subFloat32Sigs := propagateFloat32NaN( a, b );
  3436. exit;
  3437. End;
  3438. subFloat32Sigs := a;
  3439. exit;
  3440. End;
  3441. if ( bExp = 0 ) then
  3442. Begin
  3443. Dec(expDiff);
  3444. End
  3445. else
  3446. Begin
  3447. bSig := bSig OR $40000000;
  3448. End;
  3449. shift32RightJamming( bSig, expDiff, bSig );
  3450. aSig := aSig OR $40000000;
  3451. aBigger:
  3452. zSig := aSig - bSig;
  3453. zExp := aExp;
  3454. normalizeRoundAndPack:
  3455. Dec(zExp);
  3456. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3457. End;
  3458. {*
  3459. -------------------------------------------------------------------------------
  3460. Returns the result of adding the single-precision floating-point values `a'
  3461. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3462. Binary Floating-Point Arithmetic.
  3463. -------------------------------------------------------------------------------
  3464. *}
  3465. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3466. Var
  3467. aSign, bSign: Flag;
  3468. Begin
  3469. aSign := extractFloat32Sign( a.float32 );
  3470. bSign := extractFloat32Sign( b.float32 );
  3471. if ( aSign = bSign ) then
  3472. Begin
  3473. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3474. End
  3475. else
  3476. Begin
  3477. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3478. End;
  3479. End;
  3480. {*
  3481. -------------------------------------------------------------------------------
  3482. Returns the result of subtracting the single-precision floating-point values
  3483. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3484. for Binary Floating-Point Arithmetic.
  3485. -------------------------------------------------------------------------------
  3486. *}
  3487. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3488. Var
  3489. aSign, bSign: flag;
  3490. Begin
  3491. aSign := extractFloat32Sign( a.float32 );
  3492. bSign := extractFloat32Sign( b.float32 );
  3493. if ( aSign = bSign ) then
  3494. Begin
  3495. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3496. End
  3497. else
  3498. Begin
  3499. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3500. End;
  3501. End;
  3502. {*
  3503. -------------------------------------------------------------------------------
  3504. Returns the result of multiplying the single-precision floating-point values
  3505. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3506. for Binary Floating-Point Arithmetic.
  3507. -------------------------------------------------------------------------------
  3508. *}
  3509. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3510. Var
  3511. aSign, bSign, zSign: flag;
  3512. aExp, bExp, zExp : int16;
  3513. aSig, bSig, zSig0, zSig1: bits32;
  3514. Begin
  3515. aSig := extractFloat32Frac( a.float32 );
  3516. aExp := extractFloat32Exp( a.float32 );
  3517. aSign := extractFloat32Sign( a.float32 );
  3518. bSig := extractFloat32Frac( b.float32 );
  3519. bExp := extractFloat32Exp( b.float32 );
  3520. bSign := extractFloat32Sign( b.float32 );
  3521. zSign := aSign xor bSign;
  3522. if ( aExp = $FF ) then
  3523. Begin
  3524. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3525. Begin
  3526. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3527. End;
  3528. if ( ( bExp OR bSig ) = 0 ) then
  3529. Begin
  3530. float_raise( float_flag_invalid );
  3531. float32_mul.float32 := float32_default_nan;
  3532. exit;
  3533. End;
  3534. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3535. exit;
  3536. End;
  3537. if ( bExp = $FF ) then
  3538. Begin
  3539. if ( bSig <> 0 ) then
  3540. Begin
  3541. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3542. exit;
  3543. End;
  3544. if ( ( aExp OR aSig ) = 0 ) then
  3545. Begin
  3546. float_raise( float_flag_invalid );
  3547. float32_mul.float32 := float32_default_nan;
  3548. exit;
  3549. End;
  3550. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3551. exit;
  3552. End;
  3553. if ( aExp = 0 ) then
  3554. Begin
  3555. if ( aSig = 0 ) then
  3556. Begin
  3557. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3558. exit;
  3559. End;
  3560. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3561. End;
  3562. if ( bExp = 0 ) then
  3563. Begin
  3564. if ( bSig = 0 ) then
  3565. Begin
  3566. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3567. exit;
  3568. End;
  3569. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3570. End;
  3571. zExp := aExp + bExp - $7F;
  3572. aSig := ( aSig OR $00800000 ) shl 7;
  3573. bSig := ( bSig OR $00800000 ) shl 8;
  3574. mul32To64( aSig, bSig, zSig0, zSig1 );
  3575. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3576. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3577. Begin
  3578. zSig0 := zSig0 shl 1;
  3579. Dec(zExp);
  3580. End;
  3581. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3582. End;
  3583. {*
  3584. -------------------------------------------------------------------------------
  3585. Returns the result of dividing the single-precision floating-point value `a'
  3586. by the corresponding value `b'. The operation is performed according to the
  3587. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3588. -------------------------------------------------------------------------------
  3589. *}
  3590. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3591. Var
  3592. aSign, bSign, zSign: flag;
  3593. aExp, bExp, zExp: int16;
  3594. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3595. Begin
  3596. aSig := extractFloat32Frac( a.float32 );
  3597. aExp := extractFloat32Exp( a.float32 );
  3598. aSign := extractFloat32Sign( a.float32 );
  3599. bSig := extractFloat32Frac( b.float32 );
  3600. bExp := extractFloat32Exp( b.float32 );
  3601. bSign := extractFloat32Sign( b.float32 );
  3602. zSign := aSign xor bSign;
  3603. if ( aExp = $FF ) then
  3604. Begin
  3605. if ( aSig <> 0 ) then
  3606. Begin
  3607. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3608. exit;
  3609. End;
  3610. if ( bExp = $FF ) then
  3611. Begin
  3612. if ( bSig <> 0) then
  3613. Begin
  3614. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3615. End;
  3616. float_raise( float_flag_invalid );
  3617. float32_div.float32 := float32_default_nan;
  3618. exit;
  3619. End;
  3620. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3621. exit;
  3622. End;
  3623. if ( bExp = $FF ) then
  3624. Begin
  3625. if ( bSig <> 0) then
  3626. Begin
  3627. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3628. exit;
  3629. End;
  3630. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3631. exit;
  3632. End;
  3633. if ( bExp = 0 ) Then
  3634. Begin
  3635. if ( bSig = 0 ) Then
  3636. Begin
  3637. if ( ( aExp OR aSig ) = 0 ) then
  3638. Begin
  3639. float_raise( float_flag_invalid );
  3640. float32_div.float32 := float32_default_nan;
  3641. exit;
  3642. End;
  3643. float_raise( float_flag_divbyzero );
  3644. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3645. exit;
  3646. End;
  3647. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3648. End;
  3649. if ( aExp = 0 ) Then
  3650. Begin
  3651. if ( aSig = 0 ) Then
  3652. Begin
  3653. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3654. exit;
  3655. End;
  3656. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3657. End;
  3658. zExp := aExp - bExp + $7D;
  3659. aSig := ( aSig OR $00800000 ) shl 7;
  3660. bSig := ( bSig OR $00800000 ) shl 8;
  3661. if ( bSig <= ( aSig + aSig ) ) then
  3662. Begin
  3663. aSig := aSig shr 1;
  3664. Inc(zExp);
  3665. End;
  3666. zSig := estimateDiv64To32( aSig, 0, bSig );
  3667. if ( ( zSig and $3F ) <= 2 ) then
  3668. Begin
  3669. mul32To64( bSig, zSig, term0, term1 );
  3670. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3671. while ( sbits32 (rem0) < 0 ) do
  3672. Begin
  3673. Dec(zSig);
  3674. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3675. End;
  3676. zSig := zSig or bits32( rem1 <> 0 );
  3677. End;
  3678. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3679. End;
  3680. {*
  3681. -------------------------------------------------------------------------------
  3682. Returns the remainder of the single-precision floating-point value `a'
  3683. with respect to the corresponding value `b'. The operation is performed
  3684. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3685. -------------------------------------------------------------------------------
  3686. *}
  3687. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3688. Var
  3689. aSign, zSign: flag;
  3690. aExp, bExp, expDiff: int16;
  3691. aSig, bSig, q, alternateASig: bits32;
  3692. sigMean: sbits32;
  3693. Begin
  3694. aSig := extractFloat32Frac( a.float32 );
  3695. aExp := extractFloat32Exp( a.float32 );
  3696. aSign := extractFloat32Sign( a.float32 );
  3697. bSig := extractFloat32Frac( b.float32 );
  3698. bExp := extractFloat32Exp( b.float32 );
  3699. if ( aExp = $FF ) then
  3700. Begin
  3701. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3702. Begin
  3703. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3704. exit;
  3705. End;
  3706. float_raise( float_flag_invalid );
  3707. float32_rem.float32 := float32_default_nan;
  3708. exit;
  3709. End;
  3710. if ( bExp = $FF ) then
  3711. Begin
  3712. if ( bSig <> 0 ) then
  3713. Begin
  3714. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3715. exit;
  3716. End;
  3717. float32_rem := a;
  3718. exit;
  3719. End;
  3720. if ( bExp = 0 ) then
  3721. Begin
  3722. if ( bSig = 0 ) then
  3723. Begin
  3724. float_raise( float_flag_invalid );
  3725. float32_rem.float32 := float32_default_nan;
  3726. exit;
  3727. End;
  3728. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3729. End;
  3730. if ( aExp = 0 ) then
  3731. Begin
  3732. if ( aSig = 0 ) then
  3733. Begin
  3734. float32_rem := a;
  3735. exit;
  3736. End;
  3737. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3738. End;
  3739. expDiff := aExp - bExp;
  3740. aSig := ( aSig OR $00800000 ) shl 8;
  3741. bSig := ( bSig OR $00800000 ) shl 8;
  3742. if ( expDiff < 0 ) then
  3743. Begin
  3744. if ( expDiff < -1 ) then
  3745. Begin
  3746. float32_rem := a;
  3747. exit;
  3748. End;
  3749. aSig := aSig shr 1;
  3750. End;
  3751. q := bits32( bSig <= aSig );
  3752. if ( q <> 0) then
  3753. aSig := aSig - bSig;
  3754. expDiff := expDiff - 32;
  3755. while ( 0 < expDiff ) do
  3756. Begin
  3757. q := estimateDiv64To32( aSig, 0, bSig );
  3758. if (2 < q) then
  3759. q := q - 2
  3760. else
  3761. q := 0;
  3762. aSig := - ( ( bSig shr 2 ) * q );
  3763. expDiff := expDiff - 30;
  3764. End;
  3765. expDiff := expDiff + 32;
  3766. if ( 0 < expDiff ) then
  3767. Begin
  3768. q := estimateDiv64To32( aSig, 0, bSig );
  3769. if (2 < q) then
  3770. q := q - 2
  3771. else
  3772. q := 0;
  3773. q := q shr (32 - expDiff);
  3774. bSig := bSig shr 2;
  3775. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3776. End
  3777. else
  3778. Begin
  3779. aSig := aSig shr 2;
  3780. bSig := bSig shr 2;
  3781. End;
  3782. Repeat
  3783. alternateASig := aSig;
  3784. Inc(q);
  3785. aSig := aSig - bSig;
  3786. Until not ( 0 <= sbits32 (aSig) );
  3787. sigMean := aSig + alternateASig;
  3788. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3789. Begin
  3790. aSig := alternateASig;
  3791. End;
  3792. zSign := flag( sbits32 (aSig) < 0 );
  3793. if ( zSign<>0 ) then
  3794. aSig := - aSig;
  3795. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3796. End;
  3797. {*
  3798. -------------------------------------------------------------------------------
  3799. Returns the square root of the single-precision floating-point value `a'.
  3800. The operation is performed according to the IEC/IEEE Standard for Binary
  3801. Floating-Point Arithmetic.
  3802. -------------------------------------------------------------------------------
  3803. *}
  3804. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3805. Var
  3806. aSign : flag;
  3807. aExp, zExp : int16;
  3808. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3809. label roundAndPack;
  3810. Begin
  3811. aSig := extractFloat32Frac( a.float32 );
  3812. aExp := extractFloat32Exp( a.float32 );
  3813. aSign := extractFloat32Sign( a.float32 );
  3814. if ( aExp = $FF ) then
  3815. Begin
  3816. if ( aSig <> 0) then
  3817. Begin
  3818. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3819. exit;
  3820. End;
  3821. if ( aSign = 0) then
  3822. Begin
  3823. float32_sqrt := a;
  3824. exit;
  3825. End;
  3826. float_raise( float_flag_invalid );
  3827. float32_sqrt.float32 := float32_default_nan;
  3828. exit;
  3829. End;
  3830. if ( aSign <> 0) then
  3831. Begin
  3832. if ( ( aExp OR aSig ) = 0 ) then
  3833. Begin
  3834. float32_sqrt := a;
  3835. exit;
  3836. End;
  3837. float_raise( float_flag_invalid );
  3838. float32_sqrt.float32 := float32_default_nan;
  3839. exit;
  3840. End;
  3841. if ( aExp = 0 ) then
  3842. Begin
  3843. if ( aSig = 0 ) then
  3844. Begin
  3845. float32_sqrt.float32 := 0;
  3846. exit;
  3847. End;
  3848. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3849. End;
  3850. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3851. aSig := ( aSig OR $00800000 ) shl 8;
  3852. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3853. if ( ( zSig and $7F ) <= 5 ) then
  3854. Begin
  3855. if ( zSig < 2 ) then
  3856. Begin
  3857. zSig := $7FFFFFFF;
  3858. goto roundAndPack;
  3859. End
  3860. else
  3861. Begin
  3862. aSig := aSig shr (aExp and 1);
  3863. mul32To64( zSig, zSig, term0, term1 );
  3864. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3865. while ( sbits32 (rem0) < 0 ) do
  3866. Begin
  3867. Dec(zSig);
  3868. shortShift64Left( 0, zSig, 1, term0, term1 );
  3869. term1 := term1 or 1;
  3870. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3871. End;
  3872. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3873. End;
  3874. End;
  3875. shift32RightJamming( zSig, 1, zSig );
  3876. roundAndPack:
  3877. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3878. End;
  3879. {*
  3880. -------------------------------------------------------------------------------
  3881. Returns 1 if the single-precision floating-point value `a' is equal to
  3882. the corresponding value `b', and 0 otherwise. The comparison is performed
  3883. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3884. -------------------------------------------------------------------------------
  3885. *}
  3886. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3887. Begin
  3888. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3889. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3890. ) then
  3891. Begin
  3892. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3893. Begin
  3894. float_raise( float_flag_invalid );
  3895. End;
  3896. float32_eq := 0;
  3897. exit;
  3898. End;
  3899. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3900. End;
  3901. {*
  3902. -------------------------------------------------------------------------------
  3903. Returns 1 if the single-precision floating-point value `a' is less than
  3904. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3905. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3906. Arithmetic.
  3907. -------------------------------------------------------------------------------
  3908. *}
  3909. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3910. var
  3911. aSign, bSign: flag;
  3912. Begin
  3913. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3914. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3915. ) then
  3916. Begin
  3917. float_raise( float_flag_invalid );
  3918. float32_le := 0;
  3919. exit;
  3920. End;
  3921. aSign := extractFloat32Sign( a.float32 );
  3922. bSign := extractFloat32Sign( b.float32 );
  3923. if ( aSign <> bSign ) then
  3924. Begin
  3925. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3926. exit;
  3927. End;
  3928. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3929. End;
  3930. {*
  3931. -------------------------------------------------------------------------------
  3932. Returns 1 if the single-precision floating-point value `a' is less than
  3933. the corresponding value `b', and 0 otherwise. The comparison is performed
  3934. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3935. -------------------------------------------------------------------------------
  3936. *}
  3937. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3938. var
  3939. aSign, bSign: flag;
  3940. Begin
  3941. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3942. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3943. ) then
  3944. Begin
  3945. float_raise( float_flag_invalid );
  3946. float32_lt :=0;
  3947. exit;
  3948. End;
  3949. aSign := extractFloat32Sign( a.float32 );
  3950. bSign := extractFloat32Sign( b.float32 );
  3951. if ( aSign <> bSign ) then
  3952. Begin
  3953. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3954. exit;
  3955. End;
  3956. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3957. End;
  3958. {*
  3959. -------------------------------------------------------------------------------
  3960. Returns 1 if the single-precision floating-point value `a' is equal to
  3961. the corresponding value `b', and 0 otherwise. The invalid exception is
  3962. raised if either operand is a NaN. Otherwise, the comparison is performed
  3963. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3964. -------------------------------------------------------------------------------
  3965. *}
  3966. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3967. Begin
  3968. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3969. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3970. ) then
  3971. Begin
  3972. float_raise( float_flag_invalid );
  3973. float32_eq_signaling := 0;
  3974. exit;
  3975. End;
  3976. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3977. End;
  3978. {*
  3979. -------------------------------------------------------------------------------
  3980. Returns 1 if the single-precision floating-point value `a' is less than or
  3981. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3982. cause an exception. Otherwise, the comparison is performed according to the
  3983. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3984. -------------------------------------------------------------------------------
  3985. *}
  3986. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3987. Var
  3988. aSign, bSign: flag;
  3989. Begin
  3990. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3991. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3992. ) then
  3993. Begin
  3994. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3995. Begin
  3996. float_raise( float_flag_invalid );
  3997. End;
  3998. float32_le_quiet := 0;
  3999. exit;
  4000. End;
  4001. aSign := extractFloat32Sign( a );
  4002. bSign := extractFloat32Sign( b );
  4003. if ( aSign <> bSign ) then
  4004. Begin
  4005. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  4006. exit;
  4007. End;
  4008. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  4009. End;
  4010. {*
  4011. -------------------------------------------------------------------------------
  4012. Returns 1 if the single-precision floating-point value `a' is less than
  4013. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4014. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4015. Standard for Binary Floating-Point Arithmetic.
  4016. -------------------------------------------------------------------------------
  4017. *}
  4018. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  4019. Var
  4020. aSign, bSign: flag;
  4021. Begin
  4022. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4023. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4024. ) then
  4025. Begin
  4026. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4027. Begin
  4028. float_raise( float_flag_invalid );
  4029. End;
  4030. float32_lt_quiet := 0;
  4031. exit;
  4032. End;
  4033. aSign := extractFloat32Sign( a );
  4034. bSign := extractFloat32Sign( b );
  4035. if ( aSign <> bSign ) then
  4036. Begin
  4037. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4038. exit;
  4039. End;
  4040. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4041. End;
  4042. {*
  4043. -------------------------------------------------------------------------------
  4044. Returns the result of converting the double-precision floating-point value
  4045. `a' to the 32-bit two's complement integer format. The conversion is
  4046. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4047. Arithmetic---which means in particular that the conversion is rounded
  4048. according to the current rounding mode. If `a' is a NaN, the largest
  4049. positive integer is returned. Otherwise, if the conversion overflows, the
  4050. largest integer with the same sign as `a' is returned.
  4051. -------------------------------------------------------------------------------
  4052. *}
  4053. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4054. var
  4055. aSign: flag;
  4056. aExp, shiftCount: int16;
  4057. aSig0, aSig1, absZ, aSigExtra: bits32;
  4058. z: int32;
  4059. roundingMode: int8;
  4060. label invalid;
  4061. Begin
  4062. aSig1 := extractFloat64Frac1( a );
  4063. aSig0 := extractFloat64Frac0( a );
  4064. aExp := extractFloat64Exp( a );
  4065. aSign := extractFloat64Sign( a );
  4066. shiftCount := aExp - $413;
  4067. if ( 0 <= shiftCount ) then
  4068. Begin
  4069. if ( $41E < aExp ) then
  4070. Begin
  4071. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4072. aSign := 0;
  4073. goto invalid;
  4074. End;
  4075. shortShift64Left(
  4076. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4077. if ( $80000000 < absZ ) then
  4078. goto invalid;
  4079. End
  4080. else
  4081. Begin
  4082. aSig1 := flag( aSig1 <> 0 );
  4083. if ( aExp < $3FE ) then
  4084. Begin
  4085. aSigExtra := aExp OR aSig0 OR aSig1;
  4086. absZ := 0;
  4087. End
  4088. else
  4089. Begin
  4090. aSig0 := aSig0 OR $00100000;
  4091. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4092. absZ := aSig0 shr ( - shiftCount );
  4093. End;
  4094. End;
  4095. roundingMode := softfloat_rounding_mode;
  4096. if ( roundingMode = float_round_nearest_even ) then
  4097. Begin
  4098. if ( sbits32(aSigExtra) < 0 ) then
  4099. Begin
  4100. Inc(absZ);
  4101. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4102. absZ := absZ and not 1;
  4103. End;
  4104. if aSign <> 0 then
  4105. z := - absZ
  4106. else
  4107. z := absZ;
  4108. End
  4109. else
  4110. Begin
  4111. aSigExtra := bits32( aSigExtra <> 0 );
  4112. if ( aSign <> 0) then
  4113. Begin
  4114. z := - ( absZ
  4115. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4116. End
  4117. else
  4118. Begin
  4119. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4120. End
  4121. End;
  4122. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4123. Begin
  4124. invalid:
  4125. float_raise( float_flag_invalid );
  4126. if (aSign <> 0 ) then
  4127. float64_to_int32 := sbits32 ($80000000)
  4128. else
  4129. float64_to_int32 := $7FFFFFFF;
  4130. exit;
  4131. End;
  4132. if ( aSigExtra <> 0) then
  4133. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  4134. float64_to_int32 := z;
  4135. End;
  4136. {*
  4137. -------------------------------------------------------------------------------
  4138. Returns the result of converting the double-precision floating-point value
  4139. `a' to the 32-bit two's complement integer format. The conversion is
  4140. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4141. Arithmetic, except that the conversion is always rounded toward zero.
  4142. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4143. the conversion overflows, the largest integer with the same sign as `a' is
  4144. returned.
  4145. -------------------------------------------------------------------------------
  4146. *}
  4147. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4148. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4149. Var
  4150. aSign: flag;
  4151. aExp, shiftCount: int16;
  4152. aSig0, aSig1, absZ, aSigExtra: bits32;
  4153. z: int32;
  4154. label invalid;
  4155. Begin
  4156. aSig1 := extractFloat64Frac1( a );
  4157. aSig0 := extractFloat64Frac0( a );
  4158. aExp := extractFloat64Exp( a );
  4159. aSign := extractFloat64Sign( a );
  4160. shiftCount := aExp - $413;
  4161. if ( 0 <= shiftCount ) then
  4162. Begin
  4163. if ( $41E < aExp ) then
  4164. Begin
  4165. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4166. aSign := 0;
  4167. goto invalid;
  4168. End;
  4169. shortShift64Left(
  4170. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4171. End
  4172. else
  4173. Begin
  4174. if ( aExp < $3FF ) then
  4175. Begin
  4176. if ( aExp OR aSig0 OR aSig1 )<>0 then
  4177. Begin
  4178. softfloat_exception_flags :=
  4179. softfloat_exception_flags or float_flag_inexact;
  4180. End;
  4181. float64_to_int32_round_to_zero := 0;
  4182. exit;
  4183. End;
  4184. aSig0 := aSig0 or $00100000;
  4185. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4186. absZ := aSig0 shr ( - shiftCount );
  4187. End;
  4188. if aSign <> 0 then
  4189. z := - absZ
  4190. else
  4191. z := absZ;
  4192. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4193. Begin
  4194. invalid:
  4195. float_raise( float_flag_invalid );
  4196. if (aSign <> 0) then
  4197. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4198. else
  4199. float64_to_int32_round_to_zero := $7FFFFFFF;
  4200. exit;
  4201. End;
  4202. if ( aSigExtra <> 0) then
  4203. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  4204. float64_to_int32_round_to_zero := z;
  4205. End;
  4206. {*----------------------------------------------------------------------------
  4207. | Returns the result of converting the double-precision floating-point value
  4208. | `a' to the 64-bit two's complement integer format. The conversion is
  4209. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4210. | Arithmetic---which means in particular that the conversion is rounded
  4211. | according to the current rounding mode. If `a' is a NaN, the largest
  4212. | positive integer is returned. Otherwise, if the conversion overflows, the
  4213. | largest integer with the same sign as `a' is returned.
  4214. *----------------------------------------------------------------------------*}
  4215. function float64_to_int64( a: float64 ): int64;
  4216. var
  4217. aSign: flag;
  4218. aExp, shiftCount: int16;
  4219. aSig, aSigExtra: bits64;
  4220. begin
  4221. aSig := extractFloat64Frac( a );
  4222. aExp := extractFloat64Exp( a );
  4223. aSign := extractFloat64Sign( a );
  4224. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4225. shiftCount := $433 - aExp;
  4226. if ( shiftCount <= 0 ) then begin
  4227. if ( $43E < aExp ) then begin
  4228. float_raise( float_flag_invalid );
  4229. if ( ( aSign = 0 )
  4230. or ( ( aExp = $7FF )
  4231. and ( aSig <> $0010000000000000 ) )
  4232. ) then begin
  4233. result := $7FFFFFFFFFFFFFFF;
  4234. exit;
  4235. end;
  4236. result := $8000000000000000;
  4237. exit;
  4238. end;
  4239. aSigExtra := 0;
  4240. aSig := aSig shl ( - shiftCount );
  4241. end
  4242. else
  4243. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4244. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4245. end;
  4246. {*----------------------------------------------------------------------------
  4247. | Returns the result of converting the double-precision floating-point value
  4248. | `a' to the 64-bit two's complement integer format. The conversion is
  4249. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4250. | Arithmetic, except that the conversion is always rounded toward zero.
  4251. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4252. | the conversion overflows, the largest integer with the same sign as `a' is
  4253. | returned.
  4254. *----------------------------------------------------------------------------*}
  4255. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4256. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4257. var
  4258. aSign: flag;
  4259. aExp, shiftCount: int16;
  4260. aSig: bits64;
  4261. z: int64;
  4262. begin
  4263. aSig := extractFloat64Frac( a );
  4264. aExp := extractFloat64Exp( a );
  4265. aSign := extractFloat64Sign( a );
  4266. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4267. shiftCount := aExp - $433;
  4268. if ( 0 <= shiftCount ) then begin
  4269. if ( $43E <= aExp ) then begin
  4270. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4271. float_raise( float_flag_invalid );
  4272. if ( ( aSign = 0 )
  4273. or ( ( aExp = $7FF )
  4274. and ( aSig <> $0010000000000000 ) )
  4275. ) then begin
  4276. result := $7FFFFFFFFFFFFFFF;
  4277. exit;
  4278. end;
  4279. end;
  4280. result := $8000000000000000;
  4281. exit;
  4282. end;
  4283. z := aSig shl shiftCount;
  4284. end
  4285. else begin
  4286. if ( aExp < $3FE ) then begin
  4287. if ( aExp or aSig <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  4288. result := 0;
  4289. exit;
  4290. end;
  4291. z := aSig shr ( - shiftCount );
  4292. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4293. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  4294. end;
  4295. if ( aSign <> 0 ) then z := - z;
  4296. result := z;
  4297. end;
  4298. {*
  4299. -------------------------------------------------------------------------------
  4300. Returns the result of converting the double-precision floating-point value
  4301. `a' to the single-precision floating-point format. The conversion is
  4302. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4303. Arithmetic.
  4304. -------------------------------------------------------------------------------
  4305. *}
  4306. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  4307. Var
  4308. aSign: flag;
  4309. aExp: int16;
  4310. aSig0, aSig1, zSig: bits32;
  4311. allZero: bits32;
  4312. tmp : CommonNanT;
  4313. Begin
  4314. aSig1 := extractFloat64Frac1( a );
  4315. aSig0 := extractFloat64Frac0( a );
  4316. aExp := extractFloat64Exp( a );
  4317. aSign := extractFloat64Sign( a );
  4318. if ( aExp = $7FF ) then
  4319. Begin
  4320. if ( aSig0 OR aSig1 ) <> 0 then
  4321. Begin
  4322. float64ToCommonNaN( a, tmp );
  4323. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4324. exit;
  4325. End;
  4326. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4327. exit;
  4328. End;
  4329. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4330. if ( aExp <> 0) then
  4331. zSig := zSig OR $40000000;
  4332. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4333. End;
  4334. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4335. {*----------------------------------------------------------------------------
  4336. | Returns the result of converting the double-precision floating-point value
  4337. | `a' to the extended double-precision floating-point format. The conversion
  4338. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4339. | Arithmetic.
  4340. *----------------------------------------------------------------------------*}
  4341. function float64_to_floatx80( a: float64 ): floatx80;
  4342. var
  4343. aSign: flag;
  4344. aExp: int16;
  4345. aSig: bits64;
  4346. begin
  4347. aSig := extractFloat64Frac( a );
  4348. aExp := extractFloat64Exp( a );
  4349. aSign := extractFloat64Sign( a );
  4350. if ( aExp = $7FF ) then begin
  4351. if ( aSig <> 0 ) then begin
  4352. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4353. exit;
  4354. end;
  4355. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4356. exit;
  4357. end;
  4358. if ( aExp = 0 ) then begin
  4359. if ( aSig = 0 ) then begin
  4360. result := packFloatx80( aSign, 0, 0 );
  4361. exit;
  4362. end;
  4363. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4364. end;
  4365. result :=
  4366. packFloatx80(
  4367. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4368. end;
  4369. {$endif FPC_SOFTFLOAT_FLOATX80}
  4370. {*
  4371. -------------------------------------------------------------------------------
  4372. Rounds the double-precision floating-point value `a' to an integer,
  4373. and returns the result as a double-precision floating-point value. The
  4374. operation is performed according to the IEC/IEEE Standard for Binary
  4375. Floating-Point Arithmetic.
  4376. -------------------------------------------------------------------------------
  4377. *}
  4378. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4379. Var
  4380. aSign: flag;
  4381. aExp: int16;
  4382. lastBitMask, roundBitsMask: bits32;
  4383. roundingMode: int8;
  4384. z: float64;
  4385. Begin
  4386. aExp := extractFloat64Exp( a );
  4387. if ( $413 <= aExp ) then
  4388. Begin
  4389. if ( $433 <= aExp ) then
  4390. Begin
  4391. if ( ( aExp = $7FF )
  4392. AND
  4393. (
  4394. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4395. ) <>0)
  4396. ) then
  4397. Begin
  4398. propagateFloat64NaN( a, a, result );
  4399. exit;
  4400. End;
  4401. result := a;
  4402. exit;
  4403. End;
  4404. lastBitMask := 1;
  4405. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4406. roundBitsMask := lastBitMask - 1;
  4407. z := a;
  4408. roundingMode := softfloat_rounding_mode;
  4409. if ( roundingMode = float_round_nearest_even ) then
  4410. Begin
  4411. if ( lastBitMask <> 0) then
  4412. Begin
  4413. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4414. if ( ( z.low and roundBitsMask ) = 0 ) then
  4415. z.low := z.low and not lastBitMask;
  4416. End
  4417. else
  4418. Begin
  4419. if ( sbits32 (z.low) < 0 ) then
  4420. Begin
  4421. Inc(z.high);
  4422. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4423. z.high := z.high and not 1;
  4424. End;
  4425. End;
  4426. End
  4427. else if ( roundingMode <> float_round_to_zero ) then
  4428. Begin
  4429. if ( extractFloat64Sign( z )
  4430. xor flag( roundingMode = float_round_up ) )<> 0 then
  4431. Begin
  4432. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4433. End;
  4434. End;
  4435. z.low := z.low and not roundBitsMask;
  4436. End
  4437. else
  4438. Begin
  4439. if ( aExp <= $3FE ) then
  4440. Begin
  4441. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4442. Begin
  4443. result := a;
  4444. exit;
  4445. End;
  4446. softfloat_exception_flags := softfloat_exception_flags or
  4447. float_flag_inexact;
  4448. aSign := extractFloat64Sign( a );
  4449. case ( softfloat_rounding_mode ) of
  4450. float_round_nearest_even:
  4451. Begin
  4452. if ( ( aExp = $3FE )
  4453. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4454. ) then
  4455. Begin
  4456. packFloat64( aSign, $3FF, 0, 0, result );
  4457. exit;
  4458. End;
  4459. End;
  4460. float_round_down:
  4461. Begin
  4462. if aSign<>0 then
  4463. packFloat64( 1, $3FF, 0, 0, result )
  4464. else
  4465. packFloat64( 0, 0, 0, 0, result );
  4466. exit;
  4467. End;
  4468. float_round_up:
  4469. Begin
  4470. if aSign <> 0 then
  4471. packFloat64( 1, 0, 0, 0, result )
  4472. else
  4473. packFloat64( 0, $3FF, 0, 0, result );
  4474. exit;
  4475. End;
  4476. end;
  4477. packFloat64( aSign, 0, 0, 0, result );
  4478. exit;
  4479. End;
  4480. lastBitMask := 1;
  4481. lastBitMask := lastBitMask shl ($413 - aExp);
  4482. roundBitsMask := lastBitMask - 1;
  4483. z.low := 0;
  4484. z.high := a.high;
  4485. roundingMode := softfloat_rounding_mode;
  4486. if ( roundingMode = float_round_nearest_even ) then
  4487. Begin
  4488. z.high := z.high + lastBitMask shr 1;
  4489. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4490. Begin
  4491. z.high := z.high and not lastBitMask;
  4492. End;
  4493. End
  4494. else if ( roundingMode <> float_round_to_zero ) then
  4495. Begin
  4496. if ( extractFloat64Sign( z )
  4497. xor flag( roundingMode = float_round_up ) )<> 0 then
  4498. Begin
  4499. z.high := z.high or bits32( a.low <> 0 );
  4500. z.high := z.high + roundBitsMask;
  4501. End;
  4502. End;
  4503. z.high := z.high and not roundBitsMask;
  4504. End;
  4505. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4506. Begin
  4507. softfloat_exception_flags :=
  4508. softfloat_exception_flags or float_flag_inexact;
  4509. End;
  4510. result := z;
  4511. End;
  4512. {*
  4513. -------------------------------------------------------------------------------
  4514. Returns the result of adding the absolute values of the double-precision
  4515. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4516. before being returned. `zSign' is ignored if the result is a NaN.
  4517. The addition is performed according to the IEC/IEEE Standard for Binary
  4518. Floating-Point Arithmetic.
  4519. -------------------------------------------------------------------------------
  4520. *}
  4521. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4522. Var
  4523. aExp, bExp, zExp: int16;
  4524. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4525. expDiff: int16;
  4526. label shiftRight1;
  4527. label roundAndPack;
  4528. Begin
  4529. aSig1 := extractFloat64Frac1( a );
  4530. aSig0 := extractFloat64Frac0( a );
  4531. aExp := extractFloat64Exp( a );
  4532. bSig1 := extractFloat64Frac1( b );
  4533. bSig0 := extractFloat64Frac0( b );
  4534. bExp := extractFloat64Exp( b );
  4535. expDiff := aExp - bExp;
  4536. if ( 0 < expDiff ) then
  4537. Begin
  4538. if ( aExp = $7FF ) then
  4539. Begin
  4540. if ( aSig0 OR aSig1 ) <> 0 then
  4541. Begin
  4542. propagateFloat64NaN( a, b, out );
  4543. exit;
  4544. end;
  4545. out := a;
  4546. exit;
  4547. End;
  4548. if ( bExp = 0 ) then
  4549. Begin
  4550. Dec(expDiff);
  4551. End
  4552. else
  4553. Begin
  4554. bSig0 := bSig0 or $00100000;
  4555. End;
  4556. shift64ExtraRightJamming(
  4557. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4558. zExp := aExp;
  4559. End
  4560. else if ( expDiff < 0 ) then
  4561. Begin
  4562. if ( bExp = $7FF ) then
  4563. Begin
  4564. if ( bSig0 OR bSig1 ) <> 0 then
  4565. Begin
  4566. propagateFloat64NaN( a, b, out );
  4567. exit;
  4568. End;
  4569. packFloat64( zSign, $7FF, 0, 0, out );
  4570. End;
  4571. if ( aExp = 0 ) then
  4572. Begin
  4573. Inc(expDiff);
  4574. End
  4575. else
  4576. Begin
  4577. aSig0 := aSig0 or $00100000;
  4578. End;
  4579. shift64ExtraRightJamming(
  4580. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4581. zExp := bExp;
  4582. End
  4583. else
  4584. Begin
  4585. if ( aExp = $7FF ) then
  4586. Begin
  4587. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4588. Begin
  4589. propagateFloat64NaN( a, b, out );
  4590. exit;
  4591. End;
  4592. out := a;
  4593. exit;
  4594. End;
  4595. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4596. if ( aExp = 0 ) then
  4597. Begin
  4598. packFloat64( zSign, 0, zSig0, zSig1, out );
  4599. exit;
  4600. End;
  4601. zSig2 := 0;
  4602. zSig0 := zSig0 or $00200000;
  4603. zExp := aExp;
  4604. goto shiftRight1;
  4605. End;
  4606. aSig0 := aSig0 or $00100000;
  4607. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4608. Dec(zExp);
  4609. if ( zSig0 < $00200000 ) then
  4610. goto roundAndPack;
  4611. Inc(zExp);
  4612. shiftRight1:
  4613. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4614. roundAndPack:
  4615. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4616. End;
  4617. {*
  4618. -------------------------------------------------------------------------------
  4619. Returns the result of subtracting the absolute values of the double-
  4620. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4621. difference is negated before being returned. `zSign' is ignored if the
  4622. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4623. Standard for Binary Floating-Point Arithmetic.
  4624. -------------------------------------------------------------------------------
  4625. *}
  4626. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4627. Var
  4628. aExp, bExp, zExp: int16;
  4629. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4630. expDiff: int16;
  4631. z: float64;
  4632. label aExpBigger;
  4633. label bExpBigger;
  4634. label aBigger;
  4635. label bBigger;
  4636. label normalizeRoundAndPack;
  4637. Begin
  4638. aSig1 := extractFloat64Frac1( a );
  4639. aSig0 := extractFloat64Frac0( a );
  4640. aExp := extractFloat64Exp( a );
  4641. bSig1 := extractFloat64Frac1( b );
  4642. bSig0 := extractFloat64Frac0( b );
  4643. bExp := extractFloat64Exp( b );
  4644. expDiff := aExp - bExp;
  4645. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4646. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4647. if ( 0 < expDiff ) then goto aExpBigger;
  4648. if ( expDiff < 0 ) then goto bExpBigger;
  4649. if ( aExp = $7FF ) then
  4650. Begin
  4651. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4652. Begin
  4653. propagateFloat64NaN( a, b, out );
  4654. exit;
  4655. End;
  4656. float_raise( float_flag_invalid );
  4657. z.low := float64_default_nan_low;
  4658. z.high := float64_default_nan_high;
  4659. out := z;
  4660. exit;
  4661. End;
  4662. if ( aExp = 0 ) then
  4663. Begin
  4664. aExp := 1;
  4665. bExp := 1;
  4666. End;
  4667. if ( bSig0 < aSig0 ) then goto aBigger;
  4668. if ( aSig0 < bSig0 ) then goto bBigger;
  4669. if ( bSig1 < aSig1 ) then goto aBigger;
  4670. if ( aSig1 < bSig1 ) then goto bBigger;
  4671. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4672. exit;
  4673. bExpBigger:
  4674. if ( bExp = $7FF ) then
  4675. Begin
  4676. if ( bSig0 OR bSig1 ) <> 0 then
  4677. Begin
  4678. propagateFloat64NaN( a, b, out );
  4679. exit;
  4680. End;
  4681. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4682. exit;
  4683. End;
  4684. if ( aExp = 0 ) then
  4685. Begin
  4686. Inc(expDiff);
  4687. End
  4688. else
  4689. Begin
  4690. aSig0 := aSig0 or $40000000;
  4691. End;
  4692. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4693. bSig0 := bSig0 or $40000000;
  4694. bBigger:
  4695. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4696. zExp := bExp;
  4697. zSign := zSign xor 1;
  4698. goto normalizeRoundAndPack;
  4699. aExpBigger:
  4700. if ( aExp = $7FF ) then
  4701. Begin
  4702. if ( aSig0 OR aSig1 ) <> 0 then
  4703. Begin
  4704. propagateFloat64NaN( a, b, out );
  4705. exit;
  4706. End;
  4707. out := a;
  4708. exit;
  4709. End;
  4710. if ( bExp = 0 ) then
  4711. Begin
  4712. Dec(expDiff);
  4713. End
  4714. else
  4715. Begin
  4716. bSig0 := bSig0 or $40000000;
  4717. End;
  4718. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4719. aSig0 := aSig0 or $40000000;
  4720. aBigger:
  4721. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4722. zExp := aExp;
  4723. normalizeRoundAndPack:
  4724. Dec(zExp);
  4725. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4726. End;
  4727. {*
  4728. -------------------------------------------------------------------------------
  4729. Returns the result of adding the double-precision floating-point values `a'
  4730. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4731. Binary Floating-Point Arithmetic.
  4732. -------------------------------------------------------------------------------
  4733. *}
  4734. Function float64_add( a: float64; b : float64) : Float64;
  4735. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4736. Var
  4737. aSign, bSign: flag;
  4738. Begin
  4739. aSign := extractFloat64Sign( a );
  4740. bSign := extractFloat64Sign( b );
  4741. if ( aSign = bSign ) then
  4742. Begin
  4743. addFloat64Sigs( a, b, aSign, result );
  4744. End
  4745. else
  4746. Begin
  4747. subFloat64Sigs( a, b, aSign, result );
  4748. End;
  4749. End;
  4750. {*
  4751. -------------------------------------------------------------------------------
  4752. Returns the result of subtracting the double-precision floating-point values
  4753. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4754. for Binary Floating-Point Arithmetic.
  4755. -------------------------------------------------------------------------------
  4756. *}
  4757. Function float64_sub(a: float64; b : float64) : Float64;
  4758. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4759. Var
  4760. aSign, bSign: flag;
  4761. Begin
  4762. aSign := extractFloat64Sign( a );
  4763. bSign := extractFloat64Sign( b );
  4764. if ( aSign = bSign ) then
  4765. Begin
  4766. subFloat64Sigs( a, b, aSign, result );
  4767. End
  4768. else
  4769. Begin
  4770. addFloat64Sigs( a, b, aSign, result );
  4771. End;
  4772. End;
  4773. {*
  4774. -------------------------------------------------------------------------------
  4775. Returns the result of multiplying the double-precision floating-point values
  4776. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4777. for Binary Floating-Point Arithmetic.
  4778. -------------------------------------------------------------------------------
  4779. *}
  4780. Function float64_mul( a: float64; b:float64) : Float64;
  4781. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4782. Var
  4783. aSign, bSign, zSign: flag;
  4784. aExp, bExp, zExp: int16;
  4785. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4786. z: float64;
  4787. label invalid;
  4788. Begin
  4789. aSig1 := extractFloat64Frac1( a );
  4790. aSig0 := extractFloat64Frac0( a );
  4791. aExp := extractFloat64Exp( a );
  4792. aSign := extractFloat64Sign( a );
  4793. bSig1 := extractFloat64Frac1( b );
  4794. bSig0 := extractFloat64Frac0( b );
  4795. bExp := extractFloat64Exp( b );
  4796. bSign := extractFloat64Sign( b );
  4797. zSign := aSign xor bSign;
  4798. if ( aExp = $7FF ) then
  4799. Begin
  4800. if ( (( aSig0 OR aSig1 ) <>0)
  4801. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4802. Begin
  4803. propagateFloat64NaN( a, b, result );
  4804. exit;
  4805. End;
  4806. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4807. packFloat64( zSign, $7FF, 0, 0, result );
  4808. exit;
  4809. End;
  4810. if ( bExp = $7FF ) then
  4811. Begin
  4812. if ( bSig0 OR bSig1 )<> 0 then
  4813. Begin
  4814. propagateFloat64NaN( a, b, result );
  4815. exit;
  4816. End;
  4817. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4818. Begin
  4819. invalid:
  4820. float_raise( float_flag_invalid );
  4821. z.low := float64_default_nan_low;
  4822. z.high := float64_default_nan_high;
  4823. result := z;
  4824. exit;
  4825. End;
  4826. packFloat64( zSign, $7FF, 0, 0, result );
  4827. exit;
  4828. End;
  4829. if ( aExp = 0 ) then
  4830. Begin
  4831. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4832. Begin
  4833. packFloat64( zSign, 0, 0, 0, result );
  4834. exit;
  4835. End;
  4836. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4837. End;
  4838. if ( bExp = 0 ) then
  4839. Begin
  4840. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4841. Begin
  4842. packFloat64( zSign, 0, 0, 0, result );
  4843. exit;
  4844. End;
  4845. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4846. End;
  4847. zExp := aExp + bExp - $400;
  4848. aSig0 := aSig0 or $00100000;
  4849. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4850. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4851. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4852. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4853. if ( $00200000 <= zSig0 ) then
  4854. Begin
  4855. shift64ExtraRightJamming(
  4856. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4857. Inc(zExp);
  4858. End;
  4859. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4860. End;
  4861. {*
  4862. -------------------------------------------------------------------------------
  4863. Returns the result of dividing the double-precision floating-point value `a'
  4864. by the corresponding value `b'. The operation is performed according to the
  4865. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4866. -------------------------------------------------------------------------------
  4867. *}
  4868. Function float64_div(a: float64; b : float64) : Float64;
  4869. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4870. Var
  4871. aSign, bSign, zSign: flag;
  4872. aExp, bExp, zExp: int16;
  4873. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4874. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4875. z: float64;
  4876. label invalid;
  4877. Begin
  4878. aSig1 := extractFloat64Frac1( a );
  4879. aSig0 := extractFloat64Frac0( a );
  4880. aExp := extractFloat64Exp( a );
  4881. aSign := extractFloat64Sign( a );
  4882. bSig1 := extractFloat64Frac1( b );
  4883. bSig0 := extractFloat64Frac0( b );
  4884. bExp := extractFloat64Exp( b );
  4885. bSign := extractFloat64Sign( b );
  4886. zSign := aSign xor bSign;
  4887. if ( aExp = $7FF ) then
  4888. Begin
  4889. if ( aSig0 OR aSig1 )<> 0 then
  4890. Begin
  4891. propagateFloat64NaN( a, b, result );
  4892. exit;
  4893. end;
  4894. if ( bExp = $7FF ) then
  4895. Begin
  4896. if ( bSig0 OR bSig1 )<>0 then
  4897. Begin
  4898. propagateFloat64NaN( a, b, result );
  4899. exit;
  4900. End;
  4901. goto invalid;
  4902. End;
  4903. packFloat64( zSign, $7FF, 0, 0, result );
  4904. exit;
  4905. End;
  4906. if ( bExp = $7FF ) then
  4907. Begin
  4908. if ( bSig0 OR bSig1 )<> 0 then
  4909. Begin
  4910. propagateFloat64NaN( a, b, result );
  4911. exit;
  4912. End;
  4913. packFloat64( zSign, 0, 0, 0, result );
  4914. exit;
  4915. End;
  4916. if ( bExp = 0 ) then
  4917. Begin
  4918. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4919. Begin
  4920. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4921. Begin
  4922. invalid:
  4923. float_raise( float_flag_invalid );
  4924. z.low := float64_default_nan_low;
  4925. z.high := float64_default_nan_high;
  4926. result := z;
  4927. exit;
  4928. End;
  4929. float_raise( float_flag_divbyzero );
  4930. packFloat64( zSign, $7FF, 0, 0, result );
  4931. exit;
  4932. End;
  4933. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4934. End;
  4935. if ( aExp = 0 ) then
  4936. Begin
  4937. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4938. Begin
  4939. packFloat64( zSign, 0, 0, 0, result );
  4940. exit;
  4941. End;
  4942. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4943. End;
  4944. zExp := aExp - bExp + $3FD;
  4945. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4946. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4947. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4948. Begin
  4949. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4950. Inc(zExp);
  4951. End;
  4952. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4953. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4954. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4955. while ( sbits32 (rem0) < 0 ) do
  4956. Begin
  4957. Dec(zSig0);
  4958. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4959. End;
  4960. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4961. if ( ( zSig1 and $3FF ) <= 4 ) then
  4962. Begin
  4963. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4964. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4965. while ( sbits32 (rem1) < 0 ) do
  4966. Begin
  4967. Dec(zSig1);
  4968. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4969. End;
  4970. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4971. End;
  4972. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4973. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4974. End;
  4975. {*
  4976. -------------------------------------------------------------------------------
  4977. Returns the remainder of the double-precision floating-point value `a'
  4978. with respect to the corresponding value `b'. The operation is performed
  4979. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4980. -------------------------------------------------------------------------------
  4981. *}
  4982. Function float64_rem(a: float64; b : float64) : float64;
  4983. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4984. Var
  4985. aSign, zSign: flag;
  4986. aExp, bExp, expDiff: int16;
  4987. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4988. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4989. sigMean0: sbits32;
  4990. z: float64;
  4991. label invalid;
  4992. Begin
  4993. aSig1 := extractFloat64Frac1( a );
  4994. aSig0 := extractFloat64Frac0( a );
  4995. aExp := extractFloat64Exp( a );
  4996. aSign := extractFloat64Sign( a );
  4997. bSig1 := extractFloat64Frac1( b );
  4998. bSig0 := extractFloat64Frac0( b );
  4999. bExp := extractFloat64Exp( b );
  5000. if ( aExp = $7FF ) then
  5001. Begin
  5002. if ((( aSig0 OR aSig1 )<>0)
  5003. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  5004. Begin
  5005. propagateFloat64NaN( a, b, result );
  5006. exit;
  5007. End;
  5008. goto invalid;
  5009. End;
  5010. if ( bExp = $7FF ) then
  5011. Begin
  5012. if ( bSig0 OR bSig1 ) <> 0 then
  5013. Begin
  5014. propagateFloat64NaN( a, b, result );
  5015. exit;
  5016. End;
  5017. result := a;
  5018. exit;
  5019. End;
  5020. if ( bExp = 0 ) then
  5021. Begin
  5022. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5023. Begin
  5024. invalid:
  5025. float_raise( float_flag_invalid );
  5026. z.low := float64_default_nan_low;
  5027. z.high := float64_default_nan_high;
  5028. result := z;
  5029. exit;
  5030. End;
  5031. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5032. End;
  5033. if ( aExp = 0 ) then
  5034. Begin
  5035. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5036. Begin
  5037. result := a;
  5038. exit;
  5039. End;
  5040. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5041. End;
  5042. expDiff := aExp - bExp;
  5043. if ( expDiff < -1 ) then
  5044. Begin
  5045. result := a;
  5046. exit;
  5047. End;
  5048. shortShift64Left(
  5049. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5050. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5051. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5052. if ( q )<>0 then
  5053. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5054. expDiff := expDiff - 32;
  5055. while ( 0 < expDiff ) do
  5056. Begin
  5057. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5058. if 4 < q then
  5059. q:= q - 4
  5060. else
  5061. q := 0;
  5062. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5063. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5064. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5065. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5066. expDiff := expDiff - 29;
  5067. End;
  5068. if ( -32 < expDiff ) then
  5069. Begin
  5070. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5071. if 4 < q then
  5072. q := q - 4
  5073. else
  5074. q := 0;
  5075. q := q shr (- expDiff);
  5076. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5077. expDiff := expDiff + 24;
  5078. if ( expDiff < 0 ) then
  5079. Begin
  5080. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5081. End
  5082. else
  5083. Begin
  5084. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5085. End;
  5086. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5087. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5088. End
  5089. else
  5090. Begin
  5091. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5092. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5093. End;
  5094. Repeat
  5095. alternateASig0 := aSig0;
  5096. alternateASig1 := aSig1;
  5097. Inc(q);
  5098. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5099. Until not ( 0 <= sbits32 (aSig0) );
  5100. add64(
  5101. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5102. if ( ( sigMean0 < 0 )
  5103. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5104. Begin
  5105. aSig0 := alternateASig0;
  5106. aSig1 := alternateASig1;
  5107. End;
  5108. zSign := flag( sbits32 (aSig0) < 0 );
  5109. if ( zSign <> 0 ) then
  5110. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5111. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5112. End;
  5113. {*
  5114. -------------------------------------------------------------------------------
  5115. Returns the square root of the double-precision floating-point value `a'.
  5116. The operation is performed according to the IEC/IEEE Standard for Binary
  5117. Floating-Point Arithmetic.
  5118. -------------------------------------------------------------------------------
  5119. *}
  5120. Procedure float64_sqrt( a: float64; var out: float64 );
  5121. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5122. Var
  5123. aSign: flag;
  5124. aExp, zExp: int16;
  5125. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5126. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5127. z: float64;
  5128. label invalid;
  5129. Begin
  5130. aSig1 := extractFloat64Frac1( a );
  5131. aSig0 := extractFloat64Frac0( a );
  5132. aExp := extractFloat64Exp( a );
  5133. aSign := extractFloat64Sign( a );
  5134. if ( aExp = $7FF ) then
  5135. Begin
  5136. if ( aSig0 OR aSig1 ) <> 0 then
  5137. Begin
  5138. propagateFloat64NaN( a, a, out );
  5139. exit;
  5140. End;
  5141. if ( aSign = 0) then
  5142. Begin
  5143. out := a;
  5144. exit;
  5145. End;
  5146. goto invalid;
  5147. End;
  5148. if ( aSign <> 0 ) then
  5149. Begin
  5150. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  5151. Begin
  5152. out := a;
  5153. exit;
  5154. End;
  5155. invalid:
  5156. float_raise( float_flag_invalid );
  5157. z.low := float64_default_nan_low;
  5158. z.high := float64_default_nan_high;
  5159. out := z;
  5160. exit;
  5161. End;
  5162. if ( aExp = 0 ) then
  5163. Begin
  5164. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5165. Begin
  5166. packFloat64( 0, 0, 0, 0, out );
  5167. exit;
  5168. End;
  5169. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5170. End;
  5171. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5172. aSig0 := aSig0 or $00100000;
  5173. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5174. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5175. if ( zSig0 = 0 ) then
  5176. zSig0 := $7FFFFFFF;
  5177. doubleZSig0 := zSig0 + zSig0;
  5178. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5179. mul32To64( zSig0, zSig0, term0, term1 );
  5180. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5181. while ( sbits32 (rem0) < 0 ) do
  5182. Begin
  5183. Dec(zSig0);
  5184. doubleZSig0 := doubleZSig0 - 2;
  5185. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5186. End;
  5187. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5188. if ( ( zSig1 and $1FF ) <= 5 ) then
  5189. Begin
  5190. if ( zSig1 = 0 ) then
  5191. zSig1 := 1;
  5192. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5193. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5194. mul32To64( zSig1, zSig1, term2, term3 );
  5195. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5196. while ( sbits32 (rem1) < 0 ) do
  5197. Begin
  5198. Dec(zSig1);
  5199. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5200. term3 := term3 or 1;
  5201. term2 := term2 or doubleZSig0;
  5202. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5203. End;
  5204. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5205. End;
  5206. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5207. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  5208. End;
  5209. {*
  5210. -------------------------------------------------------------------------------
  5211. Returns 1 if the double-precision floating-point value `a' is equal to
  5212. the corresponding value `b', and 0 otherwise. The comparison is performed
  5213. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5214. -------------------------------------------------------------------------------
  5215. *}
  5216. Function float64_eq(a: float64; b: float64): flag;
  5217. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5218. Begin
  5219. if
  5220. (
  5221. ( extractFloat64Exp( a ) = $7FF )
  5222. AND
  5223. (
  5224. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5225. )
  5226. )
  5227. OR (
  5228. ( extractFloat64Exp( b ) = $7FF )
  5229. AND (
  5230. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5231. )
  5232. )
  5233. ) then
  5234. Begin
  5235. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5236. float_raise( float_flag_invalid );
  5237. float64_eq := 0;
  5238. exit;
  5239. End;
  5240. float64_eq := flag(
  5241. ( a.low = b.low )
  5242. AND ( ( a.high = b.high )
  5243. OR ( ( a.low = 0 )
  5244. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5245. ));
  5246. End;
  5247. {*
  5248. -------------------------------------------------------------------------------
  5249. Returns 1 if the double-precision floating-point value `a' is less than
  5250. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5251. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5252. Arithmetic.
  5253. -------------------------------------------------------------------------------
  5254. *}
  5255. Function float64_le(a: float64;b: float64): flag;
  5256. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5257. Var
  5258. aSign, bSign: flag;
  5259. Begin
  5260. if
  5261. (
  5262. ( extractFloat64Exp( a ) = $7FF )
  5263. AND
  5264. (
  5265. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5266. )
  5267. )
  5268. OR (
  5269. ( extractFloat64Exp( b ) = $7FF )
  5270. AND (
  5271. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5272. )
  5273. )
  5274. ) then
  5275. Begin
  5276. float_raise( float_flag_invalid );
  5277. float64_le := 0;
  5278. exit;
  5279. End;
  5280. aSign := extractFloat64Sign( a );
  5281. bSign := extractFloat64Sign( b );
  5282. if ( aSign <> bSign ) then
  5283. Begin
  5284. float64_le := flag(
  5285. (aSign <> 0)
  5286. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5287. = 0 ));
  5288. exit;
  5289. End;
  5290. if aSign <> 0 then
  5291. float64_le := le64( b.high, b.low, a.high, a.low )
  5292. else
  5293. float64_le := le64( a.high, a.low, b.high, b.low );
  5294. End;
  5295. {*
  5296. -------------------------------------------------------------------------------
  5297. Returns 1 if the double-precision floating-point value `a' is less than
  5298. the corresponding value `b', and 0 otherwise. The comparison is performed
  5299. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5300. -------------------------------------------------------------------------------
  5301. *}
  5302. Function float64_lt(a: float64;b: float64): flag;
  5303. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5304. Var
  5305. aSign, bSign: flag;
  5306. Begin
  5307. if
  5308. (
  5309. ( extractFloat64Exp( a ) = $7FF )
  5310. AND
  5311. (
  5312. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5313. )
  5314. )
  5315. OR (
  5316. ( extractFloat64Exp( b ) = $7FF )
  5317. AND (
  5318. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5319. )
  5320. )
  5321. ) then
  5322. Begin
  5323. float_raise( float_flag_invalid );
  5324. float64_lt := 0;
  5325. exit;
  5326. End;
  5327. aSign := extractFloat64Sign( a );
  5328. bSign := extractFloat64Sign( b );
  5329. if ( aSign <> bSign ) then
  5330. Begin
  5331. float64_lt := flag(
  5332. (aSign <> 0)
  5333. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5334. <> 0 ));
  5335. exit;
  5336. End;
  5337. if aSign <> 0 then
  5338. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5339. else
  5340. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5341. End;
  5342. {*
  5343. -------------------------------------------------------------------------------
  5344. Returns 1 if the double-precision floating-point value `a' is equal to
  5345. the corresponding value `b', and 0 otherwise. The invalid exception is
  5346. raised if either operand is a NaN. Otherwise, the comparison is performed
  5347. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5348. -------------------------------------------------------------------------------
  5349. *}
  5350. Function float64_eq_signaling( a: float64; b: float64): flag;
  5351. Begin
  5352. if
  5353. (
  5354. ( extractFloat64Exp( a ) = $7FF )
  5355. AND
  5356. (
  5357. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5358. )
  5359. )
  5360. OR (
  5361. ( extractFloat64Exp( b ) = $7FF )
  5362. AND (
  5363. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5364. )
  5365. )
  5366. ) then
  5367. Begin
  5368. float_raise( float_flag_invalid );
  5369. float64_eq_signaling := 0;
  5370. exit;
  5371. End;
  5372. float64_eq_signaling := flag(
  5373. ( a.low = b.low )
  5374. AND ( ( a.high = b.high )
  5375. OR ( ( a.low = 0 )
  5376. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5377. ));
  5378. End;
  5379. {*
  5380. -------------------------------------------------------------------------------
  5381. Returns 1 if the double-precision floating-point value `a' is less than or
  5382. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5383. cause an exception. Otherwise, the comparison is performed according to the
  5384. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5385. -------------------------------------------------------------------------------
  5386. *}
  5387. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5388. Var
  5389. aSign, bSign : flag;
  5390. Begin
  5391. if
  5392. (
  5393. ( extractFloat64Exp( a ) = $7FF )
  5394. AND
  5395. (
  5396. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5397. )
  5398. )
  5399. OR (
  5400. ( extractFloat64Exp( b ) = $7FF )
  5401. AND (
  5402. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5403. )
  5404. )
  5405. ) then
  5406. Begin
  5407. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5408. float_raise( float_flag_invalid );
  5409. float64_le_quiet := 0;
  5410. exit;
  5411. End;
  5412. aSign := extractFloat64Sign( a );
  5413. bSign := extractFloat64Sign( b );
  5414. if ( aSign <> bSign ) then
  5415. Begin
  5416. float64_le_quiet := flag
  5417. ((aSign <> 0)
  5418. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5419. = 0 ));
  5420. exit;
  5421. End;
  5422. if aSign <> 0 then
  5423. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5424. else
  5425. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5426. End;
  5427. {*
  5428. -------------------------------------------------------------------------------
  5429. Returns 1 if the double-precision floating-point value `a' is less than
  5430. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5431. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5432. Standard for Binary Floating-Point Arithmetic.
  5433. -------------------------------------------------------------------------------
  5434. *}
  5435. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5436. Var
  5437. aSign, bSign: flag;
  5438. Begin
  5439. if
  5440. (
  5441. ( extractFloat64Exp( a ) = $7FF )
  5442. AND
  5443. (
  5444. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5445. )
  5446. )
  5447. OR (
  5448. ( extractFloat64Exp( b ) = $7FF )
  5449. AND (
  5450. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5451. )
  5452. )
  5453. ) then
  5454. Begin
  5455. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5456. float_raise( float_flag_invalid );
  5457. float64_lt_quiet := 0;
  5458. exit;
  5459. End;
  5460. aSign := extractFloat64Sign( a );
  5461. bSign := extractFloat64Sign( b );
  5462. if ( aSign <> bSign ) then
  5463. Begin
  5464. float64_lt_quiet := flag(
  5465. (aSign<>0)
  5466. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5467. <> 0 ));
  5468. exit;
  5469. End;
  5470. If aSign <> 0 then
  5471. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5472. else
  5473. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5474. End;
  5475. {*----------------------------------------------------------------------------
  5476. | Returns the result of converting the 64-bit two's complement integer `a'
  5477. | to the single-precision floating-point format. The conversion is performed
  5478. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5479. *----------------------------------------------------------------------------*}
  5480. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5481. var
  5482. zSign : flag;
  5483. absA : uint64;
  5484. shiftCount: int8;
  5485. intval : int64rec;
  5486. Begin
  5487. if ( a = 0 ) then
  5488. begin
  5489. int64_to_float32.float32 := 0;
  5490. exit;
  5491. end;
  5492. if a < 0 then
  5493. zSign := flag(TRUE)
  5494. else
  5495. zSign := flag(FALSE);
  5496. if zSign<>0 then
  5497. absA := -a
  5498. else
  5499. absA := a;
  5500. shiftCount := countLeadingZeros64( absA ) - 40;
  5501. if ( 0 <= shiftCount ) then
  5502. begin
  5503. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5504. end
  5505. else
  5506. begin
  5507. shiftCount := shiftCount + 7;
  5508. if ( shiftCount < 0 ) then
  5509. begin
  5510. intval.low := int64rec(AbsA).low;
  5511. intval.high := int64rec(AbsA).high;
  5512. shift64RightJamming( intval.high, intval.low, - shiftCount,
  5513. intval.high, intval.low);
  5514. int64rec(absA).low := intval.low;
  5515. int64rec(absA).high := intval.high;
  5516. end
  5517. else
  5518. absA := absA shl shiftCount;
  5519. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5520. end;
  5521. End;
  5522. {*----------------------------------------------------------------------------
  5523. | Returns the result of converting the 64-bit two's complement integer `a'
  5524. | to the single-precision floating-point format. The conversion is performed
  5525. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5526. | Unisgned version.
  5527. *----------------------------------------------------------------------------*}
  5528. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5529. var
  5530. zSign : flag;
  5531. absA : uint64;
  5532. shiftCount: int8;
  5533. intval : int64rec;
  5534. Begin
  5535. if ( a = 0 ) then
  5536. begin
  5537. qword_to_float32.float32 := 0;
  5538. exit;
  5539. end;
  5540. zSign := flag(FALSE);
  5541. absA := a;
  5542. shiftCount := countLeadingZeros64( absA ) - 40;
  5543. if ( 0 <= shiftCount ) then
  5544. begin
  5545. qword_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5546. end
  5547. else
  5548. begin
  5549. shiftCount := shiftCount + 7;
  5550. if ( shiftCount < 0 ) then
  5551. begin
  5552. intval.low := int64rec(AbsA).low;
  5553. intval.high := int64rec(AbsA).high;
  5554. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5555. intval.low, intval.high);
  5556. int64rec(absA).low := intval.low;
  5557. int64rec(absA).high := intval.high;
  5558. end
  5559. else
  5560. absA := absA shl shiftCount;
  5561. qword_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5562. end;
  5563. End;
  5564. {*----------------------------------------------------------------------------
  5565. | Returns the result of converting the 64-bit two's complement integer `a'
  5566. | to the double-precision floating-point format. The conversion is performed
  5567. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5568. *----------------------------------------------------------------------------*}
  5569. function qword_to_float64( a: qword ): float64;
  5570. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5571. var
  5572. zSign : flag;
  5573. float_result : float64;
  5574. AbsA : bits64;
  5575. shiftcount : int8;
  5576. zSig0, zSig1 : bits32;
  5577. Begin
  5578. if ( a = 0 ) then
  5579. Begin
  5580. packFloat64( 0, 0, 0, 0, result );
  5581. exit;
  5582. end;
  5583. zSign := flag(FALSE);
  5584. AbsA := a;
  5585. shiftCount := countLeadingZeros64( absA ) - 11;
  5586. if ( 0 <= shiftCount ) then
  5587. Begin
  5588. absA := absA shl shiftcount;
  5589. zSig0:=int64rec(absA).high;
  5590. zSig1:=int64rec(absA).low;
  5591. End
  5592. else
  5593. Begin
  5594. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5595. - shiftCount, zSig0, zSig1 );
  5596. End;
  5597. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5598. qword_to_float64:= float_result;
  5599. End;
  5600. {*----------------------------------------------------------------------------
  5601. | Returns the result of converting the 64-bit two's complement integer `a'
  5602. | to the double-precision floating-point format. The conversion is performed
  5603. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5604. *----------------------------------------------------------------------------*}
  5605. function int64_to_float64( a: int64 ): float64;
  5606. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5607. var
  5608. zSign : flag;
  5609. float_result : float64;
  5610. AbsA : bits64;
  5611. shiftcount : int8;
  5612. zSig0, zSig1 : bits32;
  5613. Begin
  5614. if ( a = 0 ) then
  5615. Begin
  5616. packFloat64( 0, 0, 0, 0, result );
  5617. exit;
  5618. end;
  5619. zSign := flag( a < 0 );
  5620. if ZSign<>0 then
  5621. AbsA := -a
  5622. else
  5623. AbsA := a;
  5624. shiftCount := countLeadingZeros64( absA ) - 11;
  5625. if ( 0 <= shiftCount ) then
  5626. Begin
  5627. absA := absA shl shiftcount;
  5628. zSig0:=int64rec(absA).high;
  5629. zSig1:=int64rec(absA).low;
  5630. End
  5631. else
  5632. Begin
  5633. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5634. - shiftCount, zSig0, zSig1 );
  5635. End;
  5636. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5637. int64_to_float64:= float_result;
  5638. End;
  5639. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5640. {*----------------------------------------------------------------------------
  5641. | Returns the result of converting the 64-bit two's complement integer `a'
  5642. | to the extended double-precision floating-point format. The conversion
  5643. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5644. | Arithmetic.
  5645. *----------------------------------------------------------------------------*}
  5646. function int64_to_floatx80( a: int64 ): floatx80;
  5647. var
  5648. zSign: flag;
  5649. absA: uint64;
  5650. shiftCount: int8;
  5651. begin
  5652. if ( a = 0 ) then begin
  5653. result := packFloatx80( 0, 0, 0 );
  5654. exit;
  5655. end;
  5656. zSign := ord( a < 0 );
  5657. if zSign <> 0 then absA := - a else absA := a;
  5658. shiftCount := countLeadingZeros64( absA );
  5659. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5660. end;
  5661. {*----------------------------------------------------------------------------
  5662. | Returns the result of converting the 64-bit two's complement integer `a'
  5663. | to the extended double-precision floating-point format. The conversion
  5664. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5665. | Arithmetic.
  5666. | Unsigned version.
  5667. *----------------------------------------------------------------------------*}
  5668. function qword_to_floatx80( a: qword ): floatx80;
  5669. var
  5670. absA: bits64;
  5671. shiftCount: int8;
  5672. begin
  5673. if ( a = 0 ) then begin
  5674. result := packFloatx80( 0, 0, 0 );
  5675. exit;
  5676. end;
  5677. absA := a;
  5678. shiftCount := countLeadingZeros64( absA );
  5679. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5680. end;
  5681. {$endif FPC_SOFTFLOAT_FLOATX80}
  5682. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5683. {*----------------------------------------------------------------------------
  5684. | Returns the result of converting the 64-bit two's complement integer `a' to
  5685. | the quadruple-precision floating-point format. The conversion is performed
  5686. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5687. *----------------------------------------------------------------------------*}
  5688. function int64_to_float128( a: int64 ): float128;
  5689. var
  5690. zSign: flag;
  5691. absA: uint64;
  5692. shiftCount: int8;
  5693. zExp: int32;
  5694. zSig0, zSig1: bits64;
  5695. begin
  5696. if ( a = 0 ) then begin
  5697. result := packFloat128( 0, 0, 0, 0 );
  5698. exit;
  5699. end;
  5700. zSign := ord( a < 0 );
  5701. if zSign <> 0 then absA := - a else absA := a;
  5702. shiftCount := countLeadingZeros64( absA ) + 49;
  5703. zExp := $406E - shiftCount;
  5704. if ( 64 <= shiftCount ) then begin
  5705. zSig1 := 0;
  5706. zSig0 := absA;
  5707. dec( shiftCount, 64 );
  5708. end
  5709. else begin
  5710. zSig1 := absA;
  5711. zSig0 := 0;
  5712. end;
  5713. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5714. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5715. end;
  5716. {*----------------------------------------------------------------------------
  5717. | Returns the result of converting the 64-bit two's complement integer `a' to
  5718. | the quadruple-precision floating-point format. The conversion is performed
  5719. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5720. | Unsigned version.
  5721. *----------------------------------------------------------------------------*}
  5722. function qword_to_float128( a: qword ): float128;
  5723. var
  5724. absA: bits64;
  5725. shiftCount: int8;
  5726. zExp: int32;
  5727. zSig0, zSig1: bits64;
  5728. begin
  5729. if ( a = 0 ) then begin
  5730. result := packFloat128( 0, 0, 0, 0 );
  5731. exit;
  5732. end;
  5733. absA := a;
  5734. shiftCount := countLeadingZeros64( absA ) + 49;
  5735. zExp := $406E - shiftCount;
  5736. if ( 64 <= shiftCount ) then begin
  5737. zSig1 := 0;
  5738. zSig0 := absA;
  5739. dec( shiftCount, 64 );
  5740. end
  5741. else begin
  5742. zSig1 := absA;
  5743. zSig0 := 0;
  5744. end;
  5745. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5746. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5747. end;
  5748. {$endif FPC_SOFTFLOAT_FLOAT128}
  5749. {*----------------------------------------------------------------------------
  5750. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5751. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5752. | Otherwise, returns 0.
  5753. *----------------------------------------------------------------------------*}
  5754. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5755. begin
  5756. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5757. end;
  5758. {*----------------------------------------------------------------------------
  5759. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5760. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5761. | Otherwise, returns 0.
  5762. *----------------------------------------------------------------------------*}
  5763. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5764. begin
  5765. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5766. end;
  5767. {*----------------------------------------------------------------------------
  5768. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5769. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5770. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5771. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5772. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5773. | the most-significant bit of the extra result, and the other 63 bits of the
  5774. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5775. | were all zero. This extra result is stored in the location pointed to by
  5776. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5777. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5778. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5779. | fixed-point value is shifted right by the number of bits given in `count',
  5780. | and the integer part of the result is returned at the locations pointed to
  5781. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5782. | corrupted as described above, and is returned at the location pointed to by
  5783. | `z2Ptr'.)
  5784. *----------------------------------------------------------------------------*}
  5785. procedure shift128ExtraRightJamming(
  5786. a0: bits64;
  5787. a1: bits64;
  5788. a2: bits64;
  5789. count: int16;
  5790. var z0Ptr: bits64;
  5791. var z1Ptr: bits64;
  5792. var z2Ptr: bits64);
  5793. var
  5794. z0, z1, z2: bits64;
  5795. negCount: int8;
  5796. begin
  5797. negCount := ( - count ) and 63;
  5798. if ( count = 0 ) then
  5799. begin
  5800. z2 := a2;
  5801. z1 := a1;
  5802. z0 := a0;
  5803. end
  5804. else begin
  5805. if ( count < 64 ) then
  5806. begin
  5807. z2 := a1 shr negCount;
  5808. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5809. z0 := a0 shr count;
  5810. end
  5811. else begin
  5812. if ( count = 64 ) then
  5813. begin
  5814. z2 := a1;
  5815. z1 := a0;
  5816. end
  5817. else begin
  5818. a2 := a2 or a1;
  5819. if ( count < 128 ) then
  5820. begin
  5821. z2 := a0 shl negCount;
  5822. z1 := a0 shr ( count and 63 );
  5823. end
  5824. else begin
  5825. if ( count = 128 ) then
  5826. z2 := a0
  5827. else
  5828. z2 := ord( a0 <> 0 );
  5829. z1 := 0;
  5830. end;
  5831. end;
  5832. z0 := 0;
  5833. end;
  5834. z2 := z2 or ord( a2 <> 0 );
  5835. end;
  5836. z2Ptr := z2;
  5837. z1Ptr := z1;
  5838. z0Ptr := z0;
  5839. end;
  5840. {*----------------------------------------------------------------------------
  5841. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5842. | _plus_ the number of bits given in `count'. The shifted result is at most
  5843. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5844. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5845. | shifted off is the most-significant bit of the extra result, and the other
  5846. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5847. | bits shifted off were all zero. This extra result is stored in the location
  5848. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5849. | (This routine makes more sense if `a0' and `a1' are considered to form
  5850. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5851. | point value is shifted right by the number of bits given in `count', and
  5852. | the integer part of the result is returned at the location pointed to by
  5853. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5854. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5855. *----------------------------------------------------------------------------*}
  5856. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5857. var
  5858. z0, z1: bits64;
  5859. negCount: int8;
  5860. begin
  5861. negCount := ( - count ) and 63;
  5862. if ( count = 0 ) then
  5863. begin
  5864. z1 := a1;
  5865. z0 := a0;
  5866. end
  5867. else if ( count < 64 ) then
  5868. begin
  5869. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5870. z0 := a0 shr count;
  5871. end
  5872. else begin
  5873. if ( count = 64 ) then
  5874. begin
  5875. z1 := a0 or ord( a1 <> 0 );
  5876. end
  5877. else begin
  5878. z1 := ord( ( a0 or a1 ) <> 0 );
  5879. end;
  5880. z0 := 0;
  5881. end;
  5882. z1Ptr := z1;
  5883. z0Ptr := z0;
  5884. end;
  5885. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5886. {*----------------------------------------------------------------------------
  5887. | Returns the fraction bits of the extended double-precision floating-point
  5888. | value `a'.
  5889. *----------------------------------------------------------------------------*}
  5890. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5891. begin
  5892. result:=a.low;
  5893. end;
  5894. {*----------------------------------------------------------------------------
  5895. | Returns the exponent bits of the extended double-precision floating-point
  5896. | value `a'.
  5897. *----------------------------------------------------------------------------*}
  5898. function extractFloatx80Exp(a : floatx80): int32;inline;
  5899. begin
  5900. result:=a.high and $7FFF;
  5901. end;
  5902. {*----------------------------------------------------------------------------
  5903. | Returns the sign bit of the extended double-precision floating-point value
  5904. | `a'.
  5905. *----------------------------------------------------------------------------*}
  5906. function extractFloatx80Sign(a : floatx80): flag;inline;
  5907. begin
  5908. result:=a.high shr 15;
  5909. end;
  5910. {*----------------------------------------------------------------------------
  5911. | Normalizes the subnormal extended double-precision floating-point value
  5912. | represented by the denormalized significand `aSig'. The normalized exponent
  5913. | and significand are stored at the locations pointed to by `zExpPtr' and
  5914. | `zSigPtr', respectively.
  5915. *----------------------------------------------------------------------------*}
  5916. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5917. var
  5918. shiftCount: int8;
  5919. begin
  5920. shiftCount := countLeadingZeros64( aSig );
  5921. zSigPtr := aSig shl shiftCount;
  5922. zExpPtr := 1 - shiftCount;
  5923. end;
  5924. {*----------------------------------------------------------------------------
  5925. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5926. | extended double-precision floating-point value, returning the result.
  5927. *----------------------------------------------------------------------------*}
  5928. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5929. var
  5930. z: floatx80;
  5931. begin
  5932. z.low := zSig;
  5933. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5934. result:=z;
  5935. end;
  5936. {*----------------------------------------------------------------------------
  5937. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5938. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5939. | and returns the proper extended double-precision floating-point value
  5940. | corresponding to the abstract input. Ordinarily, the abstract value is
  5941. | rounded and packed into the extended double-precision format, with the
  5942. | inexact exception raised if the abstract input cannot be represented
  5943. | exactly. However, if the abstract value is too large, the overflow and
  5944. | inexact exceptions are raised and an infinity or maximal finite value is
  5945. | returned. If the abstract value is too small, the input value is rounded to
  5946. | a subnormal number, and the underflow and inexact exceptions are raised if
  5947. | the abstract input cannot be represented exactly as a subnormal extended
  5948. | double-precision floating-point number.
  5949. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5950. | number of bits as single or double precision, respectively. Otherwise, the
  5951. | result is rounded to the full precision of the extended double-precision
  5952. | format.
  5953. | The input significand must be normalized or smaller. If the input
  5954. | significand is not normalized, `zExp' must be 0; in that case, the result
  5955. | returned is a subnormal number, and it must not require rounding. The
  5956. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5957. | Floating-Point Arithmetic.
  5958. *----------------------------------------------------------------------------*}
  5959. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5960. var
  5961. roundingMode: int8;
  5962. roundNearestEven, increment, isTiny: flag;
  5963. roundIncrement, roundMask, roundBits: int64;
  5964. label
  5965. precision80, overflow;
  5966. begin
  5967. roundingMode := softfloat_rounding_mode;
  5968. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5969. if ( roundingPrecision = 80 ) then
  5970. goto precision80;
  5971. if ( roundingPrecision = 64 ) then
  5972. begin
  5973. roundIncrement := int64( $0000000000000400 );
  5974. roundMask := int64( $00000000000007FF );
  5975. end
  5976. else if ( roundingPrecision = 32 ) then
  5977. begin
  5978. roundIncrement := int64( $0000008000000000 );
  5979. roundMask := int64( $000000FFFFFFFFFF );
  5980. end
  5981. else begin
  5982. goto precision80;
  5983. end;
  5984. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5985. if ( not (roundNearestEven<>0) ) then
  5986. begin
  5987. if ( roundingMode = float_round_to_zero ) then
  5988. begin
  5989. roundIncrement := 0;
  5990. end
  5991. else begin
  5992. roundIncrement := roundMask;
  5993. if ( zSign<>0 ) then
  5994. begin
  5995. if ( roundingMode = float_round_up ) then
  5996. roundIncrement := 0;
  5997. end
  5998. else begin
  5999. if ( roundingMode = float_round_down ) then
  6000. roundIncrement := 0;
  6001. end;
  6002. end;
  6003. end;
  6004. roundBits := zSig0 and roundMask;
  6005. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6006. if ( ( $7FFE < zExp )
  6007. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  6008. ) then begin
  6009. goto overflow;
  6010. end;
  6011. if ( zExp <= 0 ) then begin
  6012. isTiny := ord (
  6013. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6014. or ( zExp < 0 )
  6015. or ( zSig0 <= zSig0 + roundIncrement ) );
  6016. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  6017. zExp := 0;
  6018. roundBits := zSig0 and roundMask;
  6019. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  6020. if ( roundBits <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6021. inc( zSig0, roundIncrement );
  6022. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6023. roundIncrement := roundMask + 1;
  6024. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6025. roundMask := roundMask or roundIncrement;
  6026. end;
  6027. zSig0 := not roundMask;
  6028. result:=packFloatx80( zSign, zExp, zSig0 );
  6029. exit;
  6030. end;
  6031. end;
  6032. if ( roundBits <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6033. inc( zSig0, roundIncrement );
  6034. if ( zSig0 < roundIncrement ) then begin
  6035. inc(zExp);
  6036. zSig0 := bits64( $8000000000000000 );
  6037. end;
  6038. roundIncrement := roundMask + 1;
  6039. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6040. roundMask := roundMask or roundIncrement;
  6041. end;
  6042. zSig0 := not roundMask;
  6043. if ( zSig0 = 0 ) then zExp := 0;
  6044. result:=packFloatx80( zSign, zExp, zSig0 );
  6045. exit;
  6046. precision80:
  6047. increment := ord ( sbits64( zSig1 ) < 0 );
  6048. if ( roundNearestEven = 0 ) then begin
  6049. if ( roundingMode = float_round_to_zero ) then begin
  6050. increment := 0;
  6051. end
  6052. else begin
  6053. if ( zSign <> 0 ) then begin
  6054. increment := ord ( roundingMode = float_round_down ) and zSig1;
  6055. end
  6056. else begin
  6057. increment := ord ( roundingMode = float_round_up ) and zSig1;
  6058. end;
  6059. end;
  6060. end;
  6061. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6062. if ( ( $7FFE < zExp )
  6063. or ( ( zExp = $7FFE )
  6064. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  6065. and ( increment <> 0 )
  6066. )
  6067. ) then begin
  6068. roundMask := 0;
  6069. overflow:
  6070. float_raise( float_flag_overflow or float_flag_inexact );
  6071. if ( ( roundingMode = float_round_to_zero )
  6072. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6073. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6074. ) then begin
  6075. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6076. exit;
  6077. end;
  6078. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6079. exit;
  6080. end;
  6081. if ( zExp <= 0 ) then begin
  6082. isTiny := ord(
  6083. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6084. or ( zExp < 0 )
  6085. or ( increment = 0 )
  6086. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6087. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6088. zExp := 0;
  6089. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6090. if ( zSig1 <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6091. if ( roundNearestEven <> 0 ) then begin
  6092. increment := ord( sbits64( zSig1 ) < 0 );
  6093. end
  6094. else begin
  6095. if ( zSign <> 0 ) then begin
  6096. increment := ord( roundingMode = float_round_down ) and zSig1;
  6097. end
  6098. else begin
  6099. increment := ord( roundingMode = float_round_up ) and zSig1;
  6100. end;
  6101. end;
  6102. if ( increment <> 0 ) then begin
  6103. inc(zSig0);
  6104. zSig0 :=
  6105. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6106. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6107. end;
  6108. result:=packFloatx80( zSign, zExp, zSig0 );
  6109. exit;
  6110. end;
  6111. end;
  6112. if ( zSig1 <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6113. if ( increment <> 0 ) then begin
  6114. inc(zSig0);
  6115. if ( zSig0 = 0 ) then begin
  6116. inc(zExp);
  6117. zSig0 := bits64( $8000000000000000 );
  6118. end
  6119. else begin
  6120. zSig0 := not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6121. end;
  6122. end
  6123. else begin
  6124. if ( zSig0 = 0 ) then zExp := 0;
  6125. end;
  6126. result:=packFloatx80( zSign, zExp, zSig0 );
  6127. end;
  6128. {*----------------------------------------------------------------------------
  6129. | Takes an abstract floating-point value having sign `zSign', exponent
  6130. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6131. | and returns the proper extended double-precision floating-point value
  6132. | corresponding to the abstract input. This routine is just like
  6133. | `roundAndPackFloatx80' except that the input significand does not have to be
  6134. | normalized.
  6135. *----------------------------------------------------------------------------*}
  6136. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6137. var
  6138. shiftCount: int8;
  6139. begin
  6140. if ( zSig0 = 0 ) then begin
  6141. zSig0 := zSig1;
  6142. zSig1 := 0;
  6143. dec( zExp, 64 );
  6144. end;
  6145. shiftCount := countLeadingZeros64( zSig0 );
  6146. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6147. zExp := zExp - shiftCount;
  6148. result :=
  6149. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6150. end;
  6151. {*----------------------------------------------------------------------------
  6152. | Returns the result of converting the extended double-precision floating-
  6153. | point value `a' to the 32-bit two's complement integer format. The
  6154. | conversion is performed according to the IEC/IEEE Standard for Binary
  6155. | Floating-Point Arithmetic---which means in particular that the conversion
  6156. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6157. | largest positive integer is returned. Otherwise, if the conversion
  6158. | overflows, the largest integer with the same sign as `a' is returned.
  6159. *----------------------------------------------------------------------------*}
  6160. function floatx80_to_int32(a: floatx80): int32;
  6161. var
  6162. aSign: flag;
  6163. aExp, shiftCount: int32;
  6164. aSig: bits64;
  6165. begin
  6166. aSig := extractFloatx80Frac( a );
  6167. aExp := extractFloatx80Exp( a );
  6168. aSign := extractFloatx80Sign( a );
  6169. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6170. shiftCount := $4037 - aExp;
  6171. if ( shiftCount <= 0 ) then shiftCount := 1;
  6172. shift64RightJamming( aSig, shiftCount, aSig );
  6173. result := roundAndPackInt32( aSign, aSig );
  6174. end;
  6175. {*----------------------------------------------------------------------------
  6176. | Returns the result of converting the extended double-precision floating-
  6177. | point value `a' to the 32-bit two's complement integer format. The
  6178. | conversion is performed according to the IEC/IEEE Standard for Binary
  6179. | Floating-Point Arithmetic, except that the conversion is always rounded
  6180. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6181. | Otherwise, if the conversion overflows, the largest integer with the same
  6182. | sign as `a' is returned.
  6183. *----------------------------------------------------------------------------*}
  6184. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6185. var
  6186. aSign: flag;
  6187. aExp, shiftCount: int32;
  6188. aSig, savedASig: bits64;
  6189. z: int32;
  6190. label
  6191. invalid;
  6192. begin
  6193. aSig := extractFloatx80Frac( a );
  6194. aExp := extractFloatx80Exp( a );
  6195. aSign := extractFloatx80Sign( a );
  6196. if ( $401E < aExp ) then begin
  6197. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6198. goto invalid;
  6199. end
  6200. else if ( aExp < $3FFF ) then begin
  6201. if ( aExp or aSig <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6202. result := 0;
  6203. exit;
  6204. end;
  6205. shiftCount := $403E - aExp;
  6206. savedASig := aSig;
  6207. aSig := aSig shr shiftCount;
  6208. z := aSig;
  6209. if ( aSign <> 0 ) then z := - z;
  6210. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6211. invalid:
  6212. float_raise( float_flag_invalid );
  6213. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6214. exit;
  6215. end;
  6216. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6217. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6218. end;
  6219. result := z;
  6220. end;
  6221. {*----------------------------------------------------------------------------
  6222. | Returns the result of converting the extended double-precision floating-
  6223. | point value `a' to the 64-bit two's complement integer format. The
  6224. | conversion is performed according to the IEC/IEEE Standard for Binary
  6225. | Floating-Point Arithmetic---which means in particular that the conversion
  6226. | is rounded according to the current rounding mode. If `a' is a NaN,
  6227. | the largest positive integer is returned. Otherwise, if the conversion
  6228. | overflows, the largest integer with the same sign as `a' is returned.
  6229. *----------------------------------------------------------------------------*}
  6230. function floatx80_to_int64(a: floatx80): int64;
  6231. var
  6232. aSign: flag;
  6233. aExp, shiftCount: int32;
  6234. aSig, aSigExtra: bits64;
  6235. begin
  6236. aSig := extractFloatx80Frac( a );
  6237. aExp := extractFloatx80Exp( a );
  6238. aSign := extractFloatx80Sign( a );
  6239. shiftCount := $403E - aExp;
  6240. if ( shiftCount <= 0 ) then begin
  6241. if ( shiftCount <> 0 ) then begin
  6242. float_raise( float_flag_invalid );
  6243. if ( ( aSign = 0 )
  6244. or ( ( aExp = $7FFF )
  6245. and ( aSig <> bits64( $8000000000000000 ) ) )
  6246. ) then begin
  6247. result := $7FFFFFFFFFFFFFFF;
  6248. exit;
  6249. end;
  6250. result := $8000000000000000;
  6251. exit;
  6252. end;
  6253. aSigExtra := 0;
  6254. end
  6255. else begin
  6256. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6257. end;
  6258. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6259. end;
  6260. {*----------------------------------------------------------------------------
  6261. | Returns the result of converting the extended double-precision floating-
  6262. | point value `a' to the 64-bit two's complement integer format. The
  6263. | conversion is performed according to the IEC/IEEE Standard for Binary
  6264. | Floating-Point Arithmetic, except that the conversion is always rounded
  6265. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6266. | Otherwise, if the conversion overflows, the largest integer with the same
  6267. | sign as `a' is returned.
  6268. *----------------------------------------------------------------------------*}
  6269. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6270. var
  6271. aSign: flag;
  6272. aExp, shiftCount: int32;
  6273. aSig: bits64;
  6274. z: int64;
  6275. begin
  6276. aSig := extractFloatx80Frac( a );
  6277. aExp := extractFloatx80Exp( a );
  6278. aSign := extractFloatx80Sign( a );
  6279. shiftCount := aExp - $403E;
  6280. if ( 0 <= shiftCount ) then begin
  6281. aSig := $7FFFFFFFFFFFFFFF;
  6282. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6283. float_raise( float_flag_invalid );
  6284. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6285. result := $7FFFFFFFFFFFFFFF;
  6286. exit;
  6287. end;
  6288. end;
  6289. result := $8000000000000000;
  6290. exit;
  6291. end
  6292. else if ( aExp < $3FFF ) then begin
  6293. if ( aExp or aSig <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6294. result := 0;
  6295. exit;
  6296. end;
  6297. z := aSig shr ( - shiftCount );
  6298. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6299. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6300. end;
  6301. if ( aSign <> 0 ) then z := - z;
  6302. result := z;
  6303. end;
  6304. {*----------------------------------------------------------------------------
  6305. | The pattern for a default generated extended double-precision NaN. The
  6306. | `high' and `low' values hold the most- and least-significant bits,
  6307. | respectively.
  6308. *----------------------------------------------------------------------------*}
  6309. const
  6310. floatx80_default_nan_high = $FFFF;
  6311. floatx80_default_nan_low = bits64( $C000000000000000 );
  6312. {*----------------------------------------------------------------------------
  6313. | Returns 1 if the extended double-precision floating-point value `a' is a
  6314. | signaling NaN; otherwise returns 0.
  6315. *----------------------------------------------------------------------------*}
  6316. function floatx80_is_signaling_nan(a : floatx80): flag;
  6317. var
  6318. aLow: bits64;
  6319. begin
  6320. aLow := a.low and not $4000000000000000;
  6321. result := ord(
  6322. ( a.high and $7FFF = $7FFF )
  6323. and ( bits64( aLow shl 1 ) <> 0 )
  6324. and ( a.low = aLow ) );
  6325. end;
  6326. {*----------------------------------------------------------------------------
  6327. | Returns the result of converting the extended double-precision floating-
  6328. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6329. | invalid exception is raised.
  6330. *----------------------------------------------------------------------------*}
  6331. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6332. var
  6333. z: commonNaNT;
  6334. begin
  6335. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6336. z.sign := a.high shr 15;
  6337. z.low := 0;
  6338. z.high := a.low shl 1;
  6339. result := z;
  6340. end;
  6341. {*----------------------------------------------------------------------------
  6342. | Returns 1 if the extended double-precision floating-point value `a' is a
  6343. | NaN; otherwise returns 0.
  6344. *----------------------------------------------------------------------------*}
  6345. function floatx80_is_nan(a : floatx80 ): flag;
  6346. begin
  6347. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low<<1 ) <> 0 ) );
  6348. end;
  6349. {*----------------------------------------------------------------------------
  6350. | Takes two extended double-precision floating-point values `a' and `b', one
  6351. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6352. | `b' is a signaling NaN, the invalid exception is raised.
  6353. *----------------------------------------------------------------------------*}
  6354. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6355. var
  6356. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6357. label
  6358. returnLargerSignificand;
  6359. begin
  6360. aIsNaN := floatx80_is_nan( a );
  6361. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6362. bIsNaN := floatx80_is_nan( b );
  6363. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6364. a.low := a.low or $C000000000000000;
  6365. b.low := b.low or $C000000000000000;
  6366. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6367. if aIsSignalingNaN <> 0 then begin
  6368. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6369. if bIsNaN <> 0 then result := b else result := a;
  6370. exit;
  6371. end
  6372. else if aIsNaN <>0 then begin
  6373. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6374. result := a;
  6375. exit;
  6376. end;
  6377. returnLargerSignificand:
  6378. if ( a.low < b.low ) then begin
  6379. result := b;
  6380. exit;
  6381. end;
  6382. if ( b.low < a.low ) then begin
  6383. result := a;
  6384. exit;
  6385. end;
  6386. if a.high < b.high then result := a else result := b;
  6387. exit;
  6388. end
  6389. else
  6390. result := b;
  6391. end;
  6392. {*----------------------------------------------------------------------------
  6393. | Returns the result of converting the extended double-precision floating-
  6394. | point value `a' to the single-precision floating-point format. The
  6395. | conversion is performed according to the IEC/IEEE Standard for Binary
  6396. | Floating-Point Arithmetic.
  6397. *----------------------------------------------------------------------------*}
  6398. function floatx80_to_float32(a: floatx80): float32;
  6399. var
  6400. aSign: flag;
  6401. aExp: int32;
  6402. aSig: bits64;
  6403. begin
  6404. aSig := extractFloatx80Frac( a );
  6405. aExp := extractFloatx80Exp( a );
  6406. aSign := extractFloatx80Sign( a );
  6407. if ( aExp = $7FFF ) then begin
  6408. if bits64( aSig shl 1 ) <> 0 then begin
  6409. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6410. exit;
  6411. end;
  6412. result := packFloat32( aSign, $FF, 0 );
  6413. exit;
  6414. end;
  6415. shift64RightJamming( aSig, 33, aSig );
  6416. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6417. result := roundAndPackFloat32( aSign, aExp, aSig );
  6418. end;
  6419. {*----------------------------------------------------------------------------
  6420. | Returns the result of converting the extended double-precision floating-
  6421. | point value `a' to the double-precision floating-point format. The
  6422. | conversion is performed according to the IEC/IEEE Standard for Binary
  6423. | Floating-Point Arithmetic.
  6424. *----------------------------------------------------------------------------*}
  6425. function floatx80_to_float64(a: floatx80): float64;
  6426. var
  6427. aSign: flag;
  6428. aExp: int32;
  6429. aSig, zSig: bits64;
  6430. begin
  6431. aSig := extractFloatx80Frac( a );
  6432. aExp := extractFloatx80Exp( a );
  6433. aSign := extractFloatx80Sign( a );
  6434. if ( aExp = $7FFF ) then begin
  6435. if bits64( aSig shl 1 ) <> 0 then begin
  6436. commonNaNToFloat64( floatx80ToCommonNaN( a ), result );
  6437. exit;
  6438. end;
  6439. result := packFloat64( aSign, $7FF, 0 );
  6440. exit;
  6441. end;
  6442. shift64RightJamming( aSig, 1, zSig );
  6443. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6444. result := roundAndPackFloat64( aSign, aExp, zSig );
  6445. end;
  6446. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6447. {*----------------------------------------------------------------------------
  6448. | Returns the result of converting the extended double-precision floating-
  6449. | point value `a' to the quadruple-precision floating-point format. The
  6450. | conversion is performed according to the IEC/IEEE Standard for Binary
  6451. | Floating-Point Arithmetic.
  6452. *----------------------------------------------------------------------------*}
  6453. function floatx80_to_float128(a: floatx80): float128;
  6454. var
  6455. aSign: flag;
  6456. aExp: int16;
  6457. aSig, zSig0, zSig1: bits64;
  6458. begin
  6459. aSig := extractFloatx80Frac( a );
  6460. aExp := extractFloatx80Exp( a );
  6461. aSign := extractFloatx80Sign( a );
  6462. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6463. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6464. exit;
  6465. end;
  6466. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6467. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6468. end;
  6469. {$endif FPC_SOFTFLOAT_FLOAT128}
  6470. {*----------------------------------------------------------------------------
  6471. | Rounds the extended double-precision floating-point value `a' to an integer,
  6472. | and Returns the result as an extended quadruple-precision floating-point
  6473. | value. The operation is performed according to the IEC/IEEE Standard for
  6474. | Binary Floating-Point Arithmetic.
  6475. *----------------------------------------------------------------------------*}
  6476. function floatx80_round_to_int(a: floatx80): floatx80;
  6477. var
  6478. aSign: flag;
  6479. aExp: int32;
  6480. lastBitMask, roundBitsMask: bits64;
  6481. roundingMode: int8;
  6482. z: floatx80;
  6483. begin
  6484. aExp := extractFloatx80Exp( a );
  6485. if ( $403E <= aExp ) then begin
  6486. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6487. result := propagateFloatx80NaN( a, a );
  6488. exit;
  6489. end;
  6490. result := a;
  6491. exit;
  6492. end;
  6493. if ( aExp < $3FFF ) then begin
  6494. if ( ( aExp = 0 )
  6495. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6496. result := a;
  6497. exit;
  6498. end;
  6499. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6500. aSign := extractFloatx80Sign( a );
  6501. case softfloat_rounding_mode of
  6502. float_round_nearest_even:
  6503. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6504. ) then begin
  6505. result :=
  6506. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6507. exit;
  6508. end;
  6509. float_round_down: begin
  6510. if aSign <> 0 then
  6511. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6512. else
  6513. result := packFloatx80( 0, 0, 0 );
  6514. exit;
  6515. end;
  6516. float_round_up: begin
  6517. if aSign <> 0 then
  6518. result := packFloatx80( 1, 0, 0 )
  6519. else
  6520. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6521. exit;
  6522. end;
  6523. end;
  6524. result := packFloatx80( aSign, 0, 0 );
  6525. exit;
  6526. end;
  6527. lastBitMask := 1;
  6528. lastBitMask := lastBitMask shl ( $403E - aExp );
  6529. roundBitsMask := lastBitMask - 1;
  6530. z := a;
  6531. roundingMode := softfloat_rounding_mode;
  6532. if ( roundingMode = float_round_nearest_even ) then begin
  6533. inc( z.low, lastBitMask shr 1 );
  6534. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := not lastBitMask;
  6535. end
  6536. else if ( roundingMode <> float_round_to_zero ) then begin
  6537. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6538. inc( z.low, roundBitsMask );
  6539. end;
  6540. end;
  6541. z.low := not roundBitsMask;
  6542. if ( z.low = 0 ) then begin
  6543. inc(z.high);
  6544. z.low := bits64( $8000000000000000 );
  6545. end;
  6546. if ( z.low <> a.low ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6547. result := z;
  6548. end;
  6549. {*----------------------------------------------------------------------------
  6550. | Returns the result of adding the absolute values of the extended double-
  6551. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6552. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6553. | The addition is performed according to the IEC/IEEE Standard for Binary
  6554. | Floating-Point Arithmetic.
  6555. *----------------------------------------------------------------------------*}
  6556. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6557. var
  6558. aExp, bExp, zExp: int32;
  6559. aSig, bSig, zSig0, zSig1: bits64;
  6560. expDiff: int32;
  6561. label
  6562. shiftRight1, roundAndPack;
  6563. begin
  6564. aSig := extractFloatx80Frac( a );
  6565. aExp := extractFloatx80Exp( a );
  6566. bSig := extractFloatx80Frac( b );
  6567. bExp := extractFloatx80Exp( b );
  6568. expDiff := aExp - bExp;
  6569. if ( 0 < expDiff ) then begin
  6570. if ( aExp = $7FFF ) then begin
  6571. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6572. result := propagateFloatx80NaN( a, b );
  6573. exit;
  6574. end;
  6575. result := a;
  6576. exit;
  6577. end;
  6578. if ( bExp = 0 ) then dec(expDiff);
  6579. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6580. zExp := aExp;
  6581. end
  6582. else if ( expDiff < 0 ) then begin
  6583. if ( bExp = $7FFF ) then begin
  6584. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6585. result := propagateFloatx80NaN( a, b );
  6586. exit;
  6587. end;
  6588. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6589. exit;
  6590. end;
  6591. if ( aExp = 0 ) then inc(expDiff);
  6592. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6593. zExp := bExp;
  6594. end
  6595. else begin
  6596. if ( aExp = $7FFF ) then begin
  6597. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6598. result := propagateFloatx80NaN( a, b );
  6599. exit;
  6600. end;
  6601. result := a;
  6602. exit;
  6603. end;
  6604. zSig1 := 0;
  6605. zSig0 := aSig + bSig;
  6606. if ( aExp = 0 ) then begin
  6607. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6608. goto roundAndPack;
  6609. end;
  6610. zExp := aExp;
  6611. goto shiftRight1;
  6612. end;
  6613. zSig0 := aSig + bSig;
  6614. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6615. shiftRight1:
  6616. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6617. zSig0 := zSig0 or $8000000000000000;
  6618. inc(zExp);
  6619. roundAndPack:
  6620. result :=
  6621. roundAndPackFloatx80(
  6622. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6623. end;
  6624. {*----------------------------------------------------------------------------
  6625. | Returns the result of subtracting the absolute values of the extended
  6626. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6627. | difference is negated before being returned. `zSign' is ignored if the
  6628. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6629. | Standard for Binary Floating-Point Arithmetic.
  6630. *----------------------------------------------------------------------------*}
  6631. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6632. var
  6633. aExp, bExp, zExp: int32;
  6634. aSig, bSig, zSig0, zSig1: bits64;
  6635. expDiff: int32;
  6636. z: floatx80;
  6637. label
  6638. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6639. begin
  6640. aSig := extractFloatx80Frac( a );
  6641. aExp := extractFloatx80Exp( a );
  6642. bSig := extractFloatx80Frac( b );
  6643. bExp := extractFloatx80Exp( b );
  6644. expDiff := aExp - bExp;
  6645. if ( 0 < expDiff ) then goto aExpBigger;
  6646. if ( expDiff < 0 ) then goto bExpBigger;
  6647. if ( aExp = $7FFF ) then begin
  6648. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6649. result := propagateFloatx80NaN( a, b );
  6650. exit;
  6651. end;
  6652. float_raise( float_flag_invalid );
  6653. z.low := floatx80_default_nan_low;
  6654. z.high := floatx80_default_nan_high;
  6655. result := z;
  6656. exit;
  6657. end;
  6658. if ( aExp = 0 ) then begin
  6659. aExp := 1;
  6660. bExp := 1;
  6661. end;
  6662. zSig1 := 0;
  6663. if ( bSig < aSig ) then goto aBigger;
  6664. if ( aSig < bSig ) then goto bBigger;
  6665. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6666. exit;
  6667. bExpBigger:
  6668. if ( bExp = $7FFF ) then begin
  6669. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6670. result := propagateFloatx80NaN( a, b );
  6671. exit;
  6672. end;
  6673. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6674. exit;
  6675. end;
  6676. if ( aExp = 0 ) then inc(expDiff);
  6677. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6678. bBigger:
  6679. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6680. zExp := bExp;
  6681. zSign := zSign xor 1;
  6682. goto normalizeRoundAndPack;
  6683. aExpBigger:
  6684. if ( aExp = $7FFF ) then begin
  6685. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6686. result := propagateFloatx80NaN( a, b );
  6687. exit;
  6688. end;
  6689. result := a;
  6690. exit;
  6691. end;
  6692. if ( bExp = 0 ) then dec(expDiff);
  6693. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6694. aBigger:
  6695. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6696. zExp := aExp;
  6697. normalizeRoundAndPack:
  6698. result :=
  6699. normalizeRoundAndPackFloatx80(
  6700. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6701. end;
  6702. {*----------------------------------------------------------------------------
  6703. | Returns the result of adding the extended double-precision floating-point
  6704. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6705. | Standard for Binary Floating-Point Arithmetic.
  6706. *----------------------------------------------------------------------------*}
  6707. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6708. var
  6709. aSign, bSign: flag;
  6710. begin
  6711. aSign := extractFloatx80Sign( a );
  6712. bSign := extractFloatx80Sign( b );
  6713. if ( aSign = bSign ) then begin
  6714. result := addFloatx80Sigs( a, b, aSign );
  6715. end
  6716. else begin
  6717. result := subFloatx80Sigs( a, b, aSign );
  6718. end;
  6719. end;
  6720. {*----------------------------------------------------------------------------
  6721. | Returns the result of subtracting the extended double-precision floating-
  6722. | point values `a' and `b'. The operation is performed according to the
  6723. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6724. *----------------------------------------------------------------------------*}
  6725. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6726. var
  6727. aSign, bSign: flag;
  6728. begin
  6729. aSign := extractFloatx80Sign( a );
  6730. bSign := extractFloatx80Sign( b );
  6731. if ( aSign = bSign ) then begin
  6732. result := subFloatx80Sigs( a, b, aSign );
  6733. end
  6734. else begin
  6735. result := addFloatx80Sigs( a, b, aSign );
  6736. end;
  6737. end;
  6738. {*----------------------------------------------------------------------------
  6739. | Returns the result of multiplying the extended double-precision floating-
  6740. | point values `a' and `b'. The operation is performed according to the
  6741. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6742. *----------------------------------------------------------------------------*}
  6743. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6744. var
  6745. aSign, bSign, zSign: flag;
  6746. aExp, bExp, zExp: int32;
  6747. aSig, bSig, zSig0, zSig1: bits64;
  6748. z: floatx80;
  6749. label
  6750. invalid;
  6751. begin
  6752. aSig := extractFloatx80Frac( a );
  6753. aExp := extractFloatx80Exp( a );
  6754. aSign := extractFloatx80Sign( a );
  6755. bSig := extractFloatx80Frac( b );
  6756. bExp := extractFloatx80Exp( b );
  6757. bSign := extractFloatx80Sign( b );
  6758. zSign := aSign xor bSign;
  6759. if ( aExp = $7FFF ) then begin
  6760. if ( bits64( aSig shl 1 ) <> 0 )
  6761. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6762. result := propagateFloatx80NaN( a, b );
  6763. exit;
  6764. end;
  6765. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6766. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6767. exit;
  6768. end;
  6769. if ( bExp = $7FFF ) then begin
  6770. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6771. result := propagateFloatx80NaN( a, b );
  6772. exit;
  6773. end;
  6774. if ( ( aExp or aSig ) = 0 ) then begin
  6775. invalid:
  6776. float_raise( float_flag_invalid );
  6777. z.low := floatx80_default_nan_low;
  6778. z.high := floatx80_default_nan_high;
  6779. result := z;
  6780. exit;
  6781. end;
  6782. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6783. exit;
  6784. end;
  6785. if ( aExp = 0 ) then begin
  6786. if ( aSig = 0 ) then begin
  6787. result := packFloatx80( zSign, 0, 0 );
  6788. exit;
  6789. end;
  6790. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6791. end;
  6792. if ( bExp = 0 ) then begin
  6793. if ( bSig = 0 ) then begin
  6794. result := packFloatx80( zSign, 0, 0 );
  6795. exit;
  6796. end;
  6797. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6798. end;
  6799. zExp := aExp + bExp - $3FFE;
  6800. mul64To128( aSig, bSig, zSig0, zSig1 );
  6801. if 0 < sbits64( zSig0 ) then begin
  6802. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6803. dec(zExp);
  6804. end;
  6805. result :=
  6806. roundAndPackFloatx80(
  6807. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6808. end;
  6809. {*----------------------------------------------------------------------------
  6810. | Returns the result of dividing the extended double-precision floating-point
  6811. | value `a' by the corresponding value `b'. The operation is performed
  6812. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6813. *----------------------------------------------------------------------------*}
  6814. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6815. var
  6816. aSign, bSign, zSign: flag;
  6817. aExp, bExp, zExp: int32;
  6818. aSig, bSig, zSig0, zSig1: bits64;
  6819. rem0, rem1, rem2, term0, term1, term2: bits64;
  6820. z: floatx80;
  6821. label
  6822. invalid;
  6823. begin
  6824. aSig := extractFloatx80Frac( a );
  6825. aExp := extractFloatx80Exp( a );
  6826. aSign := extractFloatx80Sign( a );
  6827. bSig := extractFloatx80Frac( b );
  6828. bExp := extractFloatx80Exp( b );
  6829. bSign := extractFloatx80Sign( b );
  6830. zSign := aSign xor bSign;
  6831. if ( aExp = $7FFF ) then begin
  6832. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6833. result := propagateFloatx80NaN( a, b );
  6834. exit;
  6835. end;
  6836. if ( bExp = $7FFF ) then begin
  6837. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6838. result := propagateFloatx80NaN( a, b );
  6839. exit;
  6840. end;
  6841. goto invalid;
  6842. end;
  6843. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6844. exit;
  6845. end;
  6846. if ( bExp = $7FFF ) then begin
  6847. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6848. result := propagateFloatx80NaN( a, b );
  6849. exit;
  6850. end;
  6851. result := packFloatx80( zSign, 0, 0 );
  6852. exit;
  6853. end;
  6854. if ( bExp = 0 ) then begin
  6855. if ( bSig = 0 ) then begin
  6856. if ( ( aExp or aSig ) = 0 ) then begin
  6857. invalid:
  6858. float_raise( float_flag_invalid );
  6859. z.low := floatx80_default_nan_low;
  6860. z.high := floatx80_default_nan_high;
  6861. result := z;
  6862. exit;
  6863. end;
  6864. float_raise( float_flag_divbyzero );
  6865. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6866. exit;
  6867. end;
  6868. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6869. end;
  6870. if ( aExp = 0 ) then begin
  6871. if ( aSig = 0 ) then begin
  6872. result := packFloatx80( zSign, 0, 0 );
  6873. exit;
  6874. end;
  6875. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6876. end;
  6877. zExp := aExp - bExp + $3FFE;
  6878. rem1 := 0;
  6879. if ( bSig <= aSig ) then begin
  6880. shift128Right( aSig, 0, 1, aSig, rem1 );
  6881. inc(zExp);
  6882. end;
  6883. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6884. mul64To128( bSig, zSig0, term0, term1 );
  6885. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6886. while ( sbits64( rem0 ) < 0 ) do begin
  6887. dec(zSig0);
  6888. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6889. end;
  6890. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6891. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6892. mul64To128( bSig, zSig1, term1, term2 );
  6893. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6894. while ( sbits64( rem1 ) < 0 ) do begin
  6895. dec(zSig1);
  6896. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6897. end;
  6898. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6899. end;
  6900. result :=
  6901. roundAndPackFloatx80(
  6902. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6903. end;
  6904. {*----------------------------------------------------------------------------
  6905. | Returns the remainder of the extended double-precision floating-point value
  6906. | `a' with respect to the corresponding value `b'. The operation is performed
  6907. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6908. *----------------------------------------------------------------------------*}
  6909. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6910. var
  6911. aSign, zSign: flag;
  6912. aExp, bExp, expDiff: int32;
  6913. aSig0, aSig1, bSig: bits64;
  6914. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6915. z: floatx80;
  6916. label
  6917. invalid;
  6918. begin
  6919. aSig0 := extractFloatx80Frac( a );
  6920. aExp := extractFloatx80Exp( a );
  6921. aSign := extractFloatx80Sign( a );
  6922. bSig := extractFloatx80Frac( b );
  6923. bExp := extractFloatx80Exp( b );
  6924. if ( aExp = $7FFF ) then begin
  6925. if ( bits64( aSig0 shl 1 ) <> 0 )
  6926. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6927. result := propagateFloatx80NaN( a, b );
  6928. exit;
  6929. end;
  6930. goto invalid;
  6931. end;
  6932. if ( bExp = $7FFF ) then begin
  6933. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6934. result := propagateFloatx80NaN( a, b );
  6935. exit;
  6936. end;
  6937. result := a;
  6938. exit;
  6939. end;
  6940. if ( bExp = 0 ) then begin
  6941. if ( bSig = 0 ) then begin
  6942. invalid:
  6943. float_raise( float_flag_invalid );
  6944. z.low := floatx80_default_nan_low;
  6945. z.high := floatx80_default_nan_high;
  6946. result := z;
  6947. exit;
  6948. end;
  6949. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6950. end;
  6951. if ( aExp = 0 ) then begin
  6952. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  6953. result := a;
  6954. exit;
  6955. end;
  6956. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6957. end;
  6958. bSig := bSig or $8000000000000000;
  6959. zSign := aSign;
  6960. expDiff := aExp - bExp;
  6961. aSig1 := 0;
  6962. if ( expDiff < 0 ) then begin
  6963. if ( expDiff < -1 ) then begin
  6964. result := a;
  6965. exit;
  6966. end;
  6967. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6968. expDiff := 0;
  6969. end;
  6970. q := ord( bSig <= aSig0 );
  6971. if ( q <> 0 ) then dec( aSig0, bSig );
  6972. dec( expDiff, 64 );
  6973. while ( 0 < expDiff ) do begin
  6974. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6975. if ( 2 < q ) then q := q - 2 else q := 0;
  6976. mul64To128( bSig, q, term0, term1 );
  6977. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6978. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6979. dec( expDiff, 62 );
  6980. end;
  6981. inc( expDiff, 64 );
  6982. if ( 0 < expDiff ) then begin
  6983. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6984. if ( 2 < q ) then q:= q - 2 else q := 0;
  6985. q := q shr ( 64 - expDiff );
  6986. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6987. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6988. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6989. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  6990. inc(q);
  6991. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6992. end;
  6993. end
  6994. else begin
  6995. term1 := 0;
  6996. term0 := bSig;
  6997. end;
  6998. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6999. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7000. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7001. and ( q and 1 <> 0 ) )
  7002. then begin
  7003. aSig0 := alternateASig0;
  7004. aSig1 := alternateASig1;
  7005. zSign := ord( zSign = 0 );
  7006. end;
  7007. result :=
  7008. normalizeRoundAndPackFloatx80(
  7009. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  7010. end;
  7011. {*----------------------------------------------------------------------------
  7012. | Returns the square root of the extended double-precision floating-point
  7013. | value `a'. The operation is performed according to the IEC/IEEE Standard
  7014. | for Binary Floating-Point Arithmetic.
  7015. *----------------------------------------------------------------------------*}
  7016. function floatx80_sqrt(a: floatx80): floatx80;
  7017. var
  7018. aSign: flag;
  7019. aExp, zExp: int32;
  7020. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  7021. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7022. z: floatx80;
  7023. label
  7024. invalid;
  7025. begin
  7026. aSig0 := extractFloatx80Frac( a );
  7027. aExp := extractFloatx80Exp( a );
  7028. aSign := extractFloatx80Sign( a );
  7029. if ( aExp = $7FFF ) then begin
  7030. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  7031. result := propagateFloatx80NaN( a, a );
  7032. exit;
  7033. end;
  7034. if ( aSign = 0 ) then begin
  7035. result := a;
  7036. exit;
  7037. end;
  7038. goto invalid;
  7039. end;
  7040. if ( aSign <> 0 ) then begin
  7041. if ( ( aExp or aSig0 ) = 0 ) then begin
  7042. result := a;
  7043. exit;
  7044. end;
  7045. invalid:
  7046. float_raise( float_flag_invalid );
  7047. z.low := floatx80_default_nan_low;
  7048. z.high := floatx80_default_nan_high;
  7049. result := z;
  7050. exit;
  7051. end;
  7052. if ( aExp = 0 ) then begin
  7053. if ( aSig0 = 0 ) then begin
  7054. result := packFloatx80( 0, 0, 0 );
  7055. exit;
  7056. end;
  7057. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7058. end;
  7059. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  7060. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  7061. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  7062. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7063. doubleZSig0 := zSig0 shl 1;
  7064. mul64To128( zSig0, zSig0, term0, term1 );
  7065. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7066. while ( sbits64( rem0 ) < 0 ) do begin
  7067. dec(zSig0);
  7068. dec( doubleZSig0, 2 );
  7069. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  7070. end;
  7071. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7072. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7073. if ( zSig1 = 0 ) then zSig1 := 1;
  7074. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7075. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7076. mul64To128( zSig1, zSig1, term2, term3 );
  7077. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7078. while ( sbits64( rem1 ) < 0 ) do begin
  7079. dec(zSig1);
  7080. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7081. term3 := term3 or 1;
  7082. term2 := term2 or doubleZSig0;
  7083. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7084. end;
  7085. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7086. end;
  7087. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7088. zSig0 := zSig0 or doubleZSig0;
  7089. result :=
  7090. roundAndPackFloatx80(
  7091. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7092. end;
  7093. {*----------------------------------------------------------------------------
  7094. | Returns 1 if the extended double-precision floating-point value `a' is
  7095. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7096. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7097. | Arithmetic.
  7098. *----------------------------------------------------------------------------*}
  7099. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7100. begin
  7101. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7102. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7103. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7104. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7105. ) then begin
  7106. if ( floatx80_is_signaling_nan( a )
  7107. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7108. float_raise( float_flag_invalid );
  7109. end;
  7110. result := 0;
  7111. exit;
  7112. end;
  7113. result := ord(
  7114. ( a.low = b.low )
  7115. and ( ( a.high = b.high )
  7116. or ( ( a.low = 0 )
  7117. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7118. ) );
  7119. end;
  7120. {*----------------------------------------------------------------------------
  7121. | Returns 1 if the extended double-precision floating-point value `a' is
  7122. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7123. | comparison is performed according to the IEC/IEEE Standard for Binary
  7124. | Floating-Point Arithmetic.
  7125. *----------------------------------------------------------------------------*}
  7126. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7127. var
  7128. aSign, bSign: flag;
  7129. begin
  7130. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7131. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7132. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7133. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7134. then begin
  7135. float_raise( float_flag_invalid );
  7136. result := 0;
  7137. exit;
  7138. end;
  7139. aSign := extractFloatx80Sign( a );
  7140. bSign := extractFloatx80Sign( b );
  7141. if ( aSign <> bSign ) then begin
  7142. result := ord(
  7143. ( aSign <> 0 )
  7144. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7145. exit;
  7146. end;
  7147. if aSign<>0 then
  7148. result := le128( b.high, b.low, a.high, a.low )
  7149. else
  7150. result := le128( a.high, a.low, b.high, b.low );
  7151. end;
  7152. {*----------------------------------------------------------------------------
  7153. | Returns 1 if the extended double-precision floating-point value `a' is
  7154. | less than the corresponding value `b', and 0 otherwise. The comparison
  7155. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7156. | Arithmetic.
  7157. *----------------------------------------------------------------------------*}
  7158. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7159. var
  7160. aSign, bSign: flag;
  7161. begin
  7162. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7163. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7164. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7165. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7166. then begin
  7167. float_raise( float_flag_invalid );
  7168. result := 0;
  7169. exit;
  7170. end;
  7171. aSign := extractFloatx80Sign( a );
  7172. bSign := extractFloatx80Sign( b );
  7173. if ( aSign <> bSign ) then begin
  7174. result := ord(
  7175. ( aSign <> 0 )
  7176. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7177. exit;
  7178. end;
  7179. if aSign <> 0 then
  7180. result := lt128( b.high, b.low, a.high, a.low )
  7181. else
  7182. result := lt128( a.high, a.low, b.high, b.low );
  7183. end;
  7184. {*----------------------------------------------------------------------------
  7185. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7186. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7187. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7188. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7189. *----------------------------------------------------------------------------*}
  7190. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7191. begin
  7192. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7193. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7194. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7195. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7196. then begin
  7197. float_raise( float_flag_invalid );
  7198. result := 0;
  7199. exit;
  7200. end;
  7201. result := ord(
  7202. ( a.low = b.low )
  7203. and ( ( a.high = b.high )
  7204. or ( ( a.low = 0 )
  7205. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7206. ) );
  7207. end;
  7208. {*----------------------------------------------------------------------------
  7209. | Returns 1 if the extended double-precision floating-point value `a' is less
  7210. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7211. | do not cause an exception. Otherwise, the comparison is performed according
  7212. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7213. *----------------------------------------------------------------------------*}
  7214. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7215. var
  7216. aSign, bSign: flag;
  7217. begin
  7218. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7219. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7220. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7221. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7222. then begin
  7223. if ( floatx80_is_signaling_nan( a )
  7224. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7225. float_raise( float_flag_invalid );
  7226. end;
  7227. result := 0;
  7228. exit;
  7229. end;
  7230. aSign := extractFloatx80Sign( a );
  7231. bSign := extractFloatx80Sign( b );
  7232. if ( aSign <> bSign ) then begin
  7233. result := ord(
  7234. ( aSign <> 0 )
  7235. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7236. exit;
  7237. end;
  7238. if aSign <> 0 then
  7239. result := le128( b.high, b.low, a.high, a.low )
  7240. else
  7241. result := le128( a.high, a.low, b.high, b.low );
  7242. end;
  7243. {*----------------------------------------------------------------------------
  7244. | Returns 1 if the extended double-precision floating-point value `a' is less
  7245. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7246. | an exception. Otherwise, the comparison is performed according to the
  7247. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7248. *----------------------------------------------------------------------------*}
  7249. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7250. var
  7251. aSign, bSign: flag;
  7252. begin
  7253. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7254. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7255. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7256. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7257. then begin
  7258. if ( floatx80_is_signaling_nan( a )
  7259. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7260. float_raise( float_flag_invalid );
  7261. end;
  7262. result := 0;
  7263. exit;
  7264. end;
  7265. aSign := extractFloatx80Sign( a );
  7266. bSign := extractFloatx80Sign( b );
  7267. if ( aSign <> bSign ) then begin
  7268. result := ord(
  7269. ( aSign <> 0 )
  7270. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7271. exit;
  7272. end;
  7273. if aSign <> 0 then
  7274. result := lt128( b.high, b.low, a.high, a.low )
  7275. else
  7276. result := lt128( a.high, a.low, b.high, b.low );
  7277. end;
  7278. {$endif FPC_SOFTFLOAT_FLOATX80}
  7279. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7280. {*----------------------------------------------------------------------------
  7281. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7282. | floating-point value `a'.
  7283. *----------------------------------------------------------------------------*}
  7284. function extractFloat128Frac1(a : float128): bits64;
  7285. begin
  7286. result:=a.low;
  7287. end;
  7288. {*----------------------------------------------------------------------------
  7289. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7290. | floating-point value `a'.
  7291. *----------------------------------------------------------------------------*}
  7292. function extractFloat128Frac0(a : float128): bits64;
  7293. begin
  7294. result:=a.high and int64($0000FFFFFFFFFFFF);
  7295. end;
  7296. {*----------------------------------------------------------------------------
  7297. | Returns the exponent bits of the quadruple-precision floating-point value
  7298. | `a'.
  7299. *----------------------------------------------------------------------------*}
  7300. function extractFloat128Exp(a : float128): int32;
  7301. begin
  7302. result:=( a.high shr 48 ) and $7FFF;
  7303. end;
  7304. {*----------------------------------------------------------------------------
  7305. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7306. *----------------------------------------------------------------------------*}
  7307. function extractFloat128Sign(a : float128): flag;
  7308. begin
  7309. result:=a.high shr 63;
  7310. end;
  7311. {*----------------------------------------------------------------------------
  7312. | Normalizes the subnormal quadruple-precision floating-point value
  7313. | represented by the denormalized significand formed by the concatenation of
  7314. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7315. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7316. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7317. | least significant 64 bits of the normalized significand are stored at the
  7318. | location pointed to by `zSig1Ptr'.
  7319. *----------------------------------------------------------------------------*}
  7320. procedure normalizeFloat128Subnormal(
  7321. aSig0: bits64;
  7322. aSig1: bits64;
  7323. var zExpPtr: int32;
  7324. var zSig0Ptr: bits64;
  7325. var zSig1Ptr: bits64);
  7326. var
  7327. shiftCount: int8;
  7328. begin
  7329. if ( aSig0 = 0 ) then
  7330. begin
  7331. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7332. if ( shiftCount < 0 ) then
  7333. begin
  7334. zSig0Ptr := aSig1 shr ( - shiftCount );
  7335. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7336. end
  7337. else begin
  7338. zSig0Ptr := aSig1 shl shiftCount;
  7339. zSig1Ptr := 0;
  7340. end;
  7341. zExpPtr := - shiftCount - 63;
  7342. end
  7343. else begin
  7344. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7345. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7346. zExpPtr := 1 - shiftCount;
  7347. end;
  7348. end;
  7349. {*----------------------------------------------------------------------------
  7350. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7351. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7352. | floating-point value, returning the result. After being shifted into the
  7353. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7354. | added together to form the most significant 32 bits of the result. This
  7355. | means that any integer portion of `zSig0' will be added into the exponent.
  7356. | Since a properly normalized significand will have an integer portion equal
  7357. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7358. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7359. | significand.
  7360. *----------------------------------------------------------------------------*}
  7361. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7362. var
  7363. z: float128;
  7364. begin
  7365. z.low := zSig1;
  7366. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7367. result:=z;
  7368. end;
  7369. {*----------------------------------------------------------------------------
  7370. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7371. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7372. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7373. | corresponding to the abstract input. Ordinarily, the abstract value is
  7374. | simply rounded and packed into the quadruple-precision format, with the
  7375. | inexact exception raised if the abstract input cannot be represented
  7376. | exactly. However, if the abstract value is too large, the overflow and
  7377. | inexact exceptions are raised and an infinity or maximal finite value is
  7378. | returned. If the abstract value is too small, the input value is rounded to
  7379. | a subnormal number, and the underflow and inexact exceptions are raised if
  7380. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7381. | precision floating-point number.
  7382. | The input significand must be normalized or smaller. If the input
  7383. | significand is not normalized, `zExp' must be 0; in that case, the result
  7384. | returned is a subnormal number, and it must not require rounding. In the
  7385. | usual case that the input significand is normalized, `zExp' must be 1 less
  7386. | than the ``true'' floating-point exponent. The handling of underflow and
  7387. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7388. *----------------------------------------------------------------------------*}
  7389. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7390. var
  7391. roundingMode: int8;
  7392. roundNearestEven, increment, isTiny: flag;
  7393. begin
  7394. roundingMode := softfloat_rounding_mode;
  7395. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7396. increment := ord( sbits64(zSig2) < 0 );
  7397. if ( roundNearestEven=0 ) then
  7398. begin
  7399. if ( roundingMode = float_round_to_zero ) then
  7400. begin
  7401. increment := 0;
  7402. end
  7403. else begin
  7404. if ( zSign<>0 ) then
  7405. begin
  7406. increment := ord( roundingMode = float_round_down ) and zSig2;
  7407. end
  7408. else begin
  7409. increment := ord( roundingMode = float_round_up ) and zSig2;
  7410. end;
  7411. end;
  7412. end;
  7413. if ( $7FFD <= bits32(zExp) ) then
  7414. begin
  7415. if ( ord( $7FFD < zExp )
  7416. or ( ord( zExp = $7FFD )
  7417. and eq128(
  7418. int64( $0001FFFFFFFFFFFF ),
  7419. bits64( $FFFFFFFFFFFFFFFF ),
  7420. zSig0,
  7421. zSig1
  7422. )
  7423. and increment
  7424. )
  7425. )<>0 then
  7426. begin
  7427. float_raise( float_flag_overflow or float_flag_inexact );
  7428. if ( ord( roundingMode = float_round_to_zero )
  7429. or ( zSign and ord( roundingMode = float_round_up ) )
  7430. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  7431. )<>0 then
  7432. begin
  7433. result :=
  7434. packFloat128(
  7435. zSign,
  7436. $7FFE,
  7437. int64( $0000FFFFFFFFFFFF ),
  7438. bits64( $FFFFFFFFFFFFFFFF )
  7439. );
  7440. exit;
  7441. end;
  7442. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7443. exit;
  7444. end;
  7445. if ( zExp < 0 ) then
  7446. begin
  7447. isTiny :=
  7448. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7449. or ( zExp < -1 )
  7450. or not( increment<>0 )
  7451. or boolean(lt128(
  7452. zSig0,
  7453. zSig1,
  7454. int64( $0001FFFFFFFFFFFF ),
  7455. bits64( $FFFFFFFFFFFFFFFF )
  7456. )));
  7457. shift128ExtraRightJamming(
  7458. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7459. zExp := 0;
  7460. if ( isTiny and zSig2 )<>0 then
  7461. float_raise( float_flag_underflow );
  7462. if ( roundNearestEven<>0 ) then
  7463. begin
  7464. increment := ord( sbits64(zSig2) < 0 );
  7465. end
  7466. else begin
  7467. if ( zSign<>0 ) then
  7468. begin
  7469. increment := ord( roundingMode = float_round_down ) and zSig2;
  7470. end
  7471. else begin
  7472. increment := ord( roundingMode = float_round_up ) and zSig2;
  7473. end;
  7474. end;
  7475. end;
  7476. end;
  7477. if ( zSig2<>0 ) then
  7478. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7479. if ( increment<>0 ) then
  7480. begin
  7481. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7482. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7483. end
  7484. else begin
  7485. if ( ( zSig0 or zSig1 ) = 0 ) then
  7486. zExp := 0;
  7487. end;
  7488. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7489. end;
  7490. {*----------------------------------------------------------------------------
  7491. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7492. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7493. | returns the proper quadruple-precision floating-point value corresponding
  7494. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7495. | except that the input significand has fewer bits and does not have to be
  7496. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7497. | point exponent.
  7498. *----------------------------------------------------------------------------*}
  7499. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7500. var
  7501. shiftCount: int8;
  7502. zSig2: bits64;
  7503. begin
  7504. if ( zSig0 = 0 ) then
  7505. begin
  7506. zSig0 := zSig1;
  7507. zSig1 := 0;
  7508. dec(zExp, 64);
  7509. end;
  7510. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7511. if ( 0 <= shiftCount ) then
  7512. begin
  7513. zSig2 := 0;
  7514. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7515. end
  7516. else begin
  7517. shift128ExtraRightJamming(
  7518. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7519. end;
  7520. dec(zExp, shiftCount);
  7521. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7522. end;
  7523. {*----------------------------------------------------------------------------
  7524. | Returns the result of converting the quadruple-precision floating-point
  7525. | value `a' to the 32-bit two's complement integer format. The conversion
  7526. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7527. | Arithmetic---which means in particular that the conversion is rounded
  7528. | according to the current rounding mode. If `a' is a NaN, the largest
  7529. | positive integer is returned. Otherwise, if the conversion overflows, the
  7530. | largest integer with the same sign as `a' is returned.
  7531. *----------------------------------------------------------------------------*}
  7532. function float128_to_int32(a: float128): int32;
  7533. var
  7534. aSign: flag;
  7535. aExp, shiftCount: int32;
  7536. aSig0, aSig1: bits64;
  7537. begin
  7538. aSig1 := extractFloat128Frac1( a );
  7539. aSig0 := extractFloat128Frac0( a );
  7540. aExp := extractFloat128Exp( a );
  7541. aSign := extractFloat128Sign( a );
  7542. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7543. aSign := 0;
  7544. if ( aExp<>0 ) then
  7545. aSig0 := aSig0 or int64( $0001000000000000 );
  7546. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7547. shiftCount := $4028 - aExp;
  7548. if ( 0 < shiftCount ) then
  7549. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7550. result := roundAndPackInt32( aSign, aSig0 );
  7551. end;
  7552. {*----------------------------------------------------------------------------
  7553. | Returns the result of converting the quadruple-precision floating-point
  7554. | value `a' to the 32-bit two's complement integer format. The conversion
  7555. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7556. | Arithmetic, except that the conversion is always rounded toward zero. If
  7557. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7558. | conversion overflows, the largest integer with the same sign as `a' is
  7559. | returned.
  7560. *----------------------------------------------------------------------------*}
  7561. function float128_to_int32_round_to_zero(a: float128): int32;
  7562. var
  7563. aSign: flag;
  7564. aExp, shiftCount: int32;
  7565. aSig0, aSig1, savedASig: bits64;
  7566. z: int32;
  7567. label
  7568. invalid;
  7569. begin
  7570. aSig1 := extractFloat128Frac1( a );
  7571. aSig0 := extractFloat128Frac0( a );
  7572. aExp := extractFloat128Exp( a );
  7573. aSign := extractFloat128Sign( a );
  7574. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7575. if ( $401E < aExp ) then
  7576. begin
  7577. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7578. aSign := 0;
  7579. goto invalid;
  7580. end
  7581. else if ( aExp < $3FFF ) then
  7582. begin
  7583. if ( aExp or aSig0 )<>0 then
  7584. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7585. result := 0;
  7586. exit;
  7587. end;
  7588. aSig0 := aSig0 or int64( $0001000000000000 );
  7589. shiftCount := $402F - aExp;
  7590. savedASig := aSig0;
  7591. aSig0 := aSig0 shr shiftCount;
  7592. z := aSig0;
  7593. if ( aSign )<>0 then
  7594. z := - z;
  7595. if ( ord( z < 0 ) xor aSign )<>0 then
  7596. begin
  7597. invalid:
  7598. float_raise( float_flag_invalid );
  7599. if aSign<>0 then
  7600. result:= int32( $80000000 )
  7601. else
  7602. result:=$7FFFFFFF;
  7603. exit;
  7604. end;
  7605. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7606. begin
  7607. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7608. end;
  7609. result := z;
  7610. end;
  7611. {*----------------------------------------------------------------------------
  7612. | Returns the result of converting the quadruple-precision floating-point
  7613. | value `a' to the 64-bit two's complement integer format. The conversion
  7614. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7615. | Arithmetic---which means in particular that the conversion is rounded
  7616. | according to the current rounding mode. If `a' is a NaN, the largest
  7617. | positive integer is returned. Otherwise, if the conversion overflows, the
  7618. | largest integer with the same sign as `a' is returned.
  7619. *----------------------------------------------------------------------------*}
  7620. function float128_to_int64(a: float128): int64;
  7621. var
  7622. aSign: flag;
  7623. aExp, shiftCount: int32;
  7624. aSig0, aSig1: bits64;
  7625. begin
  7626. aSig1 := extractFloat128Frac1( a );
  7627. aSig0 := extractFloat128Frac0( a );
  7628. aExp := extractFloat128Exp( a );
  7629. aSign := extractFloat128Sign( a );
  7630. if ( aExp<>0 ) then
  7631. aSig0 := aSig0 or int64( $0001000000000000 );
  7632. shiftCount := $402F - aExp;
  7633. if ( shiftCount <= 0 ) then
  7634. begin
  7635. if ( $403E < aExp ) then
  7636. begin
  7637. float_raise( float_flag_invalid );
  7638. if ( (aSign=0)
  7639. or ( ( aExp = $7FFF )
  7640. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7641. )
  7642. ) then
  7643. begin
  7644. result := int64( $7FFFFFFFFFFFFFFF );
  7645. exit;
  7646. end;
  7647. result := int64( $8000000000000000 );
  7648. exit;
  7649. end;
  7650. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7651. end
  7652. else begin
  7653. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7654. end;
  7655. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7656. end;
  7657. {*----------------------------------------------------------------------------
  7658. | Returns the result of converting the quadruple-precision floating-point
  7659. | value `a' to the 64-bit two's complement integer format. The conversion
  7660. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7661. | Arithmetic, except that the conversion is always rounded toward zero.
  7662. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7663. | the conversion overflows, the largest integer with the same sign as `a' is
  7664. | returned.
  7665. *----------------------------------------------------------------------------*}
  7666. function float128_to_int64_round_to_zero(a: float128): int64;
  7667. var
  7668. aSign: flag;
  7669. aExp, shiftCount: int32;
  7670. aSig0, aSig1: bits64;
  7671. z: int64;
  7672. begin
  7673. aSig1 := extractFloat128Frac1( a );
  7674. aSig0 := extractFloat128Frac0( a );
  7675. aExp := extractFloat128Exp( a );
  7676. aSign := extractFloat128Sign( a );
  7677. if ( aExp<>0 ) then
  7678. aSig0 := aSig0 or int64( $0001000000000000 );
  7679. shiftCount := aExp - $402F;
  7680. if ( 0 < shiftCount ) then
  7681. begin
  7682. if ( $403E <= aExp ) then
  7683. begin
  7684. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7685. if ( ( a.high = bits64( $C03E000000000000 ) )
  7686. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7687. begin
  7688. if ( aSig1<>0 ) then
  7689. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7690. end
  7691. else begin
  7692. float_raise( float_flag_invalid );
  7693. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7694. begin
  7695. result := int64( $7FFFFFFFFFFFFFFF );
  7696. exit;
  7697. end;
  7698. end;
  7699. result := int64( $8000000000000000 );
  7700. exit;
  7701. end;
  7702. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  7703. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7704. begin
  7705. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7706. end;
  7707. end
  7708. else begin
  7709. if ( aExp < $3FFF ) then
  7710. begin
  7711. if ( aExp or aSig0 or aSig1 )<>0 then
  7712. begin
  7713. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7714. end;
  7715. result := 0;
  7716. exit;
  7717. end;
  7718. z := aSig0 shr ( - shiftCount );
  7719. if ( (aSig1<>0)
  7720. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7721. begin
  7722. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7723. end;
  7724. end;
  7725. if ( aSign<>0 ) then
  7726. z := - z;
  7727. result := z;
  7728. end;
  7729. {*----------------------------------------------------------------------------
  7730. | Returns the result of converting the quadruple-precision floating-point
  7731. | value `a' to the single-precision floating-point format. The conversion
  7732. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7733. | Arithmetic.
  7734. *----------------------------------------------------------------------------*}
  7735. function float128_to_float32(a: float128): float32;
  7736. var
  7737. aSign: flag;
  7738. aExp: int32;
  7739. aSig0, aSig1: bits64;
  7740. zSig: bits32;
  7741. begin
  7742. aSig1 := extractFloat128Frac1( a );
  7743. aSig0 := extractFloat128Frac0( a );
  7744. aExp := extractFloat128Exp( a );
  7745. aSign := extractFloat128Sign( a );
  7746. if ( aExp = $7FFF ) then
  7747. begin
  7748. if ( aSig0 or aSig1 )<>0 then
  7749. begin
  7750. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7751. exit;
  7752. end;
  7753. result := packFloat32( aSign, $FF, 0 );
  7754. exit;
  7755. end;
  7756. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7757. shift64RightJamming( aSig0, 18, aSig0 );
  7758. zSig := aSig0;
  7759. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7760. begin
  7761. zSig := zSig or $40000000;
  7762. dec(aExp,$3F81);
  7763. end;
  7764. result := roundAndPackFloat32( aSign, aExp, zSig );
  7765. end;
  7766. {*----------------------------------------------------------------------------
  7767. | Returns the result of converting the quadruple-precision floating-point
  7768. | value `a' to the double-precision floating-point format. The conversion
  7769. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7770. | Arithmetic.
  7771. *----------------------------------------------------------------------------*}
  7772. function float128_to_float64(a: float128): float64;
  7773. var
  7774. aSign: flag;
  7775. aExp: int32;
  7776. aSig0, aSig1: bits64;
  7777. begin
  7778. aSig1 := extractFloat128Frac1( a );
  7779. aSig0 := extractFloat128Frac0( a );
  7780. aExp := extractFloat128Exp( a );
  7781. aSign := extractFloat128Sign( a );
  7782. if ( aExp = $7FFF ) then
  7783. begin
  7784. if ( aSig0 or aSig1 )<>0 then
  7785. begin
  7786. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  7787. exit;
  7788. end;
  7789. result:=packFloat64( aSign, $7FF, 0);
  7790. exit;
  7791. end;
  7792. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7793. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7794. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7795. begin
  7796. aSig0 := aSig0 or int64( $4000000000000000 );
  7797. dec(aExp,$3C01);
  7798. end;
  7799. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7800. end;
  7801. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7802. {*----------------------------------------------------------------------------
  7803. | Returns the result of converting the quadruple-precision floating-point
  7804. | value `a' to the extended double-precision floating-point format. The
  7805. | conversion is performed according to the IEC/IEEE Standard for Binary
  7806. | Floating-Point Arithmetic.
  7807. *----------------------------------------------------------------------------*}
  7808. function float128_to_floatx80(a: float128): floatx80;
  7809. var
  7810. aSign: flag;
  7811. aExp: int32;
  7812. aSig0, aSig1: bits64;
  7813. begin
  7814. aSig1 := extractFloat128Frac1( a );
  7815. aSig0 := extractFloat128Frac0( a );
  7816. aExp := extractFloat128Exp( a );
  7817. aSign := extractFloat128Sign( a );
  7818. if ( aExp = $7FFF ) then begin
  7819. if ( aSig0 or aSig1 <> 0 ) then begin
  7820. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7821. exit;
  7822. end;
  7823. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7824. exit;
  7825. end;
  7826. if ( aExp = 0 ) then begin
  7827. if ( ( aSig0 or aSig1 ) = 0 ) then
  7828. begin
  7829. result := packFloatx80( aSign, 0, 0 );
  7830. exit;
  7831. end;
  7832. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7833. end
  7834. else begin
  7835. aSig0 := aSig0 or int64( $0001000000000000 );
  7836. end;
  7837. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7838. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7839. end;
  7840. {$endif FPC_SOFTFLOAT_FLOATX80}
  7841. {*----------------------------------------------------------------------------
  7842. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7843. | Returns the result as a quadruple-precision floating-point value. The
  7844. | operation is performed according to the IEC/IEEE Standard for Binary
  7845. | Floating-Point Arithmetic.
  7846. *----------------------------------------------------------------------------*}
  7847. function float128_round_to_int(a: float128): float128;
  7848. var
  7849. aSign: flag;
  7850. aExp: int32;
  7851. lastBitMask, roundBitsMask: bits64;
  7852. roundingMode: int8;
  7853. z: float128;
  7854. begin
  7855. aExp := extractFloat128Exp( a );
  7856. if ( $402F <= aExp ) then
  7857. begin
  7858. if ( $406F <= aExp ) then
  7859. begin
  7860. if ( ( aExp = $7FFF )
  7861. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7862. ) then
  7863. begin
  7864. result := propagateFloat128NaN( a, a );
  7865. exit;
  7866. end;
  7867. result := a;
  7868. exit;
  7869. end;
  7870. lastBitMask := 1;
  7871. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7872. roundBitsMask := lastBitMask - 1;
  7873. z := a;
  7874. roundingMode := softfloat_rounding_mode;
  7875. if ( roundingMode = float_round_nearest_even ) then
  7876. begin
  7877. if ( lastBitMask )<>0 then
  7878. begin
  7879. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7880. if ( ( z.low and roundBitsMask ) = 0 ) then
  7881. z.low := z.low and not(lastBitMask);
  7882. end
  7883. else begin
  7884. if ( sbits64(z.low) < 0 ) then
  7885. begin
  7886. inc(z.high);
  7887. if ( bits64( z.low shl 1 ) = 0 ) then
  7888. z.high := z.high and not bits64( 1 );
  7889. end;
  7890. end;
  7891. end
  7892. else if ( roundingMode <> float_round_to_zero ) then
  7893. begin
  7894. if ( extractFloat128Sign( z )
  7895. xor ord( roundingMode = float_round_up ) )<>0 then
  7896. begin
  7897. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7898. end;
  7899. end;
  7900. z.low := z.low and not(roundBitsMask);
  7901. end
  7902. else begin
  7903. if ( aExp < $3FFF ) then
  7904. begin
  7905. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7906. begin
  7907. result := a;
  7908. exit;
  7909. end;
  7910. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7911. aSign := extractFloat128Sign( a );
  7912. case softfloat_rounding_mode of
  7913. float_round_nearest_even:
  7914. if ( ( aExp = $3FFE )
  7915. and ( (extractFloat128Frac0( a )<>0)
  7916. or (extractFloat128Frac1( a )<>0) )
  7917. ) then begin
  7918. begin
  7919. result := packFloat128( aSign, $3FFF, 0, 0 );
  7920. exit;
  7921. end;
  7922. end;
  7923. float_round_down:
  7924. begin
  7925. if aSign<>0 then
  7926. result:=packFloat128( 1, $3FFF, 0, 0 )
  7927. else
  7928. result:=packFloat128( 0, 0, 0, 0 );
  7929. exit;
  7930. end;
  7931. float_round_up:
  7932. begin
  7933. if aSign<>0 then
  7934. result := packFloat128( 1, 0, 0, 0 )
  7935. else
  7936. result:=packFloat128( 0, $3FFF, 0, 0 );
  7937. exit;
  7938. end;
  7939. end;
  7940. result := packFloat128( aSign, 0, 0, 0 );
  7941. exit;
  7942. end;
  7943. lastBitMask := 1;
  7944. lastBitMask := lastBitMask shl ($402F - aExp);
  7945. roundBitsMask := lastBitMask - 1;
  7946. z.low := 0;
  7947. z.high := a.high;
  7948. roundingMode := softfloat_rounding_mode;
  7949. if ( roundingMode = float_round_nearest_even ) then begin
  7950. inc(z.high,lastBitMask shr 1);
  7951. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7952. z.high := z.high and not(lastBitMask);
  7953. end;
  7954. end
  7955. else if ( roundingMode <> float_round_to_zero ) then begin
  7956. if ( (extractFloat128Sign( z )<>0)
  7957. xor ( roundingMode = float_round_up ) ) then begin
  7958. z.high := z.high or ord( a.low <> 0 );
  7959. z.high := z.high+roundBitsMask;
  7960. end;
  7961. end;
  7962. z.high := z.high and not(roundBitsMask);
  7963. end;
  7964. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7965. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7966. end;
  7967. result := z;
  7968. end;
  7969. {*----------------------------------------------------------------------------
  7970. | Returns the result of adding the absolute values of the quadruple-precision
  7971. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7972. | before being returned. `zSign' is ignored if the result is a NaN.
  7973. | The addition is performed according to the IEC/IEEE Standard for Binary
  7974. | Floating-Point Arithmetic.
  7975. *----------------------------------------------------------------------------*}
  7976. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7977. var
  7978. aExp, bExp, zExp: int32;
  7979. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7980. expDiff: int32;
  7981. label
  7982. shiftRight1,roundAndPack;
  7983. begin
  7984. aSig1 := extractFloat128Frac1( a );
  7985. aSig0 := extractFloat128Frac0( a );
  7986. aExp := extractFloat128Exp( a );
  7987. bSig1 := extractFloat128Frac1( b );
  7988. bSig0 := extractFloat128Frac0( b );
  7989. bExp := extractFloat128Exp( b );
  7990. expDiff := aExp - bExp;
  7991. if ( 0 < expDiff ) then begin
  7992. if ( aExp = $7FFF ) then begin
  7993. if ( aSig0 or aSig1 )<>0 then
  7994. begin
  7995. result := propagateFloat128NaN( a, b );
  7996. exit;
  7997. end;
  7998. result := a;
  7999. exit;
  8000. end;
  8001. if ( bExp = 0 ) then begin
  8002. dec(expDiff);
  8003. end
  8004. else begin
  8005. bSig0 := bSig0 or int64( $0001000000000000 );
  8006. end;
  8007. shift128ExtraRightJamming(
  8008. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  8009. zExp := aExp;
  8010. end
  8011. else if ( expDiff < 0 ) then begin
  8012. if ( bExp = $7FFF ) then begin
  8013. if ( bSig0 or bSig1 )<>0 then
  8014. begin
  8015. result := propagateFloat128NaN( a, b );
  8016. exit;
  8017. end;
  8018. result := packFloat128( zSign, $7FFF, 0, 0 );
  8019. exit;
  8020. end;
  8021. if ( aExp = 0 ) then begin
  8022. inc(expDiff);
  8023. end
  8024. else begin
  8025. aSig0 := aSig0 or int64( $0001000000000000 );
  8026. end;
  8027. shift128ExtraRightJamming(
  8028. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  8029. zExp := bExp;
  8030. end
  8031. else begin
  8032. if ( aExp = $7FFF ) then begin
  8033. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8034. result := propagateFloat128NaN( a, b );
  8035. exit;
  8036. end;
  8037. result := a;
  8038. exit;
  8039. end;
  8040. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8041. if ( aExp = 0 ) then
  8042. begin
  8043. result := packFloat128( zSign, 0, zSig0, zSig1 );
  8044. exit;
  8045. end;
  8046. zSig2 := 0;
  8047. zSig0 := zSig0 or int64( $0002000000000000 );
  8048. zExp := aExp;
  8049. goto shiftRight1;
  8050. end;
  8051. aSig0 := aSig0 or int64( $0001000000000000 );
  8052. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8053. dec(zExp);
  8054. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  8055. inc(zExp);
  8056. shiftRight1:
  8057. shift128ExtraRightJamming(
  8058. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8059. roundAndPack:
  8060. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8061. end;
  8062. {*----------------------------------------------------------------------------
  8063. | Returns the result of subtracting the absolute values of the quadruple-
  8064. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  8065. | difference is negated before being returned. `zSign' is ignored if the
  8066. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  8067. | Standard for Binary Floating-Point Arithmetic.
  8068. *----------------------------------------------------------------------------*}
  8069. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  8070. var
  8071. aExp, bExp, zExp: int32;
  8072. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8073. expDiff: int32;
  8074. z: float128;
  8075. label
  8076. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8077. begin
  8078. aSig1 := extractFloat128Frac1( a );
  8079. aSig0 := extractFloat128Frac0( a );
  8080. aExp := extractFloat128Exp( a );
  8081. bSig1 := extractFloat128Frac1( b );
  8082. bSig0 := extractFloat128Frac0( b );
  8083. bExp := extractFloat128Exp( b );
  8084. expDiff := aExp - bExp;
  8085. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8086. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8087. if ( 0 < expDiff ) then goto aExpBigger;
  8088. if ( expDiff < 0 ) then goto bExpBigger;
  8089. if ( aExp = $7FFF ) then begin
  8090. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8091. result := propagateFloat128NaN( a, b );
  8092. exit;
  8093. end;
  8094. float_raise( float_flag_invalid );
  8095. z.low := float128_default_nan_low;
  8096. z.high := float128_default_nan_high;
  8097. result := z;
  8098. exit;
  8099. end;
  8100. if ( aExp = 0 ) then begin
  8101. aExp := 1;
  8102. bExp := 1;
  8103. end;
  8104. if ( bSig0 < aSig0 ) then goto aBigger;
  8105. if ( aSig0 < bSig0 ) then goto bBigger;
  8106. if ( bSig1 < aSig1 ) then goto aBigger;
  8107. if ( aSig1 < bSig1 ) then goto bBigger;
  8108. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8109. exit;
  8110. bExpBigger:
  8111. if ( bExp = $7FFF ) then begin
  8112. if ( bSig0 or bSig1 )<>0 then
  8113. begin
  8114. result := propagateFloat128NaN( a, b );
  8115. exit;
  8116. end;
  8117. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8118. exit;
  8119. end;
  8120. if ( aExp = 0 ) then begin
  8121. inc(expDiff);
  8122. end
  8123. else begin
  8124. aSig0 := aSig0 or int64( $4000000000000000 );
  8125. end;
  8126. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8127. bSig0 := bSig0 or int64( $4000000000000000 );
  8128. bBigger:
  8129. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8130. zExp := bExp;
  8131. zSign := zSign xor 1;
  8132. goto normalizeRoundAndPack;
  8133. aExpBigger:
  8134. if ( aExp = $7FFF ) then begin
  8135. if ( aSig0 or aSig1 )<>0 then
  8136. begin
  8137. result := propagateFloat128NaN( a, b );
  8138. exit;
  8139. end;
  8140. result := a;
  8141. exit;
  8142. end;
  8143. if ( bExp = 0 ) then begin
  8144. dec(expDiff);
  8145. end
  8146. else begin
  8147. bSig0 := bSig0 or int64( $4000000000000000 );
  8148. end;
  8149. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8150. aSig0 := aSig0 or int64( $4000000000000000 );
  8151. aBigger:
  8152. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8153. zExp := aExp;
  8154. normalizeRoundAndPack:
  8155. dec(zExp);
  8156. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8157. end;
  8158. {*----------------------------------------------------------------------------
  8159. | Returns the result of adding the quadruple-precision floating-point values
  8160. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8161. | for Binary Floating-Point Arithmetic.
  8162. *----------------------------------------------------------------------------*}
  8163. function float128_add(a: float128; b: float128): float128;
  8164. var
  8165. aSign, bSign: flag;
  8166. begin
  8167. aSign := extractFloat128Sign( a );
  8168. bSign := extractFloat128Sign( b );
  8169. if ( aSign = bSign ) then begin
  8170. result := addFloat128Sigs( a, b, aSign );
  8171. end
  8172. else begin
  8173. result := subFloat128Sigs( a, b, aSign );
  8174. end;
  8175. end;
  8176. {*----------------------------------------------------------------------------
  8177. | Returns the result of subtracting the quadruple-precision floating-point
  8178. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8179. | Standard for Binary Floating-Point Arithmetic.
  8180. *----------------------------------------------------------------------------*}
  8181. function float128_sub(a: float128; b: float128): float128;
  8182. var
  8183. aSign, bSign: flag;
  8184. begin
  8185. aSign := extractFloat128Sign( a );
  8186. bSign := extractFloat128Sign( b );
  8187. if ( aSign = bSign ) then begin
  8188. result := subFloat128Sigs( a, b, aSign );
  8189. end
  8190. else begin
  8191. result := addFloat128Sigs( a, b, aSign );
  8192. end;
  8193. end;
  8194. {*----------------------------------------------------------------------------
  8195. | Returns the result of multiplying the quadruple-precision floating-point
  8196. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8197. | Standard for Binary Floating-Point Arithmetic.
  8198. *----------------------------------------------------------------------------*}
  8199. function float128_mul(a: float128; b: float128): float128;
  8200. var
  8201. aSign, bSign, zSign: flag;
  8202. aExp, bExp, zExp: int32;
  8203. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8204. z: float128;
  8205. label
  8206. invalid;
  8207. begin
  8208. aSig1 := extractFloat128Frac1( a );
  8209. aSig0 := extractFloat128Frac0( a );
  8210. aExp := extractFloat128Exp( a );
  8211. aSign := extractFloat128Sign( a );
  8212. bSig1 := extractFloat128Frac1( b );
  8213. bSig0 := extractFloat128Frac0( b );
  8214. bExp := extractFloat128Exp( b );
  8215. bSign := extractFloat128Sign( b );
  8216. zSign := aSign xor bSign;
  8217. if ( aExp = $7FFF ) then begin
  8218. if ( (( aSig0 or aSig1 )<>0)
  8219. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8220. result := propagateFloat128NaN( a, b );
  8221. exit;
  8222. end;
  8223. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8224. result := packFloat128( zSign, $7FFF, 0, 0 );
  8225. exit;
  8226. end;
  8227. if ( bExp = $7FFF ) then begin
  8228. if ( bSig0 or bSig1 )<>0 then
  8229. begin
  8230. result := propagateFloat128NaN( a, b );
  8231. exit;
  8232. end;
  8233. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8234. invalid:
  8235. float_raise( float_flag_invalid );
  8236. z.low := float128_default_nan_low;
  8237. z.high := float128_default_nan_high;
  8238. result := z;
  8239. exit;
  8240. end;
  8241. result := packFloat128( zSign, $7FFF, 0, 0 );
  8242. exit;
  8243. end;
  8244. if ( aExp = 0 ) then begin
  8245. if ( ( aSig0 or aSig1 ) = 0 ) then
  8246. begin
  8247. result := packFloat128( zSign, 0, 0, 0 );
  8248. exit;
  8249. end;
  8250. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8251. end;
  8252. if ( bExp = 0 ) then begin
  8253. if ( ( bSig0 or bSig1 ) = 0 ) then
  8254. begin
  8255. result := packFloat128( zSign, 0, 0, 0 );
  8256. exit;
  8257. end;
  8258. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8259. end;
  8260. zExp := aExp + bExp - $4000;
  8261. aSig0 := aSig0 or int64( $0001000000000000 );
  8262. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8263. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8264. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8265. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8266. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8267. shift128ExtraRightJamming(
  8268. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8269. inc(zExp);
  8270. end;
  8271. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8272. end;
  8273. {*----------------------------------------------------------------------------
  8274. | Returns the result of dividing the quadruple-precision floating-point value
  8275. | `a' by the corresponding value `b'. The operation is performed according to
  8276. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8277. *----------------------------------------------------------------------------*}
  8278. function float128_div(a: float128; b: float128): float128;
  8279. var
  8280. aSign, bSign, zSign: flag;
  8281. aExp, bExp, zExp: int32;
  8282. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8283. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8284. z: float128;
  8285. label
  8286. invalid;
  8287. begin
  8288. aSig1 := extractFloat128Frac1( a );
  8289. aSig0 := extractFloat128Frac0( a );
  8290. aExp := extractFloat128Exp( a );
  8291. aSign := extractFloat128Sign( a );
  8292. bSig1 := extractFloat128Frac1( b );
  8293. bSig0 := extractFloat128Frac0( b );
  8294. bExp := extractFloat128Exp( b );
  8295. bSign := extractFloat128Sign( b );
  8296. zSign := aSign xor bSign;
  8297. if ( aExp = $7FFF ) then begin
  8298. if ( aSig0 or aSig1 )<>0 then
  8299. begin
  8300. result := propagateFloat128NaN( a, b );
  8301. exit;
  8302. end;
  8303. if ( bExp = $7FFF ) then begin
  8304. if ( bSig0 or bSig1 )<>0 then
  8305. begin
  8306. result := propagateFloat128NaN( a, b );
  8307. exit;
  8308. end;
  8309. goto invalid;
  8310. end;
  8311. result := packFloat128( zSign, $7FFF, 0, 0 );
  8312. exit;
  8313. end;
  8314. if ( bExp = $7FFF ) then begin
  8315. if ( bSig0 or bSig1 )<>0 then
  8316. begin
  8317. result := propagateFloat128NaN( a, b );
  8318. exit;
  8319. end;
  8320. result := packFloat128( zSign, 0, 0, 0 );
  8321. exit;
  8322. end;
  8323. if ( bExp = 0 ) then begin
  8324. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8325. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8326. invalid:
  8327. float_raise( float_flag_invalid );
  8328. z.low := float128_default_nan_low;
  8329. z.high := float128_default_nan_high;
  8330. result := z;
  8331. exit;
  8332. end;
  8333. float_raise( float_flag_divbyzero );
  8334. result := packFloat128( zSign, $7FFF, 0, 0 );
  8335. exit;
  8336. end;
  8337. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8338. end;
  8339. if ( aExp = 0 ) then begin
  8340. if ( ( aSig0 or aSig1 ) = 0 ) then
  8341. begin
  8342. result := packFloat128( zSign, 0, 0, 0 );
  8343. exit;
  8344. end;
  8345. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8346. end;
  8347. zExp := aExp - bExp + $3FFD;
  8348. shortShift128Left(
  8349. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8350. shortShift128Left(
  8351. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8352. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8353. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8354. inc(zExp);
  8355. end;
  8356. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8357. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8358. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8359. while ( sbits64(rem0) < 0 ) do begin
  8360. dec(zSig0);
  8361. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8362. end;
  8363. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8364. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8365. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8366. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8367. while ( sbits64(rem1) < 0 ) do begin
  8368. dec(zSig1);
  8369. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8370. end;
  8371. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8372. end;
  8373. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8374. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8375. end;
  8376. {*----------------------------------------------------------------------------
  8377. | Returns the remainder of the quadruple-precision floating-point value `a'
  8378. | with respect to the corresponding value `b'. The operation is performed
  8379. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8380. *----------------------------------------------------------------------------*}
  8381. function float128_rem(a: float128; b: float128): float128;
  8382. var
  8383. aSign, zSign: flag;
  8384. aExp, bExp, expDiff: int32;
  8385. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8386. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8387. sigMean0: sbits64;
  8388. z: float128;
  8389. label
  8390. invalid;
  8391. begin
  8392. aSig1 := extractFloat128Frac1( a );
  8393. aSig0 := extractFloat128Frac0( a );
  8394. aExp := extractFloat128Exp( a );
  8395. aSign := extractFloat128Sign( a );
  8396. bSig1 := extractFloat128Frac1( b );
  8397. bSig0 := extractFloat128Frac0( b );
  8398. bExp := extractFloat128Exp( b );
  8399. if ( aExp = $7FFF ) then begin
  8400. if ( (( aSig0 or aSig1 )<>0)
  8401. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8402. result := propagateFloat128NaN( a, b );
  8403. exit;
  8404. end;
  8405. goto invalid;
  8406. end;
  8407. if ( bExp = $7FFF ) then begin
  8408. if ( bSig0 or bSig1 )<>0 then
  8409. begin
  8410. result := propagateFloat128NaN( a, b );
  8411. exit;
  8412. end;
  8413. result := a;
  8414. exit;
  8415. end;
  8416. if ( bExp = 0 ) then begin
  8417. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8418. invalid:
  8419. float_raise( float_flag_invalid );
  8420. z.low := float128_default_nan_low;
  8421. z.high := float128_default_nan_high;
  8422. result := z;
  8423. exit;
  8424. end;
  8425. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8426. end;
  8427. if ( aExp = 0 ) then begin
  8428. if ( ( aSig0 or aSig1 ) = 0 ) then
  8429. begin
  8430. result := a;
  8431. exit;
  8432. end;
  8433. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8434. end;
  8435. expDiff := aExp - bExp;
  8436. if ( expDiff < -1 ) then
  8437. begin
  8438. result := a;
  8439. exit;
  8440. end;
  8441. shortShift128Left(
  8442. aSig0 or int64( $0001000000000000 ),
  8443. aSig1,
  8444. 15 - ord( expDiff < 0 ),
  8445. aSig0,
  8446. aSig1
  8447. );
  8448. shortShift128Left(
  8449. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8450. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8451. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8452. dec(expDiff,64);
  8453. while ( 0 < expDiff ) do begin
  8454. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8455. if ( 4 < q ) then
  8456. q := q - 4
  8457. else
  8458. q := 0;
  8459. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8460. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8461. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8462. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8463. dec(expDiff,61);
  8464. end;
  8465. if ( -64 < expDiff ) then begin
  8466. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8467. if ( 4 < q ) then
  8468. q := q - 4
  8469. else
  8470. q := 0;
  8471. q := q shr (- expDiff);
  8472. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8473. inc(expDiff,52);
  8474. if ( expDiff < 0 ) then begin
  8475. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8476. end
  8477. else begin
  8478. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8479. end;
  8480. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8481. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8482. end
  8483. else begin
  8484. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8485. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8486. end;
  8487. repeat
  8488. alternateASig0 := aSig0;
  8489. alternateASig1 := aSig1;
  8490. inc(q);
  8491. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8492. until not( 0 <= sbits64(aSig0) );
  8493. add128(
  8494. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8495. if ( ( sigMean0 < 0 )
  8496. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8497. aSig0 := alternateASig0;
  8498. aSig1 := alternateASig1;
  8499. end;
  8500. zSign := ord( sbits64(aSig0) < 0 );
  8501. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8502. result :=
  8503. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8504. end;
  8505. {*----------------------------------------------------------------------------
  8506. | Returns the square root of the quadruple-precision floating-point value `a'.
  8507. | The operation is performed according to the IEC/IEEE Standard for Binary
  8508. | Floating-Point Arithmetic.
  8509. *----------------------------------------------------------------------------*}
  8510. function float128_sqrt(a: float128): float128;
  8511. var
  8512. aSign: flag;
  8513. aExp, zExp: int32;
  8514. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8515. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8516. z: float128;
  8517. label
  8518. invalid;
  8519. begin
  8520. aSig1 := extractFloat128Frac1( a );
  8521. aSig0 := extractFloat128Frac0( a );
  8522. aExp := extractFloat128Exp( a );
  8523. aSign := extractFloat128Sign( a );
  8524. if ( aExp = $7FFF ) then begin
  8525. if ( aSig0 or aSig1 )<>0 then
  8526. begin
  8527. result := propagateFloat128NaN( a, a );
  8528. exit;
  8529. end;
  8530. if ( aSign=0 ) then
  8531. begin
  8532. result := a;
  8533. exit;
  8534. end;
  8535. goto invalid;
  8536. end;
  8537. if ( aSign<>0 ) then begin
  8538. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8539. begin
  8540. result := a;
  8541. exit;
  8542. end;
  8543. invalid:
  8544. float_raise( float_flag_invalid );
  8545. z.low := float128_default_nan_low;
  8546. z.high := float128_default_nan_high;
  8547. result := z;
  8548. exit;
  8549. end;
  8550. if ( aExp = 0 ) then begin
  8551. if ( ( aSig0 or aSig1 ) = 0 ) then
  8552. begin
  8553. result := packFloat128( 0, 0, 0, 0 );
  8554. exit;
  8555. end;
  8556. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8557. end;
  8558. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  8559. aSig0 := aSig0 or int64( $0001000000000000 );
  8560. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  8561. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8562. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8563. doubleZSig0 := zSig0 shl 1;
  8564. mul64To128( zSig0, zSig0, term0, term1 );
  8565. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8566. while ( sbits64(rem0) < 0 ) do begin
  8567. dec(zSig0);
  8568. dec(doubleZSig0,2);
  8569. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8570. end;
  8571. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8572. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8573. if ( zSig1 = 0 ) then zSig1 := 1;
  8574. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8575. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8576. mul64To128( zSig1, zSig1, term2, term3 );
  8577. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8578. while ( sbits64(rem1) < 0 ) do begin
  8579. dec(zSig1);
  8580. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8581. term3 := term3 or 1;
  8582. term2 := term2 or doubleZSig0;
  8583. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8584. end;
  8585. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8586. end;
  8587. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8588. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8589. end;
  8590. {*----------------------------------------------------------------------------
  8591. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8592. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8593. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8594. *----------------------------------------------------------------------------*}
  8595. function float128_eq(a: float128; b: float128): flag;
  8596. begin
  8597. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8598. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8599. or ( ( extractFloat128Exp( b ) = $7FFF )
  8600. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8601. ) then begin
  8602. if ( (float128_is_signaling_nan( a )<>0)
  8603. or (float128_is_signaling_nan( b )<>0) ) then begin
  8604. float_raise( float_flag_invalid );
  8605. end;
  8606. result := 0;
  8607. exit;
  8608. end;
  8609. result := ord(
  8610. ( a.low = b.low )
  8611. and ( ( a.high = b.high )
  8612. or ( ( a.low = 0 )
  8613. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8614. ));
  8615. end;
  8616. {*----------------------------------------------------------------------------
  8617. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8618. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8619. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8620. | Arithmetic.
  8621. *----------------------------------------------------------------------------*}
  8622. function float128_le(a: float128; b: float128): flag;
  8623. var
  8624. aSign, bSign: flag;
  8625. begin
  8626. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8627. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8628. or ( ( extractFloat128Exp( b ) = $7FFF )
  8629. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8630. ) then begin
  8631. float_raise( float_flag_invalid );
  8632. result := 0;
  8633. exit;
  8634. end;
  8635. aSign := extractFloat128Sign( a );
  8636. bSign := extractFloat128Sign( b );
  8637. if ( aSign <> bSign ) then begin
  8638. result := ord(
  8639. (aSign<>0)
  8640. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8641. = 0 ));
  8642. exit;
  8643. end;
  8644. if aSign<>0 then
  8645. result := le128( b.high, b.low, a.high, a.low )
  8646. else
  8647. result := le128( a.high, a.low, b.high, b.low );
  8648. end;
  8649. {*----------------------------------------------------------------------------
  8650. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8651. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8652. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8653. *----------------------------------------------------------------------------*}
  8654. function float128_lt(a: float128; b: float128): flag;
  8655. var
  8656. aSign, bSign: flag;
  8657. begin
  8658. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8659. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8660. or ( ( extractFloat128Exp( b ) = $7FFF )
  8661. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8662. ) then begin
  8663. float_raise( float_flag_invalid );
  8664. result := 0;
  8665. exit;
  8666. end;
  8667. aSign := extractFloat128Sign( a );
  8668. bSign := extractFloat128Sign( b );
  8669. if ( aSign <> bSign ) then begin
  8670. result := ord(
  8671. (aSign<>0)
  8672. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8673. <> 0 ));
  8674. exit;
  8675. end;
  8676. if aSign<>0 then
  8677. result := lt128( b.high, b.low, a.high, a.low )
  8678. else
  8679. result := lt128( a.high, a.low, b.high, b.low );
  8680. end;
  8681. {*----------------------------------------------------------------------------
  8682. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8683. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8684. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8685. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8686. *----------------------------------------------------------------------------*}
  8687. function float128_eq_signaling(a: float128; b: float128): flag;
  8688. begin
  8689. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8690. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8691. or ( ( extractFloat128Exp( b ) = $7FFF )
  8692. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8693. ) then begin
  8694. float_raise( float_flag_invalid );
  8695. result := 0;
  8696. exit;
  8697. end;
  8698. result := ord(
  8699. ( a.low = b.low )
  8700. and ( ( a.high = b.high )
  8701. or ( ( a.low = 0 )
  8702. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8703. ));
  8704. end;
  8705. {*----------------------------------------------------------------------------
  8706. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8707. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8708. | cause an exception. Otherwise, the comparison is performed according to the
  8709. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8710. *----------------------------------------------------------------------------*}
  8711. function float128_le_quiet(a: float128; b: float128): flag;
  8712. var
  8713. aSign, bSign: flag;
  8714. begin
  8715. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8716. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8717. or ( ( extractFloat128Exp( b ) = $7FFF )
  8718. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8719. ) then begin
  8720. if ( (float128_is_signaling_nan( a )<>0)
  8721. or (float128_is_signaling_nan( b )<>0) ) then begin
  8722. float_raise( float_flag_invalid );
  8723. end;
  8724. result := 0;
  8725. exit;
  8726. end;
  8727. aSign := extractFloat128Sign( a );
  8728. bSign := extractFloat128Sign( b );
  8729. if ( aSign <> bSign ) then begin
  8730. result := ord(
  8731. (aSign<>0)
  8732. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8733. = 0 ));
  8734. exit;
  8735. end;
  8736. if aSign<>0 then
  8737. result := le128( b.high, b.low, a.high, a.low )
  8738. else
  8739. result := le128( a.high, a.low, b.high, b.low );
  8740. end;
  8741. {*----------------------------------------------------------------------------
  8742. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8743. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8744. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8745. | Standard for Binary Floating-Point Arithmetic.
  8746. *----------------------------------------------------------------------------*}
  8747. function float128_lt_quiet(a: float128; b: float128): flag;
  8748. var
  8749. aSign, bSign: flag;
  8750. begin
  8751. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8752. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8753. or ( ( extractFloat128Exp( b ) = $7FFF )
  8754. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8755. ) then begin
  8756. if ( (float128_is_signaling_nan( a )<>0)
  8757. or (float128_is_signaling_nan( b )<>0) ) then begin
  8758. float_raise( float_flag_invalid );
  8759. end;
  8760. result := 0;
  8761. exit;
  8762. end;
  8763. aSign := extractFloat128Sign( a );
  8764. bSign := extractFloat128Sign( b );
  8765. if ( aSign <> bSign ) then begin
  8766. result := ord(
  8767. (aSign<>0)
  8768. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8769. <> 0 ));
  8770. exit;
  8771. end;
  8772. if aSign<>0 then
  8773. result:=lt128( b.high, b.low, a.high, a.low )
  8774. else
  8775. result:=lt128( a.high, a.low, b.high, b.low );
  8776. end;
  8777. {----------------------------------------------------------------------------
  8778. | Returns the result of converting the double-precision floating-point value
  8779. | `a' to the quadruple-precision floating-point format. The conversion is
  8780. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8781. | Arithmetic.
  8782. *----------------------------------------------------------------------------}
  8783. function float64_to_float128( a : float64) : float128;
  8784. var
  8785. aSign : flag;
  8786. aExp : int16;
  8787. aSig, zSig0, zSig1 : bits64;
  8788. begin
  8789. aSig := extractFloat64Frac( a );
  8790. aExp := extractFloat64Exp( a );
  8791. aSign := extractFloat64Sign( a );
  8792. if ( aExp = $7FF ) then begin
  8793. if ( aSig<>0 ) then begin
  8794. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8795. exit;
  8796. end;
  8797. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8798. exit;
  8799. end;
  8800. if ( aExp = 0 ) then begin
  8801. if ( aSig = 0 ) then
  8802. begin
  8803. result:=packFloat128( aSign, 0, 0, 0 );
  8804. exit;
  8805. end;
  8806. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8807. dec(aExp);
  8808. end;
  8809. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8810. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8811. end;
  8812. {$endif FPC_SOFTFLOAT_FLOAT128}
  8813. {$endif not(defined(fpc_softfpu_interface))}
  8814. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8815. end.
  8816. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}