softfpu.pp 325 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. {$macro on}
  69. {$define compilerproc:=stdcall }
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. float32 = longword;
  80. {$define FPC_SYSTEM_HAS_float32}
  81. { we use here a record in the function header because
  82. the record allows bitwise conversion to single }
  83. float32rec = record
  84. float32 : float32;
  85. end;
  86. flag = byte;
  87. bits8 = byte;
  88. sbits8 = shortint;
  89. bits16 = word;
  90. sbits16 = smallint;
  91. sbits32 = longint;
  92. bits32 = longword;
  93. {$ifndef fpc}
  94. qword = int64;
  95. {$endif}
  96. { now part of the system unit
  97. uint64 = qword;
  98. }
  99. bits64 = qword;
  100. sbits64 = int64;
  101. {$ifdef ENDIAN_LITTLE}
  102. float64 = record
  103. case byte of
  104. 1: (low,high : bits32);
  105. // force the record to be aligned like a double
  106. // else *_to_double will fail for cpus like sparc
  107. // and avoid expensive unpacking/packing operations
  108. 2: (dummy : double);
  109. end;
  110. floatx80 = record
  111. case byte of
  112. 1: (low : qword;high : word);
  113. // force the record to be aligned like a double
  114. // else *_to_double will fail for cpus like sparc
  115. // and avoid expensive unpacking/packing operations
  116. 2: (dummy : extended);
  117. end;
  118. float128 = record
  119. case byte of
  120. 1: (low,high : qword);
  121. // force the record to be aligned like a double
  122. // else *_to_double will fail for cpus like sparc
  123. // and avoid expensive unpacking/packing operations
  124. 2: (dummy : qword);
  125. end;
  126. {$else}
  127. float64 = record
  128. case byte of
  129. 1: (high,low : bits32);
  130. // force the record to be aligned like a double
  131. // else *_to_double will fail for cpus like sparc
  132. 2: (dummy : double);
  133. end;
  134. floatx80 = record
  135. case byte of
  136. 1: (high : word;low : qword);
  137. // force the record to be aligned like a double
  138. // else *_to_double will fail for cpus like sparc
  139. // and avoid expensive unpacking/packing operations
  140. 2: (dummy : qword);
  141. end;
  142. float128 = record
  143. case byte of
  144. 1: (high : qword;low : qword);
  145. // force the record to be aligned like a double
  146. // else *_to_double will fail for cpus like sparc
  147. // and avoid expensive unpacking/packing operations
  148. 2: (dummy : qword);
  149. end;
  150. {$endif}
  151. {$define FPC_SYSTEM_HAS_float64}
  152. {*
  153. -------------------------------------------------------------------------------
  154. Returns 1 if the double-precision floating-point value `a' is less than
  155. the corresponding value `b', and 0 otherwise. The comparison is performed
  156. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  157. -------------------------------------------------------------------------------
  158. *}
  159. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  160. {*
  161. -------------------------------------------------------------------------------
  162. Returns 1 if the double-precision floating-point value `a' is less than
  163. or equal to the corresponding value `b', and 0 otherwise. The comparison
  164. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  165. Arithmetic.
  166. -------------------------------------------------------------------------------
  167. *}
  168. Function float64_le(a: float64;b: float64): flag; compilerproc;
  169. {*
  170. -------------------------------------------------------------------------------
  171. Returns 1 if the double-precision floating-point value `a' is equal to
  172. the corresponding value `b', and 0 otherwise. The comparison is performed
  173. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  174. -------------------------------------------------------------------------------
  175. *}
  176. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  177. {*
  178. -------------------------------------------------------------------------------
  179. Returns the square root of the double-precision floating-point value `a'.
  180. The operation is performed according to the IEC/IEEE Standard for Binary
  181. Floating-Point Arithmetic.
  182. -------------------------------------------------------------------------------
  183. *}
  184. function float64_sqrt( a: float64 ): float64; compilerproc;
  185. {*
  186. -------------------------------------------------------------------------------
  187. Returns the remainder of the double-precision floating-point value `a'
  188. with respect to the corresponding value `b'. The operation is performed
  189. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  190. -------------------------------------------------------------------------------
  191. *}
  192. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  193. {*
  194. -------------------------------------------------------------------------------
  195. Returns the result of dividing the double-precision floating-point value `a'
  196. by the corresponding value `b'. The operation is performed according to the
  197. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  198. -------------------------------------------------------------------------------
  199. *}
  200. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  201. {*
  202. -------------------------------------------------------------------------------
  203. Returns the result of multiplying the double-precision floating-point values
  204. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  205. for Binary Floating-Point Arithmetic.
  206. -------------------------------------------------------------------------------
  207. *}
  208. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  209. {*
  210. -------------------------------------------------------------------------------
  211. Returns the result of subtracting the double-precision floating-point values
  212. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  213. for Binary Floating-Point Arithmetic.
  214. -------------------------------------------------------------------------------
  215. *}
  216. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  217. {*
  218. -------------------------------------------------------------------------------
  219. Returns the result of adding the double-precision floating-point values `a'
  220. and `b'. The operation is performed according to the IEC/IEEE Standard for
  221. Binary Floating-Point Arithmetic.
  222. -------------------------------------------------------------------------------
  223. *}
  224. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  225. {*
  226. -------------------------------------------------------------------------------
  227. Rounds the double-precision floating-point value `a' to an integer,
  228. and returns the result as a double-precision floating-point value. The
  229. operation is performed according to the IEC/IEEE Standard for Binary
  230. Floating-Point Arithmetic.
  231. -------------------------------------------------------------------------------
  232. *}
  233. Function float64_round_to_int(a: float64) : float64; compilerproc;
  234. {*
  235. -------------------------------------------------------------------------------
  236. Returns the result of converting the double-precision floating-point value
  237. `a' to the single-precision floating-point format. The conversion is
  238. performed according to the IEC/IEEE Standard for Binary Floating-Point
  239. Arithmetic.
  240. -------------------------------------------------------------------------------
  241. *}
  242. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  243. {*
  244. -------------------------------------------------------------------------------
  245. Returns the result of converting the double-precision floating-point value
  246. `a' to the 32-bit two's complement integer format. The conversion is
  247. performed according to the IEC/IEEE Standard for Binary Floating-Point
  248. Arithmetic, except that the conversion is always rounded toward zero.
  249. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  250. the conversion overflows, the largest integer with the same sign as `a' is
  251. returned.
  252. -------------------------------------------------------------------------------
  253. *}
  254. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  255. {*
  256. -------------------------------------------------------------------------------
  257. Returns the result of converting the double-precision floating-point value
  258. `a' to the 32-bit two's complement integer format. The conversion is
  259. performed according to the IEC/IEEE Standard for Binary Floating-Point
  260. Arithmetic---which means in particular that the conversion is rounded
  261. according to the current rounding mode. If `a' is a NaN, the largest
  262. positive integer is returned. Otherwise, if the conversion overflows, the
  263. largest integer with the same sign as `a' is returned.
  264. -------------------------------------------------------------------------------
  265. *}
  266. Function float64_to_int32(a: float64): int32; compilerproc;
  267. {*
  268. -------------------------------------------------------------------------------
  269. Returns 1 if the single-precision floating-point value `a' is less than
  270. the corresponding value `b', and 0 otherwise. The comparison is performed
  271. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  272. -------------------------------------------------------------------------------
  273. *}
  274. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  275. {*
  276. -------------------------------------------------------------------------------
  277. Returns 1 if the single-precision floating-point value `a' is less than
  278. or equal to the corresponding value `b', and 0 otherwise. The comparison
  279. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  280. Arithmetic.
  281. -------------------------------------------------------------------------------
  282. *}
  283. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  284. {*
  285. -------------------------------------------------------------------------------
  286. Returns 1 if the single-precision floating-point value `a' is equal to
  287. the corresponding value `b', and 0 otherwise. The comparison is performed
  288. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  289. -------------------------------------------------------------------------------
  290. *}
  291. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  292. {*
  293. -------------------------------------------------------------------------------
  294. Returns the square root of the single-precision floating-point value `a'.
  295. The operation is performed according to the IEC/IEEE Standard for Binary
  296. Floating-Point Arithmetic.
  297. -------------------------------------------------------------------------------
  298. *}
  299. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  300. {*
  301. -------------------------------------------------------------------------------
  302. Returns the remainder of the single-precision floating-point value `a'
  303. with respect to the corresponding value `b'. The operation is performed
  304. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  305. -------------------------------------------------------------------------------
  306. *}
  307. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  308. {*
  309. -------------------------------------------------------------------------------
  310. Returns the result of dividing the single-precision floating-point value `a'
  311. by the corresponding value `b'. The operation is performed according to the
  312. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  313. -------------------------------------------------------------------------------
  314. *}
  315. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  316. {*
  317. -------------------------------------------------------------------------------
  318. Returns the result of multiplying the single-precision floating-point values
  319. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  320. for Binary Floating-Point Arithmetic.
  321. -------------------------------------------------------------------------------
  322. *}
  323. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  324. {*
  325. -------------------------------------------------------------------------------
  326. Returns the result of subtracting the single-precision floating-point values
  327. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  328. for Binary Floating-Point Arithmetic.
  329. -------------------------------------------------------------------------------
  330. *}
  331. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  332. {*
  333. -------------------------------------------------------------------------------
  334. Returns the result of adding the single-precision floating-point values `a'
  335. and `b'. The operation is performed according to the IEC/IEEE Standard for
  336. Binary Floating-Point Arithmetic.
  337. -------------------------------------------------------------------------------
  338. *}
  339. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  340. {*
  341. -------------------------------------------------------------------------------
  342. Rounds the single-precision floating-point value `a' to an integer,
  343. and returns the result as a single-precision floating-point value. The
  344. operation is performed according to the IEC/IEEE Standard for Binary
  345. Floating-Point Arithmetic.
  346. -------------------------------------------------------------------------------
  347. *}
  348. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  349. {*
  350. -------------------------------------------------------------------------------
  351. Returns the result of converting the single-precision floating-point value
  352. `a' to the double-precision floating-point format. The conversion is
  353. performed according to the IEC/IEEE Standard for Binary Floating-Point
  354. Arithmetic.
  355. -------------------------------------------------------------------------------
  356. *}
  357. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  358. {*
  359. -------------------------------------------------------------------------------
  360. Returns the result of converting the single-precision floating-point value
  361. `a' to the 32-bit two's complement integer format. The conversion is
  362. performed according to the IEC/IEEE Standard for Binary Floating-Point
  363. Arithmetic, except that the conversion is always rounded toward zero.
  364. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  365. the conversion overflows, the largest integer with the same sign as `a' is
  366. returned.
  367. -------------------------------------------------------------------------------
  368. *}
  369. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  370. {*
  371. -------------------------------------------------------------------------------
  372. Returns the result of converting the single-precision floating-point value
  373. `a' to the 32-bit two's complement integer format. The conversion is
  374. performed according to the IEC/IEEE Standard for Binary Floating-Point
  375. Arithmetic---which means in particular that the conversion is rounded
  376. according to the current rounding mode. If `a' is a NaN, the largest
  377. positive integer is returned. Otherwise, if the conversion overflows, the
  378. largest integer with the same sign as `a' is returned.
  379. -------------------------------------------------------------------------------
  380. *}
  381. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  382. {*
  383. -------------------------------------------------------------------------------
  384. Returns the result of converting the 32-bit two's complement integer `a' to
  385. the double-precision floating-point format. The conversion is performed
  386. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  387. -------------------------------------------------------------------------------
  388. *}
  389. Function int32_to_float64( a: int32) : float64; compilerproc;
  390. {*
  391. -------------------------------------------------------------------------------
  392. Returns the result of converting the 32-bit two's complement integer `a' to
  393. the single-precision floating-point format. The conversion is performed
  394. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  395. -------------------------------------------------------------------------------
  396. *}
  397. Function int32_to_float32( a: int32): float32rec; compilerproc;
  398. {*----------------------------------------------------------------------------
  399. | Returns the result of converting the 64-bit two's complement integer `a'
  400. | to the double-precision floating-point format. The conversion is performed
  401. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  402. *----------------------------------------------------------------------------*}
  403. Function int64_to_float64( a: int64 ): float64; compilerproc;
  404. Function qword_to_float64( a: qword ): float64; compilerproc;
  405. {*----------------------------------------------------------------------------
  406. | Returns the result of converting the 64-bit two's complement integer `a'
  407. | to the single-precision floating-point format. The conversion is performed
  408. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  409. *----------------------------------------------------------------------------*}
  410. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  411. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  412. // +++
  413. function float32_to_int64( a: float32 ): int64;
  414. function float32_to_int64_round_to_zero( a: float32 ): int64;
  415. function float32_eq_signaling( a: float32; b: float32) : flag;
  416. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  417. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  418. function float32_is_signaling_nan( a : float32 ): flag;
  419. function float32_is_nan( a : float32 ): flag;
  420. function float64_to_int64( a: float64 ): int64;
  421. function float64_to_int64_round_to_zero( a: float64 ): int64;
  422. function float64_eq_signaling( a: float64; b: float64): flag;
  423. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  424. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  425. function float64_is_signaling_nan( a : float64 ): flag;
  426. function float64_is_nan( a : float64 ): flag;
  427. // ===
  428. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  429. {*----------------------------------------------------------------------------
  430. | Extended double-precision rounding precision
  431. *----------------------------------------------------------------------------*}
  432. var // threadvar!?
  433. floatx80_rounding_precision : int8 = 80;
  434. function int32_to_floatx80( a: int32 ): floatx80;
  435. function int64_to_floatx80( a: int64 ): floatx80;
  436. function qword_to_floatx80( a: qword ): floatx80;
  437. function float32_to_floatx80( a: float32 ): floatx80;
  438. function float64_to_floatx80( a: float64 ): floatx80;
  439. function floatx80_to_int32( a: floatx80 ): int32;
  440. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  441. function floatx80_to_int64( a: floatx80 ): int64;
  442. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  443. function floatx80_to_float32( a: floatx80 ): float32;
  444. function floatx80_to_float64( a: floatx80 ): float64;
  445. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  446. function floatx80_to_float128( a: floatx80 ): float128;
  447. {$endif FPC_SOFTFLOAT_FLOAT128}
  448. function floatx80_round_to_int( a: floatx80 ): floatx80;
  449. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  450. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  451. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  452. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  453. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  454. function floatx80_sqrt( a: floatx80 ): floatx80;
  455. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  456. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  457. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  458. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  459. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  460. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  461. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  462. function floatx80_is_nan(a : floatx80 ): flag;
  463. {$endif FPC_SOFTFLOAT_FLOATX80}
  464. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  465. function int32_to_float128( a: int32 ): float128;
  466. function int64_to_float128( a: int64 ): float128;
  467. function qword_to_float128( a: qword ): float128;
  468. function float32_to_float128( a: float32 ): float128;
  469. function float128_is_nan( a : float128): flag;
  470. function float128_is_signaling_nan( a : float128): flag;
  471. function float128_to_int32(a: float128): int32;
  472. function float128_to_int32_round_to_zero(a: float128): int32;
  473. function float128_to_int64(a: float128): int64;
  474. function float128_to_int64_round_to_zero(a: float128): int64;
  475. function float128_to_float32(a: float128): float32;
  476. function float128_to_float64(a: float128): float64;
  477. function float64_to_float128( a : float64) : float128;
  478. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  479. function float128_to_floatx80(a: float128): floatx80;
  480. {$endif FPC_SOFTFLOAT_FLOATX80}
  481. function float128_round_to_int(a: float128): float128;
  482. function float128_add(a: float128; b: float128): float128;
  483. function float128_sub(a: float128; b: float128): float128;
  484. function float128_mul(a: float128; b: float128): float128;
  485. function float128_div(a: float128; b: float128): float128;
  486. function float128_rem(a: float128; b: float128): float128;
  487. function float128_sqrt(a: float128): float128;
  488. function float128_eq(a: float128; b: float128): flag;
  489. function float128_le(a: float128; b: float128): flag;
  490. function float128_lt(a: float128; b: float128): flag;
  491. function float128_eq_signaling(a: float128; b: float128): flag;
  492. function float128_le_quiet(a: float128; b: float128): flag;
  493. function float128_lt_quiet(a: float128; b: float128): flag;
  494. {$endif FPC_SOFTFLOAT_FLOAT128}
  495. CONST
  496. {-------------------------------------------------------------------------------
  497. Software IEC/IEEE floating-point underflow tininess-detection mode.
  498. -------------------------------------------------------------------------------
  499. *}
  500. float_tininess_after_rounding = 0;
  501. float_tininess_before_rounding = 1;
  502. {*
  503. -------------------------------------------------------------------------------
  504. Underflow tininess-detection mode, statically initialized to default value.
  505. (The declaration in `softfloat.h' must match the `int8' type here.)
  506. -------------------------------------------------------------------------------
  507. *}
  508. var // threadvar!?
  509. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  510. {$endif not(defined(fpc_softfpu_implementation))}
  511. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  512. implementation
  513. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  514. {$if not(defined(fpc_softfpu_interface))}
  515. {$ifdef FPC}
  516. { disable range and overflow checking explicitly }
  517. { This might be more essential for x80 and 128-bit
  518. floating point types and could, maybe be
  519. restricted to code handle floatx80 and float128 }
  520. {$push}
  521. {$R-}
  522. {$Q-}
  523. {$endif FPC}
  524. (*****************************************************************************)
  525. (*----------------------------------------------------------------------------*)
  526. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  527. (* division and square root approximations. (Can be specialized to target if *)
  528. (* desired.) *)
  529. (* ---------------------------------------------------------------------------*)
  530. (*****************************************************************************)
  531. { This procedure serves as a single access point to softfloat_exception_flags.
  532. It also helps to reduce code size a bit because softfloat_exception_flags is
  533. a threadvar. }
  534. procedure set_inexact_flag;
  535. begin
  536. include(softfloat_exception_flags,float_flag_inexact);
  537. end;
  538. {*----------------------------------------------------------------------------
  539. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  540. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  541. | input. If `zSign' is 1, the input is negated before being converted to an
  542. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  543. | is simply rounded to an integer, with the inexact exception raised if the
  544. | input cannot be represented exactly as an integer. However, if the fixed-
  545. | point input is too large, the invalid exception is raised and the largest
  546. | positive or negative integer is returned.
  547. *----------------------------------------------------------------------------*}
  548. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  549. var
  550. roundingMode: TFPURoundingMode;
  551. roundNearestEven: boolean;
  552. roundIncrement, roundBits: int8;
  553. z: int32;
  554. begin
  555. roundingMode := softfloat_rounding_mode;
  556. roundNearestEven := (roundingMode = float_round_nearest_even);
  557. roundIncrement := $40;
  558. if not roundNearestEven then
  559. begin
  560. if ( roundingMode = float_round_to_zero ) then
  561. begin
  562. roundIncrement := 0;
  563. end
  564. else begin
  565. roundIncrement := $7F;
  566. if ( zSign<>0 ) then
  567. begin
  568. if ( roundingMode = float_round_up ) then
  569. roundIncrement := 0;
  570. end
  571. else begin
  572. if ( roundingMode = float_round_down ) then
  573. roundIncrement := 0;
  574. end;
  575. end;
  576. end;
  577. roundBits := lo(absZ) and $7F;
  578. absZ := ( absZ + roundIncrement ) shr 7;
  579. absZ := absZ and not( bits64( ord( ( roundBits xor $40 ) = 0 ) and ord(roundNearestEven) ));
  580. z := absZ;
  581. if ( zSign<>0 ) then
  582. z := - z;
  583. if ( longint(hi( absZ )) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  584. begin
  585. float_raise( float_flag_invalid );
  586. if zSign<>0 then
  587. result:=sbits32($80000000)
  588. else
  589. result:=$7FFFFFFF;
  590. exit;
  591. end;
  592. if ( roundBits<>0 ) then
  593. set_inexact_flag;
  594. result:=z;
  595. end;
  596. {*----------------------------------------------------------------------------
  597. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  598. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  599. | and returns the properly rounded 64-bit integer corresponding to the input.
  600. | If `zSign' is 1, the input is negated before being converted to an integer.
  601. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  602. | the inexact exception raised if the input cannot be represented exactly as
  603. | an integer. However, if the fixed-point input is too large, the invalid
  604. | exception is raised and the largest positive or negative integer is
  605. | returned.
  606. *----------------------------------------------------------------------------*}
  607. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  608. var
  609. roundingMode: TFPURoundingMode;
  610. roundNearestEven, increment: flag;
  611. z: int64;
  612. label
  613. overflow;
  614. begin
  615. roundingMode := softfloat_rounding_mode;
  616. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  617. increment := ord( sbits64(absZ1) < 0 );
  618. if ( roundNearestEven=0 ) then
  619. begin
  620. if ( roundingMode = float_round_to_zero ) then
  621. begin
  622. increment := 0;
  623. end
  624. else begin
  625. if ( zSign<>0 ) then
  626. begin
  627. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  628. end
  629. else begin
  630. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  631. end;
  632. end;
  633. end;
  634. if ( increment<>0 ) then
  635. begin
  636. inc(absZ0);
  637. if ( absZ0 = 0 ) then
  638. goto overflow;
  639. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  640. end;
  641. z := absZ0;
  642. if ( zSign<>0 ) then
  643. z := - z;
  644. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  645. begin
  646. overflow:
  647. float_raise( float_flag_invalid );
  648. if zSign<>0 then
  649. result:=int64($8000000000000000)
  650. else
  651. result:=int64($7FFFFFFFFFFFFFFF);
  652. exit;
  653. end;
  654. if ( absZ1<>0 ) then
  655. set_inexact_flag;
  656. result:=z;
  657. end;
  658. {*
  659. -------------------------------------------------------------------------------
  660. Shifts `a' right by the number of bits given in `count'. If any nonzero
  661. bits are shifted off, they are ``jammed'' into the least significant bit of
  662. the result by setting the least significant bit to 1. The value of `count'
  663. can be arbitrarily large; in particular, if `count' is greater than 32, the
  664. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  665. The result is stored in the location pointed to by `zPtr'.
  666. -------------------------------------------------------------------------------
  667. *}
  668. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  669. var
  670. z: Bits32;
  671. Begin
  672. if ( count = 0 ) then
  673. z := a
  674. else
  675. if ( count < 32 ) then
  676. Begin
  677. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  678. End
  679. else
  680. Begin
  681. z := bits32( a <> 0 );
  682. End;
  683. zPtr := z;
  684. End;
  685. {*----------------------------------------------------------------------------
  686. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  687. | number of bits given in `count'. Any bits shifted off are lost. The value
  688. | of `count' can be arbitrarily large; in particular, if `count' is greater
  689. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  690. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  691. *----------------------------------------------------------------------------*}
  692. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  693. var
  694. z0, z1: bits64;
  695. negCount: int8;
  696. begin
  697. negCount := ( - count ) and 63;
  698. if ( count = 0 ) then
  699. begin
  700. z1 := a1;
  701. z0 := a0;
  702. end
  703. else if ( count < 64 ) then
  704. begin
  705. z1 := ( a0 shl negCount ) or ( a1 shr count );
  706. z0 := a0 shr count;
  707. end
  708. else
  709. begin
  710. if ( count < 128 ) then
  711. z1 := a0 shr ( count and 63 )
  712. else
  713. z1 := 0;
  714. z0 := 0;
  715. end;
  716. z1Ptr := z1;
  717. z0Ptr := z0;
  718. end;
  719. {*----------------------------------------------------------------------------
  720. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  721. | number of bits given in `count'. If any nonzero bits are shifted off, they
  722. | are ``jammed'' into the least significant bit of the result by setting the
  723. | least significant bit to 1. The value of `count' can be arbitrarily large;
  724. | in particular, if `count' is greater than 128, the result will be either
  725. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  726. | nonzero. The result is broken into two 64-bit pieces which are stored at
  727. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  728. *----------------------------------------------------------------------------*}
  729. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  730. var
  731. z0,z1 : bits64;
  732. negCount : int8;
  733. begin
  734. negCount := ( - count ) and 63;
  735. if ( count = 0 ) then begin
  736. z1 := a1;
  737. z0 := a0;
  738. end
  739. else if ( count < 64 ) then begin
  740. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  741. z0 := a0 shr count;
  742. end
  743. else begin
  744. if ( count = 64 ) then begin
  745. z1 := a0 or ord( a1 <> 0 );
  746. end
  747. else if ( count < 128 ) then begin
  748. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  749. end
  750. else begin
  751. z1 := ord( ( a0 or a1 ) <> 0 );
  752. end;
  753. z0 := 0;
  754. end;
  755. z1Ptr := z1;
  756. z0Ptr := z0;
  757. end;
  758. {*
  759. -------------------------------------------------------------------------------
  760. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  761. number of bits given in `count'. Any bits shifted off are lost. The value
  762. of `count' can be arbitrarily large; in particular, if `count' is greater
  763. than 64, the result will be 0. The result is broken into two 32-bit pieces
  764. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  765. -------------------------------------------------------------------------------
  766. *}
  767. Procedure
  768. shift64Right(
  769. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  770. Var
  771. z0, z1: bits32;
  772. negCount : int8;
  773. Begin
  774. negCount := ( - count ) AND 31;
  775. if ( count = 0 ) then
  776. Begin
  777. z1 := a1;
  778. z0 := a0;
  779. End
  780. else if ( count < 32 ) then
  781. Begin
  782. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  783. z0 := a0 shr count;
  784. End
  785. else
  786. Begin
  787. if (count < 64) then
  788. z1 := ( a0 shr ( count AND 31 ) )
  789. else
  790. z1 := 0;
  791. z0 := 0;
  792. End;
  793. z1Ptr := z1;
  794. z0Ptr := z0;
  795. End;
  796. {*
  797. -------------------------------------------------------------------------------
  798. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  799. number of bits given in `count'. If any nonzero bits are shifted off, they
  800. are ``jammed'' into the least significant bit of the result by setting the
  801. least significant bit to 1. The value of `count' can be arbitrarily large;
  802. in particular, if `count' is greater than 64, the result will be either 0
  803. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  804. nonzero. The result is broken into two 32-bit pieces which are stored at
  805. the locations pointed to by `z0Ptr' and `z1Ptr'.
  806. -------------------------------------------------------------------------------
  807. *}
  808. Procedure
  809. shift64RightJamming(
  810. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  811. VAR
  812. z0, z1 : bits32;
  813. negCount : int8;
  814. Begin
  815. negCount := ( - count ) AND 31;
  816. if ( count = 0 ) then
  817. Begin
  818. z1 := a1;
  819. z0 := a0;
  820. End
  821. else
  822. if ( count < 32 ) then
  823. Begin
  824. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  825. z0 := a0 shr count;
  826. End
  827. else
  828. Begin
  829. if ( count = 32 ) then
  830. Begin
  831. z1 := a0 OR bits32( a1 <> 0 );
  832. End
  833. else
  834. if ( count < 64 ) Then
  835. Begin
  836. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  837. End
  838. else
  839. Begin
  840. z1 := bits32( ( a0 OR a1 ) <> 0 );
  841. End;
  842. z0 := 0;
  843. End;
  844. z1Ptr := z1;
  845. z0Ptr := z0;
  846. End;
  847. {*----------------------------------------------------------------------------
  848. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  849. | bits are shifted off, they are ``jammed'' into the least significant bit of
  850. | the result by setting the least significant bit to 1. The value of `count'
  851. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  852. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  853. | The result is stored in the location pointed to by `zPtr'.
  854. *----------------------------------------------------------------------------*}
  855. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  856. var
  857. z: bits64;
  858. begin
  859. if ( count = 0 ) then
  860. begin
  861. z := a;
  862. end
  863. else if ( count < 64 ) then
  864. begin
  865. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  866. end
  867. else
  868. begin
  869. z := ord( a <> 0 );
  870. end;
  871. zPtr := z;
  872. end;
  873. {$if not defined(shift64ExtraRightJamming)}
  874. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  875. overload;
  876. forward;
  877. {$endif}
  878. {*
  879. -------------------------------------------------------------------------------
  880. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  881. by 32 _plus_ the number of bits given in `count'. The shifted result is
  882. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  883. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  884. off form a third 32-bit result as follows: The _last_ bit shifted off is
  885. the most-significant bit of the extra result, and the other 31 bits of the
  886. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  887. were all zero. This extra result is stored in the location pointed to by
  888. `z2Ptr'. The value of `count' can be arbitrarily large.
  889. (This routine makes more sense if `a0', `a1', and `a2' are considered
  890. to form a fixed-point value with binary point between `a1' and `a2'. This
  891. fixed-point value is shifted right by the number of bits given in `count',
  892. and the integer part of the result is returned at the locations pointed to
  893. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  894. corrupted as described above, and is returned at the location pointed to by
  895. `z2Ptr'.)
  896. -------------------------------------------------------------------------------
  897. }
  898. Procedure
  899. shift64ExtraRightJamming(
  900. a0: bits32;
  901. a1: bits32;
  902. a2: bits32;
  903. count: int16;
  904. VAR z0Ptr: bits32;
  905. VAR z1Ptr: bits32;
  906. VAR z2Ptr: bits32
  907. ); overload;
  908. Var
  909. z0, z1, z2: bits32;
  910. negCount : int8;
  911. Begin
  912. negCount := ( - count ) AND 31;
  913. if ( count = 0 ) then
  914. Begin
  915. z2 := a2;
  916. z1 := a1;
  917. z0 := a0;
  918. End
  919. else
  920. Begin
  921. if ( count < 32 ) Then
  922. Begin
  923. z2 := a1 shl negCount;
  924. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  925. z0 := a0 shr count;
  926. End
  927. else
  928. Begin
  929. if ( count = 32 ) then
  930. Begin
  931. z2 := a1;
  932. z1 := a0;
  933. End
  934. else
  935. Begin
  936. a2 := a2 or a1;
  937. if ( count < 64 ) then
  938. Begin
  939. z2 := a0 shl negCount;
  940. z1 := a0 shr ( count AND 31 );
  941. End
  942. else
  943. Begin
  944. if count = 64 then
  945. z2 := a0
  946. else
  947. z2 := bits32(a0 <> 0);
  948. z1 := 0;
  949. End;
  950. End;
  951. z0 := 0;
  952. End;
  953. z2 := z2 or bits32( a2 <> 0 );
  954. End;
  955. z2Ptr := z2;
  956. z1Ptr := z1;
  957. z0Ptr := z0;
  958. End;
  959. {*
  960. -------------------------------------------------------------------------------
  961. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  962. number of bits given in `count'. Any bits shifted off are lost. The value
  963. of `count' must be less than 32. The result is broken into two 32-bit
  964. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  965. -------------------------------------------------------------------------------
  966. *}
  967. Procedure
  968. shortShift64Left(
  969. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  970. Begin
  971. z1Ptr := a1 shl count;
  972. if count = 0 then
  973. z0Ptr := a0
  974. else
  975. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  976. End;
  977. {*
  978. -------------------------------------------------------------------------------
  979. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  980. by the number of bits given in `count'. Any bits shifted off are lost.
  981. The value of `count' must be less than 32. The result is broken into three
  982. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  983. `z1Ptr', and `z2Ptr'.
  984. -------------------------------------------------------------------------------
  985. *}
  986. Procedure
  987. shortShift96Left(
  988. a0: bits32;
  989. a1: bits32;
  990. a2: bits32;
  991. count: int16;
  992. VAR z0Ptr: bits32;
  993. VAR z1Ptr: bits32;
  994. VAR z2Ptr: bits32
  995. );
  996. Var
  997. z0, z1, z2: bits32;
  998. negCount: int8;
  999. Begin
  1000. z2 := a2 shl count;
  1001. z1 := a1 shl count;
  1002. z0 := a0 shl count;
  1003. if ( 0 < count ) then
  1004. Begin
  1005. negCount := ( ( - count ) AND 31 );
  1006. z1 := z1 or (a2 shr negCount);
  1007. z0 := z0 or (a1 shr negCount);
  1008. End;
  1009. z2Ptr := z2;
  1010. z1Ptr := z1;
  1011. z0Ptr := z0;
  1012. End;
  1013. {*----------------------------------------------------------------------------
  1014. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1015. | number of bits given in `count'. Any bits shifted off are lost. The value
  1016. | of `count' must be less than 64. The result is broken into two 64-bit
  1017. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1018. *----------------------------------------------------------------------------*}
  1019. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1020. begin
  1021. z1Ptr := a1 shl count;
  1022. if count=0 then
  1023. z0Ptr:=a0
  1024. else
  1025. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1026. end;
  1027. {*
  1028. -------------------------------------------------------------------------------
  1029. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1030. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1031. any carry out is lost. The result is broken into two 32-bit pieces which
  1032. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1033. -------------------------------------------------------------------------------
  1034. *}
  1035. Procedure
  1036. add64(
  1037. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1038. Var
  1039. z1: bits32;
  1040. Begin
  1041. z1 := a1 + b1;
  1042. z1Ptr := z1;
  1043. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1044. End;
  1045. {*
  1046. -------------------------------------------------------------------------------
  1047. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1048. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1049. modulo 2^96, so any carry out is lost. The result is broken into three
  1050. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1051. `z1Ptr', and `z2Ptr'.
  1052. -------------------------------------------------------------------------------
  1053. *}
  1054. Procedure
  1055. add96(
  1056. a0: bits32;
  1057. a1: bits32;
  1058. a2: bits32;
  1059. b0: bits32;
  1060. b1: bits32;
  1061. b2: bits32;
  1062. VAR z0Ptr: bits32;
  1063. VAR z1Ptr: bits32;
  1064. VAR z2Ptr: bits32
  1065. );
  1066. var
  1067. z0, z1, z2: bits32;
  1068. carry0, carry1: int8;
  1069. Begin
  1070. z2 := a2 + b2;
  1071. carry1 := int8( z2 < a2 );
  1072. z1 := a1 + b1;
  1073. carry0 := int8( z1 < a1 );
  1074. z0 := a0 + b0;
  1075. z1 := z1 + carry1;
  1076. z0 := z0 + bits32( z1 < carry1 );
  1077. z0 := z0 + carry0;
  1078. z2Ptr := z2;
  1079. z1Ptr := z1;
  1080. z0Ptr := z0;
  1081. End;
  1082. {*----------------------------------------------------------------------------
  1083. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1084. | by the number of bits given in `count'. Any bits shifted off are lost.
  1085. | The value of `count' must be less than 64. The result is broken into three
  1086. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1087. | `z1Ptr', and `z2Ptr'.
  1088. *----------------------------------------------------------------------------*}
  1089. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1090. var
  1091. z0, z1, z2 : bits64;
  1092. negCount : int8;
  1093. begin
  1094. z2 := a2 shl count;
  1095. z1 := a1 shl count;
  1096. z0 := a0 shl count;
  1097. if ( 0 < count ) then
  1098. begin
  1099. negCount := ( ( - count ) and 63 );
  1100. z1 := z1 or (a2 shr negCount);
  1101. z0 := z0 or (a1 shr negCount);
  1102. end;
  1103. z2Ptr := z2;
  1104. z1Ptr := z1;
  1105. z0Ptr := z0;
  1106. end;
  1107. {*----------------------------------------------------------------------------
  1108. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1109. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1110. | any carry out is lost. The result is broken into two 64-bit pieces which
  1111. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1112. *----------------------------------------------------------------------------*}
  1113. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1114. var
  1115. z1 : bits64;
  1116. begin
  1117. z1 := a1 + b1;
  1118. z1Ptr := z1;
  1119. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1120. end;
  1121. {*----------------------------------------------------------------------------
  1122. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1123. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1124. | modulo 2^192, so any carry out is lost. The result is broken into three
  1125. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1126. | `z1Ptr', and `z2Ptr'.
  1127. *----------------------------------------------------------------------------*}
  1128. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1129. var
  1130. z0, z1, z2 : bits64;
  1131. carry0, carry1 : int8;
  1132. begin
  1133. z2 := a2 + b2;
  1134. carry1 := ord( z2 < a2 );
  1135. z1 := a1 + b1;
  1136. carry0 := ord( z1 < a1 );
  1137. z0 := a0 + b0;
  1138. inc(z1, carry1);
  1139. inc(z0, ord( z1 < carry1 ));
  1140. inc(z0, carry0);
  1141. z2Ptr := z2;
  1142. z1Ptr := z1;
  1143. z0Ptr := z0;
  1144. end;
  1145. {*
  1146. -------------------------------------------------------------------------------
  1147. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1148. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1149. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1150. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1151. `z1Ptr'.
  1152. -------------------------------------------------------------------------------
  1153. *}
  1154. Procedure
  1155. sub64(
  1156. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1157. Begin
  1158. z1Ptr := a1 - b1;
  1159. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1160. End;
  1161. {*
  1162. -------------------------------------------------------------------------------
  1163. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1164. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1165. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1166. into three 32-bit pieces which are stored at the locations pointed to by
  1167. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1168. -------------------------------------------------------------------------------
  1169. *}
  1170. Procedure
  1171. sub96(
  1172. a0:bits32;
  1173. a1:bits32;
  1174. a2:bits32;
  1175. b0:bits32;
  1176. b1:bits32;
  1177. b2:bits32;
  1178. VAR z0Ptr:bits32;
  1179. VAR z1Ptr:bits32;
  1180. VAR z2Ptr:bits32
  1181. );
  1182. Var
  1183. z0, z1, z2: bits32;
  1184. borrow0, borrow1: int8;
  1185. Begin
  1186. z2 := a2 - b2;
  1187. borrow1 := int8( a2 < b2 );
  1188. z1 := a1 - b1;
  1189. borrow0 := int8( a1 < b1 );
  1190. z0 := a0 - b0;
  1191. z0 := z0 - bits32( z1 < borrow1 );
  1192. z1 := z1 - borrow1;
  1193. z0 := z0 -borrow0;
  1194. z2Ptr := z2;
  1195. z1Ptr := z1;
  1196. z0Ptr := z0;
  1197. End;
  1198. {*----------------------------------------------------------------------------
  1199. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1200. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1201. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1202. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1203. | `z1Ptr'.
  1204. *----------------------------------------------------------------------------*}
  1205. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1206. begin
  1207. z1Ptr := a1 - b1;
  1208. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1209. end;
  1210. {*----------------------------------------------------------------------------
  1211. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1212. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1213. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1214. | result is broken into three 64-bit pieces which are stored at the locations
  1215. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1216. *----------------------------------------------------------------------------*}
  1217. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1218. var
  1219. z0, z1, z2 : bits64;
  1220. borrow0, borrow1 : int8;
  1221. begin
  1222. z2 := a2 - b2;
  1223. borrow1 := ord( a2 < b2 );
  1224. z1 := a1 - b1;
  1225. borrow0 := ord( a1 < b1 );
  1226. z0 := a0 - b0;
  1227. dec(z0, ord( z1 < borrow1 ));
  1228. dec(z1, borrow1);
  1229. dec(z0, borrow0);
  1230. z2Ptr := z2;
  1231. z1Ptr := z1;
  1232. z0Ptr := z0;
  1233. end;
  1234. {*
  1235. -------------------------------------------------------------------------------
  1236. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1237. into two 32-bit pieces which are stored at the locations pointed to by
  1238. `z0Ptr' and `z1Ptr'.
  1239. -------------------------------------------------------------------------------
  1240. *}
  1241. {$IFDEF SOFTFPU_COMPILER_MUL32TO64}
  1242. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr :bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1243. var
  1244. tmp: qword;
  1245. begin
  1246. tmp:=qword(a) * b;
  1247. z0ptr:=hi(tmp);
  1248. z1ptr:=lo(tmp);
  1249. end;
  1250. {$ELSE}
  1251. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1252. :bits32 );
  1253. Var
  1254. aHigh, aLow, bHigh, bLow: bits16;
  1255. z0, zMiddleA, zMiddleB, z1: bits32;
  1256. Begin
  1257. aLow := bits16(a);
  1258. aHigh := a shr 16;
  1259. bLow := bits16(b);
  1260. bHigh := b shr 16;
  1261. z1 := ( bits32( aLow) ) * bLow;
  1262. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1263. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1264. z0 := ( bits32 (aHigh) ) * bHigh;
  1265. zMiddleA := zMiddleA + zMiddleB;
  1266. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1267. zMiddleA := zmiddleA shl 16;
  1268. z1 := z1 + zMiddleA;
  1269. z0 := z0 + bits32( z1 < zMiddleA );
  1270. z1Ptr := z1;
  1271. z0Ptr := z0;
  1272. End;
  1273. {$ENDIF}
  1274. {*
  1275. -------------------------------------------------------------------------------
  1276. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1277. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1278. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1279. `z2Ptr'.
  1280. -------------------------------------------------------------------------------
  1281. *}
  1282. Procedure
  1283. mul64By32To96(
  1284. a0:bits32;
  1285. a1:bits32;
  1286. b:bits32;
  1287. VAR z0Ptr:bits32;
  1288. VAR z1Ptr:bits32;
  1289. VAR z2Ptr:bits32
  1290. );
  1291. Var
  1292. z0, z1, z2, more1: bits32;
  1293. Begin
  1294. mul32To64( a1, b, z1, z2 );
  1295. mul32To64( a0, b, z0, more1 );
  1296. add64( z0, more1, 0, z1, z0, z1 );
  1297. z2Ptr := z2;
  1298. z1Ptr := z1;
  1299. z0Ptr := z0;
  1300. End;
  1301. {*
  1302. -------------------------------------------------------------------------------
  1303. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1304. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1305. product. The product is broken into four 32-bit pieces which are stored at
  1306. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1307. -------------------------------------------------------------------------------
  1308. *}
  1309. Procedure
  1310. mul64To128(
  1311. a0:bits32;
  1312. a1:bits32;
  1313. b0:bits32;
  1314. b1:bits32;
  1315. VAR z0Ptr:bits32;
  1316. VAR z1Ptr:bits32;
  1317. VAR z2Ptr:bits32;
  1318. VAR z3Ptr:bits32
  1319. );
  1320. Var
  1321. z0, z1, z2, z3: bits32;
  1322. more1, more2: bits32;
  1323. Begin
  1324. mul32To64( a1, b1, z2, z3 );
  1325. mul32To64( a1, b0, z1, more2 );
  1326. add64( z1, more2, 0, z2, z1, z2 );
  1327. mul32To64( a0, b0, z0, more1 );
  1328. add64( z0, more1, 0, z1, z0, z1 );
  1329. mul32To64( a0, b1, more1, more2 );
  1330. add64( more1, more2, 0, z2, more1, z2 );
  1331. add64( z0, z1, 0, more1, z0, z1 );
  1332. z3Ptr := z3;
  1333. z2Ptr := z2;
  1334. z1Ptr := z1;
  1335. z0Ptr := z0;
  1336. End;
  1337. {*----------------------------------------------------------------------------
  1338. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1339. | into two 64-bit pieces which are stored at the locations pointed to by
  1340. | `z0Ptr' and `z1Ptr'.
  1341. *----------------------------------------------------------------------------*}
  1342. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1343. var
  1344. aHigh, aLow, bHigh, bLow : bits32;
  1345. z0, zMiddleA, zMiddleB, z1 : bits64;
  1346. begin
  1347. aLow := a;
  1348. aHigh := a shr 32;
  1349. bLow := b;
  1350. bHigh := b shr 32;
  1351. z1 := ( bits64(aLow) ) * bLow;
  1352. zMiddleA := ( bits64( aLow )) * bHigh;
  1353. zMiddleB := ( bits64( aHigh )) * bLow;
  1354. z0 := ( bits64(aHigh) ) * bHigh;
  1355. inc(zMiddleA, zMiddleB);
  1356. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1357. zMiddleA := zMiddleA shl 32;
  1358. inc(z1, zMiddleA);
  1359. inc(z0, ord( z1 < zMiddleA ));
  1360. z1Ptr := z1;
  1361. z0Ptr := z0;
  1362. end;
  1363. {*----------------------------------------------------------------------------
  1364. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1365. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1366. | product. The product is broken into four 64-bit pieces which are stored at
  1367. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1368. *----------------------------------------------------------------------------*}
  1369. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1370. var
  1371. z0,z1,z2,z3,more1,more2 : bits64;
  1372. begin
  1373. mul64To128( a1, b1, z2, z3 );
  1374. mul64To128( a1, b0, z1, more2 );
  1375. add128( z1, more2, 0, z2, z1, z2 );
  1376. mul64To128( a0, b0, z0, more1 );
  1377. add128( z0, more1, 0, z1, z0, z1 );
  1378. mul64To128( a0, b1, more1, more2 );
  1379. add128( more1, more2, 0, z2, more1, z2 );
  1380. add128( z0, z1, 0, more1, z0, z1 );
  1381. z3Ptr := z3;
  1382. z2Ptr := z2;
  1383. z1Ptr := z1;
  1384. z0Ptr := z0;
  1385. end;
  1386. {*----------------------------------------------------------------------------
  1387. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1388. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1389. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1390. | `z2Ptr'.
  1391. *----------------------------------------------------------------------------*}
  1392. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1393. var
  1394. z0, z1, z2, more1 : bits64;
  1395. begin
  1396. mul64To128( a1, b, z1, z2 );
  1397. mul64To128( a0, b, z0, more1 );
  1398. add128( z0, more1, 0, z1, z0, z1 );
  1399. z2Ptr := z2;
  1400. z1Ptr := z1;
  1401. z0Ptr := z0;
  1402. end;
  1403. {*----------------------------------------------------------------------------
  1404. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1405. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1406. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1407. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1408. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1409. | unsigned integer is returned.
  1410. *----------------------------------------------------------------------------*}
  1411. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1412. var
  1413. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1414. begin
  1415. if ( b <= a0 ) then
  1416. begin
  1417. result:=qword( $FFFFFFFFFFFFFFFF );
  1418. exit;
  1419. end;
  1420. b0 := b shr 32;
  1421. if ( b0 shl 32 <= a0 ) then
  1422. z:=qword( $FFFFFFFF00000000 )
  1423. else
  1424. z:=( a0 div b0 ) shl 32;
  1425. mul64To128( b, z, term0, term1 );
  1426. sub128( a0, a1, term0, term1, rem0, rem1 );
  1427. while ( ( sbits64(rem0) ) < 0 ) do begin
  1428. dec(z,qword( $100000000 ));
  1429. b1 := b shl 32;
  1430. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1431. end;
  1432. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1433. if ( b0 shl 32 <= rem0 ) then
  1434. z:=z or $FFFFFFFF
  1435. else
  1436. z:=z or rem0 div b0;
  1437. result:=z;
  1438. end;
  1439. {*
  1440. -------------------------------------------------------------------------------
  1441. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1442. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1443. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1444. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1445. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1446. unsigned integer is returned.
  1447. -------------------------------------------------------------------------------
  1448. *}
  1449. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1450. Var
  1451. b0, b1: bits32;
  1452. rem0, rem1, term0, term1: bits32;
  1453. z: bits32;
  1454. Begin
  1455. if ( b <= a0 ) then
  1456. Begin
  1457. estimateDiv64To32 := $FFFFFFFF;
  1458. exit;
  1459. End;
  1460. b0 := b shr 16;
  1461. if ( b0 shl 16 <= a0 ) then
  1462. z:= $FFFF0000
  1463. else
  1464. z:= ( a0 div b0 ) shl 16;
  1465. mul32To64( b, z, term0, term1 );
  1466. sub64( a0, a1, term0, term1, rem0, rem1 );
  1467. while ( ( sbits32 (rem0) ) < 0 ) do
  1468. Begin
  1469. z := z - $10000;
  1470. b1 := b shl 16;
  1471. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1472. End;
  1473. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1474. if ( b0 shl 16 <= rem0 ) then
  1475. z := z or $FFFF
  1476. else
  1477. z := z or (rem0 div b0);
  1478. estimateDiv64To32 := z;
  1479. End;
  1480. {*
  1481. -------------------------------------------------------------------------------
  1482. Returns an approximation to the square root of the 32-bit significand given
  1483. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1484. `aExp' (the least significant bit) is 1, the integer returned approximates
  1485. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1486. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1487. case, the approximation returned lies strictly within +/-2 of the exact
  1488. value.
  1489. -------------------------------------------------------------------------------
  1490. *}
  1491. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1492. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1493. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1494. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1495. );
  1496. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1497. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1498. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1499. );
  1500. Var
  1501. index: int8;
  1502. z: bits32;
  1503. Begin
  1504. index := ( a shr 27 ) AND 15;
  1505. if ( aExp AND 1 ) <> 0 then
  1506. Begin
  1507. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1508. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1509. a := a shr 1;
  1510. End
  1511. else
  1512. Begin
  1513. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1514. z := a div z + z;
  1515. if ( $20000 <= z ) then
  1516. z := $FFFF8000
  1517. else
  1518. z := ( z shl 15 );
  1519. if ( z <= a ) then
  1520. Begin
  1521. estimateSqrt32 := bits32 ( SarLongint( sbits32 (a)) );
  1522. exit;
  1523. End;
  1524. End;
  1525. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1526. End;
  1527. {*
  1528. -------------------------------------------------------------------------------
  1529. Returns the number of leading 0 bits before the most-significant 1 bit of
  1530. `a'. If `a' is zero, 32 is returned.
  1531. -------------------------------------------------------------------------------
  1532. *}
  1533. Function countLeadingZeros32( a:bits32 ): int8;
  1534. const countLeadingZerosHigh:array[0..255] of int8 = (
  1535. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1536. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1537. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1538. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1539. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1540. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1541. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1542. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1543. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1544. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1545. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1546. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1547. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1548. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1549. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1550. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1551. );
  1552. Var
  1553. shiftCount: int8;
  1554. Begin
  1555. shiftCount := 0;
  1556. if ( a < $10000 ) then
  1557. Begin
  1558. shiftCount := shiftcount + 16;
  1559. a := a shl 16;
  1560. End;
  1561. if ( a < $1000000 ) then
  1562. Begin
  1563. shiftCount := shiftcount + 8;
  1564. a := a shl 8;
  1565. end;
  1566. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1567. countLeadingZeros32:= shiftCount;
  1568. End;
  1569. {*----------------------------------------------------------------------------
  1570. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1571. | `a'. If `a' is zero, 64 is returned.
  1572. *----------------------------------------------------------------------------*}
  1573. function countLeadingZeros64( a : bits64): int8;
  1574. var
  1575. shiftcount : int8;
  1576. Begin
  1577. shiftCount := 0;
  1578. if ( a < bits64(bits64(1) shl 32 )) then
  1579. shiftCount := shiftcount + 32
  1580. else
  1581. a := a shr 32;
  1582. shiftCount := shiftCount + countLeadingZeros32( a );
  1583. countLeadingZeros64:= shiftCount;
  1584. End;
  1585. {*
  1586. -------------------------------------------------------------------------------
  1587. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1588. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1589. Otherwise, returns 0.
  1590. -------------------------------------------------------------------------------
  1591. *}
  1592. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1593. Begin
  1594. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1595. End;
  1596. {*
  1597. -------------------------------------------------------------------------------
  1598. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1599. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1600. returns 0.
  1601. -------------------------------------------------------------------------------
  1602. *}
  1603. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1604. Begin
  1605. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1606. End;
  1607. const
  1608. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1609. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1610. (*****************************************************************************)
  1611. (* End Low-Level arithmetic *)
  1612. (*****************************************************************************)
  1613. {*----------------------------------------------------------------------------
  1614. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1615. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1616. | returns 0.
  1617. *----------------------------------------------------------------------------*}
  1618. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1619. begin
  1620. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1621. end;
  1622. {*
  1623. -------------------------------------------------------------------------------
  1624. Functions and definitions to determine: (1) whether tininess for underflow
  1625. is detected before or after rounding by default, (2) what (if anything)
  1626. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1627. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1628. are propagated from function inputs to output. These details are ENDIAN
  1629. specific
  1630. -------------------------------------------------------------------------------
  1631. *}
  1632. {$IFDEF ENDIAN_LITTLE}
  1633. {*
  1634. -------------------------------------------------------------------------------
  1635. Internal canonical NaN format.
  1636. -------------------------------------------------------------------------------
  1637. *}
  1638. TYPE
  1639. commonNaNT = record
  1640. high, low : bits32;
  1641. sign: flag;
  1642. end;
  1643. {*
  1644. -------------------------------------------------------------------------------
  1645. The pattern for a default generated single-precision NaN.
  1646. -------------------------------------------------------------------------------
  1647. *}
  1648. const float32_default_nan = $FFC00000;
  1649. {*
  1650. -------------------------------------------------------------------------------
  1651. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1652. otherwise returns 0.
  1653. -------------------------------------------------------------------------------
  1654. *}
  1655. Function float32_is_nan( a : float32 ): flag;
  1656. Begin
  1657. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1658. End;
  1659. {*
  1660. -------------------------------------------------------------------------------
  1661. Returns 1 if the single-precision floating-point value `a' is a signaling
  1662. NaN; otherwise returns 0.
  1663. -------------------------------------------------------------------------------
  1664. *}
  1665. Function float32_is_signaling_nan( a : float32 ): flag;
  1666. Begin
  1667. float32_is_signaling_nan := flag
  1668. (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));
  1669. End;
  1670. {*
  1671. -------------------------------------------------------------------------------
  1672. Returns the result of converting the single-precision floating-point NaN
  1673. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1674. exception is raised.
  1675. -------------------------------------------------------------------------------
  1676. *}
  1677. function float32ToCommonNaN(a: float32) : commonNaNT;
  1678. var
  1679. z : commonNaNT ;
  1680. Begin
  1681. if ( float32_is_signaling_nan( a ) <> 0) then
  1682. float_raise( float_flag_invalid );
  1683. z.sign := a shr 31;
  1684. z.low := 0;
  1685. z.high := a shl 9;
  1686. result := z;
  1687. End;
  1688. {*
  1689. -------------------------------------------------------------------------------
  1690. Returns the result of converting the canonical NaN `a' to the single-
  1691. precision floating-point format.
  1692. -------------------------------------------------------------------------------
  1693. *}
  1694. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1695. Begin
  1696. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1697. End;
  1698. {*
  1699. -------------------------------------------------------------------------------
  1700. Takes two single-precision floating-point values `a' and `b', one of which
  1701. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1702. signaling NaN, the invalid exception is raised.
  1703. -------------------------------------------------------------------------------
  1704. *}
  1705. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1706. Var
  1707. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1708. label returnLargerSignificand;
  1709. Begin
  1710. aIsNaN := float32_is_nan( a );
  1711. aIsSignalingNaN := float32_is_signaling_nan( a );
  1712. bIsNaN := float32_is_nan( b );
  1713. bIsSignalingNaN := float32_is_signaling_nan( b );
  1714. a := a or $00400000;
  1715. b := b or $00400000;
  1716. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1717. float_raise( float_flag_invalid );
  1718. if ( aIsSignalingNaN )<> 0 then
  1719. Begin
  1720. if ( bIsSignalingNaN ) <> 0 then
  1721. goto returnLargerSignificand;
  1722. if bIsNan <> 0 then
  1723. propagateFloat32NaN := b
  1724. else
  1725. propagateFloat32NaN := a;
  1726. exit;
  1727. End
  1728. else if ( aIsNaN <> 0) then
  1729. Begin
  1730. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1731. Begin
  1732. propagateFloat32NaN := a;
  1733. exit;
  1734. End;
  1735. returnLargerSignificand:
  1736. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1737. Begin
  1738. propagateFloat32NaN := b;
  1739. exit;
  1740. End;
  1741. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1742. Begin
  1743. propagateFloat32NaN := a;
  1744. End;
  1745. if a < b then
  1746. propagateFloat32NaN := a
  1747. else
  1748. propagateFloat32NaN := b;
  1749. exit;
  1750. End
  1751. else
  1752. Begin
  1753. propagateFloat32NaN := b;
  1754. exit;
  1755. End;
  1756. End;
  1757. {*
  1758. -------------------------------------------------------------------------------
  1759. The pattern for a default generated double-precision NaN. The `high' and
  1760. `low' values hold the most- and least-significant bits, respectively.
  1761. -------------------------------------------------------------------------------
  1762. *}
  1763. const
  1764. float64_default_nan_high = $FFF80000;
  1765. float64_default_nan_low = $00000000;
  1766. {*
  1767. -------------------------------------------------------------------------------
  1768. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1769. otherwise returns 0.
  1770. -------------------------------------------------------------------------------
  1771. *}
  1772. Function float64_is_nan( a : float64 ) : flag;
  1773. Begin
  1774. float64_is_nan :=
  1775. flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1776. and (( a.low or ( a.high and $000FFFFF ) )<>0));
  1777. End;
  1778. {*
  1779. -------------------------------------------------------------------------------
  1780. Returns 1 if the double-precision floating-point value `a' is a signaling
  1781. NaN; otherwise returns 0.
  1782. -------------------------------------------------------------------------------
  1783. *}
  1784. Function float64_is_signaling_nan( a : float64 ): flag;
  1785. Begin
  1786. float64_is_signaling_nan :=
  1787. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1788. and ( a.low or ( a.high and $0007FFFF ) );
  1789. End;
  1790. {*
  1791. -------------------------------------------------------------------------------
  1792. Returns the result of converting the double-precision floating-point NaN
  1793. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1794. exception is raised.
  1795. -------------------------------------------------------------------------------
  1796. *}
  1797. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1798. Var
  1799. z : commonNaNT;
  1800. Begin
  1801. if ( float64_is_signaling_nan( a )<>0 ) then
  1802. float_raise( float_flag_invalid );
  1803. z.sign := a.high shr 31;
  1804. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1805. result := z;
  1806. End;
  1807. {*
  1808. -------------------------------------------------------------------------------
  1809. Returns the result of converting the canonical NaN `a' to the double-
  1810. precision floating-point format.
  1811. -------------------------------------------------------------------------------
  1812. *}
  1813. function commonNaNToFloat64( a : commonNaNT) : float64;
  1814. Var
  1815. z: float64;
  1816. Begin
  1817. shift64Right( a.high, a.low, 12, z.high, z.low );
  1818. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1819. result := z;
  1820. End;
  1821. {*
  1822. -------------------------------------------------------------------------------
  1823. Takes two double-precision floating-point values `a' and `b', one of which
  1824. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1825. signaling NaN, the invalid exception is raised.
  1826. -------------------------------------------------------------------------------
  1827. *}
  1828. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1829. Var
  1830. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1831. label returnLargerSignificand;
  1832. Begin
  1833. aIsNaN := float64_is_nan( a );
  1834. aIsSignalingNaN := float64_is_signaling_nan( a );
  1835. bIsNaN := float64_is_nan( b );
  1836. bIsSignalingNaN := float64_is_signaling_nan( b );
  1837. a.high := a.high or $00080000;
  1838. b.high := b.high or $00080000;
  1839. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1840. float_raise( float_flag_invalid );
  1841. if ( aIsSignalingNaN )<>0 then
  1842. Begin
  1843. if ( bIsSignalingNaN )<>0 then
  1844. goto returnLargerSignificand;
  1845. if bIsNan <> 0 then
  1846. c := b
  1847. else
  1848. c := a;
  1849. exit;
  1850. End
  1851. else if ( aIsNaN )<> 0 then
  1852. Begin
  1853. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1854. Begin
  1855. c := a;
  1856. exit;
  1857. End;
  1858. returnLargerSignificand:
  1859. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1860. Begin
  1861. c := b;
  1862. exit;
  1863. End;
  1864. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1865. Begin
  1866. c := a;
  1867. exit;
  1868. End;
  1869. if a.high < b.high then
  1870. c := a
  1871. else
  1872. c := b;
  1873. exit;
  1874. End
  1875. else
  1876. Begin
  1877. c := b;
  1878. exit;
  1879. End;
  1880. End;
  1881. {*----------------------------------------------------------------------------
  1882. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1883. | otherwise returns 0.
  1884. *----------------------------------------------------------------------------*}
  1885. function float128_is_nan( a : float128): flag;
  1886. begin
  1887. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1888. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1889. end;
  1890. {*----------------------------------------------------------------------------
  1891. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1892. | signaling NaN; otherwise returns 0.
  1893. *----------------------------------------------------------------------------*}
  1894. function float128_is_signaling_nan( a : float128): flag;
  1895. begin
  1896. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1897. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1898. end;
  1899. {*----------------------------------------------------------------------------
  1900. | Returns the result of converting the quadruple-precision floating-point NaN
  1901. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1902. | exception is raised.
  1903. *----------------------------------------------------------------------------*}
  1904. function float128ToCommonNaN( a : float128): commonNaNT;
  1905. var
  1906. z: commonNaNT;
  1907. qhigh,qlow : qword;
  1908. begin
  1909. if ( float128_is_signaling_nan( a )<>0) then
  1910. float_raise( float_flag_invalid );
  1911. z.sign := a.high shr 63;
  1912. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1913. z.high:=qhigh shr 32;
  1914. z.low:=qhigh and $ffffffff;
  1915. result:=z;
  1916. end;
  1917. {*----------------------------------------------------------------------------
  1918. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1919. | precision floating-point format.
  1920. *----------------------------------------------------------------------------*}
  1921. function commonNaNToFloat128( a : commonNaNT): float128;
  1922. var
  1923. z: float128;
  1924. begin
  1925. shift128Right( a.high, a.low, 16, z.high, z.low );
  1926. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1927. result:=z;
  1928. end;
  1929. {*----------------------------------------------------------------------------
  1930. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1931. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1932. | `b' is a signaling NaN, the invalid exception is raised.
  1933. *----------------------------------------------------------------------------*}
  1934. function propagateFloat128NaN( a: float128; b : float128): float128;
  1935. var
  1936. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1937. label
  1938. returnLargerSignificand;
  1939. begin
  1940. aIsNaN := float128_is_nan( a );
  1941. aIsSignalingNaN := float128_is_signaling_nan( a );
  1942. bIsNaN := float128_is_nan( b );
  1943. bIsSignalingNaN := float128_is_signaling_nan( b );
  1944. a.high := a.high or int64( $0000800000000000 );
  1945. b.high := b.high or int64( $0000800000000000 );
  1946. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1947. float_raise( float_flag_invalid );
  1948. if ( aIsSignalingNaN )<>0 then
  1949. begin
  1950. if ( bIsSignalingNaN )<>0 then
  1951. goto returnLargerSignificand;
  1952. if bIsNaN<>0 then
  1953. result := b
  1954. else
  1955. result := a;
  1956. exit;
  1957. end
  1958. else if ( aIsNaN )<>0 then
  1959. begin
  1960. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1961. begin
  1962. result := a;
  1963. exit;
  1964. end;
  1965. returnLargerSignificand:
  1966. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1967. begin
  1968. result := b;
  1969. exit;
  1970. end;
  1971. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1972. begin
  1973. result := a;
  1974. exit
  1975. end;
  1976. if ( a.high < b.high ) then
  1977. result := a
  1978. else
  1979. result := b;
  1980. exit;
  1981. end
  1982. else
  1983. result:=b;
  1984. end;
  1985. {$ELSE}
  1986. { Big endian code }
  1987. (*----------------------------------------------------------------------------
  1988. | Internal canonical NaN format.
  1989. *----------------------------------------------------------------------------*)
  1990. type
  1991. commonNANT = record
  1992. high, low : bits32;
  1993. sign : flag;
  1994. end;
  1995. (*----------------------------------------------------------------------------
  1996. | The pattern for a default generated single-precision NaN.
  1997. *----------------------------------------------------------------------------*)
  1998. const float32_default_nan = $7FFFFFFF;
  1999. (*----------------------------------------------------------------------------
  2000. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  2001. | otherwise returns 0.
  2002. *----------------------------------------------------------------------------*)
  2003. function float32_is_nan(a: float32): flag;
  2004. begin
  2005. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  2006. end;
  2007. (*----------------------------------------------------------------------------
  2008. | Returns 1 if the single-precision floating-point value `a' is a signaling
  2009. | NaN; otherwise returns 0.
  2010. *----------------------------------------------------------------------------*)
  2011. function float32_is_signaling_nan(a: float32):flag;
  2012. begin
  2013. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2014. end;
  2015. (*----------------------------------------------------------------------------
  2016. | Returns the result of converting the single-precision floating-point NaN
  2017. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2018. | exception is raised.
  2019. *----------------------------------------------------------------------------*)
  2020. function float32ToCommonNaN( a: float32) : commonNaNT;
  2021. var
  2022. z: commonNANT;
  2023. begin
  2024. if float32_is_signaling_nan(a)<>0 then
  2025. float_raise(float_flag_invalid);
  2026. z.sign := a shr 31;
  2027. z.low := 0;
  2028. z.high := a shl 9;
  2029. result:=z;
  2030. end;
  2031. (*----------------------------------------------------------------------------
  2032. | Returns the result of converting the canonical NaN `a' to the single-
  2033. | precision floating-point format.
  2034. *----------------------------------------------------------------------------*)
  2035. function CommonNanToFloat32(a : CommonNaNT): float32;
  2036. begin
  2037. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2038. end;
  2039. (*----------------------------------------------------------------------------
  2040. | Takes two single-precision floating-point values `a' and `b', one of which
  2041. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2042. | signaling NaN, the invalid exception is raised.
  2043. *----------------------------------------------------------------------------*)
  2044. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2045. var
  2046. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2047. begin
  2048. aIsNaN := float32_is_nan( a );
  2049. aIsSignalingNaN := float32_is_signaling_nan( a );
  2050. bIsNaN := float32_is_nan( b );
  2051. bIsSignalingNaN := float32_is_signaling_nan( b );
  2052. a := a or $00400000;
  2053. b := b or $00400000;
  2054. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2055. float_raise( float_flag_invalid );
  2056. if bIsSignalingNaN<>0 then
  2057. propagateFloat32Nan := b
  2058. else if aIsSignalingNan<>0 then
  2059. propagateFloat32Nan := a
  2060. else if bIsNan<>0 then
  2061. propagateFloat32Nan := b
  2062. else
  2063. propagateFloat32Nan := a;
  2064. end;
  2065. (*----------------------------------------------------------------------------
  2066. | The pattern for a default generated double-precision NaN. The `high' and
  2067. | `low' values hold the most- and least-significant bits, respectively.
  2068. *----------------------------------------------------------------------------*)
  2069. const
  2070. float64_default_nan_high = $7FFFFFFF;
  2071. float64_default_nan_low = $FFFFFFFF;
  2072. (*----------------------------------------------------------------------------
  2073. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2074. | otherwise returns 0.
  2075. *----------------------------------------------------------------------------*)
  2076. function float64_is_nan(a: float64): flag;
  2077. begin
  2078. float64_is_nan := flag (
  2079. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2080. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2081. end;
  2082. (*----------------------------------------------------------------------------
  2083. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2084. | NaN; otherwise returns 0.
  2085. *----------------------------------------------------------------------------*)
  2086. function float64_is_signaling_nan( a:float64): flag;
  2087. begin
  2088. float64_is_signaling_nan := flag(
  2089. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2090. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2091. end;
  2092. (*----------------------------------------------------------------------------
  2093. | Returns the result of converting the double-precision floating-point NaN
  2094. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2095. | exception is raised.
  2096. *----------------------------------------------------------------------------*)
  2097. function float64ToCommonNaN( a : float64) : commonNaNT;
  2098. var
  2099. z : commonNaNT;
  2100. begin
  2101. if ( float64_is_signaling_nan( a )<>0 ) then
  2102. float_raise( float_flag_invalid );
  2103. z.sign := a.high shr 31;
  2104. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2105. result:=z;
  2106. end;
  2107. (*----------------------------------------------------------------------------
  2108. | Returns the result of converting the canonical NaN `a' to the double-
  2109. | precision floating-point format.
  2110. *----------------------------------------------------------------------------*)
  2111. function commonNaNToFloat64( a : commonNaNT): float64;
  2112. var
  2113. z: float64;
  2114. begin
  2115. shift64Right( a.high, a.low, 12, z.high, z.low );
  2116. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2117. result:=z;
  2118. end;
  2119. (*----------------------------------------------------------------------------
  2120. | Takes two double-precision floating-point values `a' and `b', one of which
  2121. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2122. | signaling NaN, the invalid exception is raised.
  2123. *----------------------------------------------------------------------------*)
  2124. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2125. var
  2126. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2127. begin
  2128. aIsNaN := float64_is_nan( a );
  2129. aIsSignalingNaN := float64_is_signaling_nan( a );
  2130. bIsNaN := float64_is_nan( b );
  2131. bIsSignalingNaN := float64_is_signaling_nan( b );
  2132. a.high := a.high or $00080000;
  2133. b.high := b.high or $00080000;
  2134. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2135. float_raise( float_flag_invalid );
  2136. if bIsSignalingNaN<>0 then
  2137. c := b
  2138. else if aIsSignalingNan<>0 then
  2139. c := a
  2140. else if bIsNan<>0 then
  2141. c := b
  2142. else
  2143. c := a;
  2144. end;
  2145. {*----------------------------------------------------------------------------
  2146. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  2147. | otherwise returns 0.
  2148. *----------------------------------------------------------------------------*}
  2149. function float128_is_nan( a : float128): flag;
  2150. begin
  2151. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  2152. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  2153. end;
  2154. {*----------------------------------------------------------------------------
  2155. | Returns 1 if the quadruple-precision floating-point value `a' is a
  2156. | signaling NaN; otherwise returns 0.
  2157. *----------------------------------------------------------------------------*}
  2158. function float128_is_signaling_nan( a : float128): flag;
  2159. begin
  2160. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  2161. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  2162. end;
  2163. {*----------------------------------------------------------------------------
  2164. | Returns the result of converting the quadruple-precision floating-point NaN
  2165. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2166. | exception is raised.
  2167. *----------------------------------------------------------------------------*}
  2168. function float128ToCommonNaN( a : float128): commonNaNT;
  2169. var
  2170. z: commonNaNT;
  2171. qhigh,qlow : qword;
  2172. begin
  2173. if ( float128_is_signaling_nan( a )<>0) then
  2174. float_raise( float_flag_invalid );
  2175. z.sign := a.high shr 63;
  2176. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  2177. z.high:=qhigh shr 32;
  2178. z.low:=qhigh and $ffffffff;
  2179. result:=z;
  2180. end;
  2181. {*----------------------------------------------------------------------------
  2182. | Returns the result of converting the canonical NaN `a' to the quadruple-
  2183. | precision floating-point format.
  2184. *----------------------------------------------------------------------------*}
  2185. function commonNaNToFloat128( a : commonNaNT): float128;
  2186. var
  2187. z: float128;
  2188. begin
  2189. shift128Right( a.high, a.low, 16, z.high, z.low );
  2190. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  2191. result:=z;
  2192. end;
  2193. {*----------------------------------------------------------------------------
  2194. | Takes two quadruple-precision floating-point values `a' and `b', one of
  2195. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  2196. | `b' is a signaling NaN, the invalid exception is raised.
  2197. *----------------------------------------------------------------------------*}
  2198. function propagateFloat128NaN( a: float128; b : float128): float128;
  2199. var
  2200. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2201. label
  2202. returnLargerSignificand;
  2203. begin
  2204. aIsNaN := float128_is_nan( a );
  2205. aIsSignalingNaN := float128_is_signaling_nan( a );
  2206. bIsNaN := float128_is_nan( b );
  2207. bIsSignalingNaN := float128_is_signaling_nan( b );
  2208. a.high := a.high or int64( $0000800000000000 );
  2209. b.high := b.high or int64( $0000800000000000 );
  2210. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2211. float_raise( float_flag_invalid );
  2212. if ( aIsSignalingNaN )<>0 then
  2213. begin
  2214. if ( bIsSignalingNaN )<>0 then
  2215. goto returnLargerSignificand;
  2216. if bIsNaN<>0 then
  2217. result := b
  2218. else
  2219. result := a;
  2220. exit;
  2221. end
  2222. else if ( aIsNaN )<>0 then
  2223. begin
  2224. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  2225. begin
  2226. result := a;
  2227. exit;
  2228. end;
  2229. returnLargerSignificand:
  2230. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  2231. begin
  2232. result := b;
  2233. exit;
  2234. end;
  2235. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  2236. begin
  2237. result := a;
  2238. exit
  2239. end;
  2240. if ( a.high < b.high ) then
  2241. result := a
  2242. else
  2243. result := b;
  2244. exit;
  2245. end
  2246. else
  2247. result:=b;
  2248. end;
  2249. {$ENDIF}
  2250. (****************************************************************************)
  2251. (* END ENDIAN SPECIFIC CODE *)
  2252. (****************************************************************************)
  2253. {*
  2254. -------------------------------------------------------------------------------
  2255. Returns the fraction bits of the single-precision floating-point value `a'.
  2256. -------------------------------------------------------------------------------
  2257. *}
  2258. Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
  2259. Begin
  2260. ExtractFloat32Frac := A AND $007FFFFF;
  2261. End;
  2262. {*
  2263. -------------------------------------------------------------------------------
  2264. Returns the exponent bits of the single-precision floating-point value `a'.
  2265. -------------------------------------------------------------------------------
  2266. *}
  2267. Function extractFloat32Exp( a: float32 ): Int16; inline;
  2268. Begin
  2269. extractFloat32Exp := (a shr 23) AND $FF;
  2270. End;
  2271. {*
  2272. -------------------------------------------------------------------------------
  2273. Returns the sign bit of the single-precision floating-point value `a'.
  2274. -------------------------------------------------------------------------------
  2275. *}
  2276. Function extractFloat32Sign( a: float32 ): Flag; inline;
  2277. Begin
  2278. extractFloat32Sign := a shr 31;
  2279. End;
  2280. {*
  2281. -------------------------------------------------------------------------------
  2282. Normalizes the subnormal single-precision floating-point value represented
  2283. by the denormalized significand `aSig'. The normalized exponent and
  2284. significand are stored at the locations pointed to by `zExpPtr' and
  2285. `zSigPtr', respectively.
  2286. -------------------------------------------------------------------------------
  2287. *}
  2288. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2289. Var
  2290. ShiftCount : BYTE;
  2291. Begin
  2292. shiftCount := countLeadingZeros32( aSig ) - 8;
  2293. zSigPtr := aSig shl shiftCount;
  2294. zExpPtr := 1 - shiftCount;
  2295. End;
  2296. {*
  2297. -------------------------------------------------------------------------------
  2298. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2299. single-precision floating-point value, returning the result. After being
  2300. shifted into the proper positions, the three fields are simply added
  2301. together to form the result. This means that any integer portion of `zSig'
  2302. will be added into the exponent. Since a properly normalized significand
  2303. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2304. than the desired result exponent whenever `zSig' is a complete, normalized
  2305. significand.
  2306. -------------------------------------------------------------------------------
  2307. *}
  2308. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
  2309. Begin
  2310. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2311. + zSig;
  2312. End;
  2313. {*
  2314. -------------------------------------------------------------------------------
  2315. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2316. and significand `zSig', and returns the proper single-precision floating-
  2317. point value corresponding to the abstract input. Ordinarily, the abstract
  2318. value is simply rounded and packed into the single-precision format, with
  2319. the inexact exception raised if the abstract input cannot be represented
  2320. exactly. However, if the abstract value is too large, the overflow and
  2321. inexact exceptions are raised and an infinity or maximal finite value is
  2322. returned. If the abstract value is too small, the input value is rounded to
  2323. a subnormal number, and the underflow and inexact exceptions are raised if
  2324. the abstract input cannot be represented exactly as a subnormal single-
  2325. precision floating-point number.
  2326. The input significand `zSig' has its binary point between bits 30
  2327. and 29, which is 7 bits to the left of the usual location. This shifted
  2328. significand must be normalized or smaller. If `zSig' is not normalized,
  2329. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2330. and it must not require rounding. In the usual case that `zSig' is
  2331. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2332. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2333. Binary Floating-Point Arithmetic.
  2334. -------------------------------------------------------------------------------
  2335. *}
  2336. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2337. Var
  2338. roundingMode : TFPURoundingMode;
  2339. roundNearestEven : boolean;
  2340. roundIncrement, roundBits : BYTE;
  2341. IsTiny : boolean;
  2342. Begin
  2343. roundingMode := softfloat_rounding_mode;
  2344. roundNearestEven := (roundingMode = float_round_nearest_even);
  2345. roundIncrement := $40;
  2346. if not roundNearestEven then
  2347. Begin
  2348. if ( roundingMode = float_round_to_zero ) Then
  2349. Begin
  2350. roundIncrement := 0;
  2351. End
  2352. else
  2353. Begin
  2354. roundIncrement := $7F;
  2355. if ( zSign <> 0 ) then
  2356. Begin
  2357. if roundingMode = float_round_up then roundIncrement := 0;
  2358. End
  2359. else
  2360. Begin
  2361. if roundingMode = float_round_down then roundIncrement := 0;
  2362. End;
  2363. End
  2364. End;
  2365. roundBits := zSig AND $7F;
  2366. if ($FD <= bits16 (zExp) ) then
  2367. Begin
  2368. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2369. Begin
  2370. float_raise( [float_flag_overflow,float_flag_inexact] );
  2371. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2372. exit;
  2373. End;
  2374. if ( zExp < 0 ) then
  2375. Begin
  2376. isTiny :=
  2377. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2378. OR ( zExp < -1 )
  2379. OR ( (zSig + roundIncrement) < $80000000 );
  2380. shift32RightJamming( zSig, - zExp, zSig );
  2381. zExp := 0;
  2382. roundBits := zSig AND $7F;
  2383. if ( isTiny and (roundBits<>0) ) then
  2384. float_raise( float_flag_underflow );
  2385. End;
  2386. End;
  2387. if ( roundBits )<> 0 then
  2388. set_inexact_flag;
  2389. zSig := ( zSig + roundIncrement ) shr 7;
  2390. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
  2391. if ( zSig = 0 ) then zExp := 0;
  2392. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2393. End;
  2394. {*
  2395. -------------------------------------------------------------------------------
  2396. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2397. and significand `zSig', and returns the proper single-precision floating-
  2398. point value corresponding to the abstract input. This routine is just like
  2399. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2400. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2401. floating-point exponent.
  2402. -------------------------------------------------------------------------------
  2403. *}
  2404. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2405. Var
  2406. ShiftCount : int8;
  2407. Begin
  2408. shiftCount := countLeadingZeros32( zSig ) - 1;
  2409. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2410. End;
  2411. {*
  2412. -------------------------------------------------------------------------------
  2413. Returns the most-significant 20 fraction bits of the double-precision
  2414. floating-point value `a'.
  2415. -------------------------------------------------------------------------------
  2416. *}
  2417. Function extractFloat64Frac0(a: float64): bits32; inline;
  2418. Begin
  2419. extractFloat64Frac0 := a.high and $000FFFFF;
  2420. End;
  2421. {*
  2422. -------------------------------------------------------------------------------
  2423. Returns the least-significant 32 fraction bits of the double-precision
  2424. floating-point value `a'.
  2425. -------------------------------------------------------------------------------
  2426. *}
  2427. Function extractFloat64Frac1(a: float64): bits32; inline;
  2428. Begin
  2429. extractFloat64Frac1 := a.low;
  2430. End;
  2431. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2432. Function extractFloat64Frac(a: float64): bits64; inline;
  2433. Begin
  2434. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2435. End;
  2436. {*
  2437. -------------------------------------------------------------------------------
  2438. Returns the exponent bits of the double-precision floating-point value `a'.
  2439. -------------------------------------------------------------------------------
  2440. *}
  2441. Function extractFloat64Exp(a: float64): int16; inline;
  2442. Begin
  2443. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2444. End;
  2445. {*
  2446. -------------------------------------------------------------------------------
  2447. Returns the sign bit of the double-precision floating-point value `a'.
  2448. -------------------------------------------------------------------------------
  2449. *}
  2450. Function extractFloat64Sign(a: float64) : flag; inline;
  2451. Begin
  2452. extractFloat64Sign := a.high shr 31;
  2453. End;
  2454. {*
  2455. -------------------------------------------------------------------------------
  2456. Normalizes the subnormal double-precision floating-point value represented
  2457. by the denormalized significand formed by the concatenation of `aSig0' and
  2458. `aSig1'. The normalized exponent is stored at the location pointed to by
  2459. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2460. stored at the location pointed to by `zSig0Ptr', and the least significant
  2461. 32 bits of the normalized significand are stored at the location pointed to
  2462. by `zSig1Ptr'.
  2463. -------------------------------------------------------------------------------
  2464. *}
  2465. Procedure normalizeFloat64Subnormal(
  2466. aSig0: bits32;
  2467. aSig1: bits32;
  2468. VAR zExpPtr : Int16;
  2469. VAR zSig0Ptr : Bits32;
  2470. VAR zSig1Ptr : Bits32
  2471. );
  2472. Var
  2473. ShiftCount : Int8;
  2474. Begin
  2475. if ( aSig0 = 0 ) then
  2476. Begin
  2477. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2478. if ( shiftCount < 0 ) then
  2479. Begin
  2480. zSig0Ptr := aSig1 shr ( - shiftCount );
  2481. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2482. End
  2483. else
  2484. Begin
  2485. zSig0Ptr := aSig1 shl shiftCount;
  2486. zSig1Ptr := 0;
  2487. End;
  2488. zExpPtr := - shiftCount - 31;
  2489. End
  2490. else
  2491. Begin
  2492. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2493. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2494. zExpPtr := 1 - shiftCount;
  2495. End;
  2496. End;
  2497. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2498. var
  2499. shiftCount : int8;
  2500. begin
  2501. shiftCount := countLeadingZeros64( aSig ) - 11;
  2502. zSigPtr := aSig shl shiftCount;
  2503. zExpPtr := 1 - shiftCount;
  2504. end;
  2505. {*
  2506. -------------------------------------------------------------------------------
  2507. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2508. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2509. point value, returning the result. After being shifted into the proper
  2510. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2511. together to form the most significant 32 bits of the result. This means
  2512. that any integer portion of `zSig0' will be added into the exponent. Since
  2513. a properly normalized significand will have an integer portion equal to 1,
  2514. the `zExp' input should be 1 less than the desired result exponent whenever
  2515. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2516. -------------------------------------------------------------------------------
  2517. *}
  2518. Procedure
  2519. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2520. var
  2521. z: Float64;
  2522. Begin
  2523. z.low := zSig1;
  2524. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2525. c := z;
  2526. End;
  2527. {*----------------------------------------------------------------------------
  2528. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2529. | double-precision floating-point value, returning the result. After being
  2530. | shifted into the proper positions, the three fields are simply added
  2531. | together to form the result. This means that any integer portion of `zSig'
  2532. | will be added into the exponent. Since a properly normalized significand
  2533. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2534. | than the desired result exponent whenever `zSig' is a complete, normalized
  2535. | significand.
  2536. *----------------------------------------------------------------------------*}
  2537. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2538. begin
  2539. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2540. end;
  2541. {*
  2542. -------------------------------------------------------------------------------
  2543. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2544. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2545. and `zSig2', and returns the proper double-precision floating-point value
  2546. corresponding to the abstract input. Ordinarily, the abstract value is
  2547. simply rounded and packed into the double-precision format, with the inexact
  2548. exception raised if the abstract input cannot be represented exactly.
  2549. However, if the abstract value is too large, the overflow and inexact
  2550. exceptions are raised and an infinity or maximal finite value is returned.
  2551. If the abstract value is too small, the input value is rounded to a
  2552. subnormal number, and the underflow and inexact exceptions are raised if the
  2553. abstract input cannot be represented exactly as a subnormal double-precision
  2554. floating-point number.
  2555. The input significand must be normalized or smaller. If the input
  2556. significand is not normalized, `zExp' must be 0; in that case, the result
  2557. returned is a subnormal number, and it must not require rounding. In the
  2558. usual case that the input significand is normalized, `zExp' must be 1 less
  2559. than the ``true'' floating-point exponent. The handling of underflow and
  2560. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2561. -------------------------------------------------------------------------------
  2562. *}
  2563. Procedure
  2564. roundAndPackFloat64(
  2565. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2566. Var
  2567. roundingMode : TFPURoundingMode;
  2568. roundNearestEven, increment, isTiny : Flag;
  2569. Begin
  2570. roundingMode := softfloat_rounding_mode;
  2571. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2572. increment := flag( sbits32 (zSig2) < 0 );
  2573. if ( roundNearestEven = flag(FALSE) ) then
  2574. Begin
  2575. if ( roundingMode = float_round_to_zero ) then
  2576. increment := 0
  2577. else
  2578. Begin
  2579. if ( zSign )<> 0 then
  2580. Begin
  2581. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2582. End
  2583. else
  2584. Begin
  2585. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2586. End
  2587. End
  2588. End;
  2589. if ( $7FD <= bits16 (zExp) ) then
  2590. Begin
  2591. if (( $7FD < zExp )
  2592. or (( zExp = $7FD )
  2593. and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
  2594. and (increment<>0)
  2595. )
  2596. ) then
  2597. Begin
  2598. float_raise( [float_flag_overflow,float_flag_inexact] );
  2599. if (( roundingMode = float_round_to_zero )
  2600. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2601. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2602. ) then
  2603. Begin
  2604. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2605. exit;
  2606. End;
  2607. packFloat64( zSign, $7FF, 0, 0, c );
  2608. exit;
  2609. End;
  2610. if ( zExp < 0 ) then
  2611. Begin
  2612. isTiny :=
  2613. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2614. or flag( zExp < -1 )
  2615. or flag(increment = 0)
  2616. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2617. shift64ExtraRightJamming(
  2618. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2619. zExp := 0;
  2620. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2621. if ( roundNearestEven )<>0 then
  2622. Begin
  2623. increment := flag( sbits32 (zSig2) < 0 );
  2624. End
  2625. else
  2626. Begin
  2627. if ( zSign )<>0 then
  2628. Begin
  2629. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2630. End
  2631. else
  2632. Begin
  2633. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2634. End
  2635. End;
  2636. End;
  2637. End;
  2638. if ( zSig2 )<>0 then
  2639. set_inexact_flag;
  2640. if ( increment )<>0 then
  2641. Begin
  2642. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2643. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2644. End
  2645. else
  2646. Begin
  2647. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2648. End;
  2649. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2650. End;
  2651. {*----------------------------------------------------------------------------
  2652. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2653. | and significand `zSig', and returns the proper double-precision floating-
  2654. | point value corresponding to the abstract input. Ordinarily, the abstract
  2655. | value is simply rounded and packed into the double-precision format, with
  2656. | the inexact exception raised if the abstract input cannot be represented
  2657. | exactly. However, if the abstract value is too large, the overflow and
  2658. | inexact exceptions are raised and an infinity or maximal finite value is
  2659. | returned. If the abstract value is too small, the input value is rounded
  2660. | to a subnormal number, and the underflow and inexact exceptions are raised
  2661. | if the abstract input cannot be represented exactly as a subnormal double-
  2662. | precision floating-point number.
  2663. | The input significand `zSig' has its binary point between bits 62
  2664. | and 61, which is 10 bits to the left of the usual location. This shifted
  2665. | significand must be normalized or smaller. If `zSig' is not normalized,
  2666. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2667. | and it must not require rounding. In the usual case that `zSig' is
  2668. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2669. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2670. | Binary Floating-Point Arithmetic.
  2671. *----------------------------------------------------------------------------*}
  2672. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2673. var
  2674. roundingMode: TFPURoundingMode;
  2675. roundNearestEven: flag;
  2676. roundIncrement, roundBits: int16;
  2677. isTiny: flag;
  2678. begin
  2679. roundingMode := softfloat_rounding_mode;
  2680. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2681. roundIncrement := $200;
  2682. if ( roundNearestEven=0 ) then
  2683. begin
  2684. if ( roundingMode = float_round_to_zero ) then
  2685. begin
  2686. roundIncrement := 0;
  2687. end
  2688. else begin
  2689. roundIncrement := $3FF;
  2690. if ( zSign<>0 ) then
  2691. begin
  2692. if ( roundingMode = float_round_up ) then
  2693. roundIncrement := 0;
  2694. end
  2695. else begin
  2696. if ( roundingMode = float_round_down ) then
  2697. roundIncrement := 0;
  2698. end
  2699. end
  2700. end;
  2701. roundBits := zSig and $3FF;
  2702. if ( $7FD <= bits16(zExp) ) then
  2703. begin
  2704. if ( ( $7FD < zExp )
  2705. or ( ( zExp = $7FD )
  2706. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2707. ) then
  2708. begin
  2709. float_raise( [float_flag_overflow,float_flag_inexact] );
  2710. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2711. exit;
  2712. end;
  2713. if ( zExp < 0 ) then
  2714. begin
  2715. isTiny := ord(
  2716. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2717. or ( zExp < -1 )
  2718. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2719. shift64RightJamming( zSig, - zExp, zSig );
  2720. zExp := 0;
  2721. roundBits := zSig and $3FF;
  2722. if ( isTiny and roundBits )<>0 then
  2723. float_raise( float_flag_underflow );
  2724. end
  2725. end;
  2726. if ( roundBits<>0 ) then
  2727. set_inexact_flag;
  2728. zSig := ( zSig + roundIncrement ) shr 10;
  2729. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2730. if ( zSig = 0 ) then
  2731. zExp := 0;
  2732. result:=packFloat64( zSign, zExp, zSig );
  2733. end;
  2734. {*
  2735. -------------------------------------------------------------------------------
  2736. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2737. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2738. returns the proper double-precision floating-point value corresponding
  2739. to the abstract input. This routine is just like `roundAndPackFloat64'
  2740. except that the input significand has fewer bits and does not have to be
  2741. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2742. point exponent.
  2743. -------------------------------------------------------------------------------
  2744. *}
  2745. Procedure
  2746. normalizeRoundAndPackFloat64(
  2747. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2748. Var
  2749. shiftCount : int8;
  2750. zSig2 : bits32;
  2751. Begin
  2752. if ( zSig0 = 0 ) then
  2753. Begin
  2754. zSig0 := zSig1;
  2755. zSig1 := 0;
  2756. zExp := zExp -32;
  2757. End;
  2758. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2759. if ( 0 <= shiftCount ) then
  2760. Begin
  2761. zSig2 := 0;
  2762. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2763. End
  2764. else
  2765. Begin
  2766. shift64ExtraRightJamming
  2767. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2768. End;
  2769. zExp := zExp - shiftCount;
  2770. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2771. End;
  2772. {*
  2773. ----------------------------------------------------------------------------
  2774. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2775. and significand `zSig', and returns the proper double-precision floating-
  2776. point value corresponding to the abstract input. This routine is just like
  2777. `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  2778. Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2779. floating-point exponent.
  2780. ----------------------------------------------------------------------------
  2781. *}
  2782. function normalizeRoundAndPackFloat64(zSign: flag; zExp: int16; zSig: bits64): float64;
  2783. var
  2784. shiftCount: int8;
  2785. begin
  2786. shiftCount := countLeadingZeros64( zSig ) - 1;
  2787. result := roundAndPackFloat64( zSign, zExp - shiftCount, zSig shl shiftCount);
  2788. end;
  2789. {*
  2790. -------------------------------------------------------------------------------
  2791. Returns the result of converting the 32-bit two's complement integer `a' to
  2792. the single-precision floating-point format. The conversion is performed
  2793. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2794. -------------------------------------------------------------------------------
  2795. *}
  2796. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2797. Var
  2798. zSign : Flag;
  2799. Begin
  2800. if ( a = 0 ) then
  2801. Begin
  2802. int32_to_float32.float32 := 0;
  2803. exit;
  2804. End;
  2805. if ( a = sbits32 ($80000000) ) then
  2806. Begin
  2807. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2808. exit;
  2809. end;
  2810. zSign := flag( a < 0 );
  2811. If zSign<>0 then
  2812. a := -a;
  2813. int32_to_float32.float32:=
  2814. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2815. End;
  2816. {*
  2817. -------------------------------------------------------------------------------
  2818. Returns the result of converting the 32-bit two's complement integer `a' to
  2819. the double-precision floating-point format. The conversion is performed
  2820. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2821. -------------------------------------------------------------------------------
  2822. *}
  2823. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2824. var
  2825. zSign : flag;
  2826. absA : bits32;
  2827. shiftCount : int8;
  2828. zSig0, zSig1 : bits32;
  2829. Begin
  2830. if ( a = 0 ) then
  2831. Begin
  2832. packFloat64( 0, 0, 0, 0, result );
  2833. exit;
  2834. end;
  2835. zSign := flag( a < 0 );
  2836. if ZSign<>0 then
  2837. AbsA := -a
  2838. else
  2839. AbsA := a;
  2840. shiftCount := countLeadingZeros32( absA ) - 11;
  2841. if ( 0 <= shiftCount ) then
  2842. Begin
  2843. zSig0 := absA shl shiftCount;
  2844. zSig1 := 0;
  2845. End
  2846. else
  2847. Begin
  2848. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2849. End;
  2850. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2851. End;
  2852. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2853. {$if not defined(packFloatx80)}
  2854. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2855. forward;
  2856. {$endif}
  2857. {*----------------------------------------------------------------------------
  2858. | Returns the result of converting the 32-bit two's complement integer `a'
  2859. | to the extended double-precision floating-point format. The conversion
  2860. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2861. | Arithmetic.
  2862. *----------------------------------------------------------------------------*}
  2863. function int32_to_floatx80( a: int32 ): floatx80;
  2864. var
  2865. zSign: flag;
  2866. absA: uint32;
  2867. shiftCount: int8;
  2868. zSig: bits64;
  2869. begin
  2870. if ( a = 0 ) then begin
  2871. result := packFloatx80( 0, 0, 0 );
  2872. exit;
  2873. end;
  2874. zSign := ord( a < 0 );
  2875. if zSign <> 0 then absA := - a else absA := a;
  2876. shiftCount := countLeadingZeros32( absA ) + 32;
  2877. zSig := absA;
  2878. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2879. end;
  2880. {$endif FPC_SOFTFLOAT_FLOATX80}
  2881. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2882. {$if not defined(packFloat128)}
  2883. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2884. forward;
  2885. {$endif}
  2886. {*----------------------------------------------------------------------------
  2887. | Returns the result of converting the 32-bit two's complement integer `a' to
  2888. | the quadruple-precision floating-point format. The conversion is performed
  2889. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2890. *----------------------------------------------------------------------------*}
  2891. function int32_to_float128( a: int32 ): float128;
  2892. var
  2893. zSign: flag;
  2894. absA: uint32;
  2895. shiftCount: int8;
  2896. zSig0: bits64;
  2897. begin
  2898. if ( a = 0 ) then begin
  2899. result := packFloat128( 0, 0, 0, 0 );
  2900. exit;
  2901. end;
  2902. zSign := ord( a < 0 );
  2903. if zSign <> 0 then absA := - a else absA := a;
  2904. shiftCount := countLeadingZeros32( absA ) + 17;
  2905. zSig0 := absA;
  2906. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2907. end;
  2908. {$endif FPC_SOFTFLOAT_FLOAT128}
  2909. {*
  2910. -------------------------------------------------------------------------------
  2911. Returns the result of converting the single-precision floating-point value
  2912. `a' to the 32-bit two's complement integer format. The conversion is
  2913. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2914. Arithmetic---which means in particular that the conversion is rounded
  2915. according to the current rounding mode. If `a' is a NaN, the largest
  2916. positive integer is returned. Otherwise, if the conversion overflows, the
  2917. largest integer with the same sign as `a' is returned.
  2918. -------------------------------------------------------------------------------
  2919. *}
  2920. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2921. Var
  2922. aSign: flag;
  2923. aExp, shiftCount: int16;
  2924. aSig, aSigExtra: bits32;
  2925. z: int32;
  2926. roundingMode: TFPURoundingMode;
  2927. Begin
  2928. aSig := extractFloat32Frac( a.float32 );
  2929. aExp := extractFloat32Exp( a.float32 );
  2930. aSign := extractFloat32Sign( a.float32 );
  2931. shiftCount := aExp - $96;
  2932. if ( 0 <= shiftCount ) then
  2933. Begin
  2934. if ( $9E <= aExp ) then
  2935. Begin
  2936. if ( a.float32 <> $CF000000 ) then
  2937. Begin
  2938. float_raise( float_flag_invalid );
  2939. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2940. Begin
  2941. float32_to_int32 := $7FFFFFFF;
  2942. exit;
  2943. End;
  2944. End;
  2945. float32_to_int32 := sbits32 ($80000000);
  2946. exit;
  2947. End;
  2948. z := ( aSig or $00800000 ) shl shiftCount;
  2949. if ( aSign<>0 ) then z := - z;
  2950. End
  2951. else
  2952. Begin
  2953. if ( aExp < $7E ) then
  2954. Begin
  2955. aSigExtra := aExp OR aSig;
  2956. z := 0;
  2957. End
  2958. else
  2959. Begin
  2960. aSig := aSig OR $00800000;
  2961. aSigExtra := aSig shl ( shiftCount and 31 );
  2962. z := aSig shr ( - shiftCount );
  2963. End;
  2964. if ( aSigExtra<>0 ) then
  2965. set_inexact_flag;
  2966. roundingMode := softfloat_rounding_mode;
  2967. if ( roundingMode = float_round_nearest_even ) then
  2968. Begin
  2969. if ( sbits32 (aSigExtra) < 0 ) then
  2970. Begin
  2971. Inc(z);
  2972. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2973. z := z and not 1;
  2974. End;
  2975. if ( aSign<>0 ) then
  2976. z := - z;
  2977. End
  2978. else
  2979. Begin
  2980. aSigExtra := flag( aSigExtra <> 0 );
  2981. if ( aSign<>0 ) then
  2982. Begin
  2983. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2984. z := - z;
  2985. End
  2986. else
  2987. Begin
  2988. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2989. End
  2990. End;
  2991. End;
  2992. float32_to_int32 := z;
  2993. End;
  2994. {*
  2995. -------------------------------------------------------------------------------
  2996. Returns the result of converting the single-precision floating-point value
  2997. `a' to the 32-bit two's complement integer format. The conversion is
  2998. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2999. Arithmetic, except that the conversion is always rounded toward zero.
  3000. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3001. the conversion overflows, the largest integer with the same sign as `a' is
  3002. returned.
  3003. -------------------------------------------------------------------------------
  3004. *}
  3005. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  3006. Var
  3007. aSign : flag;
  3008. aExp, shiftCount : int16;
  3009. aSig : bits32;
  3010. z : int32;
  3011. Begin
  3012. aSig := extractFloat32Frac( a.float32 );
  3013. aExp := extractFloat32Exp( a.float32 );
  3014. aSign := extractFloat32Sign( a.float32 );
  3015. shiftCount := aExp - $9E;
  3016. if ( 0 <= shiftCount ) then
  3017. Begin
  3018. if ( a.float32 <> $CF000000 ) then
  3019. Begin
  3020. float_raise( float_flag_invalid );
  3021. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  3022. Begin
  3023. float32_to_int32_round_to_zero := $7FFFFFFF;
  3024. exit;
  3025. end;
  3026. End;
  3027. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  3028. exit;
  3029. End
  3030. else
  3031. if ( aExp <= $7E ) then
  3032. Begin
  3033. if ( aExp or aSig )<>0 then
  3034. set_inexact_flag;
  3035. float32_to_int32_round_to_zero := 0;
  3036. exit;
  3037. End;
  3038. aSig := ( aSig or $00800000 ) shl 8;
  3039. z := aSig shr ( - shiftCount );
  3040. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  3041. Begin
  3042. set_inexact_flag;
  3043. End;
  3044. if ( aSign<>0 ) then z := - z;
  3045. float32_to_int32_round_to_zero := z;
  3046. End;
  3047. {*----------------------------------------------------------------------------
  3048. | Returns the result of converting the single-precision floating-point value
  3049. | `a' to the 64-bit two's complement integer format. The conversion is
  3050. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3051. | Arithmetic---which means in particular that the conversion is rounded
  3052. | according to the current rounding mode. If `a' is a NaN, the largest
  3053. | positive integer is returned. Otherwise, if the conversion overflows, the
  3054. | largest integer with the same sign as `a' is returned.
  3055. *----------------------------------------------------------------------------*}
  3056. function float32_to_int64( a: float32 ): int64;
  3057. var
  3058. aSign: flag;
  3059. aExp, shiftCount: int16;
  3060. aSig: bits32;
  3061. aSig64, aSigExtra: bits64;
  3062. begin
  3063. aSig := extractFloat32Frac( a );
  3064. aExp := extractFloat32Exp( a );
  3065. aSign := extractFloat32Sign( a );
  3066. shiftCount := $BE - aExp;
  3067. if ( shiftCount < 0 ) then begin
  3068. float_raise( float_flag_invalid );
  3069. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3070. result := $7FFFFFFFFFFFFFFF;
  3071. exit;
  3072. end;
  3073. result := $8000000000000000;
  3074. exit;
  3075. end;
  3076. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  3077. aSig64 := aSig;
  3078. aSig64 := aSig64 shl 40;
  3079. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  3080. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  3081. end;
  3082. {*----------------------------------------------------------------------------
  3083. | Returns the result of converting the single-precision floating-point value
  3084. | `a' to the 64-bit two's complement integer format. The conversion is
  3085. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3086. | Arithmetic, except that the conversion is always rounded toward zero. If
  3087. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  3088. | conversion overflows, the largest integer with the same sign as `a' is
  3089. | returned.
  3090. *----------------------------------------------------------------------------*}
  3091. function float32_to_int64_round_to_zero( a: float32 ): int64;
  3092. var
  3093. aSign: flag;
  3094. aExp, shiftCount: int16;
  3095. aSig: bits32;
  3096. aSig64: bits64;
  3097. z: int64;
  3098. begin
  3099. aSig := extractFloat32Frac( a );
  3100. aExp := extractFloat32Exp( a );
  3101. aSign := extractFloat32Sign( a );
  3102. shiftCount := aExp - $BE;
  3103. if ( 0 <= shiftCount ) then begin
  3104. if ( a <> $DF000000 ) then begin
  3105. float_raise( float_flag_invalid );
  3106. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3107. result := $7FFFFFFFFFFFFFFF;
  3108. exit;
  3109. end;
  3110. end;
  3111. result := $8000000000000000;
  3112. exit;
  3113. end
  3114. else if ( aExp <= $7E ) then begin
  3115. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  3116. result := 0;
  3117. exit;
  3118. end;
  3119. aSig64 := aSig or $00800000;
  3120. aSig64 := aSig64 shl 40;
  3121. z := aSig64 shr ( - shiftCount );
  3122. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3123. set_inexact_flag;
  3124. if ( aSign <> 0 ) then z := - z;
  3125. result := z;
  3126. end;
  3127. {*
  3128. -------------------------------------------------------------------------------
  3129. Returns the result of converting the single-precision floating-point value
  3130. `a' to the double-precision floating-point format. The conversion is
  3131. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3132. Arithmetic.
  3133. -------------------------------------------------------------------------------
  3134. *}
  3135. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  3136. Var
  3137. aSign : flag;
  3138. aExp : int16;
  3139. aSig, zSig0, zSig1: bits32;
  3140. tmp : CommonNanT;
  3141. Begin
  3142. aSig := extractFloat32Frac( a.float32 );
  3143. aExp := extractFloat32Exp( a.float32 );
  3144. aSign := extractFloat32Sign( a.float32 );
  3145. if ( aExp = $FF ) then
  3146. Begin
  3147. if ( aSig<>0 ) then
  3148. Begin
  3149. tmp:=float32ToCommonNaN(a.float32);
  3150. result:=commonNaNToFloat64(tmp);
  3151. exit;
  3152. End;
  3153. packFloat64( aSign, $7FF, 0, 0, result);
  3154. exit;
  3155. End;
  3156. if ( aExp = 0 ) then
  3157. Begin
  3158. if ( aSig = 0 ) then
  3159. Begin
  3160. packFloat64( aSign, 0, 0, 0, result );
  3161. exit;
  3162. end;
  3163. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3164. Dec(aExp);
  3165. End;
  3166. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3167. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3168. End;
  3169. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3170. {*----------------------------------------------------------------------------
  3171. | Returns the result of converting the canonical NaN `a' to the extended
  3172. | double-precision floating-point format.
  3173. *----------------------------------------------------------------------------*}
  3174. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3175. var
  3176. z : floatx80;
  3177. begin
  3178. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3179. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3180. result := z;
  3181. end;
  3182. {*----------------------------------------------------------------------------
  3183. | Returns the result of converting the single-precision floating-point value
  3184. | `a' to the extended double-precision floating-point format. The conversion
  3185. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3186. | Arithmetic.
  3187. *----------------------------------------------------------------------------*}
  3188. function float32_to_floatx80( a: float32 ): floatx80;
  3189. var
  3190. aSign: flag;
  3191. aExp: int16;
  3192. aSig: bits32;
  3193. tmp: commonNaNT;
  3194. begin
  3195. aSig := extractFloat32Frac( a );
  3196. aExp := extractFloat32Exp( a );
  3197. aSign := extractFloat32Sign( a );
  3198. if ( aExp = $FF ) then begin
  3199. if ( aSig <> 0 ) then begin
  3200. tmp:=float32ToCommonNaN(a);
  3201. result := commonNaNToFloatx80( tmp );
  3202. exit;
  3203. end;
  3204. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3205. exit;
  3206. end;
  3207. if ( aExp = 0 ) then begin
  3208. if ( aSig = 0 ) then begin
  3209. result := packFloatx80( aSign, 0, 0 );
  3210. exit;
  3211. end;
  3212. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3213. end;
  3214. aSig := aSig or $00800000;
  3215. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3216. end;
  3217. {$endif FPC_SOFTFLOAT_FLOATX80}
  3218. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3219. {*----------------------------------------------------------------------------
  3220. | Returns the result of converting the single-precision floating-point value
  3221. | `a' to the double-precision floating-point format. The conversion is
  3222. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3223. | Arithmetic.
  3224. *----------------------------------------------------------------------------*}
  3225. function float32_to_float128( a: float32 ): float128;
  3226. var
  3227. aSign: flag;
  3228. aExp: int16;
  3229. aSig: bits32;
  3230. tmp: commonNaNT;
  3231. begin
  3232. aSig := extractFloat32Frac( a );
  3233. aExp := extractFloat32Exp( a );
  3234. aSign := extractFloat32Sign( a );
  3235. if ( aExp = $FF ) then begin
  3236. if ( aSig <> 0 ) then begin
  3237. tmp:=float32ToCommonNaN(a);
  3238. result := commonNaNToFloat128( tmp );
  3239. exit;
  3240. end;
  3241. result := packFloat128( aSign, $7FFF, 0, 0 );
  3242. exit;
  3243. end;
  3244. if ( aExp = 0 ) then begin
  3245. if ( aSig = 0 ) then begin
  3246. result := packFloat128( aSign, 0, 0, 0 );
  3247. exit;
  3248. end;
  3249. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3250. dec( aExp );
  3251. end;
  3252. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3253. end;
  3254. {$endif FPC_SOFTFLOAT_FLOAT128}
  3255. {*
  3256. -------------------------------------------------------------------------------
  3257. Rounds the single-precision floating-point value `a' to an integer,
  3258. and returns the result as a single-precision floating-point value. The
  3259. operation is performed according to the IEC/IEEE Standard for Binary
  3260. Floating-Point Arithmetic.
  3261. -------------------------------------------------------------------------------
  3262. *}
  3263. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  3264. Var
  3265. aSign: flag;
  3266. aExp: int16;
  3267. lastBitMask, roundBitsMask: bits32;
  3268. roundingMode: TFPURoundingMode;
  3269. z: float32;
  3270. Begin
  3271. aExp := extractFloat32Exp( a.float32 );
  3272. if ( $96 <= aExp ) then
  3273. Begin
  3274. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3275. Begin
  3276. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3277. exit;
  3278. End;
  3279. float32_round_to_int:=a;
  3280. exit;
  3281. End;
  3282. if ( aExp <= $7E ) then
  3283. Begin
  3284. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3285. Begin
  3286. float32_round_to_int:=a;
  3287. exit;
  3288. end;
  3289. set_inexact_flag;
  3290. aSign := extractFloat32Sign( a.float32 );
  3291. case ( softfloat_rounding_mode ) of
  3292. float_round_nearest_even:
  3293. Begin
  3294. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3295. Begin
  3296. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3297. exit;
  3298. End;
  3299. End;
  3300. float_round_down:
  3301. Begin
  3302. if aSign <> 0 then
  3303. float32_round_to_int.float32 := $BF800000
  3304. else
  3305. float32_round_to_int.float32 := 0;
  3306. exit;
  3307. End;
  3308. float_round_up:
  3309. Begin
  3310. if aSign <> 0 then
  3311. float32_round_to_int.float32 := $80000000
  3312. else
  3313. float32_round_to_int.float32 := $3F800000;
  3314. exit;
  3315. End;
  3316. end;
  3317. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3318. exit;
  3319. End;
  3320. lastBitMask := 1;
  3321. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3322. lastBitMask := lastBitMask shl ($96 - aExp);
  3323. roundBitsMask := lastBitMask - 1;
  3324. z := a.float32;
  3325. roundingMode := softfloat_rounding_mode;
  3326. if ( roundingMode = float_round_nearest_even ) then
  3327. Begin
  3328. z := z + (lastBitMask shr 1);
  3329. if ( ( z and roundBitsMask ) = 0 ) then
  3330. z := z and not lastBitMask;
  3331. End
  3332. else if ( roundingMode <> float_round_to_zero ) then
  3333. Begin
  3334. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3335. Begin
  3336. z := z + roundBitsMask;
  3337. End;
  3338. End;
  3339. z := z and not roundBitsMask;
  3340. if ( z <> a.float32 ) then
  3341. set_inexact_flag;
  3342. float32_round_to_int.float32 := z;
  3343. End;
  3344. {*
  3345. -------------------------------------------------------------------------------
  3346. Returns the result of adding the absolute values of the single-precision
  3347. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3348. before being returned. `zSign' is ignored if the result is a NaN.
  3349. The addition is performed according to the IEC/IEEE Standard for Binary
  3350. Floating-Point Arithmetic.
  3351. -------------------------------------------------------------------------------
  3352. *}
  3353. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3354. Var
  3355. aExp, bExp, zExp: int16;
  3356. aSig, bSig, zSig: bits32;
  3357. expDiff: int16;
  3358. label roundAndPack;
  3359. Begin
  3360. aSig:=extractFloat32Frac( a );
  3361. aExp:=extractFloat32Exp( a );
  3362. bSig:=extractFloat32Frac( b );
  3363. bExp := extractFloat32Exp( b );
  3364. expDiff := aExp - bExp;
  3365. aSig := aSig shl 6;
  3366. bSig := bSig shl 6;
  3367. if ( 0 < expDiff ) then
  3368. Begin
  3369. if ( aExp = $FF ) then
  3370. Begin
  3371. if ( aSig <> 0) then
  3372. Begin
  3373. addFloat32Sigs := propagateFloat32NaN( a, b );
  3374. exit;
  3375. End;
  3376. addFloat32Sigs := a;
  3377. exit;
  3378. End;
  3379. if ( bExp = 0 ) then
  3380. Begin
  3381. Dec(expDiff);
  3382. End
  3383. else
  3384. Begin
  3385. bSig := bSig or $20000000;
  3386. End;
  3387. shift32RightJamming( bSig, expDiff, bSig );
  3388. zExp := aExp;
  3389. End
  3390. else
  3391. If ( expDiff < 0 ) then
  3392. Begin
  3393. if ( bExp = $FF ) then
  3394. Begin
  3395. if ( bSig<>0 ) then
  3396. Begin
  3397. addFloat32Sigs := propagateFloat32NaN( a, b );
  3398. exit;
  3399. end;
  3400. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3401. exit;
  3402. End;
  3403. if ( aExp = 0 ) then
  3404. Begin
  3405. Inc(expDiff);
  3406. End
  3407. else
  3408. Begin
  3409. aSig := aSig OR $20000000;
  3410. End;
  3411. shift32RightJamming( aSig, - expDiff, aSig );
  3412. zExp := bExp;
  3413. End
  3414. else
  3415. Begin
  3416. if ( aExp = $FF ) then
  3417. Begin
  3418. if ( aSig OR bSig )<> 0 then
  3419. Begin
  3420. addFloat32Sigs := propagateFloat32NaN( a, b );
  3421. exit;
  3422. end;
  3423. addFloat32Sigs := a;
  3424. exit;
  3425. End;
  3426. if ( aExp = 0 ) then
  3427. Begin
  3428. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3429. exit;
  3430. end;
  3431. zSig := $40000000 + aSig + bSig;
  3432. zExp := aExp;
  3433. goto roundAndPack;
  3434. End;
  3435. aSig := aSig OR $20000000;
  3436. zSig := ( aSig + bSig ) shl 1;
  3437. Dec(zExp);
  3438. if ( sbits32 (zSig) < 0 ) then
  3439. Begin
  3440. zSig := aSig + bSig;
  3441. Inc(zExp);
  3442. End;
  3443. roundAndPack:
  3444. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3445. End;
  3446. {*
  3447. -------------------------------------------------------------------------------
  3448. Returns the result of subtracting the absolute values of the single-
  3449. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3450. difference is negated before being returned. `zSign' is ignored if the
  3451. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3452. Standard for Binary Floating-Point Arithmetic.
  3453. -------------------------------------------------------------------------------
  3454. *}
  3455. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3456. Var
  3457. aExp, bExp, zExp: int16;
  3458. aSig, bSig, zSig: bits32;
  3459. expDiff : int16;
  3460. label aExpBigger;
  3461. label bExpBigger;
  3462. label aBigger;
  3463. label bBigger;
  3464. label normalizeRoundAndPack;
  3465. Begin
  3466. aSig := extractFloat32Frac( a );
  3467. aExp := extractFloat32Exp( a );
  3468. bSig := extractFloat32Frac( b );
  3469. bExp := extractFloat32Exp( b );
  3470. expDiff := aExp - bExp;
  3471. aSig := aSig shl 7;
  3472. bSig := bSig shl 7;
  3473. if ( 0 < expDiff ) then goto aExpBigger;
  3474. if ( expDiff < 0 ) then goto bExpBigger;
  3475. if ( aExp = $FF ) then
  3476. Begin
  3477. if ( aSig OR bSig )<> 0 then
  3478. Begin
  3479. subFloat32Sigs := propagateFloat32NaN( a, b );
  3480. exit;
  3481. End;
  3482. float_raise( float_flag_invalid );
  3483. subFloat32Sigs := float32_default_nan;
  3484. exit;
  3485. End;
  3486. if ( aExp = 0 ) then
  3487. Begin
  3488. aExp := 1;
  3489. bExp := 1;
  3490. End;
  3491. if ( bSig < aSig ) Then goto aBigger;
  3492. if ( aSig < bSig ) Then goto bBigger;
  3493. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3494. exit;
  3495. bExpBigger:
  3496. if ( bExp = $FF ) then
  3497. Begin
  3498. if ( bSig<>0 ) then
  3499. Begin
  3500. subFloat32Sigs := propagateFloat32NaN( a, b );
  3501. exit;
  3502. End;
  3503. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3504. exit;
  3505. End;
  3506. if ( aExp = 0 ) then
  3507. Begin
  3508. Inc(expDiff);
  3509. End
  3510. else
  3511. Begin
  3512. aSig := aSig OR $40000000;
  3513. End;
  3514. shift32RightJamming( aSig, - expDiff, aSig );
  3515. bSig := bSig OR $40000000;
  3516. bBigger:
  3517. zSig := bSig - aSig;
  3518. zExp := bExp;
  3519. zSign := zSign xor 1;
  3520. goto normalizeRoundAndPack;
  3521. aExpBigger:
  3522. if ( aExp = $FF ) then
  3523. Begin
  3524. if ( aSig <> 0) then
  3525. Begin
  3526. subFloat32Sigs := propagateFloat32NaN( a, b );
  3527. exit;
  3528. End;
  3529. subFloat32Sigs := a;
  3530. exit;
  3531. End;
  3532. if ( bExp = 0 ) then
  3533. Begin
  3534. Dec(expDiff);
  3535. End
  3536. else
  3537. Begin
  3538. bSig := bSig OR $40000000;
  3539. End;
  3540. shift32RightJamming( bSig, expDiff, bSig );
  3541. aSig := aSig OR $40000000;
  3542. aBigger:
  3543. zSig := aSig - bSig;
  3544. zExp := aExp;
  3545. normalizeRoundAndPack:
  3546. Dec(zExp);
  3547. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3548. End;
  3549. {*
  3550. -------------------------------------------------------------------------------
  3551. Returns the result of adding the single-precision floating-point values `a'
  3552. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3553. Binary Floating-Point Arithmetic.
  3554. -------------------------------------------------------------------------------
  3555. *}
  3556. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3557. Var
  3558. aSign, bSign: Flag;
  3559. Begin
  3560. aSign := extractFloat32Sign( a.float32 );
  3561. bSign := extractFloat32Sign( b.float32 );
  3562. if ( aSign = bSign ) then
  3563. Begin
  3564. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3565. End
  3566. else
  3567. Begin
  3568. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3569. End;
  3570. End;
  3571. {*
  3572. -------------------------------------------------------------------------------
  3573. Returns the result of subtracting the single-precision floating-point values
  3574. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3575. for Binary Floating-Point Arithmetic.
  3576. -------------------------------------------------------------------------------
  3577. *}
  3578. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3579. Var
  3580. aSign, bSign: flag;
  3581. Begin
  3582. aSign := extractFloat32Sign( a.float32 );
  3583. bSign := extractFloat32Sign( b.float32 );
  3584. if ( aSign = bSign ) then
  3585. Begin
  3586. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3587. End
  3588. else
  3589. Begin
  3590. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3591. End;
  3592. End;
  3593. {*
  3594. -------------------------------------------------------------------------------
  3595. Returns the result of multiplying the single-precision floating-point values
  3596. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3597. for Binary Floating-Point Arithmetic.
  3598. -------------------------------------------------------------------------------
  3599. *}
  3600. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3601. Var
  3602. aSign, bSign, zSign: flag;
  3603. aExp, bExp, zExp : int16;
  3604. aSig, bSig, zSig0, zSig1: bits32;
  3605. Begin
  3606. aSig := extractFloat32Frac( a.float32 );
  3607. aExp := extractFloat32Exp( a.float32 );
  3608. aSign := extractFloat32Sign( a.float32 );
  3609. bSig := extractFloat32Frac( b.float32 );
  3610. bExp := extractFloat32Exp( b.float32 );
  3611. bSign := extractFloat32Sign( b.float32 );
  3612. zSign := aSign xor bSign;
  3613. if ( aExp = $FF ) then
  3614. Begin
  3615. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3616. Begin
  3617. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3618. exit;
  3619. End;
  3620. if ( ( bits32(bExp) OR bSig ) = 0 ) then
  3621. Begin
  3622. float_raise( float_flag_invalid );
  3623. float32_mul.float32 := float32_default_nan;
  3624. exit;
  3625. End;
  3626. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3627. exit;
  3628. End;
  3629. if ( bExp = $FF ) then
  3630. Begin
  3631. if ( bSig <> 0 ) then
  3632. Begin
  3633. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3634. exit;
  3635. End;
  3636. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3637. Begin
  3638. float_raise( float_flag_invalid );
  3639. float32_mul.float32 := float32_default_nan;
  3640. exit;
  3641. End;
  3642. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3643. exit;
  3644. End;
  3645. if ( aExp = 0 ) then
  3646. Begin
  3647. if ( aSig = 0 ) then
  3648. Begin
  3649. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3650. exit;
  3651. End;
  3652. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3653. End;
  3654. if ( bExp = 0 ) then
  3655. Begin
  3656. if ( bSig = 0 ) then
  3657. Begin
  3658. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3659. exit;
  3660. End;
  3661. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3662. End;
  3663. zExp := aExp + bExp - $7F;
  3664. aSig := ( aSig OR $00800000 ) shl 7;
  3665. bSig := ( bSig OR $00800000 ) shl 8;
  3666. mul32To64( aSig, bSig, zSig0, zSig1 );
  3667. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3668. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3669. Begin
  3670. zSig0 := zSig0 shl 1;
  3671. Dec(zExp);
  3672. End;
  3673. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3674. End;
  3675. {*
  3676. -------------------------------------------------------------------------------
  3677. Returns the result of dividing the single-precision floating-point value `a'
  3678. by the corresponding value `b'. The operation is performed according to the
  3679. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3680. -------------------------------------------------------------------------------
  3681. *}
  3682. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3683. Var
  3684. aSign, bSign, zSign: flag;
  3685. aExp, bExp, zExp: int16;
  3686. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3687. Begin
  3688. aSig := extractFloat32Frac( a.float32 );
  3689. aExp := extractFloat32Exp( a.float32 );
  3690. aSign := extractFloat32Sign( a.float32 );
  3691. bSig := extractFloat32Frac( b.float32 );
  3692. bExp := extractFloat32Exp( b.float32 );
  3693. bSign := extractFloat32Sign( b.float32 );
  3694. zSign := aSign xor bSign;
  3695. if ( aExp = $FF ) then
  3696. Begin
  3697. if ( aSig <> 0 ) then
  3698. Begin
  3699. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3700. exit;
  3701. End;
  3702. if ( bExp = $FF ) then
  3703. Begin
  3704. if ( bSig <> 0) then
  3705. Begin
  3706. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3707. exit;
  3708. End;
  3709. float_raise( float_flag_invalid );
  3710. float32_div.float32 := float32_default_nan;
  3711. exit;
  3712. End;
  3713. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3714. exit;
  3715. End;
  3716. if ( bExp = $FF ) then
  3717. Begin
  3718. if ( bSig <> 0) then
  3719. Begin
  3720. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3721. exit;
  3722. End;
  3723. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3724. exit;
  3725. End;
  3726. if ( bExp = 0 ) Then
  3727. Begin
  3728. if ( bSig = 0 ) Then
  3729. Begin
  3730. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3731. Begin
  3732. float_raise( float_flag_invalid );
  3733. float32_div.float32 := float32_default_nan;
  3734. exit;
  3735. End;
  3736. float_raise( float_flag_divbyzero );
  3737. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3738. exit;
  3739. End;
  3740. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3741. End;
  3742. if ( aExp = 0 ) Then
  3743. Begin
  3744. if ( aSig = 0 ) Then
  3745. Begin
  3746. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3747. exit;
  3748. End;
  3749. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3750. End;
  3751. zExp := aExp - bExp + $7D;
  3752. aSig := ( aSig OR $00800000 ) shl 7;
  3753. bSig := ( bSig OR $00800000 ) shl 8;
  3754. if ( bSig <= ( aSig + aSig ) ) then
  3755. Begin
  3756. aSig := aSig shr 1;
  3757. Inc(zExp);
  3758. End;
  3759. zSig := estimateDiv64To32( aSig, 0, bSig );
  3760. if ( ( zSig and $3F ) <= 2 ) then
  3761. Begin
  3762. mul32To64( bSig, zSig, term0, term1 );
  3763. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3764. while ( sbits32 (rem0) < 0 ) do
  3765. Begin
  3766. Dec(zSig);
  3767. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3768. End;
  3769. zSig := zSig or bits32( rem1 <> 0 );
  3770. End;
  3771. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3772. End;
  3773. {*
  3774. -------------------------------------------------------------------------------
  3775. Returns the remainder of the single-precision floating-point value `a'
  3776. with respect to the corresponding value `b'. The operation is performed
  3777. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3778. -------------------------------------------------------------------------------
  3779. *}
  3780. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3781. Var
  3782. aSign, zSign: flag;
  3783. aExp, bExp, expDiff: int16;
  3784. aSig, bSig, q, alternateASig: bits32;
  3785. sigMean: sbits32;
  3786. Begin
  3787. aSig := extractFloat32Frac( a.float32 );
  3788. aExp := extractFloat32Exp( a.float32 );
  3789. aSign := extractFloat32Sign( a.float32 );
  3790. bSig := extractFloat32Frac( b.float32 );
  3791. bExp := extractFloat32Exp( b.float32 );
  3792. if ( aExp = $FF ) then
  3793. Begin
  3794. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3795. Begin
  3796. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3797. exit;
  3798. End;
  3799. float_raise( float_flag_invalid );
  3800. float32_rem.float32 := float32_default_nan;
  3801. exit;
  3802. End;
  3803. if ( bExp = $FF ) then
  3804. Begin
  3805. if ( bSig <> 0 ) then
  3806. Begin
  3807. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3808. exit;
  3809. End;
  3810. float32_rem := a;
  3811. exit;
  3812. End;
  3813. if ( bExp = 0 ) then
  3814. Begin
  3815. if ( bSig = 0 ) then
  3816. Begin
  3817. float_raise( float_flag_invalid );
  3818. float32_rem.float32 := float32_default_nan;
  3819. exit;
  3820. End;
  3821. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3822. End;
  3823. if ( aExp = 0 ) then
  3824. Begin
  3825. if ( aSig = 0 ) then
  3826. Begin
  3827. float32_rem := a;
  3828. exit;
  3829. End;
  3830. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3831. End;
  3832. expDiff := aExp - bExp;
  3833. aSig := ( aSig OR $00800000 ) shl 8;
  3834. bSig := ( bSig OR $00800000 ) shl 8;
  3835. if ( expDiff < 0 ) then
  3836. Begin
  3837. if ( expDiff < -1 ) then
  3838. Begin
  3839. float32_rem := a;
  3840. exit;
  3841. End;
  3842. aSig := aSig shr 1;
  3843. End;
  3844. q := bits32( bSig <= aSig );
  3845. if ( q <> 0) then
  3846. aSig := aSig - bSig;
  3847. expDiff := expDiff - 32;
  3848. while ( 0 < expDiff ) do
  3849. Begin
  3850. q := estimateDiv64To32( aSig, 0, bSig );
  3851. if (2 < q) then
  3852. q := q - 2
  3853. else
  3854. q := 0;
  3855. aSig := - ( ( bSig shr 2 ) * q );
  3856. expDiff := expDiff - 30;
  3857. End;
  3858. expDiff := expDiff + 32;
  3859. if ( 0 < expDiff ) then
  3860. Begin
  3861. q := estimateDiv64To32( aSig, 0, bSig );
  3862. if (2 < q) then
  3863. q := q - 2
  3864. else
  3865. q := 0;
  3866. q := q shr (32 - expDiff);
  3867. bSig := bSig shr 2;
  3868. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3869. End
  3870. else
  3871. Begin
  3872. aSig := aSig shr 2;
  3873. bSig := bSig shr 2;
  3874. End;
  3875. Repeat
  3876. alternateASig := aSig;
  3877. Inc(q);
  3878. aSig := aSig - bSig;
  3879. Until not ( 0 <= sbits32 (aSig) );
  3880. sigMean := aSig + alternateASig;
  3881. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3882. Begin
  3883. aSig := alternateASig;
  3884. End;
  3885. zSign := flag( sbits32 (aSig) < 0 );
  3886. if ( zSign<>0 ) then
  3887. aSig := - aSig;
  3888. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3889. End;
  3890. {*
  3891. -------------------------------------------------------------------------------
  3892. Returns the square root of the single-precision floating-point value `a'.
  3893. The operation is performed according to the IEC/IEEE Standard for Binary
  3894. Floating-Point Arithmetic.
  3895. -------------------------------------------------------------------------------
  3896. *}
  3897. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3898. Var
  3899. aSign : flag;
  3900. aExp, zExp : int16;
  3901. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3902. label roundAndPack;
  3903. Begin
  3904. aSig := extractFloat32Frac( a.float32 );
  3905. aExp := extractFloat32Exp( a.float32 );
  3906. aSign := extractFloat32Sign( a.float32 );
  3907. if ( aExp = $FF ) then
  3908. Begin
  3909. if ( aSig <> 0) then
  3910. Begin
  3911. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3912. exit;
  3913. End;
  3914. if ( aSign = 0) then
  3915. Begin
  3916. float32_sqrt := a;
  3917. exit;
  3918. End;
  3919. float_raise( float_flag_invalid );
  3920. float32_sqrt.float32 := float32_default_nan;
  3921. exit;
  3922. End;
  3923. if ( aSign <> 0) then
  3924. Begin
  3925. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3926. Begin
  3927. float32_sqrt := a;
  3928. exit;
  3929. End;
  3930. float_raise( float_flag_invalid );
  3931. float32_sqrt.float32 := float32_default_nan;
  3932. exit;
  3933. End;
  3934. if ( aExp = 0 ) then
  3935. Begin
  3936. if ( aSig = 0 ) then
  3937. Begin
  3938. float32_sqrt.float32 := 0;
  3939. exit;
  3940. End;
  3941. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3942. End;
  3943. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3944. aSig := ( aSig OR $00800000 ) shl 8;
  3945. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3946. if ( ( zSig and $7F ) <= 5 ) then
  3947. Begin
  3948. if ( zSig < 2 ) then
  3949. Begin
  3950. zSig := $7FFFFFFF;
  3951. goto roundAndPack;
  3952. End
  3953. else
  3954. Begin
  3955. aSig := aSig shr (aExp and 1);
  3956. mul32To64( zSig, zSig, term0, term1 );
  3957. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3958. while ( sbits32 (rem0) < 0 ) do
  3959. Begin
  3960. Dec(zSig);
  3961. shortShift64Left( 0, zSig, 1, term0, term1 );
  3962. term1 := term1 or 1;
  3963. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3964. End;
  3965. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3966. End;
  3967. End;
  3968. shift32RightJamming( zSig, 1, zSig );
  3969. roundAndPack:
  3970. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3971. End;
  3972. {*
  3973. -------------------------------------------------------------------------------
  3974. Returns 1 if the single-precision floating-point value `a' is equal to
  3975. the corresponding value `b', and 0 otherwise. The comparison is performed
  3976. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3977. -------------------------------------------------------------------------------
  3978. *}
  3979. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3980. Begin
  3981. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3982. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3983. ) then
  3984. Begin
  3985. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3986. Begin
  3987. float_raise( float_flag_invalid );
  3988. End;
  3989. float32_eq := 0;
  3990. exit;
  3991. End;
  3992. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3993. End;
  3994. {*
  3995. -------------------------------------------------------------------------------
  3996. Returns 1 if the single-precision floating-point value `a' is less than
  3997. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3998. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3999. Arithmetic.
  4000. -------------------------------------------------------------------------------
  4001. *}
  4002. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  4003. var
  4004. aSign, bSign: flag;
  4005. Begin
  4006. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  4007. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  4008. ) then
  4009. Begin
  4010. float_raise( float_flag_invalid );
  4011. float32_le := 0;
  4012. exit;
  4013. End;
  4014. aSign := extractFloat32Sign( a.float32 );
  4015. bSign := extractFloat32Sign( b.float32 );
  4016. if ( aSign <> bSign ) then
  4017. Begin
  4018. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  4019. exit;
  4020. End;
  4021. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  4022. End;
  4023. {*
  4024. -------------------------------------------------------------------------------
  4025. Returns 1 if the single-precision floating-point value `a' is less than
  4026. the corresponding value `b', and 0 otherwise. The comparison is performed
  4027. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4028. -------------------------------------------------------------------------------
  4029. *}
  4030. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  4031. var
  4032. aSign, bSign: flag;
  4033. Begin
  4034. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  4035. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  4036. ) then
  4037. Begin
  4038. float_raise( float_flag_invalid );
  4039. float32_lt :=0;
  4040. exit;
  4041. End;
  4042. aSign := extractFloat32Sign( a.float32 );
  4043. bSign := extractFloat32Sign( b.float32 );
  4044. if ( aSign <> bSign ) then
  4045. Begin
  4046. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  4047. exit;
  4048. End;
  4049. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  4050. End;
  4051. {*
  4052. -------------------------------------------------------------------------------
  4053. Returns 1 if the single-precision floating-point value `a' is equal to
  4054. the corresponding value `b', and 0 otherwise. The invalid exception is
  4055. raised if either operand is a NaN. Otherwise, the comparison is performed
  4056. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4057. -------------------------------------------------------------------------------
  4058. *}
  4059. Function float32_eq_signaling( a: float32; b: float32) : flag;
  4060. Begin
  4061. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  4062. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  4063. ) then
  4064. Begin
  4065. float_raise( float_flag_invalid );
  4066. float32_eq_signaling := 0;
  4067. exit;
  4068. End;
  4069. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  4070. End;
  4071. {*
  4072. -------------------------------------------------------------------------------
  4073. Returns 1 if the single-precision floating-point value `a' is less than or
  4074. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4075. cause an exception. Otherwise, the comparison is performed according to the
  4076. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4077. -------------------------------------------------------------------------------
  4078. *}
  4079. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  4080. Var
  4081. aSign, bSign: flag;
  4082. Begin
  4083. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4084. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4085. ) then
  4086. Begin
  4087. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4088. Begin
  4089. float_raise( float_flag_invalid );
  4090. End;
  4091. float32_le_quiet := 0;
  4092. exit;
  4093. End;
  4094. aSign := extractFloat32Sign( a );
  4095. bSign := extractFloat32Sign( b );
  4096. if ( aSign <> bSign ) then
  4097. Begin
  4098. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  4099. exit;
  4100. End;
  4101. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  4102. End;
  4103. {*
  4104. -------------------------------------------------------------------------------
  4105. Returns 1 if the single-precision floating-point value `a' is less than
  4106. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4107. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4108. Standard for Binary Floating-Point Arithmetic.
  4109. -------------------------------------------------------------------------------
  4110. *}
  4111. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  4112. Var
  4113. aSign, bSign: flag;
  4114. Begin
  4115. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4116. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4117. ) then
  4118. Begin
  4119. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4120. Begin
  4121. float_raise( float_flag_invalid );
  4122. End;
  4123. float32_lt_quiet := 0;
  4124. exit;
  4125. End;
  4126. aSign := extractFloat32Sign( a );
  4127. bSign := extractFloat32Sign( b );
  4128. if ( aSign <> bSign ) then
  4129. Begin
  4130. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4131. exit;
  4132. End;
  4133. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4134. End;
  4135. {*
  4136. -------------------------------------------------------------------------------
  4137. Returns the result of converting the double-precision floating-point value
  4138. `a' to the 32-bit two's complement integer format. The conversion is
  4139. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4140. Arithmetic---which means in particular that the conversion is rounded
  4141. according to the current rounding mode. If `a' is a NaN, the largest
  4142. positive integer is returned. Otherwise, if the conversion overflows, the
  4143. largest integer with the same sign as `a' is returned.
  4144. -------------------------------------------------------------------------------
  4145. *}
  4146. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4147. var
  4148. aSign: flag;
  4149. aExp, shiftCount: int16;
  4150. aSig0, aSig1, absZ, aSigExtra: bits32;
  4151. z: int32;
  4152. roundingMode: TFPURoundingMode;
  4153. label invalid;
  4154. Begin
  4155. aSig1 := extractFloat64Frac1( a );
  4156. aSig0 := extractFloat64Frac0( a );
  4157. aExp := extractFloat64Exp( a );
  4158. aSign := extractFloat64Sign( a );
  4159. shiftCount := aExp - $413;
  4160. if ( 0 <= shiftCount ) then
  4161. Begin
  4162. if ( $41E < aExp ) then
  4163. Begin
  4164. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4165. aSign := 0;
  4166. goto invalid;
  4167. End;
  4168. shortShift64Left(
  4169. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4170. if ( $80000000 < absZ ) then
  4171. goto invalid;
  4172. End
  4173. else
  4174. Begin
  4175. aSig1 := flag( aSig1 <> 0 );
  4176. if ( aExp < $3FE ) then
  4177. Begin
  4178. aSigExtra := aExp OR aSig0 OR aSig1;
  4179. absZ := 0;
  4180. End
  4181. else
  4182. Begin
  4183. aSig0 := aSig0 OR $00100000;
  4184. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4185. absZ := aSig0 shr ( - shiftCount );
  4186. End;
  4187. End;
  4188. roundingMode := softfloat_rounding_mode;
  4189. if ( roundingMode = float_round_nearest_even ) then
  4190. Begin
  4191. if ( sbits32(aSigExtra) < 0 ) then
  4192. Begin
  4193. Inc(absZ);
  4194. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4195. absZ := absZ and not 1;
  4196. End;
  4197. if aSign <> 0 then
  4198. z := - absZ
  4199. else
  4200. z := absZ;
  4201. End
  4202. else
  4203. Begin
  4204. aSigExtra := bits32( aSigExtra <> 0 );
  4205. if ( aSign <> 0) then
  4206. Begin
  4207. z := - ( absZ
  4208. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4209. End
  4210. else
  4211. Begin
  4212. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4213. End
  4214. End;
  4215. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4216. Begin
  4217. invalid:
  4218. float_raise( float_flag_invalid );
  4219. if (aSign <> 0 ) then
  4220. float64_to_int32 := sbits32 ($80000000)
  4221. else
  4222. float64_to_int32 := $7FFFFFFF;
  4223. exit;
  4224. End;
  4225. if ( aSigExtra <> 0) then
  4226. set_inexact_flag;
  4227. float64_to_int32 := z;
  4228. End;
  4229. {*
  4230. -------------------------------------------------------------------------------
  4231. Returns the result of converting the double-precision floating-point value
  4232. `a' to the 32-bit two's complement integer format. The conversion is
  4233. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4234. Arithmetic, except that the conversion is always rounded toward zero.
  4235. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4236. the conversion overflows, the largest integer with the same sign as `a' is
  4237. returned.
  4238. -------------------------------------------------------------------------------
  4239. *}
  4240. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4241. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4242. Var
  4243. aSign: flag;
  4244. aExp, shiftCount: int16;
  4245. aSig0, aSig1, absZ, aSigExtra: bits32;
  4246. z: int32;
  4247. label invalid;
  4248. Begin
  4249. aSig1 := extractFloat64Frac1( a );
  4250. aSig0 := extractFloat64Frac0( a );
  4251. aExp := extractFloat64Exp( a );
  4252. aSign := extractFloat64Sign( a );
  4253. shiftCount := aExp - $413;
  4254. if ( 0 <= shiftCount ) then
  4255. Begin
  4256. if ( $41E < aExp ) then
  4257. Begin
  4258. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4259. aSign := 0;
  4260. goto invalid;
  4261. End;
  4262. shortShift64Left(
  4263. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4264. End
  4265. else
  4266. Begin
  4267. if ( aExp < $3FF ) then
  4268. Begin
  4269. if ( bits32(aExp) OR aSig0 OR aSig1 )<>0 then
  4270. Begin
  4271. set_inexact_flag;
  4272. End;
  4273. float64_to_int32_round_to_zero := 0;
  4274. exit;
  4275. End;
  4276. aSig0 := aSig0 or $00100000;
  4277. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4278. absZ := aSig0 shr ( - shiftCount );
  4279. End;
  4280. if aSign <> 0 then
  4281. z := - absZ
  4282. else
  4283. z := absZ;
  4284. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4285. Begin
  4286. invalid:
  4287. float_raise( float_flag_invalid );
  4288. if (aSign <> 0) then
  4289. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4290. else
  4291. float64_to_int32_round_to_zero := $7FFFFFFF;
  4292. exit;
  4293. End;
  4294. if ( aSigExtra <> 0) then
  4295. set_inexact_flag;
  4296. float64_to_int32_round_to_zero := z;
  4297. End;
  4298. {*----------------------------------------------------------------------------
  4299. | Returns the result of converting the double-precision floating-point value
  4300. | `a' to the 64-bit two's complement integer format. The conversion is
  4301. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4302. | Arithmetic---which means in particular that the conversion is rounded
  4303. | according to the current rounding mode. If `a' is a NaN, the largest
  4304. | positive integer is returned. Otherwise, if the conversion overflows, the
  4305. | largest integer with the same sign as `a' is returned.
  4306. *----------------------------------------------------------------------------*}
  4307. function float64_to_int64( a: float64 ): int64;
  4308. var
  4309. aSign: flag;
  4310. aExp, shiftCount: int16;
  4311. aSig, aSigExtra: bits64;
  4312. begin
  4313. aSig := extractFloat64Frac( a );
  4314. aExp := extractFloat64Exp( a );
  4315. aSign := extractFloat64Sign( a );
  4316. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4317. shiftCount := $433 - aExp;
  4318. if ( shiftCount <= 0 ) then begin
  4319. if ( $43E < aExp ) then begin
  4320. float_raise( float_flag_invalid );
  4321. if ( ( aSign = 0 )
  4322. or ( ( aExp = $7FF )
  4323. and ( aSig <> $0010000000000000 ) )
  4324. ) then begin
  4325. result := $7FFFFFFFFFFFFFFF;
  4326. exit;
  4327. end;
  4328. result := $8000000000000000;
  4329. exit;
  4330. end;
  4331. aSigExtra := 0;
  4332. aSig := aSig shl ( - shiftCount );
  4333. end
  4334. else
  4335. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4336. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4337. end;
  4338. {*----------------------------------------------------------------------------
  4339. | Returns the result of converting the double-precision floating-point value
  4340. | `a' to the 64-bit two's complement integer format. The conversion is
  4341. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4342. | Arithmetic, except that the conversion is always rounded toward zero.
  4343. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4344. | the conversion overflows, the largest integer with the same sign as `a' is
  4345. | returned.
  4346. *----------------------------------------------------------------------------*}
  4347. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4348. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4349. var
  4350. aSign: flag;
  4351. aExp, shiftCount: int16;
  4352. aSig: bits64;
  4353. z: int64;
  4354. begin
  4355. aSig := extractFloat64Frac( a );
  4356. aExp := extractFloat64Exp( a );
  4357. aSign := extractFloat64Sign( a );
  4358. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4359. shiftCount := aExp - $433;
  4360. if ( 0 <= shiftCount ) then begin
  4361. if ( $43E <= aExp ) then begin
  4362. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4363. float_raise( float_flag_invalid );
  4364. if ( ( aSign = 0 )
  4365. or ( ( aExp = $7FF )
  4366. and ( aSig <> $0010000000000000 ) )
  4367. ) then begin
  4368. result := $7FFFFFFFFFFFFFFF;
  4369. exit;
  4370. end;
  4371. end;
  4372. result := $8000000000000000;
  4373. exit;
  4374. end;
  4375. z := aSig shl shiftCount;
  4376. end
  4377. else begin
  4378. if ( aExp < $3FE ) then begin
  4379. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  4380. result := 0;
  4381. exit;
  4382. end;
  4383. z := aSig shr ( - shiftCount );
  4384. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4385. set_inexact_flag;
  4386. end;
  4387. if ( aSign <> 0 ) then z := - z;
  4388. result := z;
  4389. end;
  4390. {*
  4391. -------------------------------------------------------------------------------
  4392. Returns the result of converting the double-precision floating-point value
  4393. `a' to the single-precision floating-point format. The conversion is
  4394. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4395. Arithmetic.
  4396. -------------------------------------------------------------------------------
  4397. *}
  4398. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  4399. Var
  4400. aSign: flag;
  4401. aExp: int16;
  4402. aSig0, aSig1, zSig: bits32;
  4403. allZero: bits32;
  4404. tmp : CommonNanT;
  4405. Begin
  4406. aSig1 := extractFloat64Frac1( a );
  4407. aSig0 := extractFloat64Frac0( a );
  4408. aExp := extractFloat64Exp( a );
  4409. aSign := extractFloat64Sign( a );
  4410. if ( aExp = $7FF ) then
  4411. Begin
  4412. if ( aSig0 OR aSig1 ) <> 0 then
  4413. Begin
  4414. tmp:=float64ToCommonNaN(a);
  4415. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4416. exit;
  4417. End;
  4418. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4419. exit;
  4420. End;
  4421. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4422. if ( aExp <> 0) then
  4423. zSig := zSig OR $40000000;
  4424. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4425. End;
  4426. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4427. {*----------------------------------------------------------------------------
  4428. | Returns the result of converting the double-precision floating-point value
  4429. | `a' to the extended double-precision floating-point format. The conversion
  4430. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4431. | Arithmetic.
  4432. *----------------------------------------------------------------------------*}
  4433. function float64_to_floatx80( a: float64 ): floatx80;
  4434. var
  4435. aSign: flag;
  4436. aExp: int16;
  4437. aSig: bits64;
  4438. begin
  4439. aSig := extractFloat64Frac( a );
  4440. aExp := extractFloat64Exp( a );
  4441. aSign := extractFloat64Sign( a );
  4442. if ( aExp = $7FF ) then begin
  4443. if ( aSig <> 0 ) then begin
  4444. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4445. exit;
  4446. end;
  4447. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4448. exit;
  4449. end;
  4450. if ( aExp = 0 ) then begin
  4451. if ( aSig = 0 ) then begin
  4452. result := packFloatx80( aSign, 0, 0 );
  4453. exit;
  4454. end;
  4455. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4456. end;
  4457. result :=
  4458. packFloatx80(
  4459. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4460. end;
  4461. {$endif FPC_SOFTFLOAT_FLOATX80}
  4462. {*
  4463. -------------------------------------------------------------------------------
  4464. Rounds the double-precision floating-point value `a' to an integer,
  4465. and returns the result as a double-precision floating-point value. The
  4466. operation is performed according to the IEC/IEEE Standard for Binary
  4467. Floating-Point Arithmetic.
  4468. -------------------------------------------------------------------------------
  4469. *}
  4470. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4471. Var
  4472. aSign: flag;
  4473. aExp: int16;
  4474. lastBitMask, roundBitsMask: bits32;
  4475. roundingMode: TFPURoundingMode;
  4476. z: float64;
  4477. Begin
  4478. aExp := extractFloat64Exp( a );
  4479. if ( $413 <= aExp ) then
  4480. Begin
  4481. if ( $433 <= aExp ) then
  4482. Begin
  4483. if ( ( aExp = $7FF )
  4484. AND
  4485. (
  4486. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4487. ) <>0)
  4488. ) then
  4489. Begin
  4490. propagateFloat64NaN( a, a, result );
  4491. exit;
  4492. End;
  4493. result := a;
  4494. exit;
  4495. End;
  4496. lastBitMask := 1;
  4497. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4498. roundBitsMask := lastBitMask - 1;
  4499. z := a;
  4500. roundingMode := softfloat_rounding_mode;
  4501. if ( roundingMode = float_round_nearest_even ) then
  4502. Begin
  4503. if ( lastBitMask <> 0) then
  4504. Begin
  4505. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4506. if ( ( z.low and roundBitsMask ) = 0 ) then
  4507. z.low := z.low and not lastBitMask;
  4508. End
  4509. else
  4510. Begin
  4511. if ( sbits32 (z.low) < 0 ) then
  4512. Begin
  4513. Inc(z.high);
  4514. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4515. z.high := z.high and not 1;
  4516. End;
  4517. End;
  4518. End
  4519. else if ( roundingMode <> float_round_to_zero ) then
  4520. Begin
  4521. if ( extractFloat64Sign( z )
  4522. xor flag( roundingMode = float_round_up ) )<> 0 then
  4523. Begin
  4524. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4525. End;
  4526. End;
  4527. z.low := z.low and not roundBitsMask;
  4528. End
  4529. else
  4530. Begin
  4531. if ( aExp <= $3FE ) then
  4532. Begin
  4533. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4534. Begin
  4535. result := a;
  4536. exit;
  4537. End;
  4538. set_inexact_flag;
  4539. aSign := extractFloat64Sign( a );
  4540. case ( softfloat_rounding_mode ) of
  4541. float_round_nearest_even:
  4542. Begin
  4543. if ( ( aExp = $3FE )
  4544. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4545. ) then
  4546. Begin
  4547. packFloat64( aSign, $3FF, 0, 0, result );
  4548. exit;
  4549. End;
  4550. End;
  4551. float_round_down:
  4552. Begin
  4553. if aSign<>0 then
  4554. packFloat64( 1, $3FF, 0, 0, result )
  4555. else
  4556. packFloat64( 0, 0, 0, 0, result );
  4557. exit;
  4558. End;
  4559. float_round_up:
  4560. Begin
  4561. if aSign <> 0 then
  4562. packFloat64( 1, 0, 0, 0, result )
  4563. else
  4564. packFloat64( 0, $3FF, 0, 0, result );
  4565. exit;
  4566. End;
  4567. end;
  4568. packFloat64( aSign, 0, 0, 0, result );
  4569. exit;
  4570. End;
  4571. lastBitMask := 1;
  4572. lastBitMask := lastBitMask shl ($413 - aExp);
  4573. roundBitsMask := lastBitMask - 1;
  4574. z.low := 0;
  4575. z.high := a.high;
  4576. roundingMode := softfloat_rounding_mode;
  4577. if ( roundingMode = float_round_nearest_even ) then
  4578. Begin
  4579. z.high := z.high + lastBitMask shr 1;
  4580. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4581. Begin
  4582. z.high := z.high and not lastBitMask;
  4583. End;
  4584. End
  4585. else if ( roundingMode <> float_round_to_zero ) then
  4586. Begin
  4587. if ( extractFloat64Sign( z )
  4588. xor flag( roundingMode = float_round_up ) )<> 0 then
  4589. Begin
  4590. z.high := z.high or bits32( a.low <> 0 );
  4591. z.high := z.high + roundBitsMask;
  4592. End;
  4593. End;
  4594. z.high := z.high and not roundBitsMask;
  4595. End;
  4596. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4597. Begin
  4598. set_inexact_flag;
  4599. End;
  4600. result := z;
  4601. End;
  4602. {*
  4603. -------------------------------------------------------------------------------
  4604. Returns the result of adding the absolute values of the double-precision
  4605. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4606. before being returned. `zSign' is ignored if the result is a NaN.
  4607. The addition is performed according to the IEC/IEEE Standard for Binary
  4608. Floating-Point Arithmetic.
  4609. -------------------------------------------------------------------------------
  4610. *}
  4611. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4612. Var
  4613. aExp, bExp, zExp: int16;
  4614. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4615. expDiff: int16;
  4616. label shiftRight1;
  4617. label roundAndPack;
  4618. Begin
  4619. aSig1 := extractFloat64Frac1( a );
  4620. aSig0 := extractFloat64Frac0( a );
  4621. aExp := extractFloat64Exp( a );
  4622. bSig1 := extractFloat64Frac1( b );
  4623. bSig0 := extractFloat64Frac0( b );
  4624. bExp := extractFloat64Exp( b );
  4625. expDiff := aExp - bExp;
  4626. if ( 0 < expDiff ) then
  4627. Begin
  4628. if ( aExp = $7FF ) then
  4629. Begin
  4630. if ( aSig0 OR aSig1 ) <> 0 then
  4631. Begin
  4632. propagateFloat64NaN( a, b, out );
  4633. exit;
  4634. end;
  4635. out := a;
  4636. exit;
  4637. End;
  4638. if ( bExp = 0 ) then
  4639. Begin
  4640. Dec(expDiff);
  4641. End
  4642. else
  4643. Begin
  4644. bSig0 := bSig0 or $00100000;
  4645. End;
  4646. shift64ExtraRightJamming(
  4647. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4648. zExp := aExp;
  4649. End
  4650. else if ( expDiff < 0 ) then
  4651. Begin
  4652. if ( bExp = $7FF ) then
  4653. Begin
  4654. if ( bSig0 OR bSig1 ) <> 0 then
  4655. Begin
  4656. propagateFloat64NaN( a, b, out );
  4657. exit;
  4658. End;
  4659. packFloat64( zSign, $7FF, 0, 0, out );
  4660. exit;
  4661. End;
  4662. if ( aExp = 0 ) then
  4663. Begin
  4664. Inc(expDiff);
  4665. End
  4666. else
  4667. Begin
  4668. aSig0 := aSig0 or $00100000;
  4669. End;
  4670. shift64ExtraRightJamming(
  4671. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4672. zExp := bExp;
  4673. End
  4674. else
  4675. Begin
  4676. if ( aExp = $7FF ) then
  4677. Begin
  4678. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4679. Begin
  4680. propagateFloat64NaN( a, b, out );
  4681. exit;
  4682. End;
  4683. out := a;
  4684. exit;
  4685. End;
  4686. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4687. if ( aExp = 0 ) then
  4688. Begin
  4689. packFloat64( zSign, 0, zSig0, zSig1, out );
  4690. exit;
  4691. End;
  4692. zSig2 := 0;
  4693. zSig0 := zSig0 or $00200000;
  4694. zExp := aExp;
  4695. goto shiftRight1;
  4696. End;
  4697. aSig0 := aSig0 or $00100000;
  4698. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4699. Dec(zExp);
  4700. if ( zSig0 < $00200000 ) then
  4701. goto roundAndPack;
  4702. Inc(zExp);
  4703. shiftRight1:
  4704. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4705. roundAndPack:
  4706. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4707. End;
  4708. {*
  4709. -------------------------------------------------------------------------------
  4710. Returns the result of subtracting the absolute values of the double-
  4711. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4712. difference is negated before being returned. `zSign' is ignored if the
  4713. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4714. Standard for Binary Floating-Point Arithmetic.
  4715. -------------------------------------------------------------------------------
  4716. *}
  4717. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4718. Var
  4719. aExp, bExp, zExp: int16;
  4720. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4721. expDiff: int16;
  4722. z: float64;
  4723. label aExpBigger;
  4724. label bExpBigger;
  4725. label aBigger;
  4726. label bBigger;
  4727. label normalizeRoundAndPack;
  4728. Begin
  4729. aSig1 := extractFloat64Frac1( a );
  4730. aSig0 := extractFloat64Frac0( a );
  4731. aExp := extractFloat64Exp( a );
  4732. bSig1 := extractFloat64Frac1( b );
  4733. bSig0 := extractFloat64Frac0( b );
  4734. bExp := extractFloat64Exp( b );
  4735. expDiff := aExp - bExp;
  4736. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4737. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4738. if ( 0 < expDiff ) then goto aExpBigger;
  4739. if ( expDiff < 0 ) then goto bExpBigger;
  4740. if ( aExp = $7FF ) then
  4741. Begin
  4742. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4743. Begin
  4744. propagateFloat64NaN( a, b, out );
  4745. exit;
  4746. End;
  4747. float_raise( float_flag_invalid );
  4748. z.low := float64_default_nan_low;
  4749. z.high := float64_default_nan_high;
  4750. out := z;
  4751. exit;
  4752. End;
  4753. if ( aExp = 0 ) then
  4754. Begin
  4755. aExp := 1;
  4756. bExp := 1;
  4757. End;
  4758. if ( bSig0 < aSig0 ) then goto aBigger;
  4759. if ( aSig0 < bSig0 ) then goto bBigger;
  4760. if ( bSig1 < aSig1 ) then goto aBigger;
  4761. if ( aSig1 < bSig1 ) then goto bBigger;
  4762. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4763. exit;
  4764. bExpBigger:
  4765. if ( bExp = $7FF ) then
  4766. Begin
  4767. if ( bSig0 OR bSig1 ) <> 0 then
  4768. Begin
  4769. propagateFloat64NaN( a, b, out );
  4770. exit;
  4771. End;
  4772. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4773. exit;
  4774. End;
  4775. if ( aExp = 0 ) then
  4776. Begin
  4777. Inc(expDiff);
  4778. End
  4779. else
  4780. Begin
  4781. aSig0 := aSig0 or $40000000;
  4782. End;
  4783. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4784. bSig0 := bSig0 or $40000000;
  4785. bBigger:
  4786. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4787. zExp := bExp;
  4788. zSign := zSign xor 1;
  4789. goto normalizeRoundAndPack;
  4790. aExpBigger:
  4791. if ( aExp = $7FF ) then
  4792. Begin
  4793. if ( aSig0 OR aSig1 ) <> 0 then
  4794. Begin
  4795. propagateFloat64NaN( a, b, out );
  4796. exit;
  4797. End;
  4798. out := a;
  4799. exit;
  4800. End;
  4801. if ( bExp = 0 ) then
  4802. Begin
  4803. Dec(expDiff);
  4804. End
  4805. else
  4806. Begin
  4807. bSig0 := bSig0 or $40000000;
  4808. End;
  4809. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4810. aSig0 := aSig0 or $40000000;
  4811. aBigger:
  4812. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4813. zExp := aExp;
  4814. normalizeRoundAndPack:
  4815. Dec(zExp);
  4816. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4817. End;
  4818. {*
  4819. -------------------------------------------------------------------------------
  4820. Returns the result of adding the double-precision floating-point values `a'
  4821. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4822. Binary Floating-Point Arithmetic.
  4823. -------------------------------------------------------------------------------
  4824. *}
  4825. Function float64_add( a: float64; b : float64) : Float64;
  4826. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4827. Var
  4828. aSign, bSign: flag;
  4829. Begin
  4830. aSign := extractFloat64Sign( a );
  4831. bSign := extractFloat64Sign( b );
  4832. if ( aSign = bSign ) then
  4833. Begin
  4834. addFloat64Sigs( a, b, aSign, result );
  4835. End
  4836. else
  4837. Begin
  4838. subFloat64Sigs( a, b, aSign, result );
  4839. End;
  4840. End;
  4841. {*
  4842. -------------------------------------------------------------------------------
  4843. Returns the result of subtracting the double-precision floating-point values
  4844. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4845. for Binary Floating-Point Arithmetic.
  4846. -------------------------------------------------------------------------------
  4847. *}
  4848. Function float64_sub(a: float64; b : float64) : Float64;
  4849. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4850. Var
  4851. aSign, bSign: flag;
  4852. Begin
  4853. aSign := extractFloat64Sign( a );
  4854. bSign := extractFloat64Sign( b );
  4855. if ( aSign = bSign ) then
  4856. Begin
  4857. subFloat64Sigs( a, b, aSign, result );
  4858. End
  4859. else
  4860. Begin
  4861. addFloat64Sigs( a, b, aSign, result );
  4862. End;
  4863. End;
  4864. {*
  4865. -------------------------------------------------------------------------------
  4866. Returns the result of multiplying the double-precision floating-point values
  4867. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4868. for Binary Floating-Point Arithmetic.
  4869. -------------------------------------------------------------------------------
  4870. *}
  4871. Function float64_mul( a: float64; b:float64) : Float64;
  4872. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4873. Var
  4874. aSign, bSign, zSign: flag;
  4875. aExp, bExp, zExp: int16;
  4876. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4877. z: float64;
  4878. label invalid;
  4879. Begin
  4880. aSig1 := extractFloat64Frac1( a );
  4881. aSig0 := extractFloat64Frac0( a );
  4882. aExp := extractFloat64Exp( a );
  4883. aSign := extractFloat64Sign( a );
  4884. bSig1 := extractFloat64Frac1( b );
  4885. bSig0 := extractFloat64Frac0( b );
  4886. bExp := extractFloat64Exp( b );
  4887. bSign := extractFloat64Sign( b );
  4888. zSign := aSign xor bSign;
  4889. if ( aExp = $7FF ) then
  4890. Begin
  4891. if ( (( aSig0 OR aSig1 ) <>0)
  4892. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4893. Begin
  4894. propagateFloat64NaN( a, b, result );
  4895. exit;
  4896. End;
  4897. if ( ( bits32(bExp) OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4898. packFloat64( zSign, $7FF, 0, 0, result );
  4899. exit;
  4900. End;
  4901. if ( bExp = $7FF ) then
  4902. Begin
  4903. if ( bSig0 OR bSig1 )<> 0 then
  4904. Begin
  4905. propagateFloat64NaN( a, b, result );
  4906. exit;
  4907. End;
  4908. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4909. Begin
  4910. invalid:
  4911. float_raise( float_flag_invalid );
  4912. z.low := float64_default_nan_low;
  4913. z.high := float64_default_nan_high;
  4914. result := z;
  4915. exit;
  4916. End;
  4917. packFloat64( zSign, $7FF, 0, 0, result );
  4918. exit;
  4919. End;
  4920. if ( aExp = 0 ) then
  4921. Begin
  4922. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4923. Begin
  4924. packFloat64( zSign, 0, 0, 0, result );
  4925. exit;
  4926. End;
  4927. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4928. End;
  4929. if ( bExp = 0 ) then
  4930. Begin
  4931. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4932. Begin
  4933. packFloat64( zSign, 0, 0, 0, result );
  4934. exit;
  4935. End;
  4936. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4937. End;
  4938. zExp := aExp + bExp - $400;
  4939. aSig0 := aSig0 or $00100000;
  4940. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4941. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4942. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4943. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4944. if ( $00200000 <= zSig0 ) then
  4945. Begin
  4946. shift64ExtraRightJamming(
  4947. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4948. Inc(zExp);
  4949. End;
  4950. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4951. End;
  4952. {*
  4953. -------------------------------------------------------------------------------
  4954. Returns the result of dividing the double-precision floating-point value `a'
  4955. by the corresponding value `b'. The operation is performed according to the
  4956. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4957. -------------------------------------------------------------------------------
  4958. *}
  4959. Function float64_div(a: float64; b : float64) : Float64;
  4960. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4961. Var
  4962. aSign, bSign, zSign: flag;
  4963. aExp, bExp, zExp: int16;
  4964. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4965. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4966. z: float64;
  4967. label invalid;
  4968. Begin
  4969. aSig1 := extractFloat64Frac1( a );
  4970. aSig0 := extractFloat64Frac0( a );
  4971. aExp := extractFloat64Exp( a );
  4972. aSign := extractFloat64Sign( a );
  4973. bSig1 := extractFloat64Frac1( b );
  4974. bSig0 := extractFloat64Frac0( b );
  4975. bExp := extractFloat64Exp( b );
  4976. bSign := extractFloat64Sign( b );
  4977. zSign := aSign xor bSign;
  4978. if ( aExp = $7FF ) then
  4979. Begin
  4980. if ( aSig0 OR aSig1 )<> 0 then
  4981. Begin
  4982. propagateFloat64NaN( a, b, result );
  4983. exit;
  4984. end;
  4985. if ( bExp = $7FF ) then
  4986. Begin
  4987. if ( bSig0 OR bSig1 )<>0 then
  4988. Begin
  4989. propagateFloat64NaN( a, b, result );
  4990. exit;
  4991. End;
  4992. goto invalid;
  4993. End;
  4994. packFloat64( zSign, $7FF, 0, 0, result );
  4995. exit;
  4996. End;
  4997. if ( bExp = $7FF ) then
  4998. Begin
  4999. if ( bSig0 OR bSig1 )<> 0 then
  5000. Begin
  5001. propagateFloat64NaN( a, b, result );
  5002. exit;
  5003. End;
  5004. packFloat64( zSign, 0, 0, 0, result );
  5005. exit;
  5006. End;
  5007. if ( bExp = 0 ) then
  5008. Begin
  5009. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5010. Begin
  5011. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5012. Begin
  5013. invalid:
  5014. float_raise( float_flag_invalid );
  5015. z.low := float64_default_nan_low;
  5016. z.high := float64_default_nan_high;
  5017. result := z;
  5018. exit;
  5019. End;
  5020. float_raise( float_flag_divbyzero );
  5021. packFloat64( zSign, $7FF, 0, 0, result );
  5022. exit;
  5023. End;
  5024. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5025. End;
  5026. if ( aExp = 0 ) then
  5027. Begin
  5028. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5029. Begin
  5030. packFloat64( zSign, 0, 0, 0, result );
  5031. exit;
  5032. End;
  5033. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5034. End;
  5035. zExp := aExp - bExp + $3FD;
  5036. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  5037. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5038. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  5039. Begin
  5040. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  5041. Inc(zExp);
  5042. End;
  5043. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5044. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  5045. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  5046. while ( sbits32 (rem0) < 0 ) do
  5047. Begin
  5048. Dec(zSig0);
  5049. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  5050. End;
  5051. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  5052. if ( ( zSig1 and $3FF ) <= 4 ) then
  5053. Begin
  5054. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  5055. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  5056. while ( sbits32 (rem1) < 0 ) do
  5057. Begin
  5058. Dec(zSig1);
  5059. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  5060. End;
  5061. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5062. End;
  5063. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  5064. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  5065. End;
  5066. {*
  5067. -------------------------------------------------------------------------------
  5068. Returns the remainder of the double-precision floating-point value `a'
  5069. with respect to the corresponding value `b'. The operation is performed
  5070. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5071. -------------------------------------------------------------------------------
  5072. *}
  5073. Function float64_rem(a: float64; b : float64) : float64;
  5074. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  5075. Var
  5076. aSign, zSign: flag;
  5077. aExp, bExp, expDiff: int16;
  5078. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  5079. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  5080. sigMean0: sbits32;
  5081. z: float64;
  5082. label invalid;
  5083. Begin
  5084. aSig1 := extractFloat64Frac1( a );
  5085. aSig0 := extractFloat64Frac0( a );
  5086. aExp := extractFloat64Exp( a );
  5087. aSign := extractFloat64Sign( a );
  5088. bSig1 := extractFloat64Frac1( b );
  5089. bSig0 := extractFloat64Frac0( b );
  5090. bExp := extractFloat64Exp( b );
  5091. if ( aExp = $7FF ) then
  5092. Begin
  5093. if ((( aSig0 OR aSig1 )<>0)
  5094. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  5095. Begin
  5096. propagateFloat64NaN( a, b, result );
  5097. exit;
  5098. End;
  5099. goto invalid;
  5100. End;
  5101. if ( bExp = $7FF ) then
  5102. Begin
  5103. if ( bSig0 OR bSig1 ) <> 0 then
  5104. Begin
  5105. propagateFloat64NaN( a, b, result );
  5106. exit;
  5107. End;
  5108. result := a;
  5109. exit;
  5110. End;
  5111. if ( bExp = 0 ) then
  5112. Begin
  5113. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5114. Begin
  5115. invalid:
  5116. float_raise( float_flag_invalid );
  5117. z.low := float64_default_nan_low;
  5118. z.high := float64_default_nan_high;
  5119. result := z;
  5120. exit;
  5121. End;
  5122. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5123. End;
  5124. if ( aExp = 0 ) then
  5125. Begin
  5126. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5127. Begin
  5128. result := a;
  5129. exit;
  5130. End;
  5131. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5132. End;
  5133. expDiff := aExp - bExp;
  5134. if ( expDiff < -1 ) then
  5135. Begin
  5136. result := a;
  5137. exit;
  5138. End;
  5139. shortShift64Left(
  5140. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5141. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5142. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5143. if ( q )<>0 then
  5144. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5145. expDiff := expDiff - 32;
  5146. while ( 0 < expDiff ) do
  5147. Begin
  5148. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5149. if 4 < q then
  5150. q:= q - 4
  5151. else
  5152. q := 0;
  5153. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5154. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5155. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5156. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5157. expDiff := expDiff - 29;
  5158. End;
  5159. if ( -32 < expDiff ) then
  5160. Begin
  5161. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5162. if 4 < q then
  5163. q := q - 4
  5164. else
  5165. q := 0;
  5166. q := q shr (- expDiff);
  5167. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5168. expDiff := expDiff + 24;
  5169. if ( expDiff < 0 ) then
  5170. Begin
  5171. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5172. End
  5173. else
  5174. Begin
  5175. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5176. End;
  5177. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5178. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5179. End
  5180. else
  5181. Begin
  5182. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5183. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5184. End;
  5185. Repeat
  5186. alternateASig0 := aSig0;
  5187. alternateASig1 := aSig1;
  5188. Inc(q);
  5189. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5190. Until not ( 0 <= sbits32 (aSig0) );
  5191. add64(
  5192. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5193. if ( ( sigMean0 < 0 )
  5194. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5195. Begin
  5196. aSig0 := alternateASig0;
  5197. aSig1 := alternateASig1;
  5198. End;
  5199. zSign := flag( sbits32 (aSig0) < 0 );
  5200. if ( zSign <> 0 ) then
  5201. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5202. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5203. End;
  5204. {*
  5205. -------------------------------------------------------------------------------
  5206. Returns the square root of the double-precision floating-point value `a'.
  5207. The operation is performed according to the IEC/IEEE Standard for Binary
  5208. Floating-Point Arithmetic.
  5209. -------------------------------------------------------------------------------
  5210. *}
  5211. function float64_sqrt( a: float64 ): float64;
  5212. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5213. Var
  5214. aSign: flag;
  5215. aExp, zExp: int16;
  5216. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5217. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5218. label invalid;
  5219. Begin
  5220. aSig1 := extractFloat64Frac1( a );
  5221. aSig0 := extractFloat64Frac0( a );
  5222. aExp := extractFloat64Exp( a );
  5223. aSign := extractFloat64Sign( a );
  5224. if ( aExp = $7FF ) then
  5225. Begin
  5226. if ( aSig0 OR aSig1 ) <> 0 then
  5227. Begin
  5228. propagateFloat64NaN( a, a, result );
  5229. exit;
  5230. End;
  5231. if ( aSign = 0) then
  5232. Begin
  5233. result := a;
  5234. exit;
  5235. End;
  5236. goto invalid;
  5237. End;
  5238. if ( aSign <> 0 ) then
  5239. Begin
  5240. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5241. Begin
  5242. result := a;
  5243. exit;
  5244. End;
  5245. invalid:
  5246. float_raise( float_flag_invalid );
  5247. result.low := float64_default_nan_low;
  5248. result.high := float64_default_nan_high;
  5249. exit;
  5250. End;
  5251. if ( aExp = 0 ) then
  5252. Begin
  5253. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5254. Begin
  5255. packFloat64( 0, 0, 0, 0, result );
  5256. exit;
  5257. End;
  5258. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5259. End;
  5260. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5261. aSig0 := aSig0 or $00100000;
  5262. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5263. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5264. if ( zSig0 = 0 ) then
  5265. zSig0 := $7FFFFFFF;
  5266. doubleZSig0 := zSig0 + zSig0;
  5267. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5268. mul32To64( zSig0, zSig0, term0, term1 );
  5269. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5270. while ( sbits32 (rem0) < 0 ) do
  5271. Begin
  5272. Dec(zSig0);
  5273. doubleZSig0 := doubleZSig0 - 2;
  5274. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5275. End;
  5276. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5277. if ( ( zSig1 and $1FF ) <= 5 ) then
  5278. Begin
  5279. if ( zSig1 = 0 ) then
  5280. zSig1 := 1;
  5281. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5282. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5283. mul32To64( zSig1, zSig1, term2, term3 );
  5284. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5285. while ( sbits32 (rem1) < 0 ) do
  5286. Begin
  5287. Dec(zSig1);
  5288. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5289. term3 := term3 or 1;
  5290. term2 := term2 or doubleZSig0;
  5291. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5292. End;
  5293. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5294. End;
  5295. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5296. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
  5297. End;
  5298. {*
  5299. -------------------------------------------------------------------------------
  5300. Returns 1 if the double-precision floating-point value `a' is equal to
  5301. the corresponding value `b', and 0 otherwise. The comparison is performed
  5302. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5303. -------------------------------------------------------------------------------
  5304. *}
  5305. Function float64_eq(a: float64; b: float64): flag;
  5306. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5307. Begin
  5308. if
  5309. (
  5310. ( extractFloat64Exp( a ) = $7FF )
  5311. AND
  5312. (
  5313. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5314. )
  5315. )
  5316. OR (
  5317. ( extractFloat64Exp( b ) = $7FF )
  5318. AND (
  5319. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5320. )
  5321. )
  5322. ) then
  5323. Begin
  5324. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5325. float_raise( float_flag_invalid );
  5326. float64_eq := 0;
  5327. exit;
  5328. End;
  5329. float64_eq := flag(
  5330. ( a.low = b.low )
  5331. AND ( ( a.high = b.high )
  5332. OR ( ( a.low = 0 )
  5333. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5334. ));
  5335. End;
  5336. {*
  5337. -------------------------------------------------------------------------------
  5338. Returns 1 if the double-precision floating-point value `a' is less than
  5339. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5340. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5341. Arithmetic.
  5342. -------------------------------------------------------------------------------
  5343. *}
  5344. Function float64_le(a: float64;b: float64): flag;
  5345. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5346. Var
  5347. aSign, bSign: flag;
  5348. Begin
  5349. if
  5350. (
  5351. ( extractFloat64Exp( a ) = $7FF )
  5352. AND
  5353. (
  5354. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5355. )
  5356. )
  5357. OR (
  5358. ( extractFloat64Exp( b ) = $7FF )
  5359. AND (
  5360. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5361. )
  5362. )
  5363. ) then
  5364. Begin
  5365. float_raise( float_flag_invalid );
  5366. float64_le := 0;
  5367. exit;
  5368. End;
  5369. aSign := extractFloat64Sign( a );
  5370. bSign := extractFloat64Sign( b );
  5371. if ( aSign <> bSign ) then
  5372. Begin
  5373. float64_le := flag(
  5374. (aSign <> 0)
  5375. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5376. = 0 ));
  5377. exit;
  5378. End;
  5379. if aSign <> 0 then
  5380. float64_le := le64( b.high, b.low, a.high, a.low )
  5381. else
  5382. float64_le := le64( a.high, a.low, b.high, b.low );
  5383. End;
  5384. {*
  5385. -------------------------------------------------------------------------------
  5386. Returns 1 if the double-precision floating-point value `a' is less than
  5387. the corresponding value `b', and 0 otherwise. The comparison is performed
  5388. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5389. -------------------------------------------------------------------------------
  5390. *}
  5391. Function float64_lt(a: float64;b: float64): flag;
  5392. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5393. Var
  5394. aSign, bSign: flag;
  5395. Begin
  5396. if
  5397. (
  5398. ( extractFloat64Exp( a ) = $7FF )
  5399. AND
  5400. (
  5401. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5402. )
  5403. )
  5404. OR (
  5405. ( extractFloat64Exp( b ) = $7FF )
  5406. AND (
  5407. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5408. )
  5409. )
  5410. ) then
  5411. Begin
  5412. float_raise( float_flag_invalid );
  5413. float64_lt := 0;
  5414. exit;
  5415. End;
  5416. aSign := extractFloat64Sign( a );
  5417. bSign := extractFloat64Sign( b );
  5418. if ( aSign <> bSign ) then
  5419. Begin
  5420. float64_lt := flag(
  5421. (aSign <> 0)
  5422. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5423. <> 0 ));
  5424. exit;
  5425. End;
  5426. if aSign <> 0 then
  5427. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5428. else
  5429. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5430. End;
  5431. {*
  5432. -------------------------------------------------------------------------------
  5433. Returns 1 if the double-precision floating-point value `a' is equal to
  5434. the corresponding value `b', and 0 otherwise. The invalid exception is
  5435. raised if either operand is a NaN. Otherwise, the comparison is performed
  5436. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5437. -------------------------------------------------------------------------------
  5438. *}
  5439. Function float64_eq_signaling( a: float64; b: float64): flag;
  5440. Begin
  5441. if
  5442. (
  5443. ( extractFloat64Exp( a ) = $7FF )
  5444. AND
  5445. (
  5446. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5447. )
  5448. )
  5449. OR (
  5450. ( extractFloat64Exp( b ) = $7FF )
  5451. AND (
  5452. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5453. )
  5454. )
  5455. ) then
  5456. Begin
  5457. float_raise( float_flag_invalid );
  5458. float64_eq_signaling := 0;
  5459. exit;
  5460. End;
  5461. float64_eq_signaling := flag(
  5462. ( a.low = b.low )
  5463. AND ( ( a.high = b.high )
  5464. OR ( ( a.low = 0 )
  5465. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5466. ));
  5467. End;
  5468. {*
  5469. -------------------------------------------------------------------------------
  5470. Returns 1 if the double-precision floating-point value `a' is less than or
  5471. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5472. cause an exception. Otherwise, the comparison is performed according to the
  5473. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5474. -------------------------------------------------------------------------------
  5475. *}
  5476. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5477. Var
  5478. aSign, bSign : flag;
  5479. Begin
  5480. if
  5481. (
  5482. ( extractFloat64Exp( a ) = $7FF )
  5483. AND
  5484. (
  5485. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5486. )
  5487. )
  5488. OR (
  5489. ( extractFloat64Exp( b ) = $7FF )
  5490. AND (
  5491. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5492. )
  5493. )
  5494. ) then
  5495. Begin
  5496. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5497. float_raise( float_flag_invalid );
  5498. float64_le_quiet := 0;
  5499. exit;
  5500. End;
  5501. aSign := extractFloat64Sign( a );
  5502. bSign := extractFloat64Sign( b );
  5503. if ( aSign <> bSign ) then
  5504. Begin
  5505. float64_le_quiet := flag
  5506. ((aSign <> 0)
  5507. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5508. = 0 ));
  5509. exit;
  5510. End;
  5511. if aSign <> 0 then
  5512. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5513. else
  5514. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5515. End;
  5516. {*
  5517. -------------------------------------------------------------------------------
  5518. Returns 1 if the double-precision floating-point value `a' is less than
  5519. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5520. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5521. Standard for Binary Floating-Point Arithmetic.
  5522. -------------------------------------------------------------------------------
  5523. *}
  5524. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5525. Var
  5526. aSign, bSign: flag;
  5527. Begin
  5528. if
  5529. (
  5530. ( extractFloat64Exp( a ) = $7FF )
  5531. AND
  5532. (
  5533. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5534. )
  5535. )
  5536. OR (
  5537. ( extractFloat64Exp( b ) = $7FF )
  5538. AND (
  5539. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5540. )
  5541. )
  5542. ) then
  5543. Begin
  5544. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5545. float_raise( float_flag_invalid );
  5546. float64_lt_quiet := 0;
  5547. exit;
  5548. End;
  5549. aSign := extractFloat64Sign( a );
  5550. bSign := extractFloat64Sign( b );
  5551. if ( aSign <> bSign ) then
  5552. Begin
  5553. float64_lt_quiet := flag(
  5554. (aSign<>0)
  5555. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5556. <> 0 ));
  5557. exit;
  5558. End;
  5559. If aSign <> 0 then
  5560. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5561. else
  5562. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5563. End;
  5564. {*----------------------------------------------------------------------------
  5565. | Returns the result of converting the 64-bit two's complement integer `a'
  5566. | to the single-precision floating-point format. The conversion is performed
  5567. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5568. *----------------------------------------------------------------------------*}
  5569. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5570. var
  5571. zSign : flag;
  5572. absA : uint64;
  5573. shiftCount: int8;
  5574. Begin
  5575. if ( a = 0 ) then
  5576. begin
  5577. int64_to_float32.float32 := 0;
  5578. exit;
  5579. end;
  5580. if a < 0 then
  5581. zSign := flag(TRUE)
  5582. else
  5583. zSign := flag(FALSE);
  5584. if zSign<>0 then
  5585. absA := -a
  5586. else
  5587. absA := a;
  5588. shiftCount := countLeadingZeros64( absA ) - 40;
  5589. if ( 0 <= shiftCount ) then
  5590. begin
  5591. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5592. end
  5593. else
  5594. begin
  5595. shiftCount := shiftCount + 7;
  5596. if ( shiftCount < 0 ) then
  5597. shift64RightJamming( absA, - shiftCount, absA )
  5598. else
  5599. absA := absA shl shiftCount;
  5600. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5601. end;
  5602. End;
  5603. {*----------------------------------------------------------------------------
  5604. | Returns the result of converting the 64-bit two's complement integer `a'
  5605. | to the single-precision floating-point format. The conversion is performed
  5606. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5607. | Unisgned version.
  5608. *----------------------------------------------------------------------------*}
  5609. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5610. var
  5611. absA : uint64;
  5612. shiftCount: int8;
  5613. Begin
  5614. if ( a = 0 ) then
  5615. begin
  5616. qword_to_float32.float32 := 0;
  5617. exit;
  5618. end;
  5619. absA := a;
  5620. shiftCount := countLeadingZeros64( absA ) - 40;
  5621. if ( 0 <= shiftCount ) then
  5622. begin
  5623. qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
  5624. end
  5625. else
  5626. begin
  5627. shiftCount := shiftCount + 7;
  5628. if ( shiftCount < 0 ) then
  5629. shift64RightJamming( absA, - shiftCount, absA )
  5630. else
  5631. absA := absA shl shiftCount;
  5632. qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
  5633. end;
  5634. End;
  5635. {*----------------------------------------------------------------------------
  5636. | Returns the result of converting the 64-bit two's complement integer `a'
  5637. | to the double-precision floating-point format. The conversion is performed
  5638. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5639. *----------------------------------------------------------------------------*}
  5640. function qword_to_float64( a: qword ): float64;
  5641. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5642. var
  5643. shiftCount: int8;
  5644. Begin
  5645. if ( a = 0 ) then
  5646. result := packFloat64( 0, 0, 0 )
  5647. else
  5648. begin
  5649. shiftCount := countLeadingZeros64(a) - 1;
  5650. { numbers with <= 53 significant bits are converted exactly }
  5651. if (shiftCount > 9) then
  5652. result := packFloat64(0, $43c - shiftCount, a shl (shiftCount-10))
  5653. else if (shiftCount>=0) then
  5654. result := roundAndPackFloat64( 0, $43c - shiftCount, a shl shiftCount)
  5655. else
  5656. begin
  5657. { the only possible negative value is -1, in case bit 63 of 'a' is set }
  5658. shift64RightJamming(a, 1, a);
  5659. result := roundAndPackFloat64(0, $43d, a);
  5660. end;
  5661. end;
  5662. End;
  5663. {*----------------------------------------------------------------------------
  5664. | Returns the result of converting the 64-bit two's complement integer `a'
  5665. | to the double-precision floating-point format. The conversion is performed
  5666. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5667. *----------------------------------------------------------------------------*}
  5668. function int64_to_float64( a: int64 ): float64;
  5669. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5670. Begin
  5671. if ( a = 0 ) then
  5672. result := packFloat64( 0, 0, 0 )
  5673. else if (a = int64($8000000000000000)) then
  5674. result := packFloat64( 1, $43e, 0 )
  5675. else if (a < 0) then
  5676. result := normalizeRoundAndPackFloat64( 1, $43c, -a )
  5677. else
  5678. result := normalizeRoundAndPackFloat64( 0, $43c, a );
  5679. End;
  5680. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5681. {*----------------------------------------------------------------------------
  5682. | Returns the result of converting the 64-bit two's complement integer `a'
  5683. | to the extended double-precision floating-point format. The conversion
  5684. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5685. | Arithmetic.
  5686. *----------------------------------------------------------------------------*}
  5687. function int64_to_floatx80( a: int64 ): floatx80;
  5688. var
  5689. zSign: flag;
  5690. absA: uint64;
  5691. shiftCount: int8;
  5692. begin
  5693. if ( a = 0 ) then begin
  5694. result := packFloatx80( 0, 0, 0 );
  5695. exit;
  5696. end;
  5697. zSign := ord( a < 0 );
  5698. if zSign <> 0 then absA := - a else absA := a;
  5699. shiftCount := countLeadingZeros64( absA );
  5700. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5701. end;
  5702. {*----------------------------------------------------------------------------
  5703. | Returns the result of converting the 64-bit two's complement integer `a'
  5704. | to the extended double-precision floating-point format. The conversion
  5705. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5706. | Arithmetic.
  5707. | Unsigned version.
  5708. *----------------------------------------------------------------------------*}
  5709. function qword_to_floatx80( a: qword ): floatx80;
  5710. var
  5711. absA: bits64;
  5712. shiftCount: int8;
  5713. begin
  5714. if ( a = 0 ) then begin
  5715. result := packFloatx80( 0, 0, 0 );
  5716. exit;
  5717. end;
  5718. absA := a;
  5719. shiftCount := countLeadingZeros64( absA );
  5720. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5721. end;
  5722. {$endif FPC_SOFTFLOAT_FLOATX80}
  5723. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5724. {*----------------------------------------------------------------------------
  5725. | Returns the result of converting the 64-bit two's complement integer `a' to
  5726. | the quadruple-precision floating-point format. The conversion is performed
  5727. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5728. *----------------------------------------------------------------------------*}
  5729. function int64_to_float128( a: int64 ): float128;
  5730. var
  5731. zSign: flag;
  5732. absA: uint64;
  5733. shiftCount: int8;
  5734. zExp: int32;
  5735. zSig0, zSig1: bits64;
  5736. begin
  5737. if ( a = 0 ) then begin
  5738. result := packFloat128( 0, 0, 0, 0 );
  5739. exit;
  5740. end;
  5741. zSign := ord( a < 0 );
  5742. if zSign <> 0 then absA := - a else absA := a;
  5743. shiftCount := countLeadingZeros64( absA ) + 49;
  5744. zExp := $406E - shiftCount;
  5745. if ( 64 <= shiftCount ) then begin
  5746. zSig1 := 0;
  5747. zSig0 := absA;
  5748. dec( shiftCount, 64 );
  5749. end
  5750. else begin
  5751. zSig1 := absA;
  5752. zSig0 := 0;
  5753. end;
  5754. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5755. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5756. end;
  5757. {*----------------------------------------------------------------------------
  5758. | Returns the result of converting the 64-bit two's complement integer `a' to
  5759. | the quadruple-precision floating-point format. The conversion is performed
  5760. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5761. | Unsigned version.
  5762. *----------------------------------------------------------------------------*}
  5763. function qword_to_float128( a: qword ): float128;
  5764. var
  5765. absA: bits64;
  5766. shiftCount: int8;
  5767. zExp: int32;
  5768. zSig0, zSig1: bits64;
  5769. begin
  5770. if ( a = 0 ) then begin
  5771. result := packFloat128( 0, 0, 0, 0 );
  5772. exit;
  5773. end;
  5774. absA := a;
  5775. shiftCount := countLeadingZeros64( absA ) + 49;
  5776. zExp := $406E - shiftCount;
  5777. if ( 64 <= shiftCount ) then begin
  5778. zSig1 := 0;
  5779. zSig0 := absA;
  5780. dec( shiftCount, 64 );
  5781. end
  5782. else begin
  5783. zSig1 := absA;
  5784. zSig0 := 0;
  5785. end;
  5786. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5787. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5788. end;
  5789. {$endif FPC_SOFTFLOAT_FLOAT128}
  5790. {*----------------------------------------------------------------------------
  5791. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5792. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5793. | Otherwise, returns 0.
  5794. *----------------------------------------------------------------------------*}
  5795. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5796. begin
  5797. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5798. end;
  5799. {*----------------------------------------------------------------------------
  5800. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5801. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5802. | Otherwise, returns 0.
  5803. *----------------------------------------------------------------------------*}
  5804. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5805. begin
  5806. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5807. end;
  5808. {*----------------------------------------------------------------------------
  5809. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5810. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5811. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5812. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5813. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5814. | the most-significant bit of the extra result, and the other 63 bits of the
  5815. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5816. | were all zero. This extra result is stored in the location pointed to by
  5817. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5818. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5819. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5820. | fixed-point value is shifted right by the number of bits given in `count',
  5821. | and the integer part of the result is returned at the locations pointed to
  5822. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5823. | corrupted as described above, and is returned at the location pointed to by
  5824. | `z2Ptr'.)
  5825. *----------------------------------------------------------------------------*}
  5826. procedure shift128ExtraRightJamming(
  5827. a0: bits64;
  5828. a1: bits64;
  5829. a2: bits64;
  5830. count: int16;
  5831. var z0Ptr: bits64;
  5832. var z1Ptr: bits64;
  5833. var z2Ptr: bits64);
  5834. var
  5835. z0, z1, z2: bits64;
  5836. negCount: int8;
  5837. begin
  5838. negCount := ( - count ) and 63;
  5839. if ( count = 0 ) then
  5840. begin
  5841. z2 := a2;
  5842. z1 := a1;
  5843. z0 := a0;
  5844. end
  5845. else begin
  5846. if ( count < 64 ) then
  5847. begin
  5848. z2 := a1 shl negCount;
  5849. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5850. z0 := a0 shr count;
  5851. end
  5852. else begin
  5853. if ( count = 64 ) then
  5854. begin
  5855. z2 := a1;
  5856. z1 := a0;
  5857. end
  5858. else begin
  5859. a2 := a2 or a1;
  5860. if ( count < 128 ) then
  5861. begin
  5862. z2 := a0 shl negCount;
  5863. z1 := a0 shr ( count and 63 );
  5864. end
  5865. else begin
  5866. if ( count = 128 ) then
  5867. z2 := a0
  5868. else
  5869. z2 := ord( a0 <> 0 );
  5870. z1 := 0;
  5871. end;
  5872. end;
  5873. z0 := 0;
  5874. end;
  5875. z2 := z2 or ord( a2 <> 0 );
  5876. end;
  5877. z2Ptr := z2;
  5878. z1Ptr := z1;
  5879. z0Ptr := z0;
  5880. end;
  5881. {*----------------------------------------------------------------------------
  5882. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5883. | _plus_ the number of bits given in `count'. The shifted result is at most
  5884. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5885. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5886. | shifted off is the most-significant bit of the extra result, and the other
  5887. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5888. | bits shifted off were all zero. This extra result is stored in the location
  5889. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5890. | (This routine makes more sense if `a0' and `a1' are considered to form
  5891. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5892. | point value is shifted right by the number of bits given in `count', and
  5893. | the integer part of the result is returned at the location pointed to by
  5894. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5895. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5896. *----------------------------------------------------------------------------*}
  5897. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5898. var
  5899. z0, z1: bits64;
  5900. negCount: int8;
  5901. begin
  5902. negCount := ( - count ) and 63;
  5903. if ( count = 0 ) then
  5904. begin
  5905. z1 := a1;
  5906. z0 := a0;
  5907. end
  5908. else if ( count < 64 ) then
  5909. begin
  5910. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5911. z0 := a0 shr count;
  5912. end
  5913. else begin
  5914. if ( count = 64 ) then
  5915. begin
  5916. z1 := a0 or ord( a1 <> 0 );
  5917. end
  5918. else begin
  5919. z1 := ord( ( a0 or a1 ) <> 0 );
  5920. end;
  5921. z0 := 0;
  5922. end;
  5923. z1Ptr := z1;
  5924. z0Ptr := z0;
  5925. end;
  5926. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5927. {*----------------------------------------------------------------------------
  5928. | Returns the fraction bits of the extended double-precision floating-point
  5929. | value `a'.
  5930. *----------------------------------------------------------------------------*}
  5931. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5932. begin
  5933. result:=a.low;
  5934. end;
  5935. {*----------------------------------------------------------------------------
  5936. | Returns the exponent bits of the extended double-precision floating-point
  5937. | value `a'.
  5938. *----------------------------------------------------------------------------*}
  5939. function extractFloatx80Exp(a : floatx80): int32;inline;
  5940. begin
  5941. result:=a.high and $7FFF;
  5942. end;
  5943. {*----------------------------------------------------------------------------
  5944. | Returns the sign bit of the extended double-precision floating-point value
  5945. | `a'.
  5946. *----------------------------------------------------------------------------*}
  5947. function extractFloatx80Sign(a : floatx80): flag;inline;
  5948. begin
  5949. result:=a.high shr 15;
  5950. end;
  5951. {*----------------------------------------------------------------------------
  5952. | Normalizes the subnormal extended double-precision floating-point value
  5953. | represented by the denormalized significand `aSig'. The normalized exponent
  5954. | and significand are stored at the locations pointed to by `zExpPtr' and
  5955. | `zSigPtr', respectively.
  5956. *----------------------------------------------------------------------------*}
  5957. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5958. var
  5959. shiftCount: int8;
  5960. begin
  5961. shiftCount := countLeadingZeros64( aSig );
  5962. zSigPtr := aSig shl shiftCount;
  5963. zExpPtr := 1 - shiftCount;
  5964. end;
  5965. {*----------------------------------------------------------------------------
  5966. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5967. | extended double-precision floating-point value, returning the result.
  5968. *----------------------------------------------------------------------------*}
  5969. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5970. var
  5971. z: floatx80;
  5972. begin
  5973. z.low := zSig;
  5974. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5975. result:=z;
  5976. end;
  5977. {*----------------------------------------------------------------------------
  5978. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5979. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5980. | and returns the proper extended double-precision floating-point value
  5981. | corresponding to the abstract input. Ordinarily, the abstract value is
  5982. | rounded and packed into the extended double-precision format, with the
  5983. | inexact exception raised if the abstract input cannot be represented
  5984. | exactly. However, if the abstract value is too large, the overflow and
  5985. | inexact exceptions are raised and an infinity or maximal finite value is
  5986. | returned. If the abstract value is too small, the input value is rounded to
  5987. | a subnormal number, and the underflow and inexact exceptions are raised if
  5988. | the abstract input cannot be represented exactly as a subnormal extended
  5989. | double-precision floating-point number.
  5990. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5991. | number of bits as single or double precision, respectively. Otherwise, the
  5992. | result is rounded to the full precision of the extended double-precision
  5993. | format.
  5994. | The input significand must be normalized or smaller. If the input
  5995. | significand is not normalized, `zExp' must be 0; in that case, the result
  5996. | returned is a subnormal number, and it must not require rounding. The
  5997. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5998. | Floating-Point Arithmetic.
  5999. *----------------------------------------------------------------------------*}
  6000. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6001. var
  6002. roundingMode: TFPURoundingMode;
  6003. roundNearestEven, increment, isTiny: flag;
  6004. roundIncrement, roundMask, roundBits: int64;
  6005. label
  6006. precision80, overflow;
  6007. begin
  6008. roundingMode := softfloat_rounding_mode;
  6009. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  6010. if ( roundingPrecision = 80 ) then
  6011. goto precision80;
  6012. if ( roundingPrecision = 64 ) then
  6013. begin
  6014. roundIncrement := int64( $0000000000000400 );
  6015. roundMask := int64( $00000000000007FF );
  6016. end
  6017. else if ( roundingPrecision = 32 ) then
  6018. begin
  6019. roundIncrement := int64( $0000008000000000 );
  6020. roundMask := int64( $000000FFFFFFFFFF );
  6021. end
  6022. else begin
  6023. goto precision80;
  6024. end;
  6025. zSig0 := zSig0 or ord( zSig1 <> 0 );
  6026. if ( not (roundNearestEven<>0) ) then
  6027. begin
  6028. if ( roundingMode = float_round_to_zero ) then
  6029. begin
  6030. roundIncrement := 0;
  6031. end
  6032. else begin
  6033. roundIncrement := roundMask;
  6034. if ( zSign<>0 ) then
  6035. begin
  6036. if ( roundingMode = float_round_up ) then
  6037. roundIncrement := 0;
  6038. end
  6039. else begin
  6040. if ( roundingMode = float_round_down ) then
  6041. roundIncrement := 0;
  6042. end;
  6043. end;
  6044. end;
  6045. roundBits := zSig0 and roundMask;
  6046. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6047. if ( ( $7FFE < zExp )
  6048. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  6049. ) then begin
  6050. goto overflow;
  6051. end;
  6052. if ( zExp <= 0 ) then begin
  6053. isTiny := ord (
  6054. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6055. or ( zExp < 0 )
  6056. or ( zSig0 <= zSig0 + roundIncrement ) );
  6057. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  6058. zExp := 0;
  6059. roundBits := zSig0 and roundMask;
  6060. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  6061. if ( roundBits <> 0 ) then set_inexact_flag;
  6062. inc( zSig0, roundIncrement );
  6063. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6064. roundIncrement := roundMask + 1;
  6065. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6066. roundMask := roundMask or roundIncrement;
  6067. end;
  6068. zSig0 := zSig0 and not roundMask;
  6069. result:=packFloatx80( zSign, zExp, zSig0 );
  6070. exit;
  6071. end;
  6072. end;
  6073. if ( roundBits <> 0 ) then set_inexact_flag;
  6074. inc( zSig0, roundIncrement );
  6075. if ( zSig0 < roundIncrement ) then begin
  6076. inc(zExp);
  6077. zSig0 := bits64( $8000000000000000 );
  6078. end;
  6079. roundIncrement := roundMask + 1;
  6080. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6081. roundMask := roundMask or roundIncrement;
  6082. end;
  6083. zSig0 := zSig0 and not roundMask;
  6084. if ( zSig0 = 0 ) then zExp := 0;
  6085. result:=packFloatx80( zSign, zExp, zSig0 );
  6086. exit;
  6087. precision80:
  6088. increment := ord ( sbits64( zSig1 ) < 0 );
  6089. if ( roundNearestEven = 0 ) then begin
  6090. if ( roundingMode = float_round_to_zero ) then begin
  6091. increment := 0;
  6092. end
  6093. else begin
  6094. if ( zSign <> 0 ) then begin
  6095. increment := ord ( roundingMode = float_round_down ) and zSig1;
  6096. end
  6097. else begin
  6098. increment := ord ( roundingMode = float_round_up ) and zSig1;
  6099. end;
  6100. end;
  6101. end;
  6102. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6103. if ( ( $7FFE < zExp )
  6104. or ( ( zExp = $7FFE )
  6105. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  6106. and ( increment <> 0 )
  6107. )
  6108. ) then begin
  6109. roundMask := 0;
  6110. overflow:
  6111. float_raise( [float_flag_overflow,float_flag_inexact] );
  6112. if ( ( roundingMode = float_round_to_zero )
  6113. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6114. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6115. ) then begin
  6116. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6117. exit;
  6118. end;
  6119. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6120. exit;
  6121. end;
  6122. if ( zExp <= 0 ) then begin
  6123. isTiny := ord(
  6124. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6125. or ( zExp < 0 )
  6126. or ( increment = 0 )
  6127. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6128. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6129. zExp := 0;
  6130. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6131. if ( zSig1 <> 0 ) then set_inexact_flag;
  6132. if ( roundNearestEven <> 0 ) then begin
  6133. increment := ord( sbits64( zSig1 ) < 0 );
  6134. end
  6135. else begin
  6136. if ( zSign <> 0 ) then begin
  6137. increment := ord( roundingMode = float_round_down ) and zSig1;
  6138. end
  6139. else begin
  6140. increment := ord( roundingMode = float_round_up ) and zSig1;
  6141. end;
  6142. end;
  6143. if ( increment <> 0 ) then begin
  6144. inc(zSig0);
  6145. zSig0 :=
  6146. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6147. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6148. end;
  6149. result:=packFloatx80( zSign, zExp, zSig0 );
  6150. exit;
  6151. end;
  6152. end;
  6153. if ( zSig1 <> 0 ) then set_inexact_flag;
  6154. if ( increment <> 0 ) then begin
  6155. inc(zSig0);
  6156. if ( zSig0 = 0 ) then begin
  6157. inc(zExp);
  6158. zSig0 := bits64( $8000000000000000 );
  6159. end
  6160. else begin
  6161. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6162. end;
  6163. end
  6164. else begin
  6165. if ( zSig0 = 0 ) then zExp := 0;
  6166. end;
  6167. result:=packFloatx80( zSign, zExp, zSig0 );
  6168. end;
  6169. {*----------------------------------------------------------------------------
  6170. | Takes an abstract floating-point value having sign `zSign', exponent
  6171. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6172. | and returns the proper extended double-precision floating-point value
  6173. | corresponding to the abstract input. This routine is just like
  6174. | `roundAndPackFloatx80' except that the input significand does not have to be
  6175. | normalized.
  6176. *----------------------------------------------------------------------------*}
  6177. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6178. var
  6179. shiftCount: int8;
  6180. begin
  6181. if ( zSig0 = 0 ) then begin
  6182. zSig0 := zSig1;
  6183. zSig1 := 0;
  6184. dec( zExp, 64 );
  6185. end;
  6186. shiftCount := countLeadingZeros64( zSig0 );
  6187. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6188. zExp := zExp - shiftCount;
  6189. result :=
  6190. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6191. end;
  6192. {*----------------------------------------------------------------------------
  6193. | Returns the result of converting the extended double-precision floating-
  6194. | point value `a' to the 32-bit two's complement integer format. The
  6195. | conversion is performed according to the IEC/IEEE Standard for Binary
  6196. | Floating-Point Arithmetic---which means in particular that the conversion
  6197. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6198. | largest positive integer is returned. Otherwise, if the conversion
  6199. | overflows, the largest integer with the same sign as `a' is returned.
  6200. *----------------------------------------------------------------------------*}
  6201. function floatx80_to_int32(a: floatx80): int32;
  6202. var
  6203. aSign: flag;
  6204. aExp, shiftCount: int32;
  6205. aSig: bits64;
  6206. begin
  6207. aSig := extractFloatx80Frac( a );
  6208. aExp := extractFloatx80Exp( a );
  6209. aSign := extractFloatx80Sign( a );
  6210. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6211. shiftCount := $4037 - aExp;
  6212. if ( shiftCount <= 0 ) then shiftCount := 1;
  6213. shift64RightJamming( aSig, shiftCount, aSig );
  6214. result := roundAndPackInt32( aSign, aSig );
  6215. end;
  6216. {*----------------------------------------------------------------------------
  6217. | Returns the result of converting the extended double-precision floating-
  6218. | point value `a' to the 32-bit two's complement integer format. The
  6219. | conversion is performed according to the IEC/IEEE Standard for Binary
  6220. | Floating-Point Arithmetic, except that the conversion is always rounded
  6221. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6222. | Otherwise, if the conversion overflows, the largest integer with the same
  6223. | sign as `a' is returned.
  6224. *----------------------------------------------------------------------------*}
  6225. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6226. var
  6227. aSign: flag;
  6228. aExp, shiftCount: int32;
  6229. aSig, savedASig: bits64;
  6230. z: int32;
  6231. label
  6232. invalid;
  6233. begin
  6234. aSig := extractFloatx80Frac( a );
  6235. aExp := extractFloatx80Exp( a );
  6236. aSign := extractFloatx80Sign( a );
  6237. if ( $401E < aExp ) then begin
  6238. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6239. goto invalid;
  6240. end
  6241. else if ( aExp < $3FFF ) then begin
  6242. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6243. result := 0;
  6244. exit;
  6245. end;
  6246. shiftCount := $403E - aExp;
  6247. savedASig := aSig;
  6248. aSig := aSig shr shiftCount;
  6249. z := aSig;
  6250. if ( aSign <> 0 ) then z := - z;
  6251. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6252. invalid:
  6253. float_raise( float_flag_invalid );
  6254. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6255. exit;
  6256. end;
  6257. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6258. set_inexact_flag;
  6259. end;
  6260. result := z;
  6261. end;
  6262. {*----------------------------------------------------------------------------
  6263. | Returns the result of converting the extended double-precision floating-
  6264. | point value `a' to the 64-bit two's complement integer format. The
  6265. | conversion is performed according to the IEC/IEEE Standard for Binary
  6266. | Floating-Point Arithmetic---which means in particular that the conversion
  6267. | is rounded according to the current rounding mode. If `a' is a NaN,
  6268. | the largest positive integer is returned. Otherwise, if the conversion
  6269. | overflows, the largest integer with the same sign as `a' is returned.
  6270. *----------------------------------------------------------------------------*}
  6271. function floatx80_to_int64(a: floatx80): int64;
  6272. var
  6273. aSign: flag;
  6274. aExp, shiftCount: int32;
  6275. aSig, aSigExtra: bits64;
  6276. begin
  6277. aSig := extractFloatx80Frac( a );
  6278. aExp := extractFloatx80Exp( a );
  6279. aSign := extractFloatx80Sign( a );
  6280. shiftCount := $403E - aExp;
  6281. if ( shiftCount <= 0 ) then begin
  6282. if ( shiftCount <> 0 ) then begin
  6283. float_raise( float_flag_invalid );
  6284. if ( ( aSign = 0 )
  6285. or ( ( aExp = $7FFF )
  6286. and ( aSig <> bits64( $8000000000000000 ) ) )
  6287. ) then begin
  6288. result := $7FFFFFFFFFFFFFFF;
  6289. exit;
  6290. end;
  6291. result := $8000000000000000;
  6292. exit;
  6293. end;
  6294. aSigExtra := 0;
  6295. end
  6296. else begin
  6297. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6298. end;
  6299. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6300. end;
  6301. {*----------------------------------------------------------------------------
  6302. | Returns the result of converting the extended double-precision floating-
  6303. | point value `a' to the 64-bit two's complement integer format. The
  6304. | conversion is performed according to the IEC/IEEE Standard for Binary
  6305. | Floating-Point Arithmetic, except that the conversion is always rounded
  6306. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6307. | Otherwise, if the conversion overflows, the largest integer with the same
  6308. | sign as `a' is returned.
  6309. *----------------------------------------------------------------------------*}
  6310. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6311. var
  6312. aSign: flag;
  6313. aExp, shiftCount: int32;
  6314. aSig: bits64;
  6315. z: int64;
  6316. begin
  6317. aSig := extractFloatx80Frac( a );
  6318. aExp := extractFloatx80Exp( a );
  6319. aSign := extractFloatx80Sign( a );
  6320. shiftCount := aExp - $403E;
  6321. if ( 0 <= shiftCount ) then begin
  6322. aSig := $7FFFFFFFFFFFFFFF;
  6323. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6324. float_raise( float_flag_invalid );
  6325. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6326. result := $7FFFFFFFFFFFFFFF;
  6327. exit;
  6328. end;
  6329. end;
  6330. result := $8000000000000000;
  6331. exit;
  6332. end
  6333. else if ( aExp < $3FFF ) then begin
  6334. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6335. result := 0;
  6336. exit;
  6337. end;
  6338. z := aSig shr ( - shiftCount );
  6339. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6340. set_inexact_flag;
  6341. end;
  6342. if ( aSign <> 0 ) then z := - z;
  6343. result := z;
  6344. end;
  6345. {*----------------------------------------------------------------------------
  6346. | The pattern for a default generated extended double-precision NaN. The
  6347. | `high' and `low' values hold the most- and least-significant bits,
  6348. | respectively.
  6349. *----------------------------------------------------------------------------*}
  6350. const
  6351. floatx80_default_nan_high = $FFFF;
  6352. floatx80_default_nan_low = bits64( $C000000000000000 );
  6353. {*----------------------------------------------------------------------------
  6354. | Returns 1 if the extended double-precision floating-point value `a' is a
  6355. | signaling NaN; otherwise returns 0.
  6356. *----------------------------------------------------------------------------*}
  6357. function floatx80_is_signaling_nan(a : floatx80): flag;
  6358. var
  6359. aLow: bits64;
  6360. begin
  6361. aLow := a.low and not $4000000000000000;
  6362. result := ord(
  6363. ( a.high and $7FFF = $7FFF )
  6364. and ( bits64( aLow shl 1 ) <> 0 )
  6365. and ( a.low = aLow ) );
  6366. end;
  6367. {*----------------------------------------------------------------------------
  6368. | Returns the result of converting the extended double-precision floating-
  6369. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6370. | invalid exception is raised.
  6371. *----------------------------------------------------------------------------*}
  6372. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6373. var
  6374. z: commonNaNT;
  6375. begin
  6376. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6377. z.sign := a.high shr 15;
  6378. z.low := 0;
  6379. z.high := a.low shl 1;
  6380. result := z;
  6381. end;
  6382. {*----------------------------------------------------------------------------
  6383. | Returns 1 if the extended double-precision floating-point value `a' is a
  6384. | NaN; otherwise returns 0.
  6385. *----------------------------------------------------------------------------*}
  6386. function floatx80_is_nan(a : floatx80 ): flag;
  6387. begin
  6388. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
  6389. end;
  6390. {*----------------------------------------------------------------------------
  6391. | Takes two extended double-precision floating-point values `a' and `b', one
  6392. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6393. | `b' is a signaling NaN, the invalid exception is raised.
  6394. *----------------------------------------------------------------------------*}
  6395. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6396. var
  6397. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6398. label
  6399. returnLargerSignificand;
  6400. begin
  6401. aIsNaN := floatx80_is_nan( a );
  6402. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6403. bIsNaN := floatx80_is_nan( b );
  6404. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6405. a.low := a.low or $C000000000000000;
  6406. b.low := b.low or $C000000000000000;
  6407. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6408. if aIsSignalingNaN <> 0 then begin
  6409. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6410. if bIsNaN <> 0 then result := b else result := a;
  6411. exit;
  6412. end
  6413. else if aIsNaN <>0 then begin
  6414. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6415. result := a;
  6416. exit;
  6417. end;
  6418. returnLargerSignificand:
  6419. if ( a.low < b.low ) then begin
  6420. result := b;
  6421. exit;
  6422. end;
  6423. if ( b.low < a.low ) then begin
  6424. result := a;
  6425. exit;
  6426. end;
  6427. if a.high < b.high then result := a else result := b;
  6428. exit;
  6429. end
  6430. else
  6431. result := b;
  6432. end;
  6433. {*----------------------------------------------------------------------------
  6434. | Returns the result of converting the extended double-precision floating-
  6435. | point value `a' to the single-precision floating-point format. The
  6436. | conversion is performed according to the IEC/IEEE Standard for Binary
  6437. | Floating-Point Arithmetic.
  6438. *----------------------------------------------------------------------------*}
  6439. function floatx80_to_float32(a: floatx80): float32;
  6440. var
  6441. aSign: flag;
  6442. aExp: int32;
  6443. aSig: bits64;
  6444. begin
  6445. aSig := extractFloatx80Frac( a );
  6446. aExp := extractFloatx80Exp( a );
  6447. aSign := extractFloatx80Sign( a );
  6448. if ( aExp = $7FFF ) then begin
  6449. if bits64( aSig shl 1 ) <> 0 then begin
  6450. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6451. exit;
  6452. end;
  6453. result := packFloat32( aSign, $FF, 0 );
  6454. exit;
  6455. end;
  6456. shift64RightJamming( aSig, 33, aSig );
  6457. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6458. result := roundAndPackFloat32( aSign, aExp, aSig );
  6459. end;
  6460. {*----------------------------------------------------------------------------
  6461. | Returns the result of converting the extended double-precision floating-
  6462. | point value `a' to the double-precision floating-point format. The
  6463. | conversion is performed according to the IEC/IEEE Standard for Binary
  6464. | Floating-Point Arithmetic.
  6465. *----------------------------------------------------------------------------*}
  6466. function floatx80_to_float64(a: floatx80): float64;
  6467. var
  6468. aSign: flag;
  6469. aExp: int32;
  6470. aSig, zSig: bits64;
  6471. begin
  6472. aSig := extractFloatx80Frac( a );
  6473. aExp := extractFloatx80Exp( a );
  6474. aSign := extractFloatx80Sign( a );
  6475. if ( aExp = $7FFF ) then begin
  6476. if bits64( aSig shl 1 ) <> 0 then begin
  6477. result:=commonNaNToFloat64(floatx80ToCommonNaN(a));
  6478. exit;
  6479. end;
  6480. result := packFloat64( aSign, $7FF, 0 );
  6481. exit;
  6482. end;
  6483. shift64RightJamming( aSig, 1, zSig );
  6484. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6485. result := roundAndPackFloat64( aSign, aExp, zSig );
  6486. end;
  6487. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6488. {*----------------------------------------------------------------------------
  6489. | Returns the result of converting the extended double-precision floating-
  6490. | point value `a' to the quadruple-precision floating-point format. The
  6491. | conversion is performed according to the IEC/IEEE Standard for Binary
  6492. | Floating-Point Arithmetic.
  6493. *----------------------------------------------------------------------------*}
  6494. function floatx80_to_float128(a: floatx80): float128;
  6495. var
  6496. aSign: flag;
  6497. aExp: int16;
  6498. aSig, zSig0, zSig1: bits64;
  6499. begin
  6500. aSig := extractFloatx80Frac( a );
  6501. aExp := extractFloatx80Exp( a );
  6502. aSign := extractFloatx80Sign( a );
  6503. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6504. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6505. exit;
  6506. end;
  6507. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6508. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6509. end;
  6510. {$endif FPC_SOFTFLOAT_FLOAT128}
  6511. {*----------------------------------------------------------------------------
  6512. | Rounds the extended double-precision floating-point value `a' to an integer,
  6513. | and Returns the result as an extended quadruple-precision floating-point
  6514. | value. The operation is performed according to the IEC/IEEE Standard for
  6515. | Binary Floating-Point Arithmetic.
  6516. *----------------------------------------------------------------------------*}
  6517. function floatx80_round_to_int(a: floatx80): floatx80;
  6518. var
  6519. aSign: flag;
  6520. aExp: int32;
  6521. lastBitMask, roundBitsMask: bits64;
  6522. roundingMode: TFPURoundingMode;
  6523. z: floatx80;
  6524. begin
  6525. aExp := extractFloatx80Exp( a );
  6526. if ( $403E <= aExp ) then begin
  6527. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6528. result := propagateFloatx80NaN( a, a );
  6529. exit;
  6530. end;
  6531. result := a;
  6532. exit;
  6533. end;
  6534. if ( aExp < $3FFF ) then begin
  6535. if ( ( aExp = 0 )
  6536. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6537. result := a;
  6538. exit;
  6539. end;
  6540. set_inexact_flag;
  6541. aSign := extractFloatx80Sign( a );
  6542. case softfloat_rounding_mode of
  6543. float_round_nearest_even:
  6544. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6545. ) then begin
  6546. result :=
  6547. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6548. exit;
  6549. end;
  6550. float_round_down: begin
  6551. if aSign <> 0 then
  6552. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6553. else
  6554. result := packFloatx80( 0, 0, 0 );
  6555. exit;
  6556. end;
  6557. float_round_up: begin
  6558. if aSign <> 0 then
  6559. result := packFloatx80( 1, 0, 0 )
  6560. else
  6561. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6562. exit;
  6563. end;
  6564. end;
  6565. result := packFloatx80( aSign, 0, 0 );
  6566. exit;
  6567. end;
  6568. lastBitMask := 1;
  6569. lastBitMask := lastBitMask shl ( $403E - aExp );
  6570. roundBitsMask := lastBitMask - 1;
  6571. z := a;
  6572. roundingMode := softfloat_rounding_mode;
  6573. if ( roundingMode = float_round_nearest_even ) then begin
  6574. inc( z.low, lastBitMask shr 1 );
  6575. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6576. end
  6577. else if ( roundingMode <> float_round_to_zero ) then begin
  6578. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6579. inc( z.low, roundBitsMask );
  6580. end;
  6581. end;
  6582. z.low := z.low and not roundBitsMask;
  6583. if ( z.low = 0 ) then begin
  6584. inc(z.high);
  6585. z.low := bits64( $8000000000000000 );
  6586. end;
  6587. if ( z.low <> a.low ) then set_inexact_flag;
  6588. result := z;
  6589. end;
  6590. {*----------------------------------------------------------------------------
  6591. | Returns the result of adding the absolute values of the extended double-
  6592. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6593. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6594. | The addition is performed according to the IEC/IEEE Standard for Binary
  6595. | Floating-Point Arithmetic.
  6596. *----------------------------------------------------------------------------*}
  6597. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6598. var
  6599. aExp, bExp, zExp: int32;
  6600. aSig, bSig, zSig0, zSig1: bits64;
  6601. expDiff: int32;
  6602. label
  6603. shiftRight1, roundAndPack;
  6604. begin
  6605. aSig := extractFloatx80Frac( a );
  6606. aExp := extractFloatx80Exp( a );
  6607. bSig := extractFloatx80Frac( b );
  6608. bExp := extractFloatx80Exp( b );
  6609. expDiff := aExp - bExp;
  6610. if ( 0 < expDiff ) then begin
  6611. if ( aExp = $7FFF ) then begin
  6612. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6613. result := propagateFloatx80NaN( a, b );
  6614. exit;
  6615. end;
  6616. result := a;
  6617. exit;
  6618. end;
  6619. if ( bExp = 0 ) then dec(expDiff);
  6620. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6621. zExp := aExp;
  6622. end
  6623. else if ( expDiff < 0 ) then begin
  6624. if ( bExp = $7FFF ) then begin
  6625. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6626. result := propagateFloatx80NaN( a, b );
  6627. exit;
  6628. end;
  6629. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6630. exit;
  6631. end;
  6632. if ( aExp = 0 ) then inc(expDiff);
  6633. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6634. zExp := bExp;
  6635. end
  6636. else begin
  6637. if ( aExp = $7FFF ) then begin
  6638. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6639. result := propagateFloatx80NaN( a, b );
  6640. exit;
  6641. end;
  6642. result := a;
  6643. exit;
  6644. end;
  6645. zSig1 := 0;
  6646. zSig0 := aSig + bSig;
  6647. if ( aExp = 0 ) then begin
  6648. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6649. goto roundAndPack;
  6650. end;
  6651. zExp := aExp;
  6652. goto shiftRight1;
  6653. end;
  6654. zSig0 := aSig + bSig;
  6655. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6656. shiftRight1:
  6657. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6658. zSig0 := zSig0 or $8000000000000000;
  6659. inc(zExp);
  6660. roundAndPack:
  6661. result :=
  6662. roundAndPackFloatx80(
  6663. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6664. end;
  6665. {*----------------------------------------------------------------------------
  6666. | Returns the result of subtracting the absolute values of the extended
  6667. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6668. | difference is negated before being returned. `zSign' is ignored if the
  6669. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6670. | Standard for Binary Floating-Point Arithmetic.
  6671. *----------------------------------------------------------------------------*}
  6672. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6673. var
  6674. aExp, bExp, zExp: int32;
  6675. aSig, bSig, zSig0, zSig1: bits64;
  6676. expDiff: int32;
  6677. z: floatx80;
  6678. label
  6679. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6680. begin
  6681. aSig := extractFloatx80Frac( a );
  6682. aExp := extractFloatx80Exp( a );
  6683. bSig := extractFloatx80Frac( b );
  6684. bExp := extractFloatx80Exp( b );
  6685. expDiff := aExp - bExp;
  6686. if ( 0 < expDiff ) then goto aExpBigger;
  6687. if ( expDiff < 0 ) then goto bExpBigger;
  6688. if ( aExp = $7FFF ) then begin
  6689. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6690. result := propagateFloatx80NaN( a, b );
  6691. exit;
  6692. end;
  6693. float_raise( float_flag_invalid );
  6694. z.low := floatx80_default_nan_low;
  6695. z.high := floatx80_default_nan_high;
  6696. result := z;
  6697. exit;
  6698. end;
  6699. if ( aExp = 0 ) then begin
  6700. aExp := 1;
  6701. bExp := 1;
  6702. end;
  6703. zSig1 := 0;
  6704. if ( bSig < aSig ) then goto aBigger;
  6705. if ( aSig < bSig ) then goto bBigger;
  6706. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6707. exit;
  6708. bExpBigger:
  6709. if ( bExp = $7FFF ) then begin
  6710. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6711. result := propagateFloatx80NaN( a, b );
  6712. exit;
  6713. end;
  6714. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6715. exit;
  6716. end;
  6717. if ( aExp = 0 ) then inc(expDiff);
  6718. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6719. bBigger:
  6720. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6721. zExp := bExp;
  6722. zSign := zSign xor 1;
  6723. goto normalizeRoundAndPack;
  6724. aExpBigger:
  6725. if ( aExp = $7FFF ) then begin
  6726. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6727. result := propagateFloatx80NaN( a, b );
  6728. exit;
  6729. end;
  6730. result := a;
  6731. exit;
  6732. end;
  6733. if ( bExp = 0 ) then dec(expDiff);
  6734. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6735. aBigger:
  6736. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6737. zExp := aExp;
  6738. normalizeRoundAndPack:
  6739. result :=
  6740. normalizeRoundAndPackFloatx80(
  6741. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6742. end;
  6743. {*----------------------------------------------------------------------------
  6744. | Returns the result of adding the extended double-precision floating-point
  6745. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6746. | Standard for Binary Floating-Point Arithmetic.
  6747. *----------------------------------------------------------------------------*}
  6748. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6749. var
  6750. aSign, bSign: flag;
  6751. begin
  6752. aSign := extractFloatx80Sign( a );
  6753. bSign := extractFloatx80Sign( b );
  6754. if ( aSign = bSign ) then begin
  6755. result := addFloatx80Sigs( a, b, aSign );
  6756. end
  6757. else begin
  6758. result := subFloatx80Sigs( a, b, aSign );
  6759. end;
  6760. end;
  6761. {*----------------------------------------------------------------------------
  6762. | Returns the result of subtracting the extended double-precision floating-
  6763. | point values `a' and `b'. The operation is performed according to the
  6764. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6765. *----------------------------------------------------------------------------*}
  6766. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6767. var
  6768. aSign, bSign: flag;
  6769. begin
  6770. aSign := extractFloatx80Sign( a );
  6771. bSign := extractFloatx80Sign( b );
  6772. if ( aSign = bSign ) then begin
  6773. result := subFloatx80Sigs( a, b, aSign );
  6774. end
  6775. else begin
  6776. result := addFloatx80Sigs( a, b, aSign );
  6777. end;
  6778. end;
  6779. {*----------------------------------------------------------------------------
  6780. | Returns the result of multiplying the extended double-precision floating-
  6781. | point values `a' and `b'. The operation is performed according to the
  6782. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6783. *----------------------------------------------------------------------------*}
  6784. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6785. var
  6786. aSign, bSign, zSign: flag;
  6787. aExp, bExp, zExp: int32;
  6788. aSig, bSig, zSig0, zSig1: bits64;
  6789. z: floatx80;
  6790. label
  6791. invalid;
  6792. begin
  6793. aSig := extractFloatx80Frac( a );
  6794. aExp := extractFloatx80Exp( a );
  6795. aSign := extractFloatx80Sign( a );
  6796. bSig := extractFloatx80Frac( b );
  6797. bExp := extractFloatx80Exp( b );
  6798. bSign := extractFloatx80Sign( b );
  6799. zSign := aSign xor bSign;
  6800. if ( aExp = $7FFF ) then begin
  6801. if ( bits64( aSig shl 1 ) <> 0 )
  6802. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6803. result := propagateFloatx80NaN( a, b );
  6804. exit;
  6805. end;
  6806. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6807. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6808. exit;
  6809. end;
  6810. if ( bExp = $7FFF ) then begin
  6811. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6812. result := propagateFloatx80NaN( a, b );
  6813. exit;
  6814. end;
  6815. if ( ( aExp or aSig ) = 0 ) then begin
  6816. invalid:
  6817. float_raise( float_flag_invalid );
  6818. z.low := floatx80_default_nan_low;
  6819. z.high := floatx80_default_nan_high;
  6820. result := z;
  6821. exit;
  6822. end;
  6823. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6824. exit;
  6825. end;
  6826. if ( aExp = 0 ) then begin
  6827. if ( aSig = 0 ) then begin
  6828. result := packFloatx80( zSign, 0, 0 );
  6829. exit;
  6830. end;
  6831. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6832. end;
  6833. if ( bExp = 0 ) then begin
  6834. if ( bSig = 0 ) then begin
  6835. result := packFloatx80( zSign, 0, 0 );
  6836. exit;
  6837. end;
  6838. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6839. end;
  6840. zExp := aExp + bExp - $3FFE;
  6841. mul64To128( aSig, bSig, zSig0, zSig1 );
  6842. if 0 < sbits64( zSig0 ) then begin
  6843. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6844. dec(zExp);
  6845. end;
  6846. result :=
  6847. roundAndPackFloatx80(
  6848. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6849. end;
  6850. {*----------------------------------------------------------------------------
  6851. | Returns the result of dividing the extended double-precision floating-point
  6852. | value `a' by the corresponding value `b'. The operation is performed
  6853. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6854. *----------------------------------------------------------------------------*}
  6855. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6856. var
  6857. aSign, bSign, zSign: flag;
  6858. aExp, bExp, zExp: int32;
  6859. aSig, bSig, zSig0, zSig1: bits64;
  6860. rem0, rem1, rem2, term0, term1, term2: bits64;
  6861. z: floatx80;
  6862. label
  6863. invalid;
  6864. begin
  6865. aSig := extractFloatx80Frac( a );
  6866. aExp := extractFloatx80Exp( a );
  6867. aSign := extractFloatx80Sign( a );
  6868. bSig := extractFloatx80Frac( b );
  6869. bExp := extractFloatx80Exp( b );
  6870. bSign := extractFloatx80Sign( b );
  6871. zSign := aSign xor bSign;
  6872. if ( aExp = $7FFF ) then begin
  6873. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6874. result := propagateFloatx80NaN( a, b );
  6875. exit;
  6876. end;
  6877. if ( bExp = $7FFF ) then begin
  6878. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6879. result := propagateFloatx80NaN( a, b );
  6880. exit;
  6881. end;
  6882. goto invalid;
  6883. end;
  6884. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6885. exit;
  6886. end;
  6887. if ( bExp = $7FFF ) then begin
  6888. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6889. result := propagateFloatx80NaN( a, b );
  6890. exit;
  6891. end;
  6892. result := packFloatx80( zSign, 0, 0 );
  6893. exit;
  6894. end;
  6895. if ( bExp = 0 ) then begin
  6896. if ( bSig = 0 ) then begin
  6897. if ( ( aExp or aSig ) = 0 ) then begin
  6898. invalid:
  6899. float_raise( float_flag_invalid );
  6900. z.low := floatx80_default_nan_low;
  6901. z.high := floatx80_default_nan_high;
  6902. result := z;
  6903. exit;
  6904. end;
  6905. float_raise( float_flag_divbyzero );
  6906. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6907. exit;
  6908. end;
  6909. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6910. end;
  6911. if ( aExp = 0 ) then begin
  6912. if ( aSig = 0 ) then begin
  6913. result := packFloatx80( zSign, 0, 0 );
  6914. exit;
  6915. end;
  6916. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6917. end;
  6918. zExp := aExp - bExp + $3FFE;
  6919. rem1 := 0;
  6920. if ( bSig <= aSig ) then begin
  6921. shift128Right( aSig, 0, 1, aSig, rem1 );
  6922. inc(zExp);
  6923. end;
  6924. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6925. mul64To128( bSig, zSig0, term0, term1 );
  6926. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6927. while ( sbits64( rem0 ) < 0 ) do begin
  6928. dec(zSig0);
  6929. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6930. end;
  6931. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6932. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6933. mul64To128( bSig, zSig1, term1, term2 );
  6934. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6935. while ( sbits64( rem1 ) < 0 ) do begin
  6936. dec(zSig1);
  6937. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6938. end;
  6939. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6940. end;
  6941. result :=
  6942. roundAndPackFloatx80(
  6943. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6944. end;
  6945. {*----------------------------------------------------------------------------
  6946. | Returns the remainder of the extended double-precision floating-point value
  6947. | `a' with respect to the corresponding value `b'. The operation is performed
  6948. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6949. *----------------------------------------------------------------------------*}
  6950. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6951. var
  6952. aSign, zSign: flag;
  6953. aExp, bExp, expDiff: int32;
  6954. aSig0, aSig1, bSig: bits64;
  6955. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6956. z: floatx80;
  6957. label
  6958. invalid;
  6959. begin
  6960. aSig0 := extractFloatx80Frac( a );
  6961. aExp := extractFloatx80Exp( a );
  6962. aSign := extractFloatx80Sign( a );
  6963. bSig := extractFloatx80Frac( b );
  6964. bExp := extractFloatx80Exp( b );
  6965. if ( aExp = $7FFF ) then begin
  6966. if ( bits64( aSig0 shl 1 ) <> 0 )
  6967. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6968. result := propagateFloatx80NaN( a, b );
  6969. exit;
  6970. end;
  6971. goto invalid;
  6972. end;
  6973. if ( bExp = $7FFF ) then begin
  6974. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6975. result := propagateFloatx80NaN( a, b );
  6976. exit;
  6977. end;
  6978. result := a;
  6979. exit;
  6980. end;
  6981. if ( bExp = 0 ) then begin
  6982. if ( bSig = 0 ) then begin
  6983. invalid:
  6984. float_raise( float_flag_invalid );
  6985. z.low := floatx80_default_nan_low;
  6986. z.high := floatx80_default_nan_high;
  6987. result := z;
  6988. exit;
  6989. end;
  6990. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6991. end;
  6992. if ( aExp = 0 ) then begin
  6993. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  6994. result := a;
  6995. exit;
  6996. end;
  6997. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6998. end;
  6999. bSig := bSig or $8000000000000000;
  7000. zSign := aSign;
  7001. expDiff := aExp - bExp;
  7002. aSig1 := 0;
  7003. if ( expDiff < 0 ) then begin
  7004. if ( expDiff < -1 ) then begin
  7005. result := a;
  7006. exit;
  7007. end;
  7008. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  7009. expDiff := 0;
  7010. end;
  7011. q := ord( bSig <= aSig0 );
  7012. if ( q <> 0 ) then dec( aSig0, bSig );
  7013. dec( expDiff, 64 );
  7014. while ( 0 < expDiff ) do begin
  7015. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7016. if ( 2 < q ) then q := q - 2 else q := 0;
  7017. mul64To128( bSig, q, term0, term1 );
  7018. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7019. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  7020. dec( expDiff, 62 );
  7021. end;
  7022. inc( expDiff, 64 );
  7023. if ( 0 < expDiff ) then begin
  7024. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7025. if ( 2 < q ) then q:= q - 2 else q := 0;
  7026. q := q shr ( 64 - expDiff );
  7027. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  7028. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7029. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  7030. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  7031. inc(q);
  7032. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7033. end;
  7034. end
  7035. else begin
  7036. term1 := 0;
  7037. term0 := bSig;
  7038. end;
  7039. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  7040. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7041. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7042. and ( q and 1 <> 0 ) )
  7043. then begin
  7044. aSig0 := alternateASig0;
  7045. aSig1 := alternateASig1;
  7046. zSign := ord( zSign = 0 );
  7047. end;
  7048. result :=
  7049. normalizeRoundAndPackFloatx80(
  7050. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  7051. end;
  7052. {*----------------------------------------------------------------------------
  7053. | Returns the square root of the extended double-precision floating-point
  7054. | value `a'. The operation is performed according to the IEC/IEEE Standard
  7055. | for Binary Floating-Point Arithmetic.
  7056. *----------------------------------------------------------------------------*}
  7057. function floatx80_sqrt(a: floatx80): floatx80;
  7058. var
  7059. aSign: flag;
  7060. aExp, zExp: int32;
  7061. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  7062. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7063. z: floatx80;
  7064. label
  7065. invalid;
  7066. begin
  7067. aSig0 := extractFloatx80Frac( a );
  7068. aExp := extractFloatx80Exp( a );
  7069. aSign := extractFloatx80Sign( a );
  7070. if ( aExp = $7FFF ) then begin
  7071. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  7072. result := propagateFloatx80NaN( a, a );
  7073. exit;
  7074. end;
  7075. if ( aSign = 0 ) then begin
  7076. result := a;
  7077. exit;
  7078. end;
  7079. goto invalid;
  7080. end;
  7081. if ( aSign <> 0 ) then begin
  7082. if ( ( aExp or aSig0 ) = 0 ) then begin
  7083. result := a;
  7084. exit;
  7085. end;
  7086. invalid:
  7087. float_raise( float_flag_invalid );
  7088. z.low := floatx80_default_nan_low;
  7089. z.high := floatx80_default_nan_high;
  7090. result := z;
  7091. exit;
  7092. end;
  7093. if ( aExp = 0 ) then begin
  7094. if ( aSig0 = 0 ) then begin
  7095. result := packFloatx80( 0, 0, 0 );
  7096. exit;
  7097. end;
  7098. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7099. end;
  7100. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
  7101. zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
  7102. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  7103. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7104. doubleZSig0 := zSig0 shl 1;
  7105. mul64To128( zSig0, zSig0, term0, term1 );
  7106. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7107. while ( sbits64( rem0 ) < 0 ) do begin
  7108. dec(zSig0);
  7109. dec( doubleZSig0, 2 );
  7110. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7111. end;
  7112. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7113. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7114. if ( zSig1 = 0 ) then zSig1 := 1;
  7115. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7116. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7117. mul64To128( zSig1, zSig1, term2, term3 );
  7118. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7119. while ( sbits64( rem1 ) < 0 ) do begin
  7120. dec(zSig1);
  7121. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7122. term3 := term3 or 1;
  7123. term2 := term2 or doubleZSig0;
  7124. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7125. end;
  7126. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7127. end;
  7128. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7129. zSig0 := zSig0 or doubleZSig0;
  7130. result :=
  7131. roundAndPackFloatx80(
  7132. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7133. end;
  7134. {*----------------------------------------------------------------------------
  7135. | Returns 1 if the extended double-precision floating-point value `a' is
  7136. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7137. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7138. | Arithmetic.
  7139. *----------------------------------------------------------------------------*}
  7140. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7141. begin
  7142. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7143. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7144. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7145. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7146. ) then begin
  7147. if ( floatx80_is_signaling_nan( a )
  7148. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7149. float_raise( float_flag_invalid );
  7150. end;
  7151. result := 0;
  7152. exit;
  7153. end;
  7154. result := ord(
  7155. ( a.low = b.low )
  7156. and ( ( a.high = b.high )
  7157. or ( ( a.low = 0 )
  7158. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7159. ) );
  7160. end;
  7161. {*----------------------------------------------------------------------------
  7162. | Returns 1 if the extended double-precision floating-point value `a' is
  7163. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7164. | comparison is performed according to the IEC/IEEE Standard for Binary
  7165. | Floating-Point Arithmetic.
  7166. *----------------------------------------------------------------------------*}
  7167. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7168. var
  7169. aSign, bSign: flag;
  7170. begin
  7171. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7172. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7173. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7174. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7175. then begin
  7176. float_raise( float_flag_invalid );
  7177. result := 0;
  7178. exit;
  7179. end;
  7180. aSign := extractFloatx80Sign( a );
  7181. bSign := extractFloatx80Sign( b );
  7182. if ( aSign <> bSign ) then begin
  7183. result := ord(
  7184. ( aSign <> 0 )
  7185. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7186. exit;
  7187. end;
  7188. if aSign<>0 then
  7189. result := le128( b.high, b.low, a.high, a.low )
  7190. else
  7191. result := le128( a.high, a.low, b.high, b.low );
  7192. end;
  7193. {*----------------------------------------------------------------------------
  7194. | Returns 1 if the extended double-precision floating-point value `a' is
  7195. | less than the corresponding value `b', and 0 otherwise. The comparison
  7196. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7197. | Arithmetic.
  7198. *----------------------------------------------------------------------------*}
  7199. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7200. var
  7201. aSign, bSign: flag;
  7202. begin
  7203. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7204. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7205. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7206. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7207. then begin
  7208. float_raise( float_flag_invalid );
  7209. result := 0;
  7210. exit;
  7211. end;
  7212. aSign := extractFloatx80Sign( a );
  7213. bSign := extractFloatx80Sign( b );
  7214. if ( aSign <> bSign ) then begin
  7215. result := ord(
  7216. ( aSign <> 0 )
  7217. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7218. exit;
  7219. end;
  7220. if aSign <> 0 then
  7221. result := lt128( b.high, b.low, a.high, a.low )
  7222. else
  7223. result := lt128( a.high, a.low, b.high, b.low );
  7224. end;
  7225. {*----------------------------------------------------------------------------
  7226. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7227. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7228. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7229. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7230. *----------------------------------------------------------------------------*}
  7231. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7232. begin
  7233. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7234. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7235. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7236. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7237. then begin
  7238. float_raise( float_flag_invalid );
  7239. result := 0;
  7240. exit;
  7241. end;
  7242. result := ord(
  7243. ( a.low = b.low )
  7244. and ( ( a.high = b.high )
  7245. or ( ( a.low = 0 )
  7246. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7247. ) );
  7248. end;
  7249. {*----------------------------------------------------------------------------
  7250. | Returns 1 if the extended double-precision floating-point value `a' is less
  7251. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7252. | do not cause an exception. Otherwise, the comparison is performed according
  7253. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7254. *----------------------------------------------------------------------------*}
  7255. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7256. var
  7257. aSign, bSign: flag;
  7258. begin
  7259. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7260. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7261. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7262. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7263. then begin
  7264. if ( floatx80_is_signaling_nan( a )
  7265. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7266. float_raise( float_flag_invalid );
  7267. end;
  7268. result := 0;
  7269. exit;
  7270. end;
  7271. aSign := extractFloatx80Sign( a );
  7272. bSign := extractFloatx80Sign( b );
  7273. if ( aSign <> bSign ) then begin
  7274. result := ord(
  7275. ( aSign <> 0 )
  7276. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7277. exit;
  7278. end;
  7279. if aSign <> 0 then
  7280. result := le128( b.high, b.low, a.high, a.low )
  7281. else
  7282. result := le128( a.high, a.low, b.high, b.low );
  7283. end;
  7284. {*----------------------------------------------------------------------------
  7285. | Returns 1 if the extended double-precision floating-point value `a' is less
  7286. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7287. | an exception. Otherwise, the comparison is performed according to the
  7288. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7289. *----------------------------------------------------------------------------*}
  7290. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7291. var
  7292. aSign, bSign: flag;
  7293. begin
  7294. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7295. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7296. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7297. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7298. then begin
  7299. if ( floatx80_is_signaling_nan( a )
  7300. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7301. float_raise( float_flag_invalid );
  7302. end;
  7303. result := 0;
  7304. exit;
  7305. end;
  7306. aSign := extractFloatx80Sign( a );
  7307. bSign := extractFloatx80Sign( b );
  7308. if ( aSign <> bSign ) then begin
  7309. result := ord(
  7310. ( aSign <> 0 )
  7311. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7312. exit;
  7313. end;
  7314. if aSign <> 0 then
  7315. result := lt128( b.high, b.low, a.high, a.low )
  7316. else
  7317. result := lt128( a.high, a.low, b.high, b.low );
  7318. end;
  7319. {$endif FPC_SOFTFLOAT_FLOATX80}
  7320. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7321. {*----------------------------------------------------------------------------
  7322. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7323. | floating-point value `a'.
  7324. *----------------------------------------------------------------------------*}
  7325. function extractFloat128Frac1(a : float128): bits64;
  7326. begin
  7327. result:=a.low;
  7328. end;
  7329. {*----------------------------------------------------------------------------
  7330. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7331. | floating-point value `a'.
  7332. *----------------------------------------------------------------------------*}
  7333. function extractFloat128Frac0(a : float128): bits64;
  7334. begin
  7335. result:=a.high and int64($0000FFFFFFFFFFFF);
  7336. end;
  7337. {*----------------------------------------------------------------------------
  7338. | Returns the exponent bits of the quadruple-precision floating-point value
  7339. | `a'.
  7340. *----------------------------------------------------------------------------*}
  7341. function extractFloat128Exp(a : float128): int32;
  7342. begin
  7343. result:=( a.high shr 48 ) and $7FFF;
  7344. end;
  7345. {*----------------------------------------------------------------------------
  7346. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7347. *----------------------------------------------------------------------------*}
  7348. function extractFloat128Sign(a : float128): flag;
  7349. begin
  7350. result:=a.high shr 63;
  7351. end;
  7352. {*----------------------------------------------------------------------------
  7353. | Normalizes the subnormal quadruple-precision floating-point value
  7354. | represented by the denormalized significand formed by the concatenation of
  7355. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7356. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7357. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7358. | least significant 64 bits of the normalized significand are stored at the
  7359. | location pointed to by `zSig1Ptr'.
  7360. *----------------------------------------------------------------------------*}
  7361. procedure normalizeFloat128Subnormal(
  7362. aSig0: bits64;
  7363. aSig1: bits64;
  7364. var zExpPtr: int32;
  7365. var zSig0Ptr: bits64;
  7366. var zSig1Ptr: bits64);
  7367. var
  7368. shiftCount: int8;
  7369. begin
  7370. if ( aSig0 = 0 ) then
  7371. begin
  7372. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7373. if ( shiftCount < 0 ) then
  7374. begin
  7375. zSig0Ptr := aSig1 shr ( - shiftCount );
  7376. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7377. end
  7378. else begin
  7379. zSig0Ptr := aSig1 shl shiftCount;
  7380. zSig1Ptr := 0;
  7381. end;
  7382. zExpPtr := - shiftCount - 63;
  7383. end
  7384. else begin
  7385. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7386. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7387. zExpPtr := 1 - shiftCount;
  7388. end;
  7389. end;
  7390. {*----------------------------------------------------------------------------
  7391. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7392. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7393. | floating-point value, returning the result. After being shifted into the
  7394. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7395. | added together to form the most significant 32 bits of the result. This
  7396. | means that any integer portion of `zSig0' will be added into the exponent.
  7397. | Since a properly normalized significand will have an integer portion equal
  7398. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7399. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7400. | significand.
  7401. *----------------------------------------------------------------------------*}
  7402. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7403. var
  7404. z: float128;
  7405. begin
  7406. z.low := zSig1;
  7407. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7408. result:=z;
  7409. end;
  7410. {*----------------------------------------------------------------------------
  7411. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7412. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7413. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7414. | corresponding to the abstract input. Ordinarily, the abstract value is
  7415. | simply rounded and packed into the quadruple-precision format, with the
  7416. | inexact exception raised if the abstract input cannot be represented
  7417. | exactly. However, if the abstract value is too large, the overflow and
  7418. | inexact exceptions are raised and an infinity or maximal finite value is
  7419. | returned. If the abstract value is too small, the input value is rounded to
  7420. | a subnormal number, and the underflow and inexact exceptions are raised if
  7421. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7422. | precision floating-point number.
  7423. | The input significand must be normalized or smaller. If the input
  7424. | significand is not normalized, `zExp' must be 0; in that case, the result
  7425. | returned is a subnormal number, and it must not require rounding. In the
  7426. | usual case that the input significand is normalized, `zExp' must be 1 less
  7427. | than the ``true'' floating-point exponent. The handling of underflow and
  7428. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7429. *----------------------------------------------------------------------------*}
  7430. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7431. var
  7432. roundingMode: TFPURoundingMode;
  7433. roundNearestEven, increment, isTiny: flag;
  7434. begin
  7435. roundingMode := softfloat_rounding_mode;
  7436. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7437. increment := ord( sbits64(zSig2) < 0 );
  7438. if ( roundNearestEven=0 ) then
  7439. begin
  7440. if ( roundingMode = float_round_to_zero ) then
  7441. begin
  7442. increment := 0;
  7443. end
  7444. else begin
  7445. if ( zSign<>0 ) then
  7446. begin
  7447. increment := ord( roundingMode = float_round_down ) and zSig2;
  7448. end
  7449. else begin
  7450. increment := ord( roundingMode = float_round_up ) and zSig2;
  7451. end;
  7452. end;
  7453. end;
  7454. if ( $7FFD <= bits32(zExp) ) then
  7455. begin
  7456. if ( ord( $7FFD < zExp )
  7457. or ( ord( zExp = $7FFD )
  7458. and eq128(
  7459. int64( $0001FFFFFFFFFFFF ),
  7460. bits64( $FFFFFFFFFFFFFFFF ),
  7461. zSig0,
  7462. zSig1
  7463. )
  7464. and increment
  7465. )
  7466. )<>0 then
  7467. begin
  7468. float_raise( [float_flag_overflow,float_flag_inexact] );
  7469. if ( ord( roundingMode = float_round_to_zero )
  7470. or ( zSign and ord( roundingMode = float_round_up ) )
  7471. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7472. )<>0 then
  7473. begin
  7474. result :=
  7475. packFloat128(
  7476. zSign,
  7477. $7FFE,
  7478. int64( $0000FFFFFFFFFFFF ),
  7479. bits64( $FFFFFFFFFFFFFFFF )
  7480. );
  7481. exit;
  7482. end;
  7483. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7484. exit;
  7485. end;
  7486. if ( zExp < 0 ) then
  7487. begin
  7488. isTiny :=
  7489. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7490. or ( zExp < -1 )
  7491. or not( increment<>0 )
  7492. or boolean(lt128(
  7493. zSig0,
  7494. zSig1,
  7495. int64( $0001FFFFFFFFFFFF ),
  7496. bits64( $FFFFFFFFFFFFFFFF )
  7497. )));
  7498. shift128ExtraRightJamming(
  7499. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7500. zExp := 0;
  7501. if ( isTiny and zSig2 )<>0 then
  7502. float_raise( float_flag_underflow );
  7503. if ( roundNearestEven<>0 ) then
  7504. begin
  7505. increment := ord( sbits64(zSig2) < 0 );
  7506. end
  7507. else begin
  7508. if ( zSign<>0 ) then
  7509. begin
  7510. increment := ord( roundingMode = float_round_down ) and zSig2;
  7511. end
  7512. else begin
  7513. increment := ord( roundingMode = float_round_up ) and zSig2;
  7514. end;
  7515. end;
  7516. end;
  7517. end;
  7518. if ( zSig2<>0 ) then
  7519. set_inexact_flag;
  7520. if ( increment<>0 ) then
  7521. begin
  7522. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7523. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7524. end
  7525. else begin
  7526. if ( ( zSig0 or zSig1 ) = 0 ) then
  7527. zExp := 0;
  7528. end;
  7529. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7530. end;
  7531. {*----------------------------------------------------------------------------
  7532. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7533. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7534. | returns the proper quadruple-precision floating-point value corresponding
  7535. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7536. | except that the input significand has fewer bits and does not have to be
  7537. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7538. | point exponent.
  7539. *----------------------------------------------------------------------------*}
  7540. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7541. var
  7542. shiftCount: int8;
  7543. zSig2: bits64;
  7544. begin
  7545. if ( zSig0 = 0 ) then
  7546. begin
  7547. zSig0 := zSig1;
  7548. zSig1 := 0;
  7549. dec(zExp, 64);
  7550. end;
  7551. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7552. if ( 0 <= shiftCount ) then
  7553. begin
  7554. zSig2 := 0;
  7555. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7556. end
  7557. else begin
  7558. shift128ExtraRightJamming(
  7559. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7560. end;
  7561. dec(zExp, shiftCount);
  7562. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7563. end;
  7564. {*----------------------------------------------------------------------------
  7565. | Returns the result of converting the quadruple-precision floating-point
  7566. | value `a' to the 32-bit two's complement integer format. The conversion
  7567. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7568. | Arithmetic---which means in particular that the conversion is rounded
  7569. | according to the current rounding mode. If `a' is a NaN, the largest
  7570. | positive integer is returned. Otherwise, if the conversion overflows, the
  7571. | largest integer with the same sign as `a' is returned.
  7572. *----------------------------------------------------------------------------*}
  7573. function float128_to_int32(a: float128): int32;
  7574. var
  7575. aSign: flag;
  7576. aExp, shiftCount: int32;
  7577. aSig0, aSig1: bits64;
  7578. begin
  7579. aSig1 := extractFloat128Frac1( a );
  7580. aSig0 := extractFloat128Frac0( a );
  7581. aExp := extractFloat128Exp( a );
  7582. aSign := extractFloat128Sign( a );
  7583. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7584. aSign := 0;
  7585. if ( aExp<>0 ) then
  7586. aSig0 := aSig0 or int64( $0001000000000000 );
  7587. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7588. shiftCount := $4028 - aExp;
  7589. if ( 0 < shiftCount ) then
  7590. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7591. result := roundAndPackInt32( aSign, aSig0 );
  7592. end;
  7593. {*----------------------------------------------------------------------------
  7594. | Returns the result of converting the quadruple-precision floating-point
  7595. | value `a' to the 32-bit two's complement integer format. The conversion
  7596. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7597. | Arithmetic, except that the conversion is always rounded toward zero. If
  7598. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7599. | conversion overflows, the largest integer with the same sign as `a' is
  7600. | returned.
  7601. *----------------------------------------------------------------------------*}
  7602. function float128_to_int32_round_to_zero(a: float128): int32;
  7603. var
  7604. aSign: flag;
  7605. aExp, shiftCount: int32;
  7606. aSig0, aSig1, savedASig: bits64;
  7607. z: int32;
  7608. label
  7609. invalid;
  7610. begin
  7611. aSig1 := extractFloat128Frac1( a );
  7612. aSig0 := extractFloat128Frac0( a );
  7613. aExp := extractFloat128Exp( a );
  7614. aSign := extractFloat128Sign( a );
  7615. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7616. if ( $401E < aExp ) then
  7617. begin
  7618. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7619. aSign := 0;
  7620. goto invalid;
  7621. end
  7622. else if ( aExp < $3FFF ) then
  7623. begin
  7624. if ( aExp or aSig0 )<>0 then
  7625. set_inexact_flag;
  7626. result := 0;
  7627. exit;
  7628. end;
  7629. aSig0 := aSig0 or int64( $0001000000000000 );
  7630. shiftCount := $402F - aExp;
  7631. savedASig := aSig0;
  7632. aSig0 := aSig0 shr shiftCount;
  7633. z := aSig0;
  7634. if ( aSign )<>0 then
  7635. z := - z;
  7636. if ( ord( z < 0 ) xor aSign )<>0 then
  7637. begin
  7638. invalid:
  7639. float_raise( float_flag_invalid );
  7640. if aSign<>0 then
  7641. result:= int32( $80000000 )
  7642. else
  7643. result:=$7FFFFFFF;
  7644. exit;
  7645. end;
  7646. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7647. begin
  7648. set_inexact_flag;
  7649. end;
  7650. result := z;
  7651. end;
  7652. {*----------------------------------------------------------------------------
  7653. | Returns the result of converting the quadruple-precision floating-point
  7654. | value `a' to the 64-bit two's complement integer format. The conversion
  7655. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7656. | Arithmetic---which means in particular that the conversion is rounded
  7657. | according to the current rounding mode. If `a' is a NaN, the largest
  7658. | positive integer is returned. Otherwise, if the conversion overflows, the
  7659. | largest integer with the same sign as `a' is returned.
  7660. *----------------------------------------------------------------------------*}
  7661. function float128_to_int64(a: float128): int64;
  7662. var
  7663. aSign: flag;
  7664. aExp, shiftCount: int32;
  7665. aSig0, aSig1: bits64;
  7666. begin
  7667. aSig1 := extractFloat128Frac1( a );
  7668. aSig0 := extractFloat128Frac0( a );
  7669. aExp := extractFloat128Exp( a );
  7670. aSign := extractFloat128Sign( a );
  7671. if ( aExp<>0 ) then
  7672. aSig0 := aSig0 or int64( $0001000000000000 );
  7673. shiftCount := $402F - aExp;
  7674. if ( shiftCount <= 0 ) then
  7675. begin
  7676. if ( $403E < aExp ) then
  7677. begin
  7678. float_raise( float_flag_invalid );
  7679. if ( (aSign=0)
  7680. or ( ( aExp = $7FFF )
  7681. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7682. )
  7683. ) then
  7684. begin
  7685. result := int64( $7FFFFFFFFFFFFFFF );
  7686. exit;
  7687. end;
  7688. result := int64( $8000000000000000 );
  7689. exit;
  7690. end;
  7691. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7692. end
  7693. else begin
  7694. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7695. end;
  7696. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7697. end;
  7698. {*----------------------------------------------------------------------------
  7699. | Returns the result of converting the quadruple-precision floating-point
  7700. | value `a' to the 64-bit two's complement integer format. The conversion
  7701. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7702. | Arithmetic, except that the conversion is always rounded toward zero.
  7703. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7704. | the conversion overflows, the largest integer with the same sign as `a' is
  7705. | returned.
  7706. *----------------------------------------------------------------------------*}
  7707. function float128_to_int64_round_to_zero(a: float128): int64;
  7708. var
  7709. aSign: flag;
  7710. aExp, shiftCount: int32;
  7711. aSig0, aSig1: bits64;
  7712. z: int64;
  7713. begin
  7714. aSig1 := extractFloat128Frac1( a );
  7715. aSig0 := extractFloat128Frac0( a );
  7716. aExp := extractFloat128Exp( a );
  7717. aSign := extractFloat128Sign( a );
  7718. if ( aExp<>0 ) then
  7719. aSig0 := aSig0 or int64( $0001000000000000 );
  7720. shiftCount := aExp - $402F;
  7721. if ( 0 < shiftCount ) then
  7722. begin
  7723. if ( $403E <= aExp ) then
  7724. begin
  7725. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7726. if ( ( a.high = bits64( $C03E000000000000 ) )
  7727. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7728. begin
  7729. if ( aSig1<>0 ) then
  7730. set_inexact_flag;
  7731. end
  7732. else begin
  7733. float_raise( float_flag_invalid );
  7734. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7735. begin
  7736. result := int64( $7FFFFFFFFFFFFFFF );
  7737. exit;
  7738. end;
  7739. end;
  7740. result := int64( $8000000000000000 );
  7741. exit;
  7742. end;
  7743. z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
  7744. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7745. begin
  7746. set_inexact_flag;
  7747. end;
  7748. end
  7749. else begin
  7750. if ( aExp < $3FFF ) then
  7751. begin
  7752. if ( aExp or aSig0 or aSig1 )<>0 then
  7753. begin
  7754. set_inexact_flag;
  7755. end;
  7756. result := 0;
  7757. exit;
  7758. end;
  7759. z := aSig0 shr ( - shiftCount );
  7760. if ( (aSig1<>0)
  7761. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7762. begin
  7763. set_inexact_flag;
  7764. end;
  7765. end;
  7766. if ( aSign<>0 ) then
  7767. z := - z;
  7768. result := z;
  7769. end;
  7770. {*----------------------------------------------------------------------------
  7771. | Returns the result of converting the quadruple-precision floating-point
  7772. | value `a' to the single-precision floating-point format. The conversion
  7773. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7774. | Arithmetic.
  7775. *----------------------------------------------------------------------------*}
  7776. function float128_to_float32(a: float128): float32;
  7777. var
  7778. aSign: flag;
  7779. aExp: int32;
  7780. aSig0, aSig1: bits64;
  7781. zSig: bits32;
  7782. begin
  7783. aSig1 := extractFloat128Frac1( a );
  7784. aSig0 := extractFloat128Frac0( a );
  7785. aExp := extractFloat128Exp( a );
  7786. aSign := extractFloat128Sign( a );
  7787. if ( aExp = $7FFF ) then
  7788. begin
  7789. if ( aSig0 or aSig1 )<>0 then
  7790. begin
  7791. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7792. exit;
  7793. end;
  7794. result := packFloat32( aSign, $FF, 0 );
  7795. exit;
  7796. end;
  7797. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7798. shift64RightJamming( aSig0, 18, aSig0 );
  7799. zSig := aSig0;
  7800. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7801. begin
  7802. zSig := zSig or $40000000;
  7803. dec(aExp,$3F81);
  7804. end;
  7805. result := roundAndPackFloat32( aSign, aExp, zSig );
  7806. end;
  7807. {*----------------------------------------------------------------------------
  7808. | Returns the result of converting the quadruple-precision floating-point
  7809. | value `a' to the double-precision floating-point format. The conversion
  7810. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7811. | Arithmetic.
  7812. *----------------------------------------------------------------------------*}
  7813. function float128_to_float64(a: float128): float64;
  7814. var
  7815. aSign: flag;
  7816. aExp: int32;
  7817. aSig0, aSig1: bits64;
  7818. begin
  7819. aSig1 := extractFloat128Frac1( a );
  7820. aSig0 := extractFloat128Frac0( a );
  7821. aExp := extractFloat128Exp( a );
  7822. aSign := extractFloat128Sign( a );
  7823. if ( aExp = $7FFF ) then
  7824. begin
  7825. if ( aSig0 or aSig1 )<>0 then
  7826. begin
  7827. result:=commonNaNToFloat64(float128ToCommonNaN(a));
  7828. exit;
  7829. end;
  7830. result:=packFloat64( aSign, $7FF, 0);
  7831. exit;
  7832. end;
  7833. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7834. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7835. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7836. begin
  7837. aSig0 := aSig0 or int64( $4000000000000000 );
  7838. dec(aExp,$3C01);
  7839. end;
  7840. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7841. end;
  7842. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7843. {*----------------------------------------------------------------------------
  7844. | Returns the result of converting the quadruple-precision floating-point
  7845. | value `a' to the extended double-precision floating-point format. The
  7846. | conversion is performed according to the IEC/IEEE Standard for Binary
  7847. | Floating-Point Arithmetic.
  7848. *----------------------------------------------------------------------------*}
  7849. function float128_to_floatx80(a: float128): floatx80;
  7850. var
  7851. aSign: flag;
  7852. aExp: int32;
  7853. aSig0, aSig1: bits64;
  7854. begin
  7855. aSig1 := extractFloat128Frac1( a );
  7856. aSig0 := extractFloat128Frac0( a );
  7857. aExp := extractFloat128Exp( a );
  7858. aSign := extractFloat128Sign( a );
  7859. if ( aExp = $7FFF ) then begin
  7860. if ( aSig0 or aSig1 <> 0 ) then begin
  7861. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7862. exit;
  7863. end;
  7864. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7865. exit;
  7866. end;
  7867. if ( aExp = 0 ) then begin
  7868. if ( ( aSig0 or aSig1 ) = 0 ) then
  7869. begin
  7870. result := packFloatx80( aSign, 0, 0 );
  7871. exit;
  7872. end;
  7873. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7874. end
  7875. else begin
  7876. aSig0 := aSig0 or int64( $0001000000000000 );
  7877. end;
  7878. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7879. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7880. end;
  7881. {$endif FPC_SOFTFLOAT_FLOATX80}
  7882. {*----------------------------------------------------------------------------
  7883. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7884. | Returns the result as a quadruple-precision floating-point value. The
  7885. | operation is performed according to the IEC/IEEE Standard for Binary
  7886. | Floating-Point Arithmetic.
  7887. *----------------------------------------------------------------------------*}
  7888. function float128_round_to_int(a: float128): float128;
  7889. var
  7890. aSign: flag;
  7891. aExp: int32;
  7892. lastBitMask, roundBitsMask: bits64;
  7893. roundingMode: TFPURoundingMode;
  7894. z: float128;
  7895. begin
  7896. aExp := extractFloat128Exp( a );
  7897. if ( $402F <= aExp ) then
  7898. begin
  7899. if ( $406F <= aExp ) then
  7900. begin
  7901. if ( ( aExp = $7FFF )
  7902. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7903. ) then
  7904. begin
  7905. result := propagateFloat128NaN( a, a );
  7906. exit;
  7907. end;
  7908. result := a;
  7909. exit;
  7910. end;
  7911. lastBitMask := 1;
  7912. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7913. roundBitsMask := lastBitMask - 1;
  7914. z := a;
  7915. roundingMode := softfloat_rounding_mode;
  7916. if ( roundingMode = float_round_nearest_even ) then
  7917. begin
  7918. if ( lastBitMask )<>0 then
  7919. begin
  7920. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7921. if ( ( z.low and roundBitsMask ) = 0 ) then
  7922. z.low := z.low and not(lastBitMask);
  7923. end
  7924. else begin
  7925. if ( sbits64(z.low) < 0 ) then
  7926. begin
  7927. inc(z.high);
  7928. if ( bits64( z.low shl 1 ) = 0 ) then
  7929. z.high := z.high and not bits64( 1 );
  7930. end;
  7931. end;
  7932. end
  7933. else if ( roundingMode <> float_round_to_zero ) then
  7934. begin
  7935. if ( extractFloat128Sign( z )
  7936. xor ord( roundingMode = float_round_up ) )<>0 then
  7937. begin
  7938. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7939. end;
  7940. end;
  7941. z.low := z.low and not(roundBitsMask);
  7942. end
  7943. else begin
  7944. if ( aExp < $3FFF ) then
  7945. begin
  7946. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7947. begin
  7948. result := a;
  7949. exit;
  7950. end;
  7951. set_inexact_flag;
  7952. aSign := extractFloat128Sign( a );
  7953. case softfloat_rounding_mode of
  7954. float_round_nearest_even:
  7955. if ( ( aExp = $3FFE )
  7956. and ( (extractFloat128Frac0( a )<>0)
  7957. or (extractFloat128Frac1( a )<>0) )
  7958. ) then begin
  7959. begin
  7960. result := packFloat128( aSign, $3FFF, 0, 0 );
  7961. exit;
  7962. end;
  7963. end;
  7964. float_round_down:
  7965. begin
  7966. if aSign<>0 then
  7967. result:=packFloat128( 1, $3FFF, 0, 0 )
  7968. else
  7969. result:=packFloat128( 0, 0, 0, 0 );
  7970. exit;
  7971. end;
  7972. float_round_up:
  7973. begin
  7974. if aSign<>0 then
  7975. result := packFloat128( 1, 0, 0, 0 )
  7976. else
  7977. result:=packFloat128( 0, $3FFF, 0, 0 );
  7978. exit;
  7979. end;
  7980. end;
  7981. result := packFloat128( aSign, 0, 0, 0 );
  7982. exit;
  7983. end;
  7984. lastBitMask := 1;
  7985. lastBitMask := lastBitMask shl ($402F - aExp);
  7986. roundBitsMask := lastBitMask - 1;
  7987. z.low := 0;
  7988. z.high := a.high;
  7989. roundingMode := softfloat_rounding_mode;
  7990. if ( roundingMode = float_round_nearest_even ) then begin
  7991. inc(z.high,lastBitMask shr 1);
  7992. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7993. z.high := z.high and not(lastBitMask);
  7994. end;
  7995. end
  7996. else if ( roundingMode <> float_round_to_zero ) then begin
  7997. if ( (extractFloat128Sign( z )<>0)
  7998. xor ( roundingMode = float_round_up ) ) then begin
  7999. z.high := z.high or ord( a.low <> 0 );
  8000. z.high := z.high+roundBitsMask;
  8001. end;
  8002. end;
  8003. z.high := z.high and not(roundBitsMask);
  8004. end;
  8005. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  8006. set_inexact_flag;
  8007. end;
  8008. result := z;
  8009. end;
  8010. {*----------------------------------------------------------------------------
  8011. | Returns the result of adding the absolute values of the quadruple-precision
  8012. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  8013. | before being returned. `zSign' is ignored if the result is a NaN.
  8014. | The addition is performed according to the IEC/IEEE Standard for Binary
  8015. | Floating-Point Arithmetic.
  8016. *----------------------------------------------------------------------------*}
  8017. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  8018. var
  8019. aExp, bExp, zExp: int32;
  8020. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8021. expDiff: int32;
  8022. label
  8023. shiftRight1,roundAndPack;
  8024. begin
  8025. aSig1 := extractFloat128Frac1( a );
  8026. aSig0 := extractFloat128Frac0( a );
  8027. aExp := extractFloat128Exp( a );
  8028. bSig1 := extractFloat128Frac1( b );
  8029. bSig0 := extractFloat128Frac0( b );
  8030. bExp := extractFloat128Exp( b );
  8031. expDiff := aExp - bExp;
  8032. if ( 0 < expDiff ) then begin
  8033. if ( aExp = $7FFF ) then begin
  8034. if ( aSig0 or aSig1 )<>0 then
  8035. begin
  8036. result := propagateFloat128NaN( a, b );
  8037. exit;
  8038. end;
  8039. result := a;
  8040. exit;
  8041. end;
  8042. if ( bExp = 0 ) then begin
  8043. dec(expDiff);
  8044. end
  8045. else begin
  8046. bSig0 := bSig0 or int64( $0001000000000000 );
  8047. end;
  8048. shift128ExtraRightJamming(
  8049. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  8050. zExp := aExp;
  8051. end
  8052. else if ( expDiff < 0 ) then begin
  8053. if ( bExp = $7FFF ) then begin
  8054. if ( bSig0 or bSig1 )<>0 then
  8055. begin
  8056. result := propagateFloat128NaN( a, b );
  8057. exit;
  8058. end;
  8059. result := packFloat128( zSign, $7FFF, 0, 0 );
  8060. exit;
  8061. end;
  8062. if ( aExp = 0 ) then begin
  8063. inc(expDiff);
  8064. end
  8065. else begin
  8066. aSig0 := aSig0 or int64( $0001000000000000 );
  8067. end;
  8068. shift128ExtraRightJamming(
  8069. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  8070. zExp := bExp;
  8071. end
  8072. else begin
  8073. if ( aExp = $7FFF ) then begin
  8074. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8075. result := propagateFloat128NaN( a, b );
  8076. exit;
  8077. end;
  8078. result := a;
  8079. exit;
  8080. end;
  8081. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8082. if ( aExp = 0 ) then
  8083. begin
  8084. result := packFloat128( zSign, 0, zSig0, zSig1 );
  8085. exit;
  8086. end;
  8087. zSig2 := 0;
  8088. zSig0 := zSig0 or int64( $0002000000000000 );
  8089. zExp := aExp;
  8090. goto shiftRight1;
  8091. end;
  8092. aSig0 := aSig0 or int64( $0001000000000000 );
  8093. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8094. dec(zExp);
  8095. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  8096. inc(zExp);
  8097. shiftRight1:
  8098. shift128ExtraRightJamming(
  8099. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8100. roundAndPack:
  8101. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8102. end;
  8103. {*----------------------------------------------------------------------------
  8104. | Returns the result of subtracting the absolute values of the quadruple-
  8105. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  8106. | difference is negated before being returned. `zSign' is ignored if the
  8107. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  8108. | Standard for Binary Floating-Point Arithmetic.
  8109. *----------------------------------------------------------------------------*}
  8110. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  8111. var
  8112. aExp, bExp, zExp: int32;
  8113. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8114. expDiff: int32;
  8115. z: float128;
  8116. label
  8117. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8118. begin
  8119. aSig1 := extractFloat128Frac1( a );
  8120. aSig0 := extractFloat128Frac0( a );
  8121. aExp := extractFloat128Exp( a );
  8122. bSig1 := extractFloat128Frac1( b );
  8123. bSig0 := extractFloat128Frac0( b );
  8124. bExp := extractFloat128Exp( b );
  8125. expDiff := aExp - bExp;
  8126. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8127. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8128. if ( 0 < expDiff ) then goto aExpBigger;
  8129. if ( expDiff < 0 ) then goto bExpBigger;
  8130. if ( aExp = $7FFF ) then begin
  8131. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8132. result := propagateFloat128NaN( a, b );
  8133. exit;
  8134. end;
  8135. float_raise( float_flag_invalid );
  8136. z.low := float128_default_nan_low;
  8137. z.high := float128_default_nan_high;
  8138. result := z;
  8139. exit;
  8140. end;
  8141. if ( aExp = 0 ) then begin
  8142. aExp := 1;
  8143. bExp := 1;
  8144. end;
  8145. if ( bSig0 < aSig0 ) then goto aBigger;
  8146. if ( aSig0 < bSig0 ) then goto bBigger;
  8147. if ( bSig1 < aSig1 ) then goto aBigger;
  8148. if ( aSig1 < bSig1 ) then goto bBigger;
  8149. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8150. exit;
  8151. bExpBigger:
  8152. if ( bExp = $7FFF ) then begin
  8153. if ( bSig0 or bSig1 )<>0 then
  8154. begin
  8155. result := propagateFloat128NaN( a, b );
  8156. exit;
  8157. end;
  8158. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8159. exit;
  8160. end;
  8161. if ( aExp = 0 ) then begin
  8162. inc(expDiff);
  8163. end
  8164. else begin
  8165. aSig0 := aSig0 or int64( $4000000000000000 );
  8166. end;
  8167. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8168. bSig0 := bSig0 or int64( $4000000000000000 );
  8169. bBigger:
  8170. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8171. zExp := bExp;
  8172. zSign := zSign xor 1;
  8173. goto normalizeRoundAndPack;
  8174. aExpBigger:
  8175. if ( aExp = $7FFF ) then begin
  8176. if ( aSig0 or aSig1 )<>0 then
  8177. begin
  8178. result := propagateFloat128NaN( a, b );
  8179. exit;
  8180. end;
  8181. result := a;
  8182. exit;
  8183. end;
  8184. if ( bExp = 0 ) then begin
  8185. dec(expDiff);
  8186. end
  8187. else begin
  8188. bSig0 := bSig0 or int64( $4000000000000000 );
  8189. end;
  8190. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8191. aSig0 := aSig0 or int64( $4000000000000000 );
  8192. aBigger:
  8193. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8194. zExp := aExp;
  8195. normalizeRoundAndPack:
  8196. dec(zExp);
  8197. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8198. end;
  8199. {*----------------------------------------------------------------------------
  8200. | Returns the result of adding the quadruple-precision floating-point values
  8201. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8202. | for Binary Floating-Point Arithmetic.
  8203. *----------------------------------------------------------------------------*}
  8204. function float128_add(a: float128; b: float128): float128;
  8205. var
  8206. aSign, bSign: flag;
  8207. begin
  8208. aSign := extractFloat128Sign( a );
  8209. bSign := extractFloat128Sign( b );
  8210. if ( aSign = bSign ) then begin
  8211. result := addFloat128Sigs( a, b, aSign );
  8212. end
  8213. else begin
  8214. result := subFloat128Sigs( a, b, aSign );
  8215. end;
  8216. end;
  8217. {*----------------------------------------------------------------------------
  8218. | Returns the result of subtracting the quadruple-precision floating-point
  8219. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8220. | Standard for Binary Floating-Point Arithmetic.
  8221. *----------------------------------------------------------------------------*}
  8222. function float128_sub(a: float128; b: float128): float128;
  8223. var
  8224. aSign, bSign: flag;
  8225. begin
  8226. aSign := extractFloat128Sign( a );
  8227. bSign := extractFloat128Sign( b );
  8228. if ( aSign = bSign ) then begin
  8229. result := subFloat128Sigs( a, b, aSign );
  8230. end
  8231. else begin
  8232. result := addFloat128Sigs( a, b, aSign );
  8233. end;
  8234. end;
  8235. {*----------------------------------------------------------------------------
  8236. | Returns the result of multiplying the quadruple-precision floating-point
  8237. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8238. | Standard for Binary Floating-Point Arithmetic.
  8239. *----------------------------------------------------------------------------*}
  8240. function float128_mul(a: float128; b: float128): float128;
  8241. var
  8242. aSign, bSign, zSign: flag;
  8243. aExp, bExp, zExp: int32;
  8244. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8245. z: float128;
  8246. label
  8247. invalid;
  8248. begin
  8249. aSig1 := extractFloat128Frac1( a );
  8250. aSig0 := extractFloat128Frac0( a );
  8251. aExp := extractFloat128Exp( a );
  8252. aSign := extractFloat128Sign( a );
  8253. bSig1 := extractFloat128Frac1( b );
  8254. bSig0 := extractFloat128Frac0( b );
  8255. bExp := extractFloat128Exp( b );
  8256. bSign := extractFloat128Sign( b );
  8257. zSign := aSign xor bSign;
  8258. if ( aExp = $7FFF ) then begin
  8259. if ( (( aSig0 or aSig1 )<>0)
  8260. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8261. result := propagateFloat128NaN( a, b );
  8262. exit;
  8263. end;
  8264. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8265. result := packFloat128( zSign, $7FFF, 0, 0 );
  8266. exit;
  8267. end;
  8268. if ( bExp = $7FFF ) then begin
  8269. if ( bSig0 or bSig1 )<>0 then
  8270. begin
  8271. result := propagateFloat128NaN( a, b );
  8272. exit;
  8273. end;
  8274. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8275. invalid:
  8276. float_raise( float_flag_invalid );
  8277. z.low := float128_default_nan_low;
  8278. z.high := float128_default_nan_high;
  8279. result := z;
  8280. exit;
  8281. end;
  8282. result := packFloat128( zSign, $7FFF, 0, 0 );
  8283. exit;
  8284. end;
  8285. if ( aExp = 0 ) then begin
  8286. if ( ( aSig0 or aSig1 ) = 0 ) then
  8287. begin
  8288. result := packFloat128( zSign, 0, 0, 0 );
  8289. exit;
  8290. end;
  8291. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8292. end;
  8293. if ( bExp = 0 ) then begin
  8294. if ( ( bSig0 or bSig1 ) = 0 ) then
  8295. begin
  8296. result := packFloat128( zSign, 0, 0, 0 );
  8297. exit;
  8298. end;
  8299. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8300. end;
  8301. zExp := aExp + bExp - $4000;
  8302. aSig0 := aSig0 or int64( $0001000000000000 );
  8303. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8304. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8305. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8306. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8307. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8308. shift128ExtraRightJamming(
  8309. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8310. inc(zExp);
  8311. end;
  8312. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8313. end;
  8314. {*----------------------------------------------------------------------------
  8315. | Returns the result of dividing the quadruple-precision floating-point value
  8316. | `a' by the corresponding value `b'. The operation is performed according to
  8317. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8318. *----------------------------------------------------------------------------*}
  8319. function float128_div(a: float128; b: float128): float128;
  8320. var
  8321. aSign, bSign, zSign: flag;
  8322. aExp, bExp, zExp: int32;
  8323. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8324. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8325. z: float128;
  8326. label
  8327. invalid;
  8328. begin
  8329. aSig1 := extractFloat128Frac1( a );
  8330. aSig0 := extractFloat128Frac0( a );
  8331. aExp := extractFloat128Exp( a );
  8332. aSign := extractFloat128Sign( a );
  8333. bSig1 := extractFloat128Frac1( b );
  8334. bSig0 := extractFloat128Frac0( b );
  8335. bExp := extractFloat128Exp( b );
  8336. bSign := extractFloat128Sign( b );
  8337. zSign := aSign xor bSign;
  8338. if ( aExp = $7FFF ) then begin
  8339. if ( aSig0 or aSig1 )<>0 then
  8340. begin
  8341. result := propagateFloat128NaN( a, b );
  8342. exit;
  8343. end;
  8344. if ( bExp = $7FFF ) then begin
  8345. if ( bSig0 or bSig1 )<>0 then
  8346. begin
  8347. result := propagateFloat128NaN( a, b );
  8348. exit;
  8349. end;
  8350. goto invalid;
  8351. end;
  8352. result := packFloat128( zSign, $7FFF, 0, 0 );
  8353. exit;
  8354. end;
  8355. if ( bExp = $7FFF ) then begin
  8356. if ( bSig0 or bSig1 )<>0 then
  8357. begin
  8358. result := propagateFloat128NaN( a, b );
  8359. exit;
  8360. end;
  8361. result := packFloat128( zSign, 0, 0, 0 );
  8362. exit;
  8363. end;
  8364. if ( bExp = 0 ) then begin
  8365. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8366. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8367. invalid:
  8368. float_raise( float_flag_invalid );
  8369. z.low := float128_default_nan_low;
  8370. z.high := float128_default_nan_high;
  8371. result := z;
  8372. exit;
  8373. end;
  8374. float_raise( float_flag_divbyzero );
  8375. result := packFloat128( zSign, $7FFF, 0, 0 );
  8376. exit;
  8377. end;
  8378. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8379. end;
  8380. if ( aExp = 0 ) then begin
  8381. if ( ( aSig0 or aSig1 ) = 0 ) then
  8382. begin
  8383. result := packFloat128( zSign, 0, 0, 0 );
  8384. exit;
  8385. end;
  8386. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8387. end;
  8388. zExp := aExp - bExp + $3FFD;
  8389. shortShift128Left(
  8390. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8391. shortShift128Left(
  8392. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8393. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8394. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8395. inc(zExp);
  8396. end;
  8397. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8398. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8399. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8400. while ( sbits64(rem0) < 0 ) do begin
  8401. dec(zSig0);
  8402. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8403. end;
  8404. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8405. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8406. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8407. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8408. while ( sbits64(rem1) < 0 ) do begin
  8409. dec(zSig1);
  8410. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8411. end;
  8412. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8413. end;
  8414. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8415. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8416. end;
  8417. {*----------------------------------------------------------------------------
  8418. | Returns the remainder of the quadruple-precision floating-point value `a'
  8419. | with respect to the corresponding value `b'. The operation is performed
  8420. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8421. *----------------------------------------------------------------------------*}
  8422. function float128_rem(a: float128; b: float128): float128;
  8423. var
  8424. aSign, zSign: flag;
  8425. aExp, bExp, expDiff: int32;
  8426. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8427. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8428. sigMean0: sbits64;
  8429. z: float128;
  8430. label
  8431. invalid;
  8432. begin
  8433. aSig1 := extractFloat128Frac1( a );
  8434. aSig0 := extractFloat128Frac0( a );
  8435. aExp := extractFloat128Exp( a );
  8436. aSign := extractFloat128Sign( a );
  8437. bSig1 := extractFloat128Frac1( b );
  8438. bSig0 := extractFloat128Frac0( b );
  8439. bExp := extractFloat128Exp( b );
  8440. if ( aExp = $7FFF ) then begin
  8441. if ( (( aSig0 or aSig1 )<>0)
  8442. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8443. result := propagateFloat128NaN( a, b );
  8444. exit;
  8445. end;
  8446. goto invalid;
  8447. end;
  8448. if ( bExp = $7FFF ) then begin
  8449. if ( bSig0 or bSig1 )<>0 then
  8450. begin
  8451. result := propagateFloat128NaN( a, b );
  8452. exit;
  8453. end;
  8454. result := a;
  8455. exit;
  8456. end;
  8457. if ( bExp = 0 ) then begin
  8458. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8459. invalid:
  8460. float_raise( float_flag_invalid );
  8461. z.low := float128_default_nan_low;
  8462. z.high := float128_default_nan_high;
  8463. result := z;
  8464. exit;
  8465. end;
  8466. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8467. end;
  8468. if ( aExp = 0 ) then begin
  8469. if ( ( aSig0 or aSig1 ) = 0 ) then
  8470. begin
  8471. result := a;
  8472. exit;
  8473. end;
  8474. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8475. end;
  8476. expDiff := aExp - bExp;
  8477. if ( expDiff < -1 ) then
  8478. begin
  8479. result := a;
  8480. exit;
  8481. end;
  8482. shortShift128Left(
  8483. aSig0 or int64( $0001000000000000 ),
  8484. aSig1,
  8485. 15 - ord( expDiff < 0 ),
  8486. aSig0,
  8487. aSig1
  8488. );
  8489. shortShift128Left(
  8490. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8491. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8492. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8493. dec(expDiff,64);
  8494. while ( 0 < expDiff ) do begin
  8495. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8496. if ( 4 < q ) then
  8497. q := q - 4
  8498. else
  8499. q := 0;
  8500. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8501. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8502. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8503. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8504. dec(expDiff,61);
  8505. end;
  8506. if ( -64 < expDiff ) then begin
  8507. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8508. if ( 4 < q ) then
  8509. q := q - 4
  8510. else
  8511. q := 0;
  8512. q := q shr (- expDiff);
  8513. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8514. inc(expDiff,52);
  8515. if ( expDiff < 0 ) then begin
  8516. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8517. end
  8518. else begin
  8519. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8520. end;
  8521. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8522. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8523. end
  8524. else begin
  8525. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8526. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8527. end;
  8528. repeat
  8529. alternateASig0 := aSig0;
  8530. alternateASig1 := aSig1;
  8531. inc(q);
  8532. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8533. until not( 0 <= sbits64(aSig0) );
  8534. add128(
  8535. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8536. if ( ( sigMean0 < 0 )
  8537. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8538. aSig0 := alternateASig0;
  8539. aSig1 := alternateASig1;
  8540. end;
  8541. zSign := ord( sbits64(aSig0) < 0 );
  8542. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8543. result :=
  8544. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8545. end;
  8546. {*----------------------------------------------------------------------------
  8547. | Returns the square root of the quadruple-precision floating-point value `a'.
  8548. | The operation is performed according to the IEC/IEEE Standard for Binary
  8549. | Floating-Point Arithmetic.
  8550. *----------------------------------------------------------------------------*}
  8551. function float128_sqrt(a: float128): float128;
  8552. var
  8553. aSign: flag;
  8554. aExp, zExp: int32;
  8555. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8556. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8557. z: float128;
  8558. label
  8559. invalid;
  8560. begin
  8561. aSig1 := extractFloat128Frac1( a );
  8562. aSig0 := extractFloat128Frac0( a );
  8563. aExp := extractFloat128Exp( a );
  8564. aSign := extractFloat128Sign( a );
  8565. if ( aExp = $7FFF ) then begin
  8566. if ( aSig0 or aSig1 )<>0 then
  8567. begin
  8568. result := propagateFloat128NaN( a, a );
  8569. exit;
  8570. end;
  8571. if ( aSign=0 ) then
  8572. begin
  8573. result := a;
  8574. exit;
  8575. end;
  8576. goto invalid;
  8577. end;
  8578. if ( aSign<>0 ) then begin
  8579. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8580. begin
  8581. result := a;
  8582. exit;
  8583. end;
  8584. invalid:
  8585. float_raise( float_flag_invalid );
  8586. z.low := float128_default_nan_low;
  8587. z.high := float128_default_nan_high;
  8588. result := z;
  8589. exit;
  8590. end;
  8591. if ( aExp = 0 ) then begin
  8592. if ( ( aSig0 or aSig1 ) = 0 ) then
  8593. begin
  8594. result := packFloat128( 0, 0, 0, 0 );
  8595. exit;
  8596. end;
  8597. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8598. end;
  8599. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
  8600. aSig0 := aSig0 or int64( $0001000000000000 );
  8601. zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
  8602. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8603. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8604. doubleZSig0 := zSig0 shl 1;
  8605. mul64To128( zSig0, zSig0, term0, term1 );
  8606. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8607. while ( sbits64(rem0) < 0 ) do begin
  8608. dec(zSig0);
  8609. dec(doubleZSig0,2);
  8610. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8611. end;
  8612. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8613. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8614. if ( zSig1 = 0 ) then zSig1 := 1;
  8615. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8616. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8617. mul64To128( zSig1, zSig1, term2, term3 );
  8618. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8619. while ( sbits64(rem1) < 0 ) do begin
  8620. dec(zSig1);
  8621. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8622. term3 := term3 or 1;
  8623. term2 := term2 or doubleZSig0;
  8624. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8625. end;
  8626. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8627. end;
  8628. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8629. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8630. end;
  8631. {*----------------------------------------------------------------------------
  8632. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8633. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8634. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8635. *----------------------------------------------------------------------------*}
  8636. function float128_eq(a: float128; b: float128): flag;
  8637. begin
  8638. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8639. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8640. or ( ( extractFloat128Exp( b ) = $7FFF )
  8641. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8642. ) then begin
  8643. if ( (float128_is_signaling_nan( a )<>0)
  8644. or (float128_is_signaling_nan( b )<>0) ) then begin
  8645. float_raise( float_flag_invalid );
  8646. end;
  8647. result := 0;
  8648. exit;
  8649. end;
  8650. result := ord(
  8651. ( a.low = b.low )
  8652. and ( ( a.high = b.high )
  8653. or ( ( a.low = 0 )
  8654. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8655. ));
  8656. end;
  8657. {*----------------------------------------------------------------------------
  8658. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8659. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8660. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8661. | Arithmetic.
  8662. *----------------------------------------------------------------------------*}
  8663. function float128_le(a: float128; b: float128): flag;
  8664. var
  8665. aSign, bSign: flag;
  8666. begin
  8667. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8668. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8669. or ( ( extractFloat128Exp( b ) = $7FFF )
  8670. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8671. ) then begin
  8672. float_raise( float_flag_invalid );
  8673. result := 0;
  8674. exit;
  8675. end;
  8676. aSign := extractFloat128Sign( a );
  8677. bSign := extractFloat128Sign( b );
  8678. if ( aSign <> bSign ) then begin
  8679. result := ord(
  8680. (aSign<>0)
  8681. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8682. = 0 ));
  8683. exit;
  8684. end;
  8685. if aSign<>0 then
  8686. result := le128( b.high, b.low, a.high, a.low )
  8687. else
  8688. result := le128( a.high, a.low, b.high, b.low );
  8689. end;
  8690. {*----------------------------------------------------------------------------
  8691. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8692. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8693. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8694. *----------------------------------------------------------------------------*}
  8695. function float128_lt(a: float128; b: float128): flag;
  8696. var
  8697. aSign, bSign: flag;
  8698. begin
  8699. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8700. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8701. or ( ( extractFloat128Exp( b ) = $7FFF )
  8702. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8703. ) then begin
  8704. float_raise( float_flag_invalid );
  8705. result := 0;
  8706. exit;
  8707. end;
  8708. aSign := extractFloat128Sign( a );
  8709. bSign := extractFloat128Sign( b );
  8710. if ( aSign <> bSign ) then begin
  8711. result := ord(
  8712. (aSign<>0)
  8713. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8714. <> 0 ));
  8715. exit;
  8716. end;
  8717. if aSign<>0 then
  8718. result := lt128( b.high, b.low, a.high, a.low )
  8719. else
  8720. result := lt128( a.high, a.low, b.high, b.low );
  8721. end;
  8722. {*----------------------------------------------------------------------------
  8723. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8724. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8725. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8726. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8727. *----------------------------------------------------------------------------*}
  8728. function float128_eq_signaling(a: float128; b: float128): flag;
  8729. begin
  8730. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8731. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8732. or ( ( extractFloat128Exp( b ) = $7FFF )
  8733. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8734. ) then begin
  8735. float_raise( float_flag_invalid );
  8736. result := 0;
  8737. exit;
  8738. end;
  8739. result := ord(
  8740. ( a.low = b.low )
  8741. and ( ( a.high = b.high )
  8742. or ( ( a.low = 0 )
  8743. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8744. ));
  8745. end;
  8746. {*----------------------------------------------------------------------------
  8747. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8748. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8749. | cause an exception. Otherwise, the comparison is performed according to the
  8750. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8751. *----------------------------------------------------------------------------*}
  8752. function float128_le_quiet(a: float128; b: float128): flag;
  8753. var
  8754. aSign, bSign: flag;
  8755. begin
  8756. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8757. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8758. or ( ( extractFloat128Exp( b ) = $7FFF )
  8759. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8760. ) then begin
  8761. if ( (float128_is_signaling_nan( a )<>0)
  8762. or (float128_is_signaling_nan( b )<>0) ) then begin
  8763. float_raise( float_flag_invalid );
  8764. end;
  8765. result := 0;
  8766. exit;
  8767. end;
  8768. aSign := extractFloat128Sign( a );
  8769. bSign := extractFloat128Sign( b );
  8770. if ( aSign <> bSign ) then begin
  8771. result := ord(
  8772. (aSign<>0)
  8773. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8774. = 0 ));
  8775. exit;
  8776. end;
  8777. if aSign<>0 then
  8778. result := le128( b.high, b.low, a.high, a.low )
  8779. else
  8780. result := le128( a.high, a.low, b.high, b.low );
  8781. end;
  8782. {*----------------------------------------------------------------------------
  8783. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8784. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8785. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8786. | Standard for Binary Floating-Point Arithmetic.
  8787. *----------------------------------------------------------------------------*}
  8788. function float128_lt_quiet(a: float128; b: float128): flag;
  8789. var
  8790. aSign, bSign: flag;
  8791. begin
  8792. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8793. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8794. or ( ( extractFloat128Exp( b ) = $7FFF )
  8795. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8796. ) then begin
  8797. if ( (float128_is_signaling_nan( a )<>0)
  8798. or (float128_is_signaling_nan( b )<>0) ) then begin
  8799. float_raise( float_flag_invalid );
  8800. end;
  8801. result := 0;
  8802. exit;
  8803. end;
  8804. aSign := extractFloat128Sign( a );
  8805. bSign := extractFloat128Sign( b );
  8806. if ( aSign <> bSign ) then begin
  8807. result := ord(
  8808. (aSign<>0)
  8809. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8810. <> 0 ));
  8811. exit;
  8812. end;
  8813. if aSign<>0 then
  8814. result:=lt128( b.high, b.low, a.high, a.low )
  8815. else
  8816. result:=lt128( a.high, a.low, b.high, b.low );
  8817. end;
  8818. {----------------------------------------------------------------------------
  8819. | Returns the result of converting the double-precision floating-point value
  8820. | `a' to the quadruple-precision floating-point format. The conversion is
  8821. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8822. | Arithmetic.
  8823. *----------------------------------------------------------------------------}
  8824. function float64_to_float128( a : float64) : float128;
  8825. var
  8826. aSign : flag;
  8827. aExp : int16;
  8828. aSig, zSig0, zSig1 : bits64;
  8829. begin
  8830. aSig := extractFloat64Frac( a );
  8831. aExp := extractFloat64Exp( a );
  8832. aSign := extractFloat64Sign( a );
  8833. if ( aExp = $7FF ) then begin
  8834. if ( aSig<>0 ) then begin
  8835. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8836. exit;
  8837. end;
  8838. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8839. exit;
  8840. end;
  8841. if ( aExp = 0 ) then begin
  8842. if ( aSig = 0 ) then
  8843. begin
  8844. result:=packFloat128( aSign, 0, 0, 0 );
  8845. exit;
  8846. end;
  8847. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8848. dec(aExp);
  8849. end;
  8850. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8851. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8852. end;
  8853. {$endif FPC_SOFTFLOAT_FLOAT128}
  8854. {$endif not(defined(fpc_softfpu_interface))}
  8855. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8856. end.
  8857. {$ifdef FPC}
  8858. { restore context modified at implmentation start
  8859. to possibly re-enable range and overflow checking explicitly}
  8860. {$pop}
  8861. {$endif FPC}
  8862. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}