softfpu.pp 295 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. interface
  69. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  70. {$if not(defined(fpc_softfpu_implementation))}
  71. {
  72. -------------------------------------------------------------------------------
  73. Software IEC/IEEE floating-point types.
  74. -------------------------------------------------------------------------------
  75. }
  76. TYPE
  77. float32 = longword;
  78. {$define FPC_SYSTEM_HAS_float32}
  79. { we use here a record in the function header because
  80. the record allows bitwise conversion to single }
  81. float32rec = record
  82. float32 : float32;
  83. end;
  84. flag = byte;
  85. uint8 = byte;
  86. int8 = shortint;
  87. uint16 = word;
  88. int16 = smallint;
  89. uint32 = longword;
  90. int32 = longint;
  91. bits8 = byte;
  92. sbits8 = shortint;
  93. bits16 = word;
  94. sbits16 = smallint;
  95. sbits32 = longint;
  96. bits32 = longword;
  97. {$ifndef fpc}
  98. qword = int64;
  99. {$endif}
  100. { now part of the system unit
  101. uint64 = qword;
  102. }
  103. bits64 = qword;
  104. sbits64 = int64;
  105. {$ifdef ENDIAN_LITTLE}
  106. float64 = packed record
  107. low: bits32;
  108. high: bits32;
  109. end;
  110. int64rec = packed record
  111. low: bits32;
  112. high: bits32;
  113. end;
  114. floatx80 = packed record
  115. low : qword;
  116. high : word;
  117. end;
  118. float128 = packed record
  119. low : qword;
  120. high : qword;
  121. end;
  122. {$else}
  123. float64 = packed record
  124. high,low : bits32;
  125. end;
  126. int64rec = packed record
  127. high,low : bits32;
  128. end;
  129. floatx80 = packed record
  130. high : word;
  131. low : qword;
  132. end;
  133. float128 = packed record
  134. high : qword;
  135. low : qword;
  136. end;
  137. {$endif}
  138. {$define FPC_SYSTEM_HAS_float64}
  139. {*
  140. -------------------------------------------------------------------------------
  141. Returns 1 if the double-precision floating-point value `a' is less than
  142. the corresponding value `b', and 0 otherwise. The comparison is performed
  143. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  144. -------------------------------------------------------------------------------
  145. *}
  146. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  147. {*
  148. -------------------------------------------------------------------------------
  149. Returns 1 if the double-precision floating-point value `a' is less than
  150. or equal to the corresponding value `b', and 0 otherwise. The comparison
  151. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  152. Arithmetic.
  153. -------------------------------------------------------------------------------
  154. *}
  155. Function float64_le(a: float64;b: float64): flag; compilerproc;
  156. {*
  157. -------------------------------------------------------------------------------
  158. Returns 1 if the double-precision floating-point value `a' is equal to
  159. the corresponding value `b', and 0 otherwise. The comparison is performed
  160. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  161. -------------------------------------------------------------------------------
  162. *}
  163. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  164. {*
  165. -------------------------------------------------------------------------------
  166. Returns the square root of the double-precision floating-point value `a'.
  167. The operation is performed according to the IEC/IEEE Standard for Binary
  168. Floating-Point Arithmetic.
  169. -------------------------------------------------------------------------------
  170. *}
  171. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  172. {*
  173. -------------------------------------------------------------------------------
  174. Returns the remainder of the double-precision floating-point value `a'
  175. with respect to the corresponding value `b'. The operation is performed
  176. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  177. -------------------------------------------------------------------------------
  178. *}
  179. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  180. {*
  181. -------------------------------------------------------------------------------
  182. Returns the result of dividing the double-precision floating-point value `a'
  183. by the corresponding value `b'. The operation is performed according to the
  184. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  185. -------------------------------------------------------------------------------
  186. *}
  187. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  188. {*
  189. -------------------------------------------------------------------------------
  190. Returns the result of multiplying the double-precision floating-point values
  191. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  192. for Binary Floating-Point Arithmetic.
  193. -------------------------------------------------------------------------------
  194. *}
  195. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  196. {*
  197. -------------------------------------------------------------------------------
  198. Returns the result of subtracting the double-precision floating-point values
  199. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  200. for Binary Floating-Point Arithmetic.
  201. -------------------------------------------------------------------------------
  202. *}
  203. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  204. {*
  205. -------------------------------------------------------------------------------
  206. Returns the result of adding the double-precision floating-point values `a'
  207. and `b'. The operation is performed according to the IEC/IEEE Standard for
  208. Binary Floating-Point Arithmetic.
  209. -------------------------------------------------------------------------------
  210. *}
  211. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  212. {*
  213. -------------------------------------------------------------------------------
  214. Rounds the double-precision floating-point value `a' to an integer,
  215. and returns the result as a double-precision floating-point value. The
  216. operation is performed according to the IEC/IEEE Standard for Binary
  217. Floating-Point Arithmetic.
  218. -------------------------------------------------------------------------------
  219. *}
  220. Function float64_round_to_int(a: float64) : float64; compilerproc;
  221. {*
  222. -------------------------------------------------------------------------------
  223. Returns the result of converting the double-precision floating-point value
  224. `a' to the single-precision floating-point format. The conversion is
  225. performed according to the IEC/IEEE Standard for Binary Floating-Point
  226. Arithmetic.
  227. -------------------------------------------------------------------------------
  228. *}
  229. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  230. {*
  231. -------------------------------------------------------------------------------
  232. Returns the result of converting the double-precision floating-point value
  233. `a' to the 32-bit two's complement integer format. The conversion is
  234. performed according to the IEC/IEEE Standard for Binary Floating-Point
  235. Arithmetic, except that the conversion is always rounded toward zero.
  236. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  237. the conversion overflows, the largest integer with the same sign as `a' is
  238. returned.
  239. -------------------------------------------------------------------------------
  240. *}
  241. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  242. {*
  243. -------------------------------------------------------------------------------
  244. Returns the result of converting the double-precision floating-point value
  245. `a' to the 32-bit two's complement integer format. The conversion is
  246. performed according to the IEC/IEEE Standard for Binary Floating-Point
  247. Arithmetic---which means in particular that the conversion is rounded
  248. according to the current rounding mode. If `a' is a NaN, the largest
  249. positive integer is returned. Otherwise, if the conversion overflows, the
  250. largest integer with the same sign as `a' is returned.
  251. -------------------------------------------------------------------------------
  252. *}
  253. Function float64_to_int32(a: float64): int32; compilerproc;
  254. {*
  255. -------------------------------------------------------------------------------
  256. Returns 1 if the single-precision floating-point value `a' is less than
  257. the corresponding value `b', and 0 otherwise. The comparison is performed
  258. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  259. -------------------------------------------------------------------------------
  260. *}
  261. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  262. {*
  263. -------------------------------------------------------------------------------
  264. Returns 1 if the single-precision floating-point value `a' is less than
  265. or equal to the corresponding value `b', and 0 otherwise. The comparison
  266. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  267. Arithmetic.
  268. -------------------------------------------------------------------------------
  269. *}
  270. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  271. {*
  272. -------------------------------------------------------------------------------
  273. Returns 1 if the single-precision floating-point value `a' is equal to
  274. the corresponding value `b', and 0 otherwise. The comparison is performed
  275. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  276. -------------------------------------------------------------------------------
  277. *}
  278. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  279. {*
  280. -------------------------------------------------------------------------------
  281. Returns the square root of the single-precision floating-point value `a'.
  282. The operation is performed according to the IEC/IEEE Standard for Binary
  283. Floating-Point Arithmetic.
  284. -------------------------------------------------------------------------------
  285. *}
  286. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  287. {*
  288. -------------------------------------------------------------------------------
  289. Returns the remainder of the single-precision floating-point value `a'
  290. with respect to the corresponding value `b'. The operation is performed
  291. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  292. -------------------------------------------------------------------------------
  293. *}
  294. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  295. {*
  296. -------------------------------------------------------------------------------
  297. Returns the result of dividing the single-precision floating-point value `a'
  298. by the corresponding value `b'. The operation is performed according to the
  299. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  300. -------------------------------------------------------------------------------
  301. *}
  302. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  303. {*
  304. -------------------------------------------------------------------------------
  305. Returns the result of multiplying the single-precision floating-point values
  306. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  307. for Binary Floating-Point Arithmetic.
  308. -------------------------------------------------------------------------------
  309. *}
  310. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  311. {*
  312. -------------------------------------------------------------------------------
  313. Returns the result of subtracting the single-precision floating-point values
  314. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  315. for Binary Floating-Point Arithmetic.
  316. -------------------------------------------------------------------------------
  317. *}
  318. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  319. {*
  320. -------------------------------------------------------------------------------
  321. Returns the result of adding the single-precision floating-point values `a'
  322. and `b'. The operation is performed according to the IEC/IEEE Standard for
  323. Binary Floating-Point Arithmetic.
  324. -------------------------------------------------------------------------------
  325. *}
  326. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  327. {*
  328. -------------------------------------------------------------------------------
  329. Rounds the single-precision floating-point value `a' to an integer,
  330. and returns the result as a single-precision floating-point value. The
  331. operation is performed according to the IEC/IEEE Standard for Binary
  332. Floating-Point Arithmetic.
  333. -------------------------------------------------------------------------------
  334. *}
  335. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  336. {*
  337. -------------------------------------------------------------------------------
  338. Returns the result of converting the single-precision floating-point value
  339. `a' to the double-precision floating-point format. The conversion is
  340. performed according to the IEC/IEEE Standard for Binary Floating-Point
  341. Arithmetic.
  342. -------------------------------------------------------------------------------
  343. *}
  344. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  345. {*
  346. -------------------------------------------------------------------------------
  347. Returns the result of converting the single-precision floating-point value
  348. `a' to the 32-bit two's complement integer format. The conversion is
  349. performed according to the IEC/IEEE Standard for Binary Floating-Point
  350. Arithmetic, except that the conversion is always rounded toward zero.
  351. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  352. the conversion overflows, the largest integer with the same sign as `a' is
  353. returned.
  354. -------------------------------------------------------------------------------
  355. *}
  356. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  357. {*
  358. -------------------------------------------------------------------------------
  359. Returns the result of converting the single-precision floating-point value
  360. `a' to the 32-bit two's complement integer format. The conversion is
  361. performed according to the IEC/IEEE Standard for Binary Floating-Point
  362. Arithmetic---which means in particular that the conversion is rounded
  363. according to the current rounding mode. If `a' is a NaN, the largest
  364. positive integer is returned. Otherwise, if the conversion overflows, the
  365. largest integer with the same sign as `a' is returned.
  366. -------------------------------------------------------------------------------
  367. *}
  368. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  369. {*
  370. -------------------------------------------------------------------------------
  371. Returns the result of converting the 32-bit two's complement integer `a' to
  372. the double-precision floating-point format. The conversion is performed
  373. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  374. -------------------------------------------------------------------------------
  375. *}
  376. Function int32_to_float64( a: int32) : float64; compilerproc;
  377. {*
  378. -------------------------------------------------------------------------------
  379. Returns the result of converting the 32-bit two's complement integer `a' to
  380. the single-precision floating-point format. The conversion is performed
  381. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  382. -------------------------------------------------------------------------------
  383. *}
  384. Function int32_to_float32( a: int32): float32rec; compilerproc;
  385. {*----------------------------------------------------------------------------
  386. | Returns the result of converting the 64-bit two's complement integer `a'
  387. | to the double-precision floating-point format. The conversion is performed
  388. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  389. *----------------------------------------------------------------------------*}
  390. Function int64_to_float64( a: int64 ): float64; compilerproc;
  391. Function qword_to_float64( a: qword ): float64; compilerproc;
  392. {*----------------------------------------------------------------------------
  393. | Returns the result of converting the 64-bit two's complement integer `a'
  394. | to the single-precision floating-point format. The conversion is performed
  395. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  396. *----------------------------------------------------------------------------*}
  397. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  398. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  399. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  400. function float128_is_nan( a : float128): flag;
  401. function float128_is_signaling_nan( a : float128): flag;
  402. function float128_to_int32(a: float128): int32;
  403. function float128_to_int32_round_to_zero(a: float128): int32;
  404. function float128_to_int64(a: float128): int64;
  405. function float128_to_int64_round_to_zero(a: float128): int64;
  406. function float128_to_float32(a: float128): float32;
  407. function float128_to_float64(a: float128): float64;
  408. function float64_to_float128( a : float64) : float128;
  409. {$ifdef FPC_SOFTFLOAT_FLOAT80}
  410. function float128_to_floatx80(a: float128): floatx80;
  411. {$endif FPC_SOFTFLOAT_FLOAT80}
  412. function float128_round_to_int(a: float128): float128;
  413. function float128_add(a: float128; b: float128): float128;
  414. function float128_sub(a: float128; b: float128): float128;
  415. function float128_mul(a: float128; b: float128): float128;
  416. function float128_div(a: float128; b: float128): float128;
  417. function float128_rem(a: float128; b: float128): float128;
  418. function float128_sqrt(a: float128): float128;
  419. function float128_eq(a: float128; b: float128): flag;
  420. function float128_le(a: float128; b: float128): flag;
  421. function float128_lt(a: float128; b: float128): flag;
  422. function float128_eq_signaling(a: float128; b: float128): flag;
  423. function float128_le_quiet(a: float128; b: float128): flag;
  424. function float128_lt_quiet(a: float128; b: float128): flag;
  425. {$endif FPC_SOFTFLOAT_FLOAT128}
  426. CONST
  427. {-------------------------------------------------------------------------------
  428. Software IEC/IEEE floating-point underflow tininess-detection mode.
  429. -------------------------------------------------------------------------------
  430. *}
  431. float_tininess_after_rounding = 0;
  432. float_tininess_before_rounding = 1;
  433. {*
  434. -------------------------------------------------------------------------------
  435. Underflow tininess-detection mode, statically initialized to default value.
  436. (The declaration in `softfloat.h' must match the `int8' type here.)
  437. -------------------------------------------------------------------------------
  438. *}
  439. const float_detect_tininess: int8 = float_tininess_after_rounding;
  440. {$endif not(defined(fpc_softfpu_implementation))}
  441. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  442. implementation
  443. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  444. {$if not(defined(fpc_softfpu_interface))}
  445. (*****************************************************************************)
  446. (*----------------------------------------------------------------------------*)
  447. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  448. (* division and square root approximations. (Can be specialized to target if *)
  449. (* desired.) *)
  450. (* ---------------------------------------------------------------------------*)
  451. (*****************************************************************************)
  452. {*----------------------------------------------------------------------------
  453. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  454. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  455. | input. If `zSign' is 1, the input is negated before being converted to an
  456. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  457. | is simply rounded to an integer, with the inexact exception raised if the
  458. | input cannot be represented exactly as an integer. However, if the fixed-
  459. | point input is too large, the invalid exception is raised and the largest
  460. | positive or negative integer is returned.
  461. *----------------------------------------------------------------------------*}
  462. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  463. var
  464. roundingMode: int8;
  465. roundNearestEven: flag;
  466. roundIncrement, roundBits: int8;
  467. z: int32;
  468. begin
  469. roundingMode := softfloat_rounding_mode;
  470. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  471. roundIncrement := $40;
  472. if ( roundNearestEven=0 ) then
  473. begin
  474. if ( roundingMode = float_round_to_zero ) then
  475. begin
  476. roundIncrement := 0;
  477. end
  478. else begin
  479. roundIncrement := $7F;
  480. if ( zSign<>0 ) then
  481. begin
  482. if ( roundingMode = float_round_up ) then
  483. roundIncrement := 0;
  484. end
  485. else begin
  486. if ( roundingMode = float_round_down ) then
  487. roundIncrement := 0;
  488. end;
  489. end;
  490. end;
  491. roundBits := absZ and $7F;
  492. absZ := ( absZ + roundIncrement ) shr 7;
  493. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  494. z := absZ;
  495. if ( zSign<>0 ) then
  496. z := - z;
  497. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  498. begin
  499. float_raise( float_flag_invalid );
  500. if zSign<>0 then
  501. result:=sbits32($80000000)
  502. else
  503. result:=$7FFFFFFF;
  504. exit;
  505. end;
  506. if ( roundBits<>0 ) then
  507. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  508. result:=z;
  509. end;
  510. {*----------------------------------------------------------------------------
  511. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  512. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  513. | and returns the properly rounded 64-bit integer corresponding to the input.
  514. | If `zSign' is 1, the input is negated before being converted to an integer.
  515. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  516. | the inexact exception raised if the input cannot be represented exactly as
  517. | an integer. However, if the fixed-point input is too large, the invalid
  518. | exception is raised and the largest positive or negative integer is
  519. | returned.
  520. *----------------------------------------------------------------------------*}
  521. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  522. var
  523. roundingMode: int8;
  524. roundNearestEven, increment: flag;
  525. z: int64;
  526. label
  527. overflow;
  528. begin
  529. roundingMode := softfloat_rounding_mode;
  530. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  531. increment := ord( sbits64(absZ1) < 0 );
  532. if ( roundNearestEven=0 ) then
  533. begin
  534. if ( roundingMode = float_round_to_zero ) then
  535. begin
  536. increment := 0;
  537. end
  538. else begin
  539. if ( zSign<>0 ) then
  540. begin
  541. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  542. end
  543. else begin
  544. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  545. end;
  546. end;
  547. end;
  548. if ( increment<>0 ) then
  549. begin
  550. inc(absZ0);
  551. if ( absZ0 = 0 ) then
  552. goto overflow;
  553. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  554. end;
  555. z := absZ0;
  556. if ( zSign<>0 ) then
  557. z := - z;
  558. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  559. begin
  560. overflow:
  561. float_raise( float_flag_invalid );
  562. if zSign<>0 then
  563. result:=int64($8000000000000000)
  564. else
  565. result:=int64($7FFFFFFFFFFFFFFF);
  566. end;
  567. if ( absZ1<>0 ) then
  568. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  569. result:=z;
  570. end;
  571. {*
  572. -------------------------------------------------------------------------------
  573. Shifts `a' right by the number of bits given in `count'. If any nonzero
  574. bits are shifted off, they are ``jammed'' into the least significant bit of
  575. the result by setting the least significant bit to 1. The value of `count'
  576. can be arbitrarily large; in particular, if `count' is greater than 32, the
  577. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  578. The result is stored in the location pointed to by `zPtr'.
  579. -------------------------------------------------------------------------------
  580. *}
  581. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  582. var
  583. z: Bits32;
  584. Begin
  585. if ( count = 0 ) then
  586. z := a
  587. else
  588. if ( count < 32 ) then
  589. Begin
  590. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  591. End
  592. else
  593. Begin
  594. z := bits32( a <> 0 );
  595. End;
  596. zPtr := z;
  597. End;
  598. {*----------------------------------------------------------------------------
  599. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  600. | number of bits given in `count'. Any bits shifted off are lost. The value
  601. | of `count' can be arbitrarily large; in particular, if `count' is greater
  602. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  603. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  604. *----------------------------------------------------------------------------*}
  605. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  606. var
  607. z0, z1: bits64;
  608. negCount: int8;
  609. begin
  610. negCount := ( - count ) and 63;
  611. if ( count = 0 ) then
  612. begin
  613. z1 := a1;
  614. z0 := a0;
  615. end
  616. else if ( count < 64 ) then
  617. begin
  618. z1 := ( a0 shl negCount ) or ( a1 shr count );
  619. z0 := a0 shr count;
  620. end
  621. else
  622. begin
  623. if ( count shl 64 )<>0 then
  624. z1 := a0 shr ( count and 63 )
  625. else
  626. z1 := 0;
  627. z0 := 0;
  628. end;
  629. z1Ptr := z1;
  630. z0Ptr := z0;
  631. end;
  632. {*----------------------------------------------------------------------------
  633. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  634. | number of bits given in `count'. If any nonzero bits are shifted off, they
  635. | are ``jammed'' into the least significant bit of the result by setting the
  636. | least significant bit to 1. The value of `count' can be arbitrarily large;
  637. | in particular, if `count' is greater than 128, the result will be either
  638. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  639. | nonzero. The result is broken into two 64-bit pieces which are stored at
  640. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  641. *----------------------------------------------------------------------------*}
  642. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  643. var
  644. z0,z1 : bits64;
  645. negCount : int8;
  646. begin
  647. negCount := ( - count ) and 63;
  648. if ( count = 0 ) then begin
  649. z1 := a1;
  650. z0 := a0;
  651. end
  652. else if ( count < 64 ) then begin
  653. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  654. z0 := a0>>count;
  655. end
  656. else begin
  657. if ( count = 64 ) then begin
  658. z1 := a0 or ord( a1 <> 0 );
  659. end
  660. else if ( count < 128 ) then begin
  661. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  662. end
  663. else begin
  664. z1 := ord( ( a0 or a1 ) <> 0 );
  665. end;
  666. z0 := 0;
  667. end;
  668. z1Ptr := z1;
  669. z0Ptr := z0;
  670. end;
  671. {*
  672. -------------------------------------------------------------------------------
  673. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  674. number of bits given in `count'. Any bits shifted off are lost. The value
  675. of `count' can be arbitrarily large; in particular, if `count' is greater
  676. than 64, the result will be 0. The result is broken into two 32-bit pieces
  677. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  678. -------------------------------------------------------------------------------
  679. *}
  680. Procedure
  681. shift64Right(
  682. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  683. Var
  684. z0, z1: bits32;
  685. negCount : int8;
  686. Begin
  687. negCount := ( - count ) AND 31;
  688. if ( count = 0 ) then
  689. Begin
  690. z1 := a1;
  691. z0 := a0;
  692. End
  693. else if ( count < 32 ) then
  694. Begin
  695. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  696. z0 := a0 shr count;
  697. End
  698. else
  699. Begin
  700. if (count < 64) then
  701. z1 := ( a0 shr ( count AND 31 ) )
  702. else
  703. z1 := 0;
  704. z0 := 0;
  705. End;
  706. z1Ptr := z1;
  707. z0Ptr := z0;
  708. End;
  709. {*
  710. -------------------------------------------------------------------------------
  711. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  712. number of bits given in `count'. If any nonzero bits are shifted off, they
  713. are ``jammed'' into the least significant bit of the result by setting the
  714. least significant bit to 1. The value of `count' can be arbitrarily large;
  715. in particular, if `count' is greater than 64, the result will be either 0
  716. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  717. nonzero. The result is broken into two 32-bit pieces which are stored at
  718. the locations pointed to by `z0Ptr' and `z1Ptr'.
  719. -------------------------------------------------------------------------------
  720. *}
  721. Procedure
  722. shift64RightJamming(
  723. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  724. VAR
  725. z0, z1 : bits32;
  726. negCount : int8;
  727. Begin
  728. negCount := ( - count ) AND 31;
  729. if ( count = 0 ) then
  730. Begin
  731. z1 := a1;
  732. z0 := a0;
  733. End
  734. else
  735. if ( count < 32 ) then
  736. Begin
  737. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  738. z0 := a0 shr count;
  739. End
  740. else
  741. Begin
  742. if ( count = 32 ) then
  743. Begin
  744. z1 := a0 OR bits32( a1 <> 0 );
  745. End
  746. else
  747. if ( count < 64 ) Then
  748. Begin
  749. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  750. End
  751. else
  752. Begin
  753. z1 := bits32( ( a0 OR a1 ) <> 0 );
  754. End;
  755. z0 := 0;
  756. End;
  757. z1Ptr := z1;
  758. z0Ptr := z0;
  759. End;
  760. {*----------------------------------------------------------------------------
  761. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  762. | bits are shifted off, they are ``jammed'' into the least significant bit of
  763. | the result by setting the least significant bit to 1. The value of `count'
  764. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  765. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  766. | The result is stored in the location pointed to by `zPtr'.
  767. *----------------------------------------------------------------------------*}
  768. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  769. var
  770. z: bits64;
  771. begin
  772. if ( count = 0 ) then
  773. begin
  774. z := a;
  775. end
  776. else if ( count < 64 ) then
  777. begin
  778. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  779. end
  780. else
  781. begin
  782. z := ord( a <> 0 );
  783. end;
  784. zPtr := z;
  785. end;
  786. {*
  787. -------------------------------------------------------------------------------
  788. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  789. by 32 _plus_ the number of bits given in `count'. The shifted result is
  790. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  791. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  792. off form a third 32-bit result as follows: The _last_ bit shifted off is
  793. the most-significant bit of the extra result, and the other 31 bits of the
  794. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  795. were all zero. This extra result is stored in the location pointed to by
  796. `z2Ptr'. The value of `count' can be arbitrarily large.
  797. (This routine makes more sense if `a0', `a1', and `a2' are considered
  798. to form a fixed-point value with binary point between `a1' and `a2'. This
  799. fixed-point value is shifted right by the number of bits given in `count',
  800. and the integer part of the result is returned at the locations pointed to
  801. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  802. corrupted as described above, and is returned at the location pointed to by
  803. `z2Ptr'.)
  804. -------------------------------------------------------------------------------
  805. }
  806. Procedure
  807. shift64ExtraRightJamming(
  808. a0: bits32;
  809. a1: bits32;
  810. a2: bits32;
  811. count: int16;
  812. VAR z0Ptr: bits32;
  813. VAR z1Ptr: bits32;
  814. VAR z2Ptr: bits32
  815. );
  816. Var
  817. z0, z1, z2: bits32;
  818. negCount : int8;
  819. Begin
  820. negCount := ( - count ) AND 31;
  821. if ( count = 0 ) then
  822. Begin
  823. z2 := a2;
  824. z1 := a1;
  825. z0 := a0;
  826. End
  827. else
  828. Begin
  829. if ( count < 32 ) Then
  830. Begin
  831. z2 := a1 shl negCount;
  832. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  833. z0 := a0 shr count;
  834. End
  835. else
  836. Begin
  837. if ( count = 32 ) then
  838. Begin
  839. z2 := a1;
  840. z1 := a0;
  841. End
  842. else
  843. Begin
  844. a2 := a2 or a1;
  845. if ( count < 64 ) then
  846. Begin
  847. z2 := a0 shl negCount;
  848. z1 := a0 shr ( count AND 31 );
  849. End
  850. else
  851. Begin
  852. if count = 64 then
  853. z2 := a0
  854. else
  855. z2 := bits32(a0 <> 0);
  856. z1 := 0;
  857. End;
  858. End;
  859. z0 := 0;
  860. End;
  861. z2 := z2 or bits32( a2 <> 0 );
  862. End;
  863. z2Ptr := z2;
  864. z1Ptr := z1;
  865. z0Ptr := z0;
  866. End;
  867. {*
  868. -------------------------------------------------------------------------------
  869. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  870. number of bits given in `count'. Any bits shifted off are lost. The value
  871. of `count' must be less than 32. The result is broken into two 32-bit
  872. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  873. -------------------------------------------------------------------------------
  874. *}
  875. Procedure
  876. shortShift64Left(
  877. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  878. Begin
  879. z1Ptr := a1 shl count;
  880. if count = 0 then
  881. z0Ptr := a0
  882. else
  883. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  884. End;
  885. {*
  886. -------------------------------------------------------------------------------
  887. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  888. by the number of bits given in `count'. Any bits shifted off are lost.
  889. The value of `count' must be less than 32. The result is broken into three
  890. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  891. `z1Ptr', and `z2Ptr'.
  892. -------------------------------------------------------------------------------
  893. *}
  894. Procedure
  895. shortShift96Left(
  896. a0: bits32;
  897. a1: bits32;
  898. a2: bits32;
  899. count: int16;
  900. VAR z0Ptr: bits32;
  901. VAR z1Ptr: bits32;
  902. VAR z2Ptr: bits32
  903. );
  904. Var
  905. z0, z1, z2: bits32;
  906. negCount: int8;
  907. Begin
  908. z2 := a2 shl count;
  909. z1 := a1 shl count;
  910. z0 := a0 shl count;
  911. if ( 0 < count ) then
  912. Begin
  913. negCount := ( ( - count ) AND 31 );
  914. z1 := z1 or (a2 shr negCount);
  915. z0 := z0 or (a1 shr negCount);
  916. End;
  917. z2Ptr := z2;
  918. z1Ptr := z1;
  919. z0Ptr := z0;
  920. End;
  921. {*----------------------------------------------------------------------------
  922. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  923. | number of bits given in `count'. Any bits shifted off are lost. The value
  924. | of `count' must be less than 64. The result is broken into two 64-bit
  925. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  926. *----------------------------------------------------------------------------*}
  927. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  928. begin
  929. z1Ptr := a1 shl count;
  930. if count=0 then
  931. z0Ptr:=a0
  932. else
  933. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  934. end;
  935. {*
  936. -------------------------------------------------------------------------------
  937. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  938. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  939. any carry out is lost. The result is broken into two 32-bit pieces which
  940. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  941. -------------------------------------------------------------------------------
  942. *}
  943. Procedure
  944. add64(
  945. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  946. Var
  947. z1: bits32;
  948. Begin
  949. z1 := a1 + b1;
  950. z1Ptr := z1;
  951. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  952. End;
  953. {*
  954. -------------------------------------------------------------------------------
  955. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  956. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  957. modulo 2^96, so any carry out is lost. The result is broken into three
  958. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  959. `z1Ptr', and `z2Ptr'.
  960. -------------------------------------------------------------------------------
  961. *}
  962. Procedure
  963. add96(
  964. a0: bits32;
  965. a1: bits32;
  966. a2: bits32;
  967. b0: bits32;
  968. b1: bits32;
  969. b2: bits32;
  970. VAR z0Ptr: bits32;
  971. VAR z1Ptr: bits32;
  972. VAR z2Ptr: bits32
  973. );
  974. var
  975. z0, z1, z2: bits32;
  976. carry0, carry1: int8;
  977. Begin
  978. z2 := a2 + b2;
  979. carry1 := int8( z2 < a2 );
  980. z1 := a1 + b1;
  981. carry0 := int8( z1 < a1 );
  982. z0 := a0 + b0;
  983. z1 := z1 + carry1;
  984. z0 := z0 + bits32( z1 < carry1 );
  985. z0 := z0 + carry0;
  986. z2Ptr := z2;
  987. z1Ptr := z1;
  988. z0Ptr := z0;
  989. End;
  990. {*----------------------------------------------------------------------------
  991. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  992. | by the number of bits given in `count'. Any bits shifted off are lost.
  993. | The value of `count' must be less than 64. The result is broken into three
  994. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  995. | `z1Ptr', and `z2Ptr'.
  996. *----------------------------------------------------------------------------*}
  997. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  998. var
  999. z0, z1, z2 : bits64;
  1000. negCount : int8;
  1001. begin
  1002. z2 := a2 shl count;
  1003. z1 := a1 shl count;
  1004. z0 := a0 shl count;
  1005. if ( 0 < count ) then
  1006. begin
  1007. negCount := ( ( - count ) and 63 );
  1008. z1 := z1 or (a2 shr negCount);
  1009. z0 := z0 or (a1 shr negCount);
  1010. end;
  1011. z2Ptr := z2;
  1012. z1Ptr := z1;
  1013. z0Ptr := z0;
  1014. end;
  1015. {*----------------------------------------------------------------------------
  1016. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1017. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1018. | any carry out is lost. The result is broken into two 64-bit pieces which
  1019. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1020. *----------------------------------------------------------------------------*}
  1021. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1022. var
  1023. z1 : bits64;
  1024. begin
  1025. z1 := a1 + b1;
  1026. z1Ptr := z1;
  1027. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1028. end;
  1029. {*----------------------------------------------------------------------------
  1030. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1031. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1032. | modulo 2^192, so any carry out is lost. The result is broken into three
  1033. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1034. | `z1Ptr', and `z2Ptr'.
  1035. *----------------------------------------------------------------------------*}
  1036. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1037. var
  1038. z0, z1, z2 : bits64;
  1039. carry0, carry1 : int8;
  1040. begin
  1041. z2 := a2 + b2;
  1042. carry1 := ord( z2 < a2 );
  1043. z1 := a1 + b1;
  1044. carry0 := ord( z1 < a1 );
  1045. z0 := a0 + b0;
  1046. inc(z1, carry1);
  1047. inc(z0, ord( z1 < carry1 ));
  1048. inc(z0, carry0);
  1049. z2Ptr := z2;
  1050. z1Ptr := z1;
  1051. z0Ptr := z0;
  1052. end;
  1053. {*
  1054. -------------------------------------------------------------------------------
  1055. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1056. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1057. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1058. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1059. `z1Ptr'.
  1060. -------------------------------------------------------------------------------
  1061. *}
  1062. Procedure
  1063. sub64(
  1064. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1065. Begin
  1066. z1Ptr := a1 - b1;
  1067. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1068. End;
  1069. {*
  1070. -------------------------------------------------------------------------------
  1071. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1072. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1073. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1074. into three 32-bit pieces which are stored at the locations pointed to by
  1075. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1076. -------------------------------------------------------------------------------
  1077. *}
  1078. Procedure
  1079. sub96(
  1080. a0:bits32;
  1081. a1:bits32;
  1082. a2:bits32;
  1083. b0:bits32;
  1084. b1:bits32;
  1085. b2:bits32;
  1086. VAR z0Ptr:bits32;
  1087. VAR z1Ptr:bits32;
  1088. VAR z2Ptr:bits32
  1089. );
  1090. Var
  1091. z0, z1, z2: bits32;
  1092. borrow0, borrow1: int8;
  1093. Begin
  1094. z2 := a2 - b2;
  1095. borrow1 := int8( a2 < b2 );
  1096. z1 := a1 - b1;
  1097. borrow0 := int8( a1 < b1 );
  1098. z0 := a0 - b0;
  1099. z0 := z0 - bits32( z1 < borrow1 );
  1100. z1 := z1 - borrow1;
  1101. z0 := z0 -borrow0;
  1102. z2Ptr := z2;
  1103. z1Ptr := z1;
  1104. z0Ptr := z0;
  1105. End;
  1106. {*----------------------------------------------------------------------------
  1107. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1108. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1109. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1110. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1111. | `z1Ptr'.
  1112. *----------------------------------------------------------------------------*}
  1113. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1114. begin
  1115. z1Ptr := a1 - b1;
  1116. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1117. end;
  1118. {*----------------------------------------------------------------------------
  1119. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1120. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1121. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1122. | result is broken into three 64-bit pieces which are stored at the locations
  1123. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1124. *----------------------------------------------------------------------------*}
  1125. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1126. var
  1127. z0, z1, z2 : bits64;
  1128. borrow0, borrow1 : int8;
  1129. begin
  1130. z2 := a2 - b2;
  1131. borrow1 := ord( a2 < b2 );
  1132. z1 := a1 - b1;
  1133. borrow0 := ord( a1 < b1 );
  1134. z0 := a0 - b0;
  1135. dec(z0, ord( z1 < borrow1 ));
  1136. dec(z1, borrow1);
  1137. dec(z0, borrow0);
  1138. z2Ptr := z2;
  1139. z1Ptr := z1;
  1140. z0Ptr := z0;
  1141. end;
  1142. {*
  1143. -------------------------------------------------------------------------------
  1144. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1145. into two 32-bit pieces which are stored at the locations pointed to by
  1146. `z0Ptr' and `z1Ptr'.
  1147. -------------------------------------------------------------------------------
  1148. *}
  1149. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1150. :bits32 );
  1151. Var
  1152. aHigh, aLow, bHigh, bLow: bits16;
  1153. z0, zMiddleA, zMiddleB, z1: bits32;
  1154. Begin
  1155. aLow := a and $ffff;
  1156. aHigh := a shr 16;
  1157. bLow := b and $ffff;
  1158. bHigh := b shr 16;
  1159. z1 := ( bits32( aLow) ) * bLow;
  1160. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1161. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1162. z0 := ( bits32 (aHigh) ) * bHigh;
  1163. zMiddleA := zMiddleA + zMiddleB;
  1164. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1165. zMiddleA := zmiddleA shl 16;
  1166. z1 := z1 + zMiddleA;
  1167. z0 := z0 + bits32( z1 < zMiddleA );
  1168. z1Ptr := z1;
  1169. z0Ptr := z0;
  1170. End;
  1171. {*
  1172. -------------------------------------------------------------------------------
  1173. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1174. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1175. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1176. `z2Ptr'.
  1177. -------------------------------------------------------------------------------
  1178. *}
  1179. Procedure
  1180. mul64By32To96(
  1181. a0:bits32;
  1182. a1:bits32;
  1183. b:bits32;
  1184. VAR z0Ptr:bits32;
  1185. VAR z1Ptr:bits32;
  1186. VAR z2Ptr:bits32
  1187. );
  1188. Var
  1189. z0, z1, z2, more1: bits32;
  1190. Begin
  1191. mul32To64( a1, b, z1, z2 );
  1192. mul32To64( a0, b, z0, more1 );
  1193. add64( z0, more1, 0, z1, z0, z1 );
  1194. z2Ptr := z2;
  1195. z1Ptr := z1;
  1196. z0Ptr := z0;
  1197. End;
  1198. {*
  1199. -------------------------------------------------------------------------------
  1200. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1201. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1202. product. The product is broken into four 32-bit pieces which are stored at
  1203. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1204. -------------------------------------------------------------------------------
  1205. *}
  1206. Procedure
  1207. mul64To128(
  1208. a0:bits32;
  1209. a1:bits32;
  1210. b0:bits32;
  1211. b1:bits32;
  1212. VAR z0Ptr:bits32;
  1213. VAR z1Ptr:bits32;
  1214. VAR z2Ptr:bits32;
  1215. VAR z3Ptr:bits32
  1216. );
  1217. Var
  1218. z0, z1, z2, z3: bits32;
  1219. more1, more2: bits32;
  1220. Begin
  1221. mul32To64( a1, b1, z2, z3 );
  1222. mul32To64( a1, b0, z1, more2 );
  1223. add64( z1, more2, 0, z2, z1, z2 );
  1224. mul32To64( a0, b0, z0, more1 );
  1225. add64( z0, more1, 0, z1, z0, z1 );
  1226. mul32To64( a0, b1, more1, more2 );
  1227. add64( more1, more2, 0, z2, more1, z2 );
  1228. add64( z0, z1, 0, more1, z0, z1 );
  1229. z3Ptr := z3;
  1230. z2Ptr := z2;
  1231. z1Ptr := z1;
  1232. z0Ptr := z0;
  1233. End;
  1234. {*----------------------------------------------------------------------------
  1235. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1236. | into two 64-bit pieces which are stored at the locations pointed to by
  1237. | `z0Ptr' and `z1Ptr'.
  1238. *----------------------------------------------------------------------------*}
  1239. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1240. var
  1241. aHigh, aLow, bHigh, bLow : bits32;
  1242. z0, zMiddleA, zMiddleB, z1 : bits64;
  1243. begin
  1244. aLow := a;
  1245. aHigh := a shr 32;
  1246. bLow := b;
  1247. bHigh := b shr 32;
  1248. z1 := ( bits64(aLow) ) * bLow;
  1249. zMiddleA := ( bits64( aLow )) * bHigh;
  1250. zMiddleB := ( bits64( aHigh )) * bLow;
  1251. z0 := ( bits64(aHigh) ) * bHigh;
  1252. inc(zMiddleA, zMiddleB);
  1253. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1254. zMiddleA := zMiddleA shl 32;
  1255. inc(z1, zMiddleA);
  1256. inc(z0, ord( z1 < zMiddleA ));
  1257. z1Ptr := z1;
  1258. z0Ptr := z0;
  1259. end;
  1260. {*----------------------------------------------------------------------------
  1261. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1262. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1263. | product. The product is broken into four 64-bit pieces which are stored at
  1264. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1265. *----------------------------------------------------------------------------*}
  1266. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1267. var
  1268. z0,z1,z2,z3,more1,more2 : bits64;
  1269. begin
  1270. mul64To128( a1, b1, z2, z3 );
  1271. mul64To128( a1, b0, z1, more2 );
  1272. add128( z1, more2, 0, z2, z1, z2 );
  1273. mul64To128( a0, b0, z0, more1 );
  1274. add128( z0, more1, 0, z1, z0, z1 );
  1275. mul64To128( a0, b1, more1, more2 );
  1276. add128( more1, more2, 0, z2, more1, z2 );
  1277. add128( z0, z1, 0, more1, z0, z1 );
  1278. z3Ptr := z3;
  1279. z2Ptr := z2;
  1280. z1Ptr := z1;
  1281. z0Ptr := z0;
  1282. end;
  1283. {*----------------------------------------------------------------------------
  1284. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1285. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1286. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1287. | `z2Ptr'.
  1288. *----------------------------------------------------------------------------*}
  1289. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1290. var
  1291. z0, z1, z2, more1 : bits64;
  1292. begin
  1293. mul64To128( a1, b, z1, z2 );
  1294. mul64To128( a0, b, z0, more1 );
  1295. add128( z0, more1, 0, z1, z0, z1 );
  1296. z2Ptr := z2;
  1297. z1Ptr := z1;
  1298. z0Ptr := z0;
  1299. end;
  1300. {*----------------------------------------------------------------------------
  1301. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1302. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1303. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1304. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1305. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1306. | unsigned integer is returned.
  1307. *----------------------------------------------------------------------------*}
  1308. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1309. var
  1310. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1311. begin
  1312. if ( b <= a0 ) then
  1313. begin
  1314. result:=qword( $FFFFFFFFFFFFFFFF );
  1315. exit;
  1316. end;
  1317. b0 := b shr 32;
  1318. if ( b0 shl 32 <= a0 ) then
  1319. z:=qword( $FFFFFFFF00000000 )
  1320. else
  1321. z:=( a0 div b0 ) shl 32;
  1322. mul64To128( b, z, term0, term1 );
  1323. sub128( a0, a1, term0, term1, rem0, rem1 );
  1324. while ( ( sbits64(rem0) ) < 0 ) do begin
  1325. dec(z,qword( $100000000 ));
  1326. b1 := b shl 32;
  1327. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1328. end;
  1329. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1330. if ( b0 shl 32 <= rem0 ) then
  1331. z:=z or $FFFFFFFF
  1332. else
  1333. z:=z or rem0 div b0;
  1334. result:=z;
  1335. end;
  1336. {*
  1337. -------------------------------------------------------------------------------
  1338. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1339. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1340. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1341. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1342. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1343. unsigned integer is returned.
  1344. -------------------------------------------------------------------------------
  1345. *}
  1346. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1347. Var
  1348. b0, b1: bits32;
  1349. rem0, rem1, term0, term1: bits32;
  1350. z: bits32;
  1351. Begin
  1352. if ( b <= a0 ) then
  1353. Begin
  1354. estimateDiv64To32 := $FFFFFFFF;
  1355. exit;
  1356. End;
  1357. b0 := b shr 16;
  1358. if ( b0 shl 16 <= a0 ) then
  1359. z:= $FFFF0000
  1360. else
  1361. z:= ( a0 div b0 ) shl 16;
  1362. mul32To64( b, z, term0, term1 );
  1363. sub64( a0, a1, term0, term1, rem0, rem1 );
  1364. while ( ( sbits32 (rem0) ) < 0 ) do
  1365. Begin
  1366. z := z - $10000;
  1367. b1 := b shl 16;
  1368. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1369. End;
  1370. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1371. if ( b0 shl 16 <= rem0 ) then
  1372. z := z or $FFFF
  1373. else
  1374. z := z or (rem0 div b0);
  1375. estimateDiv64To32 := z;
  1376. End;
  1377. {*
  1378. -------------------------------------------------------------------------------
  1379. Returns an approximation to the square root of the 32-bit significand given
  1380. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1381. `aExp' (the least significant bit) is 1, the integer returned approximates
  1382. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1383. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1384. case, the approximation returned lies strictly within +/-2 of the exact
  1385. value.
  1386. -------------------------------------------------------------------------------
  1387. *}
  1388. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1389. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1390. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1391. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1392. );
  1393. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1394. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1395. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1396. );
  1397. Var
  1398. index: int8;
  1399. z: bits32;
  1400. Begin
  1401. index := ( a shr 27 ) AND 15;
  1402. if ( aExp AND 1 ) <> 0 then
  1403. Begin
  1404. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1405. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1406. a := a shr 1;
  1407. End
  1408. else
  1409. Begin
  1410. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1411. z := a div z + z;
  1412. if ( $20000 <= z ) then
  1413. z := $FFFF8000
  1414. else
  1415. z := ( z shl 15 );
  1416. if ( z <= a ) then
  1417. Begin
  1418. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1419. exit;
  1420. End;
  1421. End;
  1422. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1423. End;
  1424. {*
  1425. -------------------------------------------------------------------------------
  1426. Returns the number of leading 0 bits before the most-significant 1 bit of
  1427. `a'. If `a' is zero, 32 is returned.
  1428. -------------------------------------------------------------------------------
  1429. *}
  1430. Function countLeadingZeros32( a:bits32 ): int8;
  1431. const countLeadingZerosHigh:array[0..255] of int8 = (
  1432. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1433. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1434. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1435. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1436. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1437. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1438. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1439. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1440. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1441. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1442. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1443. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1444. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1445. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1446. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1447. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1448. );
  1449. Var
  1450. shiftCount: int8;
  1451. Begin
  1452. shiftCount := 0;
  1453. if ( a < $10000 ) then
  1454. Begin
  1455. shiftCount := shiftcount + 16;
  1456. a := a shl 16;
  1457. End;
  1458. if ( a < $1000000 ) then
  1459. Begin
  1460. shiftCount := shiftcount + 8;
  1461. a := a shl 8;
  1462. end;
  1463. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1464. countLeadingZeros32:= shiftCount;
  1465. End;
  1466. {*----------------------------------------------------------------------------
  1467. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1468. | `a'. If `a' is zero, 64 is returned.
  1469. *----------------------------------------------------------------------------*}
  1470. function countLeadingZeros64( a : bits64): int8;
  1471. var
  1472. shiftcount : int8;
  1473. Begin
  1474. shiftCount := 0;
  1475. if ( a < bits64(bits64(1) shl 32 )) then
  1476. shiftCount := shiftcount + 32
  1477. else
  1478. a := a shr 32;
  1479. shiftCount := shiftCount + countLeadingZeros32( a );
  1480. countLeadingZeros64:= shiftCount;
  1481. End;
  1482. {*
  1483. -------------------------------------------------------------------------------
  1484. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1485. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1486. returns 0.
  1487. -------------------------------------------------------------------------------
  1488. *}
  1489. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1490. Begin
  1491. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1492. End;
  1493. {*
  1494. -------------------------------------------------------------------------------
  1495. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1496. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1497. Otherwise, returns 0.
  1498. -------------------------------------------------------------------------------
  1499. *}
  1500. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1501. Begin
  1502. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1503. End;
  1504. {*
  1505. -------------------------------------------------------------------------------
  1506. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1507. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1508. returns 0.
  1509. -------------------------------------------------------------------------------
  1510. *}
  1511. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1512. Begin
  1513. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1514. End;
  1515. {*
  1516. -------------------------------------------------------------------------------
  1517. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1518. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1519. returns 0.
  1520. -------------------------------------------------------------------------------
  1521. *}
  1522. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1523. Begin
  1524. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1525. End;
  1526. const
  1527. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1528. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1529. (*****************************************************************************)
  1530. (* End Low-Level arithmetic *)
  1531. (*****************************************************************************)
  1532. {*
  1533. -------------------------------------------------------------------------------
  1534. Functions and definitions to determine: (1) whether tininess for underflow
  1535. is detected before or after rounding by default, (2) what (if anything)
  1536. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1537. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1538. are propagated from function inputs to output. These details are ENDIAN
  1539. specific
  1540. -------------------------------------------------------------------------------
  1541. *}
  1542. {$IFDEF ENDIAN_LITTLE}
  1543. {*
  1544. -------------------------------------------------------------------------------
  1545. Internal canonical NaN format.
  1546. -------------------------------------------------------------------------------
  1547. *}
  1548. TYPE
  1549. commonNaNT = packed record
  1550. sign: flag;
  1551. high, low : bits32;
  1552. end;
  1553. {*
  1554. -------------------------------------------------------------------------------
  1555. The pattern for a default generated single-precision NaN.
  1556. -------------------------------------------------------------------------------
  1557. *}
  1558. const float32_default_nan = $FFC00000;
  1559. {*
  1560. -------------------------------------------------------------------------------
  1561. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1562. otherwise returns 0.
  1563. -------------------------------------------------------------------------------
  1564. *}
  1565. Function float32_is_nan( a : float32 ): flag;
  1566. Begin
  1567. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1568. End;
  1569. {*
  1570. -------------------------------------------------------------------------------
  1571. Returns 1 if the single-precision floating-point value `a' is a signaling
  1572. NaN; otherwise returns 0.
  1573. -------------------------------------------------------------------------------
  1574. *}
  1575. Function float32_is_signaling_nan( a : float32 ): flag;
  1576. Begin
  1577. float32_is_signaling_nan := flag
  1578. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1579. End;
  1580. {*
  1581. -------------------------------------------------------------------------------
  1582. Returns the result of converting the single-precision floating-point NaN
  1583. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1584. exception is raised.
  1585. -------------------------------------------------------------------------------
  1586. *}
  1587. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1588. var
  1589. z : commonNaNT ;
  1590. Begin
  1591. if ( float32_is_signaling_nan( a ) <> 0) then
  1592. float_raise( float_flag_invalid );
  1593. z.sign := a shr 31;
  1594. z.low := 0;
  1595. z.high := a shl 9;
  1596. c := z;
  1597. End;
  1598. {*
  1599. -------------------------------------------------------------------------------
  1600. Returns the result of converting the canonical NaN `a' to the single-
  1601. precision floating-point format.
  1602. -------------------------------------------------------------------------------
  1603. *}
  1604. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1605. Begin
  1606. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1607. End;
  1608. {*
  1609. -------------------------------------------------------------------------------
  1610. Takes two single-precision floating-point values `a' and `b', one of which
  1611. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1612. signaling NaN, the invalid exception is raised.
  1613. -------------------------------------------------------------------------------
  1614. *}
  1615. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1616. Var
  1617. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1618. label returnLargerSignificand;
  1619. Begin
  1620. aIsNaN := float32_is_nan( a );
  1621. aIsSignalingNaN := float32_is_signaling_nan( a );
  1622. bIsNaN := float32_is_nan( b );
  1623. bIsSignalingNaN := float32_is_signaling_nan( b );
  1624. a := a or $00400000;
  1625. b := b or $00400000;
  1626. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1627. float_raise( float_flag_invalid );
  1628. if ( aIsSignalingNaN )<> 0 then
  1629. Begin
  1630. if ( bIsSignalingNaN ) <> 0 then
  1631. goto returnLargerSignificand;
  1632. if bIsNan <> 0 then
  1633. propagateFloat32NaN := b
  1634. else
  1635. propagateFloat32NaN := a;
  1636. exit;
  1637. End
  1638. else if ( aIsNaN <> 0) then
  1639. Begin
  1640. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1641. Begin
  1642. propagateFloat32NaN := a;
  1643. exit;
  1644. End;
  1645. returnLargerSignificand:
  1646. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1647. Begin
  1648. propagateFloat32NaN := b;
  1649. exit;
  1650. End;
  1651. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1652. Begin
  1653. propagateFloat32NaN := a;
  1654. End;
  1655. if a < b then
  1656. propagateFloat32NaN := a
  1657. else
  1658. propagateFloat32NaN := b;
  1659. exit;
  1660. End
  1661. else
  1662. Begin
  1663. propagateFloat32NaN := b;
  1664. exit;
  1665. End;
  1666. End;
  1667. {*
  1668. -------------------------------------------------------------------------------
  1669. The pattern for a default generated double-precision NaN. The `high' and
  1670. `low' values hold the most- and least-significant bits, respectively.
  1671. -------------------------------------------------------------------------------
  1672. *}
  1673. const
  1674. float64_default_nan_high = $FFF80000;
  1675. float64_default_nan_low = $00000000;
  1676. {*
  1677. -------------------------------------------------------------------------------
  1678. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1679. otherwise returns 0.
  1680. -------------------------------------------------------------------------------
  1681. *}
  1682. Function float64_is_nan( a : float64 ) : flag;
  1683. Begin
  1684. float64_is_nan :=
  1685. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1686. and ( a.low or ( a.high and $000FFFFF ) );
  1687. End;
  1688. {*
  1689. -------------------------------------------------------------------------------
  1690. Returns 1 if the double-precision floating-point value `a' is a signaling
  1691. NaN; otherwise returns 0.
  1692. -------------------------------------------------------------------------------
  1693. *}
  1694. Function float64_is_signaling_nan( a : float64 ): flag;
  1695. Begin
  1696. float64_is_signaling_nan :=
  1697. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1698. and ( a.low or ( a.high and $0007FFFF ) );
  1699. End;
  1700. {*
  1701. -------------------------------------------------------------------------------
  1702. Returns the result of converting the double-precision floating-point NaN
  1703. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1704. exception is raised.
  1705. -------------------------------------------------------------------------------
  1706. *}
  1707. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1708. Var
  1709. z : commonNaNT;
  1710. Begin
  1711. if ( float64_is_signaling_nan( a )<>0 ) then
  1712. float_raise( float_flag_invalid );
  1713. z.sign := a.high shr 31;
  1714. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1715. c := z;
  1716. End;
  1717. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1718. Var
  1719. z : commonNaNT;
  1720. Begin
  1721. if ( float64_is_signaling_nan( a )<>0 ) then
  1722. float_raise( float_flag_invalid );
  1723. z.sign := a.high shr 31;
  1724. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1725. result := z;
  1726. End;
  1727. {*
  1728. -------------------------------------------------------------------------------
  1729. Returns the result of converting the canonical NaN `a' to the double-
  1730. precision floating-point format.
  1731. -------------------------------------------------------------------------------
  1732. *}
  1733. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1734. Var
  1735. z: float64;
  1736. Begin
  1737. shift64Right( a.high, a.low, 12, z.high, z.low );
  1738. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1739. c := z;
  1740. End;
  1741. {*
  1742. -------------------------------------------------------------------------------
  1743. Takes two double-precision floating-point values `a' and `b', one of which
  1744. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1745. signaling NaN, the invalid exception is raised.
  1746. -------------------------------------------------------------------------------
  1747. *}
  1748. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1749. Var
  1750. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1751. label returnLargerSignificand;
  1752. Begin
  1753. aIsNaN := float64_is_nan( a );
  1754. aIsSignalingNaN := float64_is_signaling_nan( a );
  1755. bIsNaN := float64_is_nan( b );
  1756. bIsSignalingNaN := float64_is_signaling_nan( b );
  1757. a.high := a.high or $00080000;
  1758. b.high := b.high or $00080000;
  1759. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1760. float_raise( float_flag_invalid );
  1761. if ( aIsSignalingNaN )<>0 then
  1762. Begin
  1763. if ( bIsSignalingNaN )<>0 then
  1764. goto returnLargerSignificand;
  1765. if bIsNan <> 0 then
  1766. c := b
  1767. else
  1768. c := a;
  1769. exit;
  1770. End
  1771. else if ( aIsNaN )<> 0 then
  1772. Begin
  1773. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1774. Begin
  1775. c := a;
  1776. exit;
  1777. End;
  1778. returnLargerSignificand:
  1779. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1780. Begin
  1781. c := b;
  1782. exit;
  1783. End;
  1784. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1785. Begin
  1786. c := a;
  1787. exit;
  1788. End;
  1789. if a.high < b.high then
  1790. c := a
  1791. else
  1792. c := b;
  1793. exit;
  1794. End
  1795. else
  1796. Begin
  1797. c := b;
  1798. exit;
  1799. End;
  1800. End;
  1801. {*----------------------------------------------------------------------------
  1802. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1803. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1804. | returns 0.
  1805. *----------------------------------------------------------------------------*}
  1806. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1807. begin
  1808. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1809. end;
  1810. {*----------------------------------------------------------------------------
  1811. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1812. | otherwise returns 0.
  1813. *----------------------------------------------------------------------------*}
  1814. function float128_is_nan( a : float128): flag;
  1815. begin
  1816. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1817. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1818. end;
  1819. {*----------------------------------------------------------------------------
  1820. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1821. | signaling NaN; otherwise returns 0.
  1822. *----------------------------------------------------------------------------*}
  1823. function float128_is_signaling_nan( a : float128): flag;
  1824. begin
  1825. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1826. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1827. end;
  1828. {*----------------------------------------------------------------------------
  1829. | Returns the result of converting the quadruple-precision floating-point NaN
  1830. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1831. | exception is raised.
  1832. *----------------------------------------------------------------------------*}
  1833. function float128ToCommonNaN( a : float128): commonNaNT;
  1834. var
  1835. z: commonNaNT;
  1836. qhigh,qlow : qword;
  1837. begin
  1838. if ( float128_is_signaling_nan( a )<>0) then
  1839. float_raise( float_flag_invalid );
  1840. z.sign := a.high shr 63;
  1841. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1842. z.high:=qhigh shr 32;
  1843. z.low:=qhigh and $ffffffff;
  1844. result:=z;
  1845. end;
  1846. {*----------------------------------------------------------------------------
  1847. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1848. | precision floating-point format.
  1849. *----------------------------------------------------------------------------*}
  1850. function commonNaNToFloat128( a : commonNaNT): float128;
  1851. var
  1852. z: float128;
  1853. begin
  1854. shift128Right( a.high, a.low, 16, z.high, z.low );
  1855. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1856. result:=z;
  1857. end;
  1858. {*----------------------------------------------------------------------------
  1859. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1860. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1861. | `b' is a signaling NaN, the invalid exception is raised.
  1862. *----------------------------------------------------------------------------*}
  1863. function propagateFloat128NaN( a: float128; b : float128): float128;
  1864. var
  1865. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1866. label
  1867. returnLargerSignificand;
  1868. begin
  1869. aIsNaN := float128_is_nan( a );
  1870. aIsSignalingNaN := float128_is_signaling_nan( a );
  1871. bIsNaN := float128_is_nan( b );
  1872. bIsSignalingNaN := float128_is_signaling_nan( b );
  1873. a.high := a.high or int64( $0000800000000000 );
  1874. b.high := b.high or int64( $0000800000000000 );
  1875. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1876. float_raise( float_flag_invalid );
  1877. if ( aIsSignalingNaN )<>0 then
  1878. begin
  1879. if ( bIsSignalingNaN )<>0 then
  1880. goto returnLargerSignificand;
  1881. if bIsNaN<>0 then
  1882. result := b
  1883. else
  1884. result := a;
  1885. exit;
  1886. end
  1887. else if ( aIsNaN )<>0 then
  1888. begin
  1889. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1890. begin
  1891. result := a;
  1892. exit;
  1893. end;
  1894. returnLargerSignificand:
  1895. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1896. begin
  1897. result := b;
  1898. exit;
  1899. end;
  1900. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1901. begin
  1902. result := a;
  1903. exit
  1904. end;
  1905. if ( a.high < b.high ) then
  1906. result := a
  1907. else
  1908. result := b;
  1909. exit;
  1910. end
  1911. else
  1912. result:=b;
  1913. end;
  1914. {$ELSE}
  1915. { Big endian code }
  1916. (*----------------------------------------------------------------------------
  1917. | Internal canonical NaN format.
  1918. *----------------------------------------------------------------------------*)
  1919. type
  1920. commonNANT = packed record
  1921. sign : flag;
  1922. high, low : bits32;
  1923. end;
  1924. (*----------------------------------------------------------------------------
  1925. | The pattern for a default generated single-precision NaN.
  1926. *----------------------------------------------------------------------------*)
  1927. const float32_default_nan = $7FFFFFFF;
  1928. (*----------------------------------------------------------------------------
  1929. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1930. | otherwise returns 0.
  1931. *----------------------------------------------------------------------------*)
  1932. function float32_is_nan(a: float32): flag;
  1933. begin
  1934. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1935. end;
  1936. (*----------------------------------------------------------------------------
  1937. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1938. | NaN; otherwise returns 0.
  1939. *----------------------------------------------------------------------------*)
  1940. function float32_is_signaling_nan(a: float32):flag;
  1941. begin
  1942. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1943. end;
  1944. (*----------------------------------------------------------------------------
  1945. | Returns the result of converting the single-precision floating-point NaN
  1946. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1947. | exception is raised.
  1948. *----------------------------------------------------------------------------*)
  1949. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1950. var
  1951. z: commonNANT;
  1952. begin
  1953. if float32_is_signaling_nan(a)<>0 then
  1954. float_raise(float_flag_invalid);
  1955. z.sign := a shr 31;
  1956. z.low := 0;
  1957. z.high := a shl 9;
  1958. c:=z;
  1959. end;
  1960. (*----------------------------------------------------------------------------
  1961. | Returns the result of converting the canonical NaN `a' to the single-
  1962. | precision floating-point format.
  1963. *----------------------------------------------------------------------------*)
  1964. function CommonNanToFloat32(a : CommonNaNT): float32;
  1965. begin
  1966. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  1967. end;
  1968. (*----------------------------------------------------------------------------
  1969. | Takes two single-precision floating-point values `a' and `b', one of which
  1970. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1971. | signaling NaN, the invalid exception is raised.
  1972. *----------------------------------------------------------------------------*)
  1973. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  1974. var
  1975. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1976. begin
  1977. aIsNaN := float32_is_nan( a );
  1978. aIsSignalingNaN := float32_is_signaling_nan( a );
  1979. bIsNaN := float32_is_nan( b );
  1980. bIsSignalingNaN := float32_is_signaling_nan( b );
  1981. a := a or $00400000;
  1982. b := b or $00400000;
  1983. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1984. float_raise( float_flag_invalid );
  1985. if bIsSignalingNaN<>0 then
  1986. propagateFloat32Nan := b
  1987. else if aIsSignalingNan<>0 then
  1988. propagateFloat32Nan := a
  1989. else if bIsNan<>0 then
  1990. propagateFloat32Nan := b
  1991. else
  1992. propagateFloat32Nan := a;
  1993. end;
  1994. (*----------------------------------------------------------------------------
  1995. | The pattern for a default generated double-precision NaN. The `high' and
  1996. | `low' values hold the most- and least-significant bits, respectively.
  1997. *----------------------------------------------------------------------------*)
  1998. const
  1999. float64_default_nan_high = $7FFFFFFF;
  2000. float64_default_nan_low = $FFFFFFFF;
  2001. (*----------------------------------------------------------------------------
  2002. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2003. | otherwise returns 0.
  2004. *----------------------------------------------------------------------------*)
  2005. function float64_is_nan(a: float64): flag;
  2006. begin
  2007. float64_is_nan := flag (
  2008. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2009. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2010. end;
  2011. (*----------------------------------------------------------------------------
  2012. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2013. | NaN; otherwise returns 0.
  2014. *----------------------------------------------------------------------------*)
  2015. function float64_is_signaling_nan( a:float64): flag;
  2016. begin
  2017. float64_is_signaling_nan := flag(
  2018. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2019. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2020. end;
  2021. (*----------------------------------------------------------------------------
  2022. | Returns the result of converting the double-precision floating-point NaN
  2023. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2024. | exception is raised.
  2025. *----------------------------------------------------------------------------*)
  2026. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2027. var
  2028. z : commonNaNT;
  2029. begin
  2030. if ( float64_is_signaling_nan( a )<>0 ) then
  2031. float_raise( float_flag_invalid );
  2032. z.sign := a.high shr 31;
  2033. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2034. c:=z;
  2035. end;
  2036. (*----------------------------------------------------------------------------
  2037. | Returns the result of converting the canonical NaN `a' to the double-
  2038. | precision floating-point format.
  2039. *----------------------------------------------------------------------------*)
  2040. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2041. var
  2042. z: float64;
  2043. begin
  2044. shift64Right( a.high, a.low, 12, z.high, z.low );
  2045. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2046. c:=z;
  2047. end;
  2048. (*----------------------------------------------------------------------------
  2049. | Takes two double-precision floating-point values `a' and `b', one of which
  2050. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2051. | signaling NaN, the invalid exception is raised.
  2052. *----------------------------------------------------------------------------*)
  2053. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2054. var
  2055. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2056. begin
  2057. aIsNaN := float64_is_nan( a );
  2058. aIsSignalingNaN := float64_is_signaling_nan( a );
  2059. bIsNaN := float64_is_nan( b );
  2060. bIsSignalingNaN := float64_is_signaling_nan( b );
  2061. a.high := a.high or $00080000;
  2062. b.high := b.high or $00080000;
  2063. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2064. float_raise( float_flag_invalid );
  2065. if bIsSignalingNaN<>0 then
  2066. c := b
  2067. else if aIsSignalingNan<>0 then
  2068. c := a
  2069. else if bIsNan<>0 then
  2070. c := b
  2071. else
  2072. c := a;
  2073. end;
  2074. {$ENDIF}
  2075. (****************************************************************************)
  2076. (* END ENDIAN SPECIFIC CODE *)
  2077. (****************************************************************************)
  2078. {*
  2079. -------------------------------------------------------------------------------
  2080. Returns the fraction bits of the single-precision floating-point value `a'.
  2081. -------------------------------------------------------------------------------
  2082. *}
  2083. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2084. Begin
  2085. ExtractFloat32Frac := A AND $007FFFFF;
  2086. End;
  2087. {*
  2088. -------------------------------------------------------------------------------
  2089. Returns the exponent bits of the single-precision floating-point value `a'.
  2090. -------------------------------------------------------------------------------
  2091. *}
  2092. Function extractFloat32Exp( a: float32 ): Int16;
  2093. Begin
  2094. extractFloat32Exp := (a shr 23) AND $FF;
  2095. End;
  2096. {*
  2097. -------------------------------------------------------------------------------
  2098. Returns the sign bit of the single-precision floating-point value `a'.
  2099. -------------------------------------------------------------------------------
  2100. *}
  2101. Function extractFloat32Sign( a: float32 ): Flag;
  2102. Begin
  2103. extractFloat32Sign := a shr 31;
  2104. End;
  2105. {*
  2106. -------------------------------------------------------------------------------
  2107. Normalizes the subnormal single-precision floating-point value represented
  2108. by the denormalized significand `aSig'. The normalized exponent and
  2109. significand are stored at the locations pointed to by `zExpPtr' and
  2110. `zSigPtr', respectively.
  2111. -------------------------------------------------------------------------------
  2112. *}
  2113. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2114. Var
  2115. ShiftCount : BYTE;
  2116. Begin
  2117. shiftCount := countLeadingZeros32( aSig ) - 8;
  2118. zSigPtr := aSig shl shiftCount;
  2119. zExpPtr := 1 - shiftCount;
  2120. End;
  2121. {*
  2122. -------------------------------------------------------------------------------
  2123. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2124. single-precision floating-point value, returning the result. After being
  2125. shifted into the proper positions, the three fields are simply added
  2126. together to form the result. This means that any integer portion of `zSig'
  2127. will be added into the exponent. Since a properly normalized significand
  2128. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2129. than the desired result exponent whenever `zSig' is a complete, normalized
  2130. significand.
  2131. -------------------------------------------------------------------------------
  2132. *}
  2133. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2134. Begin
  2135. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2136. + zSig;
  2137. End;
  2138. {*
  2139. -------------------------------------------------------------------------------
  2140. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2141. and significand `zSig', and returns the proper single-precision floating-
  2142. point value corresponding to the abstract input. Ordinarily, the abstract
  2143. value is simply rounded and packed into the single-precision format, with
  2144. the inexact exception raised if the abstract input cannot be represented
  2145. exactly. However, if the abstract value is too large, the overflow and
  2146. inexact exceptions are raised and an infinity or maximal finite value is
  2147. returned. If the abstract value is too small, the input value is rounded to
  2148. a subnormal number, and the underflow and inexact exceptions are raised if
  2149. the abstract input cannot be represented exactly as a subnormal single-
  2150. precision floating-point number.
  2151. The input significand `zSig' has its binary point between bits 30
  2152. and 29, which is 7 bits to the left of the usual location. This shifted
  2153. significand must be normalized or smaller. If `zSig' is not normalized,
  2154. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2155. and it must not require rounding. In the usual case that `zSig' is
  2156. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2157. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2158. Binary Floating-Point Arithmetic.
  2159. -------------------------------------------------------------------------------
  2160. *}
  2161. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2162. Var
  2163. roundingMode : BYTE;
  2164. roundNearestEven : Flag;
  2165. roundIncrement, roundBits : BYTE;
  2166. IsTiny : Flag;
  2167. Begin
  2168. roundingMode := softfloat_rounding_mode;
  2169. if (roundingMode = float_round_nearest_even) then
  2170. Begin
  2171. roundNearestEven := Flag(TRUE);
  2172. end
  2173. else
  2174. roundNearestEven := Flag(FALSE);
  2175. roundIncrement := $40;
  2176. if ( Boolean(roundNearestEven) = FALSE) then
  2177. Begin
  2178. if ( roundingMode = float_round_to_zero ) Then
  2179. Begin
  2180. roundIncrement := 0;
  2181. End
  2182. else
  2183. Begin
  2184. roundIncrement := $7F;
  2185. if ( zSign <> 0 ) then
  2186. Begin
  2187. if roundingMode = float_round_up then roundIncrement := 0;
  2188. End
  2189. else
  2190. Begin
  2191. if roundingMode = float_round_down then roundIncrement := 0;
  2192. End;
  2193. End
  2194. End;
  2195. roundBits := zSig AND $7F;
  2196. if ($FD <= bits16 (zExp) ) then
  2197. Begin
  2198. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2199. Begin
  2200. float_raise( float_flag_overflow OR float_flag_inexact );
  2201. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2202. exit;
  2203. End;
  2204. if ( zExp < 0 ) then
  2205. Begin
  2206. isTiny :=
  2207. flag(( float_detect_tininess = float_tininess_before_rounding )
  2208. OR ( zExp < -1 )
  2209. OR ( (zSig + roundIncrement) < $80000000 ));
  2210. shift32RightJamming( zSig, - zExp, zSig );
  2211. zExp := 0;
  2212. roundBits := zSig AND $7F;
  2213. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2214. float_raise( float_flag_underflow );
  2215. End;
  2216. End;
  2217. if ( roundBits )<> 0 then
  2218. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  2219. zSig := ( zSig + roundIncrement ) shr 7;
  2220. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2221. if ( zSig = 0 ) then zExp := 0;
  2222. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2223. exit;
  2224. End;
  2225. {*
  2226. -------------------------------------------------------------------------------
  2227. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2228. and significand `zSig', and returns the proper single-precision floating-
  2229. point value corresponding to the abstract input. This routine is just like
  2230. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2231. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2232. floating-point exponent.
  2233. -------------------------------------------------------------------------------
  2234. *}
  2235. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2236. Var
  2237. ShiftCount : int8;
  2238. Begin
  2239. shiftCount := countLeadingZeros32( zSig ) - 1;
  2240. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2241. End;
  2242. {*
  2243. -------------------------------------------------------------------------------
  2244. Returns the most-significant 20 fraction bits of the double-precision
  2245. floating-point value `a'.
  2246. -------------------------------------------------------------------------------
  2247. *}
  2248. Function extractFloat64Frac0(a: float64): bits32;
  2249. Begin
  2250. extractFloat64Frac0 := a.high and $000FFFFF;
  2251. End;
  2252. {*
  2253. -------------------------------------------------------------------------------
  2254. Returns the least-significant 32 fraction bits of the double-precision
  2255. floating-point value `a'.
  2256. -------------------------------------------------------------------------------
  2257. *}
  2258. Function extractFloat64Frac1(a: float64): bits32;
  2259. Begin
  2260. extractFloat64Frac1 := a.low;
  2261. End;
  2262. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2263. Function extractFloat64Frac(a: float64): bits64;
  2264. Begin
  2265. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2266. End;
  2267. {*
  2268. -------------------------------------------------------------------------------
  2269. Returns the exponent bits of the double-precision floating-point value `a'.
  2270. -------------------------------------------------------------------------------
  2271. *}
  2272. Function extractFloat64Exp(a: float64): int16;
  2273. Begin
  2274. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2275. End;
  2276. {*
  2277. -------------------------------------------------------------------------------
  2278. Returns the sign bit of the double-precision floating-point value `a'.
  2279. -------------------------------------------------------------------------------
  2280. *}
  2281. Function extractFloat64Sign(a: float64) : flag;
  2282. Begin
  2283. extractFloat64Sign := a.high shr 31;
  2284. End;
  2285. {*
  2286. -------------------------------------------------------------------------------
  2287. Normalizes the subnormal double-precision floating-point value represented
  2288. by the denormalized significand formed by the concatenation of `aSig0' and
  2289. `aSig1'. The normalized exponent is stored at the location pointed to by
  2290. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2291. stored at the location pointed to by `zSig0Ptr', and the least significant
  2292. 32 bits of the normalized significand are stored at the location pointed to
  2293. by `zSig1Ptr'.
  2294. -------------------------------------------------------------------------------
  2295. *}
  2296. Procedure normalizeFloat64Subnormal(
  2297. aSig0: bits32;
  2298. aSig1: bits32;
  2299. VAR zExpPtr : Int16;
  2300. VAR zSig0Ptr : Bits32;
  2301. VAR zSig1Ptr : Bits32
  2302. );
  2303. Var
  2304. ShiftCount : Int8;
  2305. Begin
  2306. if ( aSig0 = 0 ) then
  2307. Begin
  2308. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2309. if ( shiftCount < 0 ) then
  2310. Begin
  2311. zSig0Ptr := aSig1 shr ( - shiftCount );
  2312. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2313. End
  2314. else
  2315. Begin
  2316. zSig0Ptr := aSig1 shl shiftCount;
  2317. zSig1Ptr := 0;
  2318. End;
  2319. zExpPtr := - shiftCount - 31;
  2320. End
  2321. else
  2322. Begin
  2323. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2324. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2325. zExpPtr := 1 - shiftCount;
  2326. End;
  2327. End;
  2328. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2329. var
  2330. shiftCount : int8;
  2331. begin
  2332. shiftCount := countLeadingZeros64( aSig ) - 11;
  2333. zSigPtr := aSig shl shiftCount;
  2334. zExpPtr := 1 - shiftCount;
  2335. end;
  2336. {*
  2337. -------------------------------------------------------------------------------
  2338. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2339. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2340. point value, returning the result. After being shifted into the proper
  2341. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2342. together to form the most significant 32 bits of the result. This means
  2343. that any integer portion of `zSig0' will be added into the exponent. Since
  2344. a properly normalized significand will have an integer portion equal to 1,
  2345. the `zExp' input should be 1 less than the desired result exponent whenever
  2346. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2347. -------------------------------------------------------------------------------
  2348. *}
  2349. Procedure
  2350. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2351. var
  2352. z: Float64;
  2353. Begin
  2354. z.low := zSig1;
  2355. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2356. c := z;
  2357. End;
  2358. {*----------------------------------------------------------------------------
  2359. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2360. | double-precision floating-point value, returning the result. After being
  2361. | shifted into the proper positions, the three fields are simply added
  2362. | together to form the result. This means that any integer portion of `zSig'
  2363. | will be added into the exponent. Since a properly normalized significand
  2364. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2365. | than the desired result exponent whenever `zSig' is a complete, normalized
  2366. | significand.
  2367. *----------------------------------------------------------------------------*}
  2368. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2369. begin
  2370. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2371. end;
  2372. {*
  2373. -------------------------------------------------------------------------------
  2374. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2375. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2376. and `zSig2', and returns the proper double-precision floating-point value
  2377. corresponding to the abstract input. Ordinarily, the abstract value is
  2378. simply rounded and packed into the double-precision format, with the inexact
  2379. exception raised if the abstract input cannot be represented exactly.
  2380. However, if the abstract value is too large, the overflow and inexact
  2381. exceptions are raised and an infinity or maximal finite value is returned.
  2382. If the abstract value is too small, the input value is rounded to a
  2383. subnormal number, and the underflow and inexact exceptions are raised if the
  2384. abstract input cannot be represented exactly as a subnormal double-precision
  2385. floating-point number.
  2386. The input significand must be normalized or smaller. If the input
  2387. significand is not normalized, `zExp' must be 0; in that case, the result
  2388. returned is a subnormal number, and it must not require rounding. In the
  2389. usual case that the input significand is normalized, `zExp' must be 1 less
  2390. than the ``true'' floating-point exponent. The handling of underflow and
  2391. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2392. -------------------------------------------------------------------------------
  2393. *}
  2394. Procedure
  2395. roundAndPackFloat64(
  2396. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2397. Var
  2398. roundingMode : Int8;
  2399. roundNearestEven, increment, isTiny : Flag;
  2400. Begin
  2401. roundingMode := softfloat_rounding_mode;
  2402. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2403. increment := flag( sbits32 (zSig2) < 0 );
  2404. if ( roundNearestEven = flag(FALSE) ) then
  2405. Begin
  2406. if ( roundingMode = float_round_to_zero ) then
  2407. increment := 0
  2408. else
  2409. Begin
  2410. if ( zSign )<> 0 then
  2411. Begin
  2412. increment := flag( roundingMode = float_round_down ) and zSig2;
  2413. End
  2414. else
  2415. Begin
  2416. increment := flag( roundingMode = float_round_up ) and zSig2;
  2417. End
  2418. End
  2419. End;
  2420. if ( $7FD <= bits16 (zExp) ) then
  2421. Begin
  2422. if (( $7FD < zExp )
  2423. or (( zExp = $7FD )
  2424. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2425. and (increment<>0)
  2426. )
  2427. ) then
  2428. Begin
  2429. float_raise( float_flag_overflow OR float_flag_inexact );
  2430. if (( roundingMode = float_round_to_zero )
  2431. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2432. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2433. ) then
  2434. Begin
  2435. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2436. exit;
  2437. End;
  2438. packFloat64( zSign, $7FF, 0, 0, c );
  2439. exit;
  2440. End;
  2441. if ( zExp < 0 ) then
  2442. Begin
  2443. isTiny :=
  2444. flag( float_detect_tininess = float_tininess_before_rounding )
  2445. or flag( zExp < -1 )
  2446. or flag(increment = 0)
  2447. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2448. shift64ExtraRightJamming(
  2449. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2450. zExp := 0;
  2451. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2452. if ( roundNearestEven )<>0 then
  2453. Begin
  2454. increment := flag( sbits32 (zSig2) < 0 );
  2455. End
  2456. else
  2457. Begin
  2458. if ( zSign )<>0 then
  2459. Begin
  2460. increment := flag( roundingMode = float_round_down ) and zSig2;
  2461. End
  2462. else
  2463. Begin
  2464. increment := flag( roundingMode = float_round_up ) and zSig2;
  2465. End
  2466. End;
  2467. End;
  2468. End;
  2469. if ( zSig2 )<>0 then
  2470. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2471. if ( increment )<>0 then
  2472. Begin
  2473. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2474. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2475. End
  2476. else
  2477. Begin
  2478. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2479. End;
  2480. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2481. End;
  2482. {*----------------------------------------------------------------------------
  2483. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2484. | and significand `zSig', and returns the proper double-precision floating-
  2485. | point value corresponding to the abstract input. Ordinarily, the abstract
  2486. | value is simply rounded and packed into the double-precision format, with
  2487. | the inexact exception raised if the abstract input cannot be represented
  2488. | exactly. However, if the abstract value is too large, the overflow and
  2489. | inexact exceptions are raised and an infinity or maximal finite value is
  2490. | returned. If the abstract value is too small, the input value is rounded
  2491. | to a subnormal number, and the underflow and inexact exceptions are raised
  2492. | if the abstract input cannot be represented exactly as a subnormal double-
  2493. | precision floating-point number.
  2494. | The input significand `zSig' has its binary point between bits 62
  2495. | and 61, which is 10 bits to the left of the usual location. This shifted
  2496. | significand must be normalized or smaller. If `zSig' is not normalized,
  2497. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2498. | and it must not require rounding. In the usual case that `zSig' is
  2499. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2500. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2501. | Binary Floating-Point Arithmetic.
  2502. *----------------------------------------------------------------------------*}
  2503. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2504. var
  2505. roundingMode: int8;
  2506. roundNearestEven: flag;
  2507. roundIncrement, roundBits: int16;
  2508. isTiny: flag;
  2509. begin
  2510. roundingMode := softfloat_rounding_mode;
  2511. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2512. roundIncrement := $200;
  2513. if ( roundNearestEven=0 ) then
  2514. begin
  2515. if ( roundingMode = float_round_to_zero ) then
  2516. begin
  2517. roundIncrement := 0;
  2518. end
  2519. else begin
  2520. roundIncrement := $3FF;
  2521. if ( zSign<>0 ) then
  2522. begin
  2523. if ( roundingMode = float_round_up ) then
  2524. roundIncrement := 0;
  2525. end
  2526. else begin
  2527. if ( roundingMode = float_round_down ) then
  2528. roundIncrement := 0;
  2529. end
  2530. end
  2531. end;
  2532. roundBits := zSig and $3FF;
  2533. if ( $7FD <= bits16(zExp) ) then
  2534. begin
  2535. if ( ( $7FD < zExp )
  2536. or ( ( zExp = $7FD )
  2537. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2538. ) then
  2539. begin
  2540. float_raise( float_flag_overflow or float_flag_inexact );
  2541. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2542. exit;
  2543. end;
  2544. if ( zExp < 0 ) then
  2545. begin
  2546. isTiny := ord(
  2547. ( float_detect_tininess = float_tininess_before_rounding )
  2548. or ( zExp < -1 )
  2549. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2550. shift64RightJamming( zSig, - zExp, zSig );
  2551. zExp := 0;
  2552. roundBits := zSig and $3FF;
  2553. if ( isTiny and roundBits )<>0 then
  2554. float_raise( float_flag_underflow );
  2555. end
  2556. end;
  2557. if ( roundBits<>0 ) then
  2558. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2559. zSig := ( zSig + roundIncrement ) shr 10;
  2560. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2561. if ( zSig = 0 ) then
  2562. zExp := 0;
  2563. result:=packFloat64( zSign, zExp, zSig );
  2564. end;
  2565. {*
  2566. -------------------------------------------------------------------------------
  2567. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2568. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2569. returns the proper double-precision floating-point value corresponding
  2570. to the abstract input. This routine is just like `roundAndPackFloat64'
  2571. except that the input significand has fewer bits and does not have to be
  2572. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2573. point exponent.
  2574. -------------------------------------------------------------------------------
  2575. *}
  2576. Procedure
  2577. normalizeRoundAndPackFloat64(
  2578. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2579. Var
  2580. shiftCount : int8;
  2581. zSig2 : bits32;
  2582. Begin
  2583. if ( zSig0 = 0 ) then
  2584. Begin
  2585. zSig0 := zSig1;
  2586. zSig1 := 0;
  2587. zExp := zExp -32;
  2588. End;
  2589. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2590. if ( 0 <= shiftCount ) then
  2591. Begin
  2592. zSig2 := 0;
  2593. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2594. End
  2595. else
  2596. Begin
  2597. shift64ExtraRightJamming
  2598. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2599. End;
  2600. zExp := zExp - shiftCount;
  2601. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2602. End;
  2603. {*
  2604. -------------------------------------------------------------------------------
  2605. Returns the result of converting the 32-bit two's complement integer `a' to
  2606. the single-precision floating-point format. The conversion is performed
  2607. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2608. -------------------------------------------------------------------------------
  2609. *}
  2610. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2611. Var
  2612. zSign : Flag;
  2613. Begin
  2614. if ( a = 0 ) then
  2615. Begin
  2616. int32_to_float32.float32 := 0;
  2617. exit;
  2618. End;
  2619. if ( a = sbits32 ($80000000) ) then
  2620. Begin
  2621. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2622. exit;
  2623. end;
  2624. zSign := flag( a < 0 );
  2625. If zSign<>0 then
  2626. a := -a;
  2627. int32_to_float32.float32:=
  2628. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2629. End;
  2630. {*
  2631. -------------------------------------------------------------------------------
  2632. Returns the result of converting the 32-bit two's complement integer `a' to
  2633. the double-precision floating-point format. The conversion is performed
  2634. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2635. -------------------------------------------------------------------------------
  2636. *}
  2637. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2638. var
  2639. zSign : flag;
  2640. absA : bits32;
  2641. shiftCount : int8;
  2642. zSig0, zSig1 : bits32;
  2643. Begin
  2644. if ( a = 0 ) then
  2645. Begin
  2646. packFloat64( 0, 0, 0, 0, result );
  2647. exit;
  2648. end;
  2649. zSign := flag( a < 0 );
  2650. if ZSign<>0 then
  2651. AbsA := -a
  2652. else
  2653. AbsA := a;
  2654. shiftCount := countLeadingZeros32( absA ) - 11;
  2655. if ( 0 <= shiftCount ) then
  2656. Begin
  2657. zSig0 := absA shl shiftCount;
  2658. zSig1 := 0;
  2659. End
  2660. else
  2661. Begin
  2662. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2663. End;
  2664. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2665. End;
  2666. {*
  2667. -------------------------------------------------------------------------------
  2668. Returns the result of converting the single-precision floating-point value
  2669. `a' to the 32-bit two's complement integer format. The conversion is
  2670. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2671. Arithmetic---which means in particular that the conversion is rounded
  2672. according to the current rounding mode. If `a' is a NaN, the largest
  2673. positive integer is returned. Otherwise, if the conversion overflows, the
  2674. largest integer with the same sign as `a' is returned.
  2675. -------------------------------------------------------------------------------
  2676. *}
  2677. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2678. Var
  2679. aSign: flag;
  2680. aExp, shiftCount: int16;
  2681. aSig, aSigExtra: bits32;
  2682. z: int32;
  2683. roundingMode: int8;
  2684. Begin
  2685. aSig := extractFloat32Frac( a.float32 );
  2686. aExp := extractFloat32Exp( a.float32 );
  2687. aSign := extractFloat32Sign( a.float32 );
  2688. shiftCount := aExp - $96;
  2689. if ( 0 <= shiftCount ) then
  2690. Begin
  2691. if ( $9E <= aExp ) then
  2692. Begin
  2693. if ( a.float32 <> $CF000000 ) then
  2694. Begin
  2695. float_raise( float_flag_invalid );
  2696. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2697. Begin
  2698. float32_to_int32 := $7FFFFFFF;
  2699. exit;
  2700. End;
  2701. End;
  2702. float32_to_int32 := sbits32 ($80000000);
  2703. exit;
  2704. End;
  2705. z := ( aSig or $00800000 ) shl shiftCount;
  2706. if ( aSign<>0 ) then z := - z;
  2707. End
  2708. else
  2709. Begin
  2710. if ( aExp < $7E ) then
  2711. Begin
  2712. aSigExtra := aExp OR aSig;
  2713. z := 0;
  2714. End
  2715. else
  2716. Begin
  2717. aSig := aSig OR $00800000;
  2718. aSigExtra := aSig shl ( shiftCount and 31 );
  2719. z := aSig shr ( - shiftCount );
  2720. End;
  2721. if ( aSigExtra<>0 ) then
  2722. softfloat_exception_flags := softfloat_exception_flags
  2723. or float_flag_inexact;
  2724. roundingMode := softfloat_rounding_mode;
  2725. if ( roundingMode = float_round_nearest_even ) then
  2726. Begin
  2727. if ( sbits32 (aSigExtra) < 0 ) then
  2728. Begin
  2729. Inc(z);
  2730. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2731. z := z and not 1;
  2732. End;
  2733. if ( aSign<>0 ) then
  2734. z := - z;
  2735. End
  2736. else
  2737. Begin
  2738. aSigExtra := flag( aSigExtra <> 0 );
  2739. if ( aSign<>0 ) then
  2740. Begin
  2741. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2742. z := - z;
  2743. End
  2744. else
  2745. Begin
  2746. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2747. End
  2748. End;
  2749. End;
  2750. float32_to_int32 := z;
  2751. End;
  2752. {*
  2753. -------------------------------------------------------------------------------
  2754. Returns the result of converting the single-precision floating-point value
  2755. `a' to the 32-bit two's complement integer format. The conversion is
  2756. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2757. Arithmetic, except that the conversion is always rounded toward zero.
  2758. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2759. the conversion overflows, the largest integer with the same sign as `a' is
  2760. returned.
  2761. -------------------------------------------------------------------------------
  2762. *}
  2763. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2764. Var
  2765. aSign : flag;
  2766. aExp, shiftCount : int16;
  2767. aSig : bits32;
  2768. z : int32;
  2769. Begin
  2770. aSig := extractFloat32Frac( a.float32 );
  2771. aExp := extractFloat32Exp( a.float32 );
  2772. aSign := extractFloat32Sign( a.float32 );
  2773. shiftCount := aExp - $9E;
  2774. if ( 0 <= shiftCount ) then
  2775. Begin
  2776. if ( a.float32 <> $CF000000 ) then
  2777. Begin
  2778. float_raise( float_flag_invalid );
  2779. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2780. Begin
  2781. float32_to_int32_round_to_zero := $7FFFFFFF;
  2782. exit;
  2783. end;
  2784. End;
  2785. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2786. exit;
  2787. End
  2788. else
  2789. if ( aExp <= $7E ) then
  2790. Begin
  2791. if ( aExp or aSig )<>0 then
  2792. softfloat_exception_flags :=
  2793. softfloat_exception_flags or float_flag_inexact;
  2794. float32_to_int32_round_to_zero := 0;
  2795. exit;
  2796. End;
  2797. aSig := ( aSig or $00800000 ) shl 8;
  2798. z := aSig shr ( - shiftCount );
  2799. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2800. Begin
  2801. softfloat_exception_flags :=
  2802. softfloat_exception_flags or float_flag_inexact;
  2803. End;
  2804. if ( aSign<>0 ) then z := - z;
  2805. float32_to_int32_round_to_zero := z;
  2806. End;
  2807. {*
  2808. -------------------------------------------------------------------------------
  2809. Returns the result of converting the single-precision floating-point value
  2810. `a' to the double-precision floating-point format. The conversion is
  2811. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2812. Arithmetic.
  2813. -------------------------------------------------------------------------------
  2814. *}
  2815. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2816. Var
  2817. aSign : flag;
  2818. aExp : int16;
  2819. aSig, zSig0, zSig1: bits32;
  2820. tmp : CommonNanT;
  2821. Begin
  2822. aSig := extractFloat32Frac( a.float32 );
  2823. aExp := extractFloat32Exp( a.float32 );
  2824. aSign := extractFloat32Sign( a.float32 );
  2825. if ( aExp = $FF ) then
  2826. Begin
  2827. if ( aSig<>0 ) then
  2828. Begin
  2829. float32ToCommonNaN(a.float32, tmp);
  2830. commonNaNToFloat64(tmp , result);
  2831. exit;
  2832. End;
  2833. packFloat64( aSign, $7FF, 0, 0, result);
  2834. exit;
  2835. End;
  2836. if ( aExp = 0 ) then
  2837. Begin
  2838. if ( aSig = 0 ) then
  2839. Begin
  2840. packFloat64( aSign, 0, 0, 0, result );
  2841. exit;
  2842. end;
  2843. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2844. Dec(aExp);
  2845. End;
  2846. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2847. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2848. End;
  2849. {*
  2850. -------------------------------------------------------------------------------
  2851. Rounds the single-precision floating-point value `a' to an integer,
  2852. and returns the result as a single-precision floating-point value. The
  2853. operation is performed according to the IEC/IEEE Standard for Binary
  2854. Floating-Point Arithmetic.
  2855. -------------------------------------------------------------------------------
  2856. *}
  2857. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2858. Var
  2859. aSign: flag;
  2860. aExp: int16;
  2861. lastBitMask, roundBitsMask: bits32;
  2862. roundingMode: int8;
  2863. z: float32;
  2864. Begin
  2865. aExp := extractFloat32Exp( a.float32 );
  2866. if ( $96 <= aExp ) then
  2867. Begin
  2868. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2869. Begin
  2870. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2871. exit;
  2872. End;
  2873. float32_round_to_int:=a;
  2874. exit;
  2875. End;
  2876. if ( aExp <= $7E ) then
  2877. Begin
  2878. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2879. Begin
  2880. float32_round_to_int:=a;
  2881. exit;
  2882. end;
  2883. softfloat_exception_flags
  2884. := softfloat_exception_flags OR float_flag_inexact;
  2885. aSign := extractFloat32Sign( a.float32 );
  2886. case ( softfloat_rounding_mode ) of
  2887. float_round_nearest_even:
  2888. Begin
  2889. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2890. Begin
  2891. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2892. exit;
  2893. End;
  2894. End;
  2895. float_round_down:
  2896. Begin
  2897. if aSign <> 0 then
  2898. float32_round_to_int.float32 := $BF800000
  2899. else
  2900. float32_round_to_int.float32 := 0;
  2901. exit;
  2902. End;
  2903. float_round_up:
  2904. Begin
  2905. if aSign <> 0 then
  2906. float32_round_to_int.float32 := $80000000
  2907. else
  2908. float32_round_to_int.float32 := $3F800000;
  2909. exit;
  2910. End;
  2911. end;
  2912. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2913. End;
  2914. lastBitMask := 1;
  2915. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2916. lastBitMask := lastBitMask shl ($96 - aExp);
  2917. roundBitsMask := lastBitMask - 1;
  2918. z := a.float32;
  2919. roundingMode := softfloat_rounding_mode;
  2920. if ( roundingMode = float_round_nearest_even ) then
  2921. Begin
  2922. z := z + (lastBitMask shr 1);
  2923. if ( ( z and roundBitsMask ) = 0 ) then
  2924. z := z and not lastBitMask;
  2925. End
  2926. else if ( roundingMode <> float_round_to_zero ) then
  2927. Begin
  2928. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2929. Begin
  2930. z := z + roundBitsMask;
  2931. End;
  2932. End;
  2933. z := z and not roundBitsMask;
  2934. if ( z <> a.float32 ) then
  2935. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2936. float32_round_to_int.float32 := z;
  2937. End;
  2938. {*
  2939. -------------------------------------------------------------------------------
  2940. Returns the result of adding the absolute values of the single-precision
  2941. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2942. before being returned. `zSign' is ignored if the result is a NaN.
  2943. The addition is performed according to the IEC/IEEE Standard for Binary
  2944. Floating-Point Arithmetic.
  2945. -------------------------------------------------------------------------------
  2946. *}
  2947. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2948. Var
  2949. aExp, bExp, zExp: int16;
  2950. aSig, bSig, zSig: bits32;
  2951. expDiff: int16;
  2952. label roundAndPack;
  2953. Begin
  2954. aSig:=extractFloat32Frac( a );
  2955. aExp:=extractFloat32Exp( a );
  2956. bSig:=extractFloat32Frac( b );
  2957. bExp := extractFloat32Exp( b );
  2958. expDiff := aExp - bExp;
  2959. aSig := aSig shl 6;
  2960. bSig := bSig shl 6;
  2961. if ( 0 < expDiff ) then
  2962. Begin
  2963. if ( aExp = $FF ) then
  2964. Begin
  2965. if ( aSig <> 0) then
  2966. Begin
  2967. addFloat32Sigs := propagateFloat32NaN( a, b );
  2968. exit;
  2969. End;
  2970. addFloat32Sigs := a;
  2971. exit;
  2972. End;
  2973. if ( bExp = 0 ) then
  2974. Begin
  2975. Dec(expDiff);
  2976. End
  2977. else
  2978. Begin
  2979. bSig := bSig or $20000000;
  2980. End;
  2981. shift32RightJamming( bSig, expDiff, bSig );
  2982. zExp := aExp;
  2983. End
  2984. else
  2985. If ( expDiff < 0 ) then
  2986. Begin
  2987. if ( bExp = $FF ) then
  2988. Begin
  2989. if ( bSig<>0 ) then
  2990. Begin
  2991. addFloat32Sigs := propagateFloat32NaN( a, b );
  2992. exit;
  2993. end;
  2994. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  2995. exit;
  2996. End;
  2997. if ( aExp = 0 ) then
  2998. Begin
  2999. Inc(expDiff);
  3000. End
  3001. else
  3002. Begin
  3003. aSig := aSig OR $20000000;
  3004. End;
  3005. shift32RightJamming( aSig, - expDiff, aSig );
  3006. zExp := bExp;
  3007. End
  3008. else
  3009. Begin
  3010. if ( aExp = $FF ) then
  3011. Begin
  3012. if ( aSig OR bSig )<> 0 then
  3013. Begin
  3014. addFloat32Sigs := propagateFloat32NaN( a, b );
  3015. exit;
  3016. end;
  3017. addFloat32Sigs := a;
  3018. exit;
  3019. End;
  3020. if ( aExp = 0 ) then
  3021. Begin
  3022. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3023. exit;
  3024. end;
  3025. zSig := $40000000 + aSig + bSig;
  3026. zExp := aExp;
  3027. goto roundAndPack;
  3028. End;
  3029. aSig := aSig OR $20000000;
  3030. zSig := ( aSig + bSig ) shl 1;
  3031. Dec(zExp);
  3032. if ( sbits32 (zSig) < 0 ) then
  3033. Begin
  3034. zSig := aSig + bSig;
  3035. Inc(zExp);
  3036. End;
  3037. roundAndPack:
  3038. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3039. End;
  3040. {*
  3041. -------------------------------------------------------------------------------
  3042. Returns the result of subtracting the absolute values of the single-
  3043. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3044. difference is negated before being returned. `zSign' is ignored if the
  3045. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3046. Standard for Binary Floating-Point Arithmetic.
  3047. -------------------------------------------------------------------------------
  3048. *}
  3049. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3050. Var
  3051. aExp, bExp, zExp: int16;
  3052. aSig, bSig, zSig: bits32;
  3053. expDiff : int16;
  3054. label aExpBigger;
  3055. label bExpBigger;
  3056. label aBigger;
  3057. label bBigger;
  3058. label normalizeRoundAndPack;
  3059. Begin
  3060. aSig := extractFloat32Frac( a );
  3061. aExp := extractFloat32Exp( a );
  3062. bSig := extractFloat32Frac( b );
  3063. bExp := extractFloat32Exp( b );
  3064. expDiff := aExp - bExp;
  3065. aSig := aSig shl 7;
  3066. bSig := bSig shl 7;
  3067. if ( 0 < expDiff ) then goto aExpBigger;
  3068. if ( expDiff < 0 ) then goto bExpBigger;
  3069. if ( aExp = $FF ) then
  3070. Begin
  3071. if ( aSig OR bSig )<> 0 then
  3072. Begin
  3073. subFloat32Sigs := propagateFloat32NaN( a, b );
  3074. exit;
  3075. End;
  3076. float_raise( float_flag_invalid );
  3077. subFloat32Sigs := float32_default_nan;
  3078. exit;
  3079. End;
  3080. if ( aExp = 0 ) then
  3081. Begin
  3082. aExp := 1;
  3083. bExp := 1;
  3084. End;
  3085. if ( bSig < aSig ) Then goto aBigger;
  3086. if ( aSig < bSig ) Then goto bBigger;
  3087. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3088. exit;
  3089. bExpBigger:
  3090. if ( bExp = $FF ) then
  3091. Begin
  3092. if ( bSig<>0 ) then
  3093. Begin
  3094. subFloat32Sigs := propagateFloat32NaN( a, b );
  3095. exit;
  3096. End;
  3097. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3098. exit;
  3099. End;
  3100. if ( aExp = 0 ) then
  3101. Begin
  3102. Inc(expDiff);
  3103. End
  3104. else
  3105. Begin
  3106. aSig := aSig OR $40000000;
  3107. End;
  3108. shift32RightJamming( aSig, - expDiff, aSig );
  3109. bSig := bSig OR $40000000;
  3110. bBigger:
  3111. zSig := bSig - aSig;
  3112. zExp := bExp;
  3113. zSign := zSign xor 1;
  3114. goto normalizeRoundAndPack;
  3115. aExpBigger:
  3116. if ( aExp = $FF ) then
  3117. Begin
  3118. if ( aSig <> 0) then
  3119. Begin
  3120. subFloat32Sigs := propagateFloat32NaN( a, b );
  3121. exit;
  3122. End;
  3123. subFloat32Sigs := a;
  3124. exit;
  3125. End;
  3126. if ( bExp = 0 ) then
  3127. Begin
  3128. Dec(expDiff);
  3129. End
  3130. else
  3131. Begin
  3132. bSig := bSig OR $40000000;
  3133. End;
  3134. shift32RightJamming( bSig, expDiff, bSig );
  3135. aSig := aSig OR $40000000;
  3136. aBigger:
  3137. zSig := aSig - bSig;
  3138. zExp := aExp;
  3139. normalizeRoundAndPack:
  3140. Dec(zExp);
  3141. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3142. End;
  3143. {*
  3144. -------------------------------------------------------------------------------
  3145. Returns the result of adding the single-precision floating-point values `a'
  3146. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3147. Binary Floating-Point Arithmetic.
  3148. -------------------------------------------------------------------------------
  3149. *}
  3150. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3151. Var
  3152. aSign, bSign: Flag;
  3153. Begin
  3154. aSign := extractFloat32Sign( a.float32 );
  3155. bSign := extractFloat32Sign( b.float32 );
  3156. if ( aSign = bSign ) then
  3157. Begin
  3158. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3159. End
  3160. else
  3161. Begin
  3162. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3163. End;
  3164. End;
  3165. {*
  3166. -------------------------------------------------------------------------------
  3167. Returns the result of subtracting the single-precision floating-point values
  3168. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3169. for Binary Floating-Point Arithmetic.
  3170. -------------------------------------------------------------------------------
  3171. *}
  3172. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3173. Var
  3174. aSign, bSign: flag;
  3175. Begin
  3176. aSign := extractFloat32Sign( a.float32 );
  3177. bSign := extractFloat32Sign( b.float32 );
  3178. if ( aSign = bSign ) then
  3179. Begin
  3180. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3181. End
  3182. else
  3183. Begin
  3184. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3185. End;
  3186. End;
  3187. {*
  3188. -------------------------------------------------------------------------------
  3189. Returns the result of multiplying the single-precision floating-point values
  3190. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3191. for Binary Floating-Point Arithmetic.
  3192. -------------------------------------------------------------------------------
  3193. *}
  3194. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3195. Var
  3196. aSign, bSign, zSign: flag;
  3197. aExp, bExp, zExp : int16;
  3198. aSig, bSig, zSig0, zSig1: bits32;
  3199. Begin
  3200. aSig := extractFloat32Frac( a.float32 );
  3201. aExp := extractFloat32Exp( a.float32 );
  3202. aSign := extractFloat32Sign( a.float32 );
  3203. bSig := extractFloat32Frac( b.float32 );
  3204. bExp := extractFloat32Exp( b.float32 );
  3205. bSign := extractFloat32Sign( b.float32 );
  3206. zSign := aSign xor bSign;
  3207. if ( aExp = $FF ) then
  3208. Begin
  3209. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3210. Begin
  3211. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3212. End;
  3213. if ( ( bExp OR bSig ) = 0 ) then
  3214. Begin
  3215. float_raise( float_flag_invalid );
  3216. float32_mul.float32 := float32_default_nan;
  3217. exit;
  3218. End;
  3219. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3220. exit;
  3221. End;
  3222. if ( bExp = $FF ) then
  3223. Begin
  3224. if ( bSig <> 0 ) then
  3225. Begin
  3226. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3227. exit;
  3228. End;
  3229. if ( ( aExp OR aSig ) = 0 ) then
  3230. Begin
  3231. float_raise( float_flag_invalid );
  3232. float32_mul.float32 := float32_default_nan;
  3233. exit;
  3234. End;
  3235. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3236. exit;
  3237. End;
  3238. if ( aExp = 0 ) then
  3239. Begin
  3240. if ( aSig = 0 ) then
  3241. Begin
  3242. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3243. exit;
  3244. End;
  3245. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3246. End;
  3247. if ( bExp = 0 ) then
  3248. Begin
  3249. if ( bSig = 0 ) then
  3250. Begin
  3251. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3252. exit;
  3253. End;
  3254. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3255. End;
  3256. zExp := aExp + bExp - $7F;
  3257. aSig := ( aSig OR $00800000 ) shl 7;
  3258. bSig := ( bSig OR $00800000 ) shl 8;
  3259. mul32To64( aSig, bSig, zSig0, zSig1 );
  3260. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3261. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3262. Begin
  3263. zSig0 := zSig0 shl 1;
  3264. Dec(zExp);
  3265. End;
  3266. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3267. End;
  3268. {*
  3269. -------------------------------------------------------------------------------
  3270. Returns the result of dividing the single-precision floating-point value `a'
  3271. by the corresponding value `b'. The operation is performed according to the
  3272. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3273. -------------------------------------------------------------------------------
  3274. *}
  3275. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3276. Var
  3277. aSign, bSign, zSign: flag;
  3278. aExp, bExp, zExp: int16;
  3279. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3280. Begin
  3281. aSig := extractFloat32Frac( a.float32 );
  3282. aExp := extractFloat32Exp( a.float32 );
  3283. aSign := extractFloat32Sign( a.float32 );
  3284. bSig := extractFloat32Frac( b.float32 );
  3285. bExp := extractFloat32Exp( b.float32 );
  3286. bSign := extractFloat32Sign( b.float32 );
  3287. zSign := aSign xor bSign;
  3288. if ( aExp = $FF ) then
  3289. Begin
  3290. if ( aSig <> 0 ) then
  3291. Begin
  3292. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3293. exit;
  3294. End;
  3295. if ( bExp = $FF ) then
  3296. Begin
  3297. if ( bSig <> 0) then
  3298. Begin
  3299. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3300. End;
  3301. float_raise( float_flag_invalid );
  3302. float32_div.float32 := float32_default_nan;
  3303. exit;
  3304. End;
  3305. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3306. exit;
  3307. End;
  3308. if ( bExp = $FF ) then
  3309. Begin
  3310. if ( bSig <> 0) then
  3311. Begin
  3312. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3313. exit;
  3314. End;
  3315. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3316. exit;
  3317. End;
  3318. if ( bExp = 0 ) Then
  3319. Begin
  3320. if ( bSig = 0 ) Then
  3321. Begin
  3322. if ( ( aExp OR aSig ) = 0 ) then
  3323. Begin
  3324. float_raise( float_flag_invalid );
  3325. float32_div.float32 := float32_default_nan;
  3326. exit;
  3327. End;
  3328. float_raise( float_flag_divbyzero );
  3329. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3330. exit;
  3331. End;
  3332. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3333. End;
  3334. if ( aExp = 0 ) Then
  3335. Begin
  3336. if ( aSig = 0 ) Then
  3337. Begin
  3338. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3339. exit;
  3340. End;
  3341. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3342. End;
  3343. zExp := aExp - bExp + $7D;
  3344. aSig := ( aSig OR $00800000 ) shl 7;
  3345. bSig := ( bSig OR $00800000 ) shl 8;
  3346. if ( bSig <= ( aSig + aSig ) ) then
  3347. Begin
  3348. aSig := aSig shr 1;
  3349. Inc(zExp);
  3350. End;
  3351. zSig := estimateDiv64To32( aSig, 0, bSig );
  3352. if ( ( zSig and $3F ) <= 2 ) then
  3353. Begin
  3354. mul32To64( bSig, zSig, term0, term1 );
  3355. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3356. while ( sbits32 (rem0) < 0 ) do
  3357. Begin
  3358. Dec(zSig);
  3359. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3360. End;
  3361. zSig := zSig or bits32( rem1 <> 0 );
  3362. End;
  3363. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3364. End;
  3365. {*
  3366. -------------------------------------------------------------------------------
  3367. Returns the remainder of the single-precision floating-point value `a'
  3368. with respect to the corresponding value `b'. The operation is performed
  3369. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3370. -------------------------------------------------------------------------------
  3371. *}
  3372. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3373. Var
  3374. aSign, bSign, zSign: flag;
  3375. aExp, bExp, expDiff: int16;
  3376. aSig, bSig, q, allZero, alternateASig: bits32;
  3377. sigMean: sbits32;
  3378. Begin
  3379. aSig := extractFloat32Frac( a.float32 );
  3380. aExp := extractFloat32Exp( a.float32 );
  3381. aSign := extractFloat32Sign( a.float32 );
  3382. bSig := extractFloat32Frac( b.float32 );
  3383. bExp := extractFloat32Exp( b.float32 );
  3384. bSign := extractFloat32Sign( b.float32 );
  3385. if ( aExp = $FF ) then
  3386. Begin
  3387. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3388. Begin
  3389. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3390. exit;
  3391. End;
  3392. float_raise( float_flag_invalid );
  3393. float32_rem.float32 := float32_default_nan;
  3394. exit;
  3395. End;
  3396. if ( bExp = $FF ) then
  3397. Begin
  3398. if ( bSig <> 0 ) then
  3399. Begin
  3400. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3401. exit;
  3402. End;
  3403. float32_rem := a;
  3404. exit;
  3405. End;
  3406. if ( bExp = 0 ) then
  3407. Begin
  3408. if ( bSig = 0 ) then
  3409. Begin
  3410. float_raise( float_flag_invalid );
  3411. float32_rem.float32 := float32_default_nan;
  3412. exit;
  3413. End;
  3414. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3415. End;
  3416. if ( aExp = 0 ) then
  3417. Begin
  3418. if ( aSig = 0 ) then
  3419. Begin
  3420. float32_rem := a;
  3421. exit;
  3422. End;
  3423. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3424. End;
  3425. expDiff := aExp - bExp;
  3426. aSig := ( aSig OR $00800000 ) shl 8;
  3427. bSig := ( bSig OR $00800000 ) shl 8;
  3428. if ( expDiff < 0 ) then
  3429. Begin
  3430. if ( expDiff < -1 ) then
  3431. Begin
  3432. float32_rem := a;
  3433. exit;
  3434. End;
  3435. aSig := aSig shr 1;
  3436. End;
  3437. q := bits32( bSig <= aSig );
  3438. if ( q <> 0) then
  3439. aSig := aSig - bSig;
  3440. expDiff := expDiff - 32;
  3441. while ( 0 < expDiff ) do
  3442. Begin
  3443. q := estimateDiv64To32( aSig, 0, bSig );
  3444. if (2 < q) then
  3445. q := q - 2
  3446. else
  3447. q := 0;
  3448. aSig := - ( ( bSig shr 2 ) * q );
  3449. expDiff := expDiff - 30;
  3450. End;
  3451. expDiff := expDiff + 32;
  3452. if ( 0 < expDiff ) then
  3453. Begin
  3454. q := estimateDiv64To32( aSig, 0, bSig );
  3455. if (2 < q) then
  3456. q := q - 2
  3457. else
  3458. q := 0;
  3459. q := q shr (32 - expDiff);
  3460. bSig := bSig shr 2;
  3461. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3462. End
  3463. else
  3464. Begin
  3465. aSig := aSig shr 2;
  3466. bSig := bSig shr 2;
  3467. End;
  3468. Repeat
  3469. alternateASig := aSig;
  3470. Inc(q);
  3471. aSig := aSig - bSig;
  3472. Until not ( 0 <= sbits32 (aSig) );
  3473. sigMean := aSig + alternateASig;
  3474. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3475. Begin
  3476. aSig := alternateASig;
  3477. End;
  3478. zSign := flag( sbits32 (aSig) < 0 );
  3479. if ( zSign<>0 ) then
  3480. aSig := - aSig;
  3481. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3482. End;
  3483. {*
  3484. -------------------------------------------------------------------------------
  3485. Returns the square root of the single-precision floating-point value `a'.
  3486. The operation is performed according to the IEC/IEEE Standard for Binary
  3487. Floating-Point Arithmetic.
  3488. -------------------------------------------------------------------------------
  3489. *}
  3490. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3491. Var
  3492. aSign : flag;
  3493. aExp, zExp : int16;
  3494. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3495. label roundAndPack;
  3496. Begin
  3497. aSig := extractFloat32Frac( a.float32 );
  3498. aExp := extractFloat32Exp( a.float32 );
  3499. aSign := extractFloat32Sign( a.float32 );
  3500. if ( aExp = $FF ) then
  3501. Begin
  3502. if ( aSig <> 0) then
  3503. Begin
  3504. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3505. exit;
  3506. End;
  3507. if ( aSign = 0) then
  3508. Begin
  3509. float32_sqrt := a;
  3510. exit;
  3511. End;
  3512. float_raise( float_flag_invalid );
  3513. float32_sqrt.float32 := float32_default_nan;
  3514. exit;
  3515. End;
  3516. if ( aSign <> 0) then
  3517. Begin
  3518. if ( ( aExp OR aSig ) = 0 ) then
  3519. Begin
  3520. float32_sqrt := a;
  3521. exit;
  3522. End;
  3523. float_raise( float_flag_invalid );
  3524. float32_sqrt.float32 := float32_default_nan;
  3525. exit;
  3526. End;
  3527. if ( aExp = 0 ) then
  3528. Begin
  3529. if ( aSig = 0 ) then
  3530. Begin
  3531. float32_sqrt.float32 := 0;
  3532. exit;
  3533. End;
  3534. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3535. End;
  3536. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3537. aSig := ( aSig OR $00800000 ) shl 8;
  3538. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3539. if ( ( zSig and $7F ) <= 5 ) then
  3540. Begin
  3541. if ( zSig < 2 ) then
  3542. Begin
  3543. zSig := $7FFFFFFF;
  3544. goto roundAndPack;
  3545. End
  3546. else
  3547. Begin
  3548. aSig := aSig shr (aExp and 1);
  3549. mul32To64( zSig, zSig, term0, term1 );
  3550. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3551. while ( sbits32 (rem0) < 0 ) do
  3552. Begin
  3553. Dec(zSig);
  3554. shortShift64Left( 0, zSig, 1, term0, term1 );
  3555. term1 := term1 or 1;
  3556. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3557. End;
  3558. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3559. End;
  3560. End;
  3561. shift32RightJamming( zSig, 1, zSig );
  3562. roundAndPack:
  3563. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3564. End;
  3565. {*
  3566. -------------------------------------------------------------------------------
  3567. Returns 1 if the single-precision floating-point value `a' is equal to
  3568. the corresponding value `b', and 0 otherwise. The comparison is performed
  3569. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3570. -------------------------------------------------------------------------------
  3571. *}
  3572. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3573. Begin
  3574. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3575. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3576. ) then
  3577. Begin
  3578. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3579. Begin
  3580. float_raise( float_flag_invalid );
  3581. End;
  3582. float32_eq := 0;
  3583. exit;
  3584. End;
  3585. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3586. End;
  3587. {*
  3588. -------------------------------------------------------------------------------
  3589. Returns 1 if the single-precision floating-point value `a' is less than
  3590. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3591. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3592. Arithmetic.
  3593. -------------------------------------------------------------------------------
  3594. *}
  3595. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3596. var
  3597. aSign, bSign: flag;
  3598. Begin
  3599. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3600. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3601. ) then
  3602. Begin
  3603. float_raise( float_flag_invalid );
  3604. float32_le := 0;
  3605. exit;
  3606. End;
  3607. aSign := extractFloat32Sign( a.float32 );
  3608. bSign := extractFloat32Sign( b.float32 );
  3609. if ( aSign <> bSign ) then
  3610. Begin
  3611. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3612. exit;
  3613. End;
  3614. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3615. End;
  3616. {*
  3617. -------------------------------------------------------------------------------
  3618. Returns 1 if the single-precision floating-point value `a' is less than
  3619. the corresponding value `b', and 0 otherwise. The comparison is performed
  3620. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3621. -------------------------------------------------------------------------------
  3622. *}
  3623. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3624. var
  3625. aSign, bSign: flag;
  3626. Begin
  3627. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3628. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3629. ) then
  3630. Begin
  3631. float_raise( float_flag_invalid );
  3632. float32_lt :=0;
  3633. exit;
  3634. End;
  3635. aSign := extractFloat32Sign( a.float32 );
  3636. bSign := extractFloat32Sign( b.float32 );
  3637. if ( aSign <> bSign ) then
  3638. Begin
  3639. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3640. exit;
  3641. End;
  3642. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3643. End;
  3644. {*
  3645. -------------------------------------------------------------------------------
  3646. Returns 1 if the single-precision floating-point value `a' is equal to
  3647. the corresponding value `b', and 0 otherwise. The invalid exception is
  3648. raised if either operand is a NaN. Otherwise, the comparison is performed
  3649. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3650. -------------------------------------------------------------------------------
  3651. *}
  3652. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3653. Begin
  3654. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3655. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3656. ) then
  3657. Begin
  3658. float_raise( float_flag_invalid );
  3659. float32_eq_signaling := 0;
  3660. exit;
  3661. End;
  3662. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3663. End;
  3664. {*
  3665. -------------------------------------------------------------------------------
  3666. Returns 1 if the single-precision floating-point value `a' is less than or
  3667. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3668. cause an exception. Otherwise, the comparison is performed according to the
  3669. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3670. -------------------------------------------------------------------------------
  3671. *}
  3672. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3673. Var
  3674. aSign, bSign: flag;
  3675. aExp, bExp: int16;
  3676. Begin
  3677. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3678. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3679. ) then
  3680. Begin
  3681. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3682. Begin
  3683. float_raise( float_flag_invalid );
  3684. End;
  3685. float32_le_quiet := 0;
  3686. exit;
  3687. End;
  3688. aSign := extractFloat32Sign( a );
  3689. bSign := extractFloat32Sign( b );
  3690. if ( aSign <> bSign ) then
  3691. Begin
  3692. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3693. exit;
  3694. End;
  3695. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3696. End;
  3697. {*
  3698. -------------------------------------------------------------------------------
  3699. Returns 1 if the single-precision floating-point value `a' is less than
  3700. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3701. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3702. Standard for Binary Floating-Point Arithmetic.
  3703. -------------------------------------------------------------------------------
  3704. *}
  3705. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3706. Var
  3707. aSign, bSign: flag;
  3708. Begin
  3709. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3710. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3711. ) then
  3712. Begin
  3713. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3714. Begin
  3715. float_raise( float_flag_invalid );
  3716. End;
  3717. float32_lt_quiet := 0;
  3718. exit;
  3719. End;
  3720. aSign := extractFloat32Sign( a );
  3721. bSign := extractFloat32Sign( b );
  3722. if ( aSign <> bSign ) then
  3723. Begin
  3724. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3725. exit;
  3726. End;
  3727. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3728. End;
  3729. {*
  3730. -------------------------------------------------------------------------------
  3731. Returns the result of converting the double-precision floating-point value
  3732. `a' to the 32-bit two's complement integer format. The conversion is
  3733. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3734. Arithmetic---which means in particular that the conversion is rounded
  3735. according to the current rounding mode. If `a' is a NaN, the largest
  3736. positive integer is returned. Otherwise, if the conversion overflows, the
  3737. largest integer with the same sign as `a' is returned.
  3738. -------------------------------------------------------------------------------
  3739. *}
  3740. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3741. var
  3742. aSign: flag;
  3743. aExp, shiftCount: int16;
  3744. aSig0, aSig1, absZ, aSigExtra: bits32;
  3745. z: int32;
  3746. roundingMode: int8;
  3747. label invalid;
  3748. Begin
  3749. aSig1 := extractFloat64Frac1( a );
  3750. aSig0 := extractFloat64Frac0( a );
  3751. aExp := extractFloat64Exp( a );
  3752. aSign := extractFloat64Sign( a );
  3753. shiftCount := aExp - $413;
  3754. if ( 0 <= shiftCount ) then
  3755. Begin
  3756. if ( $41E < aExp ) then
  3757. Begin
  3758. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3759. aSign := 0;
  3760. goto invalid;
  3761. End;
  3762. shortShift64Left(
  3763. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3764. if ( $80000000 < absZ ) then
  3765. goto invalid;
  3766. End
  3767. else
  3768. Begin
  3769. aSig1 := flag( aSig1 <> 0 );
  3770. if ( aExp < $3FE ) then
  3771. Begin
  3772. aSigExtra := aExp OR aSig0 OR aSig1;
  3773. absZ := 0;
  3774. End
  3775. else
  3776. Begin
  3777. aSig0 := aSig0 OR $00100000;
  3778. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3779. absZ := aSig0 shr ( - shiftCount );
  3780. End;
  3781. End;
  3782. roundingMode := softfloat_rounding_mode;
  3783. if ( roundingMode = float_round_nearest_even ) then
  3784. Begin
  3785. if ( sbits32(aSigExtra) < 0 ) then
  3786. Begin
  3787. Inc(absZ);
  3788. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3789. absZ := absZ and not 1;
  3790. End;
  3791. if aSign <> 0 then
  3792. z := - absZ
  3793. else
  3794. z := absZ;
  3795. End
  3796. else
  3797. Begin
  3798. aSigExtra := bits32( aSigExtra <> 0 );
  3799. if ( aSign <> 0) then
  3800. Begin
  3801. z := - ( absZ
  3802. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3803. End
  3804. else
  3805. Begin
  3806. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3807. End
  3808. End;
  3809. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3810. Begin
  3811. invalid:
  3812. float_raise( float_flag_invalid );
  3813. if (aSign <> 0 ) then
  3814. float64_to_int32 := sbits32 ($80000000)
  3815. else
  3816. float64_to_int32 := $7FFFFFFF;
  3817. exit;
  3818. End;
  3819. if ( aSigExtra <> 0) then
  3820. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3821. float64_to_int32 := z;
  3822. End;
  3823. {*
  3824. -------------------------------------------------------------------------------
  3825. Returns the result of converting the double-precision floating-point value
  3826. `a' to the 32-bit two's complement integer format. The conversion is
  3827. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3828. Arithmetic, except that the conversion is always rounded toward zero.
  3829. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3830. the conversion overflows, the largest integer with the same sign as `a' is
  3831. returned.
  3832. -------------------------------------------------------------------------------
  3833. *}
  3834. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3835. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3836. Var
  3837. aSign: flag;
  3838. aExp, shiftCount: int16;
  3839. aSig0, aSig1, absZ, aSigExtra: bits32;
  3840. z: int32;
  3841. label invalid;
  3842. Begin
  3843. aSig1 := extractFloat64Frac1( a );
  3844. aSig0 := extractFloat64Frac0( a );
  3845. aExp := extractFloat64Exp( a );
  3846. aSign := extractFloat64Sign( a );
  3847. shiftCount := aExp - $413;
  3848. if ( 0 <= shiftCount ) then
  3849. Begin
  3850. if ( $41E < aExp ) then
  3851. Begin
  3852. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3853. aSign := 0;
  3854. goto invalid;
  3855. End;
  3856. shortShift64Left(
  3857. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3858. End
  3859. else
  3860. Begin
  3861. if ( aExp < $3FF ) then
  3862. Begin
  3863. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3864. Begin
  3865. softfloat_exception_flags :=
  3866. softfloat_exception_flags or float_flag_inexact;
  3867. End;
  3868. float64_to_int32_round_to_zero := 0;
  3869. exit;
  3870. End;
  3871. aSig0 := aSig0 or $00100000;
  3872. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3873. absZ := aSig0 shr ( - shiftCount );
  3874. End;
  3875. if aSign <> 0 then
  3876. z := - absZ
  3877. else
  3878. z := absZ;
  3879. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3880. Begin
  3881. invalid:
  3882. float_raise( float_flag_invalid );
  3883. if (aSign <> 0) then
  3884. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3885. else
  3886. float64_to_int32_round_to_zero := $7FFFFFFF;
  3887. exit;
  3888. End;
  3889. if ( aSigExtra <> 0) then
  3890. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3891. float64_to_int32_round_to_zero := z;
  3892. End;
  3893. {*
  3894. -------------------------------------------------------------------------------
  3895. Returns the result of converting the double-precision floating-point value
  3896. `a' to the single-precision floating-point format. The conversion is
  3897. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3898. Arithmetic.
  3899. -------------------------------------------------------------------------------
  3900. *}
  3901. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3902. Var
  3903. aSign: flag;
  3904. aExp: int16;
  3905. aSig0, aSig1, zSig: bits32;
  3906. allZero: bits32;
  3907. tmp : CommonNanT;
  3908. Begin
  3909. aSig1 := extractFloat64Frac1( a );
  3910. aSig0 := extractFloat64Frac0( a );
  3911. aExp := extractFloat64Exp( a );
  3912. aSign := extractFloat64Sign( a );
  3913. if ( aExp = $7FF ) then
  3914. Begin
  3915. if ( aSig0 OR aSig1 ) <> 0 then
  3916. Begin
  3917. float64ToCommonNaN( a, tmp );
  3918. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3919. exit;
  3920. End;
  3921. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3922. exit;
  3923. End;
  3924. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3925. if ( aExp <> 0) then
  3926. zSig := zSig OR $40000000;
  3927. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3928. End;
  3929. {*
  3930. -------------------------------------------------------------------------------
  3931. Rounds the double-precision floating-point value `a' to an integer,
  3932. and returns the result as a double-precision floating-point value. The
  3933. operation is performed according to the IEC/IEEE Standard for Binary
  3934. Floating-Point Arithmetic.
  3935. -------------------------------------------------------------------------------
  3936. *}
  3937. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3938. Var
  3939. aSign: flag;
  3940. aExp: int16;
  3941. lastBitMask, roundBitsMask: bits32;
  3942. roundingMode: int8;
  3943. z: float64;
  3944. Begin
  3945. aExp := extractFloat64Exp( a );
  3946. if ( $413 <= aExp ) then
  3947. Begin
  3948. if ( $433 <= aExp ) then
  3949. Begin
  3950. if ( ( aExp = $7FF )
  3951. AND
  3952. (
  3953. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3954. ) <>0)
  3955. ) then
  3956. Begin
  3957. propagateFloat64NaN( a, a, result );
  3958. exit;
  3959. End;
  3960. result := a;
  3961. exit;
  3962. End;
  3963. lastBitMask := 1;
  3964. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3965. roundBitsMask := lastBitMask - 1;
  3966. z := a;
  3967. roundingMode := softfloat_rounding_mode;
  3968. if ( roundingMode = float_round_nearest_even ) then
  3969. Begin
  3970. if ( lastBitMask <> 0) then
  3971. Begin
  3972. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  3973. if ( ( z.low and roundBitsMask ) = 0 ) then
  3974. z.low := z.low and not lastBitMask;
  3975. End
  3976. else
  3977. Begin
  3978. if ( sbits32 (z.low) < 0 ) then
  3979. Begin
  3980. Inc(z.high);
  3981. if ( bits32 ( z.low shl 1 ) = 0 ) then
  3982. z.high := z.high and not 1;
  3983. End;
  3984. End;
  3985. End
  3986. else if ( roundingMode <> float_round_to_zero ) then
  3987. Begin
  3988. if ( extractFloat64Sign( z )
  3989. xor flag( roundingMode = float_round_up ) )<> 0 then
  3990. Begin
  3991. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  3992. End;
  3993. End;
  3994. z.low := z.low and not roundBitsMask;
  3995. End
  3996. else
  3997. Begin
  3998. if ( aExp <= $3FE ) then
  3999. Begin
  4000. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4001. Begin
  4002. result := a;
  4003. exit;
  4004. End;
  4005. softfloat_exception_flags := softfloat_exception_flags or
  4006. float_flag_inexact;
  4007. aSign := extractFloat64Sign( a );
  4008. case ( softfloat_rounding_mode ) of
  4009. float_round_nearest_even:
  4010. Begin
  4011. if ( ( aExp = $3FE )
  4012. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4013. ) then
  4014. Begin
  4015. packFloat64( aSign, $3FF, 0, 0, result );
  4016. exit;
  4017. End;
  4018. End;
  4019. float_round_down:
  4020. Begin
  4021. if aSign<>0 then
  4022. packFloat64( 1, $3FF, 0, 0, result )
  4023. else
  4024. packFloat64( 0, 0, 0, 0, result );
  4025. exit;
  4026. End;
  4027. float_round_up:
  4028. Begin
  4029. if aSign <> 0 then
  4030. packFloat64( 1, 0, 0, 0, result )
  4031. else
  4032. packFloat64( 0, $3FF, 0, 0, result );
  4033. exit;
  4034. End;
  4035. end;
  4036. packFloat64( aSign, 0, 0, 0, result );
  4037. exit;
  4038. End;
  4039. lastBitMask := 1;
  4040. lastBitMask := lastBitMask shl ($413 - aExp);
  4041. roundBitsMask := lastBitMask - 1;
  4042. z.low := 0;
  4043. z.high := a.high;
  4044. roundingMode := softfloat_rounding_mode;
  4045. if ( roundingMode = float_round_nearest_even ) then
  4046. Begin
  4047. z.high := z.high + lastBitMask shr 1;
  4048. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4049. Begin
  4050. z.high := z.high and not lastBitMask;
  4051. End;
  4052. End
  4053. else if ( roundingMode <> float_round_to_zero ) then
  4054. Begin
  4055. if ( extractFloat64Sign( z )
  4056. xor flag( roundingMode = float_round_up ) )<> 0 then
  4057. Begin
  4058. z.high := z.high or bits32( a.low <> 0 );
  4059. z.high := z.high + roundBitsMask;
  4060. End;
  4061. End;
  4062. z.high := z.high and not roundBitsMask;
  4063. End;
  4064. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4065. Begin
  4066. softfloat_exception_flags :=
  4067. softfloat_exception_flags or float_flag_inexact;
  4068. End;
  4069. result := z;
  4070. End;
  4071. {*
  4072. -------------------------------------------------------------------------------
  4073. Returns the result of adding the absolute values of the double-precision
  4074. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4075. before being returned. `zSign' is ignored if the result is a NaN.
  4076. The addition is performed according to the IEC/IEEE Standard for Binary
  4077. Floating-Point Arithmetic.
  4078. -------------------------------------------------------------------------------
  4079. *}
  4080. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4081. Var
  4082. aExp, bExp, zExp: int16;
  4083. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4084. expDiff: int16;
  4085. label shiftRight1;
  4086. label roundAndPack;
  4087. Begin
  4088. aSig1 := extractFloat64Frac1( a );
  4089. aSig0 := extractFloat64Frac0( a );
  4090. aExp := extractFloat64Exp( a );
  4091. bSig1 := extractFloat64Frac1( b );
  4092. bSig0 := extractFloat64Frac0( b );
  4093. bExp := extractFloat64Exp( b );
  4094. expDiff := aExp - bExp;
  4095. if ( 0 < expDiff ) then
  4096. Begin
  4097. if ( aExp = $7FF ) then
  4098. Begin
  4099. if ( aSig0 OR aSig1 ) <> 0 then
  4100. Begin
  4101. propagateFloat64NaN( a, b, out );
  4102. exit;
  4103. end;
  4104. out := a;
  4105. exit;
  4106. End;
  4107. if ( bExp = 0 ) then
  4108. Begin
  4109. Dec(expDiff);
  4110. End
  4111. else
  4112. Begin
  4113. bSig0 := bSig0 or $00100000;
  4114. End;
  4115. shift64ExtraRightJamming(
  4116. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4117. zExp := aExp;
  4118. End
  4119. else if ( expDiff < 0 ) then
  4120. Begin
  4121. if ( bExp = $7FF ) then
  4122. Begin
  4123. if ( bSig0 OR bSig1 ) <> 0 then
  4124. Begin
  4125. propagateFloat64NaN( a, b, out );
  4126. exit;
  4127. End;
  4128. packFloat64( zSign, $7FF, 0, 0, out );
  4129. End;
  4130. if ( aExp = 0 ) then
  4131. Begin
  4132. Inc(expDiff);
  4133. End
  4134. else
  4135. Begin
  4136. aSig0 := aSig0 or $00100000;
  4137. End;
  4138. shift64ExtraRightJamming(
  4139. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4140. zExp := bExp;
  4141. End
  4142. else
  4143. Begin
  4144. if ( aExp = $7FF ) then
  4145. Begin
  4146. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4147. Begin
  4148. propagateFloat64NaN( a, b, out );
  4149. exit;
  4150. End;
  4151. out := a;
  4152. exit;
  4153. End;
  4154. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4155. if ( aExp = 0 ) then
  4156. Begin
  4157. packFloat64( zSign, 0, zSig0, zSig1, out );
  4158. exit;
  4159. End;
  4160. zSig2 := 0;
  4161. zSig0 := zSig0 or $00200000;
  4162. zExp := aExp;
  4163. goto shiftRight1;
  4164. End;
  4165. aSig0 := aSig0 or $00100000;
  4166. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4167. Dec(zExp);
  4168. if ( zSig0 < $00200000 ) then
  4169. goto roundAndPack;
  4170. Inc(zExp);
  4171. shiftRight1:
  4172. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4173. roundAndPack:
  4174. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4175. End;
  4176. {*
  4177. -------------------------------------------------------------------------------
  4178. Returns the result of subtracting the absolute values of the double-
  4179. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4180. difference is negated before being returned. `zSign' is ignored if the
  4181. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4182. Standard for Binary Floating-Point Arithmetic.
  4183. -------------------------------------------------------------------------------
  4184. *}
  4185. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4186. Var
  4187. aExp, bExp, zExp: int16;
  4188. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4189. expDiff: int16;
  4190. z: float64;
  4191. label aExpBigger;
  4192. label bExpBigger;
  4193. label aBigger;
  4194. label bBigger;
  4195. label normalizeRoundAndPack;
  4196. Begin
  4197. aSig1 := extractFloat64Frac1( a );
  4198. aSig0 := extractFloat64Frac0( a );
  4199. aExp := extractFloat64Exp( a );
  4200. bSig1 := extractFloat64Frac1( b );
  4201. bSig0 := extractFloat64Frac0( b );
  4202. bExp := extractFloat64Exp( b );
  4203. expDiff := aExp - bExp;
  4204. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4205. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4206. if ( 0 < expDiff ) then goto aExpBigger;
  4207. if ( expDiff < 0 ) then goto bExpBigger;
  4208. if ( aExp = $7FF ) then
  4209. Begin
  4210. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4211. Begin
  4212. propagateFloat64NaN( a, b, out );
  4213. exit;
  4214. End;
  4215. float_raise( float_flag_invalid );
  4216. z.low := float64_default_nan_low;
  4217. z.high := float64_default_nan_high;
  4218. out := z;
  4219. exit;
  4220. End;
  4221. if ( aExp = 0 ) then
  4222. Begin
  4223. aExp := 1;
  4224. bExp := 1;
  4225. End;
  4226. if ( bSig0 < aSig0 ) then goto aBigger;
  4227. if ( aSig0 < bSig0 ) then goto bBigger;
  4228. if ( bSig1 < aSig1 ) then goto aBigger;
  4229. if ( aSig1 < bSig1 ) then goto bBigger;
  4230. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4231. exit;
  4232. bExpBigger:
  4233. if ( bExp = $7FF ) then
  4234. Begin
  4235. if ( bSig0 OR bSig1 ) <> 0 then
  4236. Begin
  4237. propagateFloat64NaN( a, b, out );
  4238. exit;
  4239. End;
  4240. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4241. exit;
  4242. End;
  4243. if ( aExp = 0 ) then
  4244. Begin
  4245. Inc(expDiff);
  4246. End
  4247. else
  4248. Begin
  4249. aSig0 := aSig0 or $40000000;
  4250. End;
  4251. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4252. bSig0 := bSig0 or $40000000;
  4253. bBigger:
  4254. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4255. zExp := bExp;
  4256. zSign := zSign xor 1;
  4257. goto normalizeRoundAndPack;
  4258. aExpBigger:
  4259. if ( aExp = $7FF ) then
  4260. Begin
  4261. if ( aSig0 OR aSig1 ) <> 0 then
  4262. Begin
  4263. propagateFloat64NaN( a, b, out );
  4264. exit;
  4265. End;
  4266. out := a;
  4267. exit;
  4268. End;
  4269. if ( bExp = 0 ) then
  4270. Begin
  4271. Dec(expDiff);
  4272. End
  4273. else
  4274. Begin
  4275. bSig0 := bSig0 or $40000000;
  4276. End;
  4277. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4278. aSig0 := aSig0 or $40000000;
  4279. aBigger:
  4280. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4281. zExp := aExp;
  4282. normalizeRoundAndPack:
  4283. Dec(zExp);
  4284. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4285. End;
  4286. {*
  4287. -------------------------------------------------------------------------------
  4288. Returns the result of adding the double-precision floating-point values `a'
  4289. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4290. Binary Floating-Point Arithmetic.
  4291. -------------------------------------------------------------------------------
  4292. *}
  4293. Function float64_add( a: float64; b : float64) : Float64;
  4294. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4295. Var
  4296. aSign, bSign: flag;
  4297. Begin
  4298. aSign := extractFloat64Sign( a );
  4299. bSign := extractFloat64Sign( b );
  4300. if ( aSign = bSign ) then
  4301. Begin
  4302. addFloat64Sigs( a, b, aSign, result );
  4303. End
  4304. else
  4305. Begin
  4306. subFloat64Sigs( a, b, aSign, result );
  4307. End;
  4308. End;
  4309. {*
  4310. -------------------------------------------------------------------------------
  4311. Returns the result of subtracting the double-precision floating-point values
  4312. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4313. for Binary Floating-Point Arithmetic.
  4314. -------------------------------------------------------------------------------
  4315. *}
  4316. Function float64_sub(a: float64; b : float64) : Float64;
  4317. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4318. Var
  4319. aSign, bSign: flag;
  4320. Begin
  4321. aSign := extractFloat64Sign( a );
  4322. bSign := extractFloat64Sign( b );
  4323. if ( aSign = bSign ) then
  4324. Begin
  4325. subFloat64Sigs( a, b, aSign, result );
  4326. End
  4327. else
  4328. Begin
  4329. addFloat64Sigs( a, b, aSign, result );
  4330. End;
  4331. End;
  4332. {*
  4333. -------------------------------------------------------------------------------
  4334. Returns the result of multiplying the double-precision floating-point values
  4335. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4336. for Binary Floating-Point Arithmetic.
  4337. -------------------------------------------------------------------------------
  4338. *}
  4339. Function float64_mul( a: float64; b:float64) : Float64;
  4340. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4341. Var
  4342. aSign, bSign, zSign: flag;
  4343. aExp, bExp, zExp: int16;
  4344. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4345. z: float64;
  4346. label invalid;
  4347. Begin
  4348. aSig1 := extractFloat64Frac1( a );
  4349. aSig0 := extractFloat64Frac0( a );
  4350. aExp := extractFloat64Exp( a );
  4351. aSign := extractFloat64Sign( a );
  4352. bSig1 := extractFloat64Frac1( b );
  4353. bSig0 := extractFloat64Frac0( b );
  4354. bExp := extractFloat64Exp( b );
  4355. bSign := extractFloat64Sign( b );
  4356. zSign := aSign xor bSign;
  4357. if ( aExp = $7FF ) then
  4358. Begin
  4359. if ( (( aSig0 OR aSig1 ) <>0)
  4360. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4361. Begin
  4362. propagateFloat64NaN( a, b, result );
  4363. exit;
  4364. End;
  4365. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4366. packFloat64( zSign, $7FF, 0, 0, result );
  4367. exit;
  4368. End;
  4369. if ( bExp = $7FF ) then
  4370. Begin
  4371. if ( bSig0 OR bSig1 )<> 0 then
  4372. Begin
  4373. propagateFloat64NaN( a, b, result );
  4374. exit;
  4375. End;
  4376. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4377. Begin
  4378. invalid:
  4379. float_raise( float_flag_invalid );
  4380. z.low := float64_default_nan_low;
  4381. z.high := float64_default_nan_high;
  4382. result := z;
  4383. exit;
  4384. End;
  4385. packFloat64( zSign, $7FF, 0, 0, result );
  4386. exit;
  4387. End;
  4388. if ( aExp = 0 ) then
  4389. Begin
  4390. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4391. Begin
  4392. packFloat64( zSign, 0, 0, 0, result );
  4393. exit;
  4394. End;
  4395. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4396. End;
  4397. if ( bExp = 0 ) then
  4398. Begin
  4399. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4400. Begin
  4401. packFloat64( zSign, 0, 0, 0, result );
  4402. exit;
  4403. End;
  4404. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4405. End;
  4406. zExp := aExp + bExp - $400;
  4407. aSig0 := aSig0 or $00100000;
  4408. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4409. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4410. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4411. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4412. if ( $00200000 <= zSig0 ) then
  4413. Begin
  4414. shift64ExtraRightJamming(
  4415. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4416. Inc(zExp);
  4417. End;
  4418. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4419. End;
  4420. {*
  4421. -------------------------------------------------------------------------------
  4422. Returns the result of dividing the double-precision floating-point value `a'
  4423. by the corresponding value `b'. The operation is performed according to the
  4424. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4425. -------------------------------------------------------------------------------
  4426. *}
  4427. Function float64_div(a: float64; b : float64) : Float64;
  4428. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4429. Var
  4430. aSign, bSign, zSign: flag;
  4431. aExp, bExp, zExp: int16;
  4432. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4433. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4434. z: float64;
  4435. label invalid;
  4436. Begin
  4437. aSig1 := extractFloat64Frac1( a );
  4438. aSig0 := extractFloat64Frac0( a );
  4439. aExp := extractFloat64Exp( a );
  4440. aSign := extractFloat64Sign( a );
  4441. bSig1 := extractFloat64Frac1( b );
  4442. bSig0 := extractFloat64Frac0( b );
  4443. bExp := extractFloat64Exp( b );
  4444. bSign := extractFloat64Sign( b );
  4445. zSign := aSign xor bSign;
  4446. if ( aExp = $7FF ) then
  4447. Begin
  4448. if ( aSig0 OR aSig1 )<> 0 then
  4449. Begin
  4450. propagateFloat64NaN( a, b, result );
  4451. exit;
  4452. end;
  4453. if ( bExp = $7FF ) then
  4454. Begin
  4455. if ( bSig0 OR bSig1 )<>0 then
  4456. Begin
  4457. propagateFloat64NaN( a, b, result );
  4458. exit;
  4459. End;
  4460. goto invalid;
  4461. End;
  4462. packFloat64( zSign, $7FF, 0, 0, result );
  4463. exit;
  4464. End;
  4465. if ( bExp = $7FF ) then
  4466. Begin
  4467. if ( bSig0 OR bSig1 )<> 0 then
  4468. Begin
  4469. propagateFloat64NaN( a, b, result );
  4470. exit;
  4471. End;
  4472. packFloat64( zSign, 0, 0, 0, result );
  4473. exit;
  4474. End;
  4475. if ( bExp = 0 ) then
  4476. Begin
  4477. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4478. Begin
  4479. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4480. Begin
  4481. invalid:
  4482. float_raise( float_flag_invalid );
  4483. z.low := float64_default_nan_low;
  4484. z.high := float64_default_nan_high;
  4485. result := z;
  4486. exit;
  4487. End;
  4488. float_raise( float_flag_divbyzero );
  4489. packFloat64( zSign, $7FF, 0, 0, result );
  4490. exit;
  4491. End;
  4492. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4493. End;
  4494. if ( aExp = 0 ) then
  4495. Begin
  4496. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4497. Begin
  4498. packFloat64( zSign, 0, 0, 0, result );
  4499. exit;
  4500. End;
  4501. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4502. End;
  4503. zExp := aExp - bExp + $3FD;
  4504. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4505. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4506. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4507. Begin
  4508. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4509. Inc(zExp);
  4510. End;
  4511. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4512. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4513. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4514. while ( sbits32 (rem0) < 0 ) do
  4515. Begin
  4516. Dec(zSig0);
  4517. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4518. End;
  4519. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4520. if ( ( zSig1 and $3FF ) <= 4 ) then
  4521. Begin
  4522. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4523. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4524. while ( sbits32 (rem1) < 0 ) do
  4525. Begin
  4526. Dec(zSig1);
  4527. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4528. End;
  4529. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4530. End;
  4531. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4532. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4533. End;
  4534. {*
  4535. -------------------------------------------------------------------------------
  4536. Returns the remainder of the double-precision floating-point value `a'
  4537. with respect to the corresponding value `b'. The operation is performed
  4538. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4539. -------------------------------------------------------------------------------
  4540. *}
  4541. Function float64_rem(a: float64; b : float64) : float64;
  4542. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4543. Var
  4544. aSign, bSign, zSign: flag;
  4545. aExp, bExp, expDiff: int16;
  4546. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4547. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4548. sigMean0: sbits32;
  4549. z: float64;
  4550. label invalid;
  4551. Begin
  4552. aSig1 := extractFloat64Frac1( a );
  4553. aSig0 := extractFloat64Frac0( a );
  4554. aExp := extractFloat64Exp( a );
  4555. aSign := extractFloat64Sign( a );
  4556. bSig1 := extractFloat64Frac1( b );
  4557. bSig0 := extractFloat64Frac0( b );
  4558. bExp := extractFloat64Exp( b );
  4559. bSign := extractFloat64Sign( b );
  4560. if ( aExp = $7FF ) then
  4561. Begin
  4562. if ((( aSig0 OR aSig1 )<>0)
  4563. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4564. Begin
  4565. propagateFloat64NaN( a, b, result );
  4566. exit;
  4567. End;
  4568. goto invalid;
  4569. End;
  4570. if ( bExp = $7FF ) then
  4571. Begin
  4572. if ( bSig0 OR bSig1 ) <> 0 then
  4573. Begin
  4574. propagateFloat64NaN( a, b, result );
  4575. exit;
  4576. End;
  4577. result := a;
  4578. exit;
  4579. End;
  4580. if ( bExp = 0 ) then
  4581. Begin
  4582. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4583. Begin
  4584. invalid:
  4585. float_raise( float_flag_invalid );
  4586. z.low := float64_default_nan_low;
  4587. z.high := float64_default_nan_high;
  4588. result := z;
  4589. exit;
  4590. End;
  4591. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4592. End;
  4593. if ( aExp = 0 ) then
  4594. Begin
  4595. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4596. Begin
  4597. result := a;
  4598. exit;
  4599. End;
  4600. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4601. End;
  4602. expDiff := aExp - bExp;
  4603. if ( expDiff < -1 ) then
  4604. Begin
  4605. result := a;
  4606. exit;
  4607. End;
  4608. shortShift64Left(
  4609. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4610. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4611. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4612. if ( q )<>0 then
  4613. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4614. expDiff := expDiff - 32;
  4615. while ( 0 < expDiff ) do
  4616. Begin
  4617. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4618. if 4 < q then
  4619. q:= q - 4
  4620. else
  4621. q := 0;
  4622. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4623. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4624. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4625. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4626. expDiff := expDiff - 29;
  4627. End;
  4628. if ( -32 < expDiff ) then
  4629. Begin
  4630. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4631. if 4 < q then
  4632. q := q - 4
  4633. else
  4634. q := 0;
  4635. q := q shr (- expDiff);
  4636. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4637. expDiff := expDiff + 24;
  4638. if ( expDiff < 0 ) then
  4639. Begin
  4640. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4641. End
  4642. else
  4643. Begin
  4644. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4645. End;
  4646. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4647. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4648. End
  4649. else
  4650. Begin
  4651. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4652. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4653. End;
  4654. Repeat
  4655. alternateASig0 := aSig0;
  4656. alternateASig1 := aSig1;
  4657. Inc(q);
  4658. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4659. Until not ( 0 <= sbits32 (aSig0) );
  4660. add64(
  4661. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4662. if ( ( sigMean0 < 0 )
  4663. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4664. Begin
  4665. aSig0 := alternateASig0;
  4666. aSig1 := alternateASig1;
  4667. End;
  4668. zSign := flag( sbits32 (aSig0) < 0 );
  4669. if ( zSign <> 0 ) then
  4670. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4671. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4672. End;
  4673. {*
  4674. -------------------------------------------------------------------------------
  4675. Returns the square root of the double-precision floating-point value `a'.
  4676. The operation is performed according to the IEC/IEEE Standard for Binary
  4677. Floating-Point Arithmetic.
  4678. -------------------------------------------------------------------------------
  4679. *}
  4680. Procedure float64_sqrt( a: float64; var out: float64 );
  4681. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4682. Var
  4683. aSign: flag;
  4684. aExp, zExp: int16;
  4685. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4686. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4687. z: float64;
  4688. label invalid;
  4689. Begin
  4690. aSig1 := extractFloat64Frac1( a );
  4691. aSig0 := extractFloat64Frac0( a );
  4692. aExp := extractFloat64Exp( a );
  4693. aSign := extractFloat64Sign( a );
  4694. if ( aExp = $7FF ) then
  4695. Begin
  4696. if ( aSig0 OR aSig1 ) <> 0 then
  4697. Begin
  4698. propagateFloat64NaN( a, a, out );
  4699. exit;
  4700. End;
  4701. if ( aSign = 0) then
  4702. Begin
  4703. out := a;
  4704. exit;
  4705. End;
  4706. goto invalid;
  4707. End;
  4708. if ( aSign <> 0 ) then
  4709. Begin
  4710. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4711. Begin
  4712. out := a;
  4713. exit;
  4714. End;
  4715. invalid:
  4716. float_raise( float_flag_invalid );
  4717. z.low := float64_default_nan_low;
  4718. z.high := float64_default_nan_high;
  4719. out := z;
  4720. exit;
  4721. End;
  4722. if ( aExp = 0 ) then
  4723. Begin
  4724. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4725. Begin
  4726. packFloat64( 0, 0, 0, 0, out );
  4727. exit;
  4728. End;
  4729. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4730. End;
  4731. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4732. aSig0 := aSig0 or $00100000;
  4733. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4734. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4735. if ( zSig0 = 0 ) then
  4736. zSig0 := $7FFFFFFF;
  4737. doubleZSig0 := zSig0 + zSig0;
  4738. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4739. mul32To64( zSig0, zSig0, term0, term1 );
  4740. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4741. while ( sbits32 (rem0) < 0 ) do
  4742. Begin
  4743. Dec(zSig0);
  4744. doubleZSig0 := doubleZSig0 - 2;
  4745. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4746. End;
  4747. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4748. if ( ( zSig1 and $1FF ) <= 5 ) then
  4749. Begin
  4750. if ( zSig1 = 0 ) then
  4751. zSig1 := 1;
  4752. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4753. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4754. mul32To64( zSig1, zSig1, term2, term3 );
  4755. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4756. while ( sbits32 (rem1) < 0 ) do
  4757. Begin
  4758. Dec(zSig1);
  4759. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4760. term3 := term3 or 1;
  4761. term2 := term2 or doubleZSig0;
  4762. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4763. End;
  4764. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4765. End;
  4766. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4767. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4768. End;
  4769. {*
  4770. -------------------------------------------------------------------------------
  4771. Returns 1 if the double-precision floating-point value `a' is equal to
  4772. the corresponding value `b', and 0 otherwise. The comparison is performed
  4773. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4774. -------------------------------------------------------------------------------
  4775. *}
  4776. Function float64_eq(a: float64; b: float64): flag;
  4777. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4778. Begin
  4779. if
  4780. (
  4781. ( extractFloat64Exp( a ) = $7FF )
  4782. AND
  4783. (
  4784. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4785. )
  4786. )
  4787. OR (
  4788. ( extractFloat64Exp( b ) = $7FF )
  4789. AND (
  4790. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4791. )
  4792. )
  4793. ) then
  4794. Begin
  4795. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4796. float_raise( float_flag_invalid );
  4797. float64_eq := 0;
  4798. exit;
  4799. End;
  4800. float64_eq := flag(
  4801. ( a.low = b.low )
  4802. AND ( ( a.high = b.high )
  4803. OR ( ( a.low = 0 )
  4804. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4805. ));
  4806. End;
  4807. {*
  4808. -------------------------------------------------------------------------------
  4809. Returns 1 if the double-precision floating-point value `a' is less than
  4810. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4811. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4812. Arithmetic.
  4813. -------------------------------------------------------------------------------
  4814. *}
  4815. Function float64_le(a: float64;b: float64): flag;
  4816. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4817. Var
  4818. aSign, bSign: flag;
  4819. Begin
  4820. if
  4821. (
  4822. ( extractFloat64Exp( a ) = $7FF )
  4823. AND
  4824. (
  4825. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4826. )
  4827. )
  4828. OR (
  4829. ( extractFloat64Exp( b ) = $7FF )
  4830. AND (
  4831. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4832. )
  4833. )
  4834. ) then
  4835. Begin
  4836. float_raise( float_flag_invalid );
  4837. float64_le := 0;
  4838. exit;
  4839. End;
  4840. aSign := extractFloat64Sign( a );
  4841. bSign := extractFloat64Sign( b );
  4842. if ( aSign <> bSign ) then
  4843. Begin
  4844. float64_le := flag(
  4845. (aSign <> 0)
  4846. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4847. = 0 ));
  4848. exit;
  4849. End;
  4850. if aSign <> 0 then
  4851. float64_le := le64( b.high, b.low, a.high, a.low )
  4852. else
  4853. float64_le := le64( a.high, a.low, b.high, b.low );
  4854. End;
  4855. {*
  4856. -------------------------------------------------------------------------------
  4857. Returns 1 if the double-precision floating-point value `a' is less than
  4858. the corresponding value `b', and 0 otherwise. The comparison is performed
  4859. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4860. -------------------------------------------------------------------------------
  4861. *}
  4862. Function float64_lt(a: float64;b: float64): flag;
  4863. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4864. Var
  4865. aSign, bSign: flag;
  4866. Begin
  4867. if
  4868. (
  4869. ( extractFloat64Exp( a ) = $7FF )
  4870. AND
  4871. (
  4872. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4873. )
  4874. )
  4875. OR (
  4876. ( extractFloat64Exp( b ) = $7FF )
  4877. AND (
  4878. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4879. )
  4880. )
  4881. ) then
  4882. Begin
  4883. float_raise( float_flag_invalid );
  4884. float64_lt := 0;
  4885. exit;
  4886. End;
  4887. aSign := extractFloat64Sign( a );
  4888. bSign := extractFloat64Sign( b );
  4889. if ( aSign <> bSign ) then
  4890. Begin
  4891. float64_lt := flag(
  4892. (aSign <> 0)
  4893. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4894. <> 0 ));
  4895. exit;
  4896. End;
  4897. if aSign <> 0 then
  4898. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4899. else
  4900. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4901. End;
  4902. {*
  4903. -------------------------------------------------------------------------------
  4904. Returns 1 if the double-precision floating-point value `a' is equal to
  4905. the corresponding value `b', and 0 otherwise. The invalid exception is
  4906. raised if either operand is a NaN. Otherwise, the comparison is performed
  4907. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4908. -------------------------------------------------------------------------------
  4909. *}
  4910. Function float64_eq_signaling( a: float64; b: float64): flag;
  4911. Begin
  4912. if
  4913. (
  4914. ( extractFloat64Exp( a ) = $7FF )
  4915. AND
  4916. (
  4917. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4918. )
  4919. )
  4920. OR (
  4921. ( extractFloat64Exp( b ) = $7FF )
  4922. AND (
  4923. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4924. )
  4925. )
  4926. ) then
  4927. Begin
  4928. float_raise( float_flag_invalid );
  4929. float64_eq_signaling := 0;
  4930. exit;
  4931. End;
  4932. float64_eq_signaling := flag(
  4933. ( a.low = b.low )
  4934. AND ( ( a.high = b.high )
  4935. OR ( ( a.low = 0 )
  4936. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4937. ));
  4938. End;
  4939. {*
  4940. -------------------------------------------------------------------------------
  4941. Returns 1 if the double-precision floating-point value `a' is less than or
  4942. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4943. cause an exception. Otherwise, the comparison is performed according to the
  4944. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4945. -------------------------------------------------------------------------------
  4946. *}
  4947. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4948. Var
  4949. aSign, bSign : flag;
  4950. Begin
  4951. if
  4952. (
  4953. ( extractFloat64Exp( a ) = $7FF )
  4954. AND
  4955. (
  4956. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4957. )
  4958. )
  4959. OR (
  4960. ( extractFloat64Exp( b ) = $7FF )
  4961. AND (
  4962. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4963. )
  4964. )
  4965. ) then
  4966. Begin
  4967. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4968. float_raise( float_flag_invalid );
  4969. float64_le_quiet := 0;
  4970. exit;
  4971. End;
  4972. aSign := extractFloat64Sign( a );
  4973. bSign := extractFloat64Sign( b );
  4974. if ( aSign <> bSign ) then
  4975. Begin
  4976. float64_le_quiet := flag
  4977. ((aSign <> 0)
  4978. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4979. = 0 ));
  4980. exit;
  4981. End;
  4982. if aSign <> 0 then
  4983. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  4984. else
  4985. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  4986. End;
  4987. {*
  4988. -------------------------------------------------------------------------------
  4989. Returns 1 if the double-precision floating-point value `a' is less than
  4990. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4991. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4992. Standard for Binary Floating-Point Arithmetic.
  4993. -------------------------------------------------------------------------------
  4994. *}
  4995. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  4996. Var
  4997. aSign, bSign: flag;
  4998. Begin
  4999. if
  5000. (
  5001. ( extractFloat64Exp( a ) = $7FF )
  5002. AND
  5003. (
  5004. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5005. )
  5006. )
  5007. OR (
  5008. ( extractFloat64Exp( b ) = $7FF )
  5009. AND (
  5010. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5011. )
  5012. )
  5013. ) then
  5014. Begin
  5015. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5016. float_raise( float_flag_invalid );
  5017. float64_lt_quiet := 0;
  5018. exit;
  5019. End;
  5020. aSign := extractFloat64Sign( a );
  5021. bSign := extractFloat64Sign( b );
  5022. if ( aSign <> bSign ) then
  5023. Begin
  5024. float64_lt_quiet := flag(
  5025. (aSign<>0)
  5026. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5027. <> 0 ));
  5028. exit;
  5029. End;
  5030. If aSign <> 0 then
  5031. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5032. else
  5033. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5034. End;
  5035. {*----------------------------------------------------------------------------
  5036. | Returns the result of converting the 64-bit two's complement integer `a'
  5037. | to the single-precision floating-point format. The conversion is performed
  5038. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5039. *----------------------------------------------------------------------------*}
  5040. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5041. var
  5042. zSign : flag;
  5043. absA : uint64;
  5044. shiftCount: int8;
  5045. zSig : bits32;
  5046. intval : int64rec;
  5047. Begin
  5048. if ( a = 0 ) then
  5049. begin
  5050. int64_to_float32.float32 := 0;
  5051. exit;
  5052. end;
  5053. if a < 0 then
  5054. zSign := flag(TRUE)
  5055. else
  5056. zSign := flag(FALSE);
  5057. if zSign<>0 then
  5058. absA := -a
  5059. else
  5060. absA := a;
  5061. shiftCount := countLeadingZeros64( absA ) - 40;
  5062. if ( 0 <= shiftCount ) then
  5063. begin
  5064. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5065. end
  5066. else
  5067. begin
  5068. shiftCount := shiftCount + 7;
  5069. if ( shiftCount < 0 ) then
  5070. begin
  5071. intval.low := int64rec(AbsA).low;
  5072. intval.high := int64rec(AbsA).high;
  5073. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5074. intval.low, intval.high);
  5075. int64rec(absA).low := intval.low;
  5076. int64rec(absA).high := intval.high;
  5077. end
  5078. else
  5079. absA := absA shl shiftCount;
  5080. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5081. end;
  5082. End;
  5083. {*----------------------------------------------------------------------------
  5084. | Returns the result of converting the 64-bit two's complement integer `a'
  5085. | to the single-precision floating-point format. The conversion is performed
  5086. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5087. | Unisgned version.
  5088. *----------------------------------------------------------------------------*}
  5089. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5090. var
  5091. zSign : flag;
  5092. absA : uint64;
  5093. shiftCount: int8;
  5094. zSig : bits32;
  5095. intval : int64rec;
  5096. Begin
  5097. if ( a = 0 ) then
  5098. begin
  5099. qword_to_float32.float32 := 0;
  5100. exit;
  5101. end;
  5102. zSign := flag(FALSE);
  5103. absA := a;
  5104. shiftCount := countLeadingZeros64( absA ) - 40;
  5105. if ( 0 <= shiftCount ) then
  5106. begin
  5107. qword_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5108. end
  5109. else
  5110. begin
  5111. shiftCount := shiftCount + 7;
  5112. if ( shiftCount < 0 ) then
  5113. begin
  5114. intval.low := int64rec(AbsA).low;
  5115. intval.high := int64rec(AbsA).high;
  5116. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5117. intval.low, intval.high);
  5118. int64rec(absA).low := intval.low;
  5119. int64rec(absA).high := intval.high;
  5120. end
  5121. else
  5122. absA := absA shl shiftCount;
  5123. qword_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5124. end;
  5125. End;
  5126. {*----------------------------------------------------------------------------
  5127. | Returns the result of converting the 64-bit two's complement integer `a'
  5128. | to the double-precision floating-point format. The conversion is performed
  5129. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5130. *----------------------------------------------------------------------------*}
  5131. function qword_to_float64( a: qword ): float64;
  5132. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5133. var
  5134. zSign : flag;
  5135. float_result : float64;
  5136. intval : int64rec;
  5137. AbsA : bits64;
  5138. shiftcount : int8;
  5139. zSig0, zSig1 : bits32;
  5140. Begin
  5141. if ( a = 0 ) then
  5142. Begin
  5143. packFloat64( 0, 0, 0, 0, result );
  5144. exit;
  5145. end;
  5146. zSign := flag(FALSE);
  5147. AbsA := a;
  5148. shiftCount := countLeadingZeros64( absA ) - 11;
  5149. if ( 0 <= shiftCount ) then
  5150. Begin
  5151. absA := absA shl shiftcount;
  5152. zSig0:=int64rec(absA).high;
  5153. zSig1:=int64rec(absA).low;
  5154. End
  5155. else
  5156. Begin
  5157. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5158. - shiftCount, zSig0, zSig1 );
  5159. End;
  5160. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5161. qword_to_float64:= float_result;
  5162. End;
  5163. {*----------------------------------------------------------------------------
  5164. | Returns the result of converting the 64-bit two's complement integer `a'
  5165. | to the double-precision floating-point format. The conversion is performed
  5166. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5167. *----------------------------------------------------------------------------*}
  5168. function int64_to_float64( a: int64 ): float64;
  5169. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5170. var
  5171. zSign : flag;
  5172. float_result : float64;
  5173. intval : int64rec;
  5174. AbsA : bits64;
  5175. shiftcount : int8;
  5176. zSig0, zSig1 : bits32;
  5177. Begin
  5178. if ( a = 0 ) then
  5179. Begin
  5180. packFloat64( 0, 0, 0, 0, result );
  5181. exit;
  5182. end;
  5183. zSign := flag( a < 0 );
  5184. if ZSign<>0 then
  5185. AbsA := -a
  5186. else
  5187. AbsA := a;
  5188. shiftCount := countLeadingZeros64( absA ) - 11;
  5189. if ( 0 <= shiftCount ) then
  5190. Begin
  5191. absA := absA shl shiftcount;
  5192. zSig0:=int64rec(absA).high;
  5193. zSig1:=int64rec(absA).low;
  5194. End
  5195. else
  5196. Begin
  5197. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5198. - shiftCount, zSig0, zSig1 );
  5199. End;
  5200. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5201. int64_to_float64:= float_result;
  5202. End;
  5203. {*----------------------------------------------------------------------------
  5204. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5205. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5206. | Otherwise, returns 0.
  5207. *----------------------------------------------------------------------------*}
  5208. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5209. begin
  5210. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5211. end;
  5212. {*----------------------------------------------------------------------------
  5213. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5214. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5215. | Otherwise, returns 0.
  5216. *----------------------------------------------------------------------------*}
  5217. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5218. begin
  5219. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5220. end;
  5221. {*----------------------------------------------------------------------------
  5222. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5223. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5224. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5225. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5226. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5227. | the most-significant bit of the extra result, and the other 63 bits of the
  5228. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5229. | were all zero. This extra result is stored in the location pointed to by
  5230. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5231. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5232. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5233. | fixed-point value is shifted right by the number of bits given in `count',
  5234. | and the integer part of the result is returned at the locations pointed to
  5235. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5236. | corrupted as described above, and is returned at the location pointed to by
  5237. | `z2Ptr'.)
  5238. *----------------------------------------------------------------------------*}
  5239. procedure shift128ExtraRightJamming(
  5240. a0: bits64;
  5241. a1: bits64;
  5242. a2: bits64;
  5243. count: int16;
  5244. var z0Ptr: bits64;
  5245. var z1Ptr: bits64;
  5246. var z2Ptr: bits64);
  5247. var
  5248. z0, z1, z2: bits64;
  5249. negCount: int8;
  5250. begin
  5251. negCount := ( - count ) and 63;
  5252. if ( count = 0 ) then
  5253. begin
  5254. z2 := a2;
  5255. z1 := a1;
  5256. z0 := a0;
  5257. end
  5258. else begin
  5259. if ( count < 64 ) then
  5260. begin
  5261. z2 := a1 shr negCount;
  5262. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5263. z0 := a0 shr count;
  5264. end
  5265. else begin
  5266. if ( count = 64 ) then
  5267. begin
  5268. z2 := a1;
  5269. z1 := a0;
  5270. end
  5271. else begin
  5272. a2 := a2 or a1;
  5273. if ( count < 128 ) then
  5274. begin
  5275. z2 := a0 shl negCount;
  5276. z1 := a0 shr ( count and 63 );
  5277. end
  5278. else begin
  5279. if ( count = 128 ) then
  5280. z2 := a0
  5281. else
  5282. z2 := ord( a0 <> 0 );
  5283. z1 := 0;
  5284. end;
  5285. end;
  5286. z0 := 0;
  5287. end;
  5288. z2 := z2 or ord( a2 <> 0 );
  5289. end;
  5290. z2Ptr := z2;
  5291. z1Ptr := z1;
  5292. z0Ptr := z0;
  5293. end;
  5294. {*----------------------------------------------------------------------------
  5295. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5296. | _plus_ the number of bits given in `count'. The shifted result is at most
  5297. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5298. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5299. | shifted off is the most-significant bit of the extra result, and the other
  5300. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5301. | bits shifted off were all zero. This extra result is stored in the location
  5302. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5303. | (This routine makes more sense if `a0' and `a1' are considered to form
  5304. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5305. | point value is shifted right by the number of bits given in `count', and
  5306. | the integer part of the result is returned at the location pointed to by
  5307. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5308. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5309. *----------------------------------------------------------------------------*}
  5310. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5311. var
  5312. z0, z1: bits64;
  5313. negCount: int8;
  5314. begin
  5315. negCount := ( - count ) and 63;
  5316. if ( count = 0 ) then
  5317. begin
  5318. z1 := a1;
  5319. z0 := a0;
  5320. end
  5321. else if ( count < 64 ) then
  5322. begin
  5323. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5324. z0 := a0 shr count;
  5325. end
  5326. else begin
  5327. if ( count = 64 ) then
  5328. begin
  5329. z1 := a0 or ord( a1 <> 0 );
  5330. end
  5331. else begin
  5332. z1 := ord( ( a0 or a1 ) <> 0 );
  5333. end;
  5334. z0 := 0;
  5335. end;
  5336. z1Ptr := z1;
  5337. z0Ptr := z0;
  5338. end;
  5339. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5340. {*----------------------------------------------------------------------------
  5341. | Returns the fraction bits of the extended double-precision floating-point
  5342. | value `a'.
  5343. *----------------------------------------------------------------------------*}
  5344. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5345. begin
  5346. result:=a.low;
  5347. end;
  5348. {*----------------------------------------------------------------------------
  5349. | Returns the exponent bits of the extended double-precision floating-point
  5350. | value `a'.
  5351. *----------------------------------------------------------------------------*}
  5352. function extractFloatx80Exp(a : floatx80): int32;inline;
  5353. begin
  5354. result:=a.high and $7FFF;
  5355. end;
  5356. {*----------------------------------------------------------------------------
  5357. | Returns the sign bit of the extended double-precision floating-point value
  5358. | `a'.
  5359. *----------------------------------------------------------------------------*}
  5360. function extractFloatx80Sign(a : floatx80): flag;inline;
  5361. begin
  5362. result:=a.high shr 15;
  5363. end;
  5364. {*----------------------------------------------------------------------------
  5365. | Normalizes the subnormal extended double-precision floating-point value
  5366. | represented by the denormalized significand `aSig'. The normalized exponent
  5367. | and significand are stored at the locations pointed to by `zExpPtr' and
  5368. | `zSigPtr', respectively.
  5369. *----------------------------------------------------------------------------*}
  5370. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5371. var
  5372. shiftCount: int8;
  5373. begin
  5374. shiftCount := countLeadingZeros64( aSig );
  5375. zSigPtr := aSig shl shiftCount;
  5376. zExpPtr := 1 - shiftCount;
  5377. end;
  5378. {*----------------------------------------------------------------------------
  5379. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5380. | extended double-precision floating-point value, returning the result.
  5381. *----------------------------------------------------------------------------*}
  5382. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5383. var
  5384. z: floatx80;
  5385. begin
  5386. z.low := zSig;
  5387. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5388. result:=z;
  5389. end;
  5390. {*----------------------------------------------------------------------------
  5391. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5392. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5393. | and returns the proper extended double-precision floating-point value
  5394. | corresponding to the abstract input. Ordinarily, the abstract value is
  5395. | rounded and packed into the extended double-precision format, with the
  5396. | inexact exception raised if the abstract input cannot be represented
  5397. | exactly. However, if the abstract value is too large, the overflow and
  5398. | inexact exceptions are raised and an infinity or maximal finite value is
  5399. | returned. If the abstract value is too small, the input value is rounded to
  5400. | a subnormal number, and the underflow and inexact exceptions are raised if
  5401. | the abstract input cannot be represented exactly as a subnormal extended
  5402. | double-precision floating-point number.
  5403. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5404. | number of bits as single or double precision, respectively. Otherwise, the
  5405. | result is rounded to the full precision of the extended double-precision
  5406. | format.
  5407. | The input significand must be normalized or smaller. If the input
  5408. | significand is not normalized, `zExp' must be 0; in that case, the result
  5409. | returned is a subnormal number, and it must not require rounding. The
  5410. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5411. | Floating-Point Arithmetic.
  5412. *----------------------------------------------------------------------------*}
  5413. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5414. var
  5415. roundingMode: int8;
  5416. roundNearestEven, increment, isTiny: flag;
  5417. roundIncrement, roundMask, roundBits: int64;
  5418. label
  5419. precision80;
  5420. begin
  5421. roundingMode := softfloat_rounding_mode;
  5422. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5423. if ( roundingPrecision = 80 ) then
  5424. goto precision80;
  5425. if ( roundingPrecision = 64 ) then
  5426. begin
  5427. roundIncrement := int64( $0000000000000400 );
  5428. roundMask := int64( $00000000000007FF );
  5429. end
  5430. else if ( roundingPrecision = 32 ) then
  5431. begin
  5432. roundIncrement := int64( $0000008000000000 );
  5433. roundMask := int64( $000000FFFFFFFFFF );
  5434. end
  5435. else begin
  5436. goto precision80;
  5437. end;
  5438. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5439. if ( not (roundNearestEven<>0) ) then
  5440. begin
  5441. if ( roundingMode = float_round_to_zero ) then
  5442. begin
  5443. roundIncrement := 0;
  5444. end
  5445. else begin
  5446. roundIncrement := roundMask;
  5447. if ( zSign<>0 ) then
  5448. begin
  5449. if ( roundingMode = float_round_up ) then
  5450. roundIncrement := 0;
  5451. end
  5452. else begin
  5453. if ( roundingMode = float_round_down ) then
  5454. roundIncrement := 0;
  5455. end;
  5456. end;
  5457. end;
  5458. roundBits := zSig0 and roundMask;
  5459. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5460. if ( ( $7FFE < zExp )
  5461. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5462. ) begin
  5463. goto overflow;
  5464. end;
  5465. if ( zExp <= 0 ) begin
  5466. isTiny =
  5467. ( float_detect_tininess = float_tininess_before_rounding )
  5468. or ( zExp < 0 )
  5469. or ( zSig0 <= zSig0 + roundIncrement );
  5470. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5471. zExp := 0;
  5472. roundBits := zSig0 and roundMask;
  5473. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5474. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5475. zSig0 += roundIncrement;
  5476. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5477. roundIncrement := roundMask + 1;
  5478. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5479. roundMask |= roundIncrement;
  5480. end;
  5481. zSig0 = ~ roundMask;
  5482. result:=packFloatx80( zSign, zExp, zSig0 );
  5483. end;
  5484. end;
  5485. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5486. zSig0 += roundIncrement;
  5487. if ( zSig0 < roundIncrement ) begin
  5488. ++zExp;
  5489. zSig0 := LIT64( $8000000000000000 );
  5490. end;
  5491. roundIncrement := roundMask + 1;
  5492. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5493. roundMask |= roundIncrement;
  5494. end;
  5495. zSig0 = ~ roundMask;
  5496. if ( zSig0 = 0 ) zExp := 0;
  5497. result:=packFloatx80( zSign, zExp, zSig0 );
  5498. precision80:
  5499. increment := ( (sbits64) zSig1 < 0 );
  5500. if ( ! roundNearestEven ) begin
  5501. if ( roundingMode = float_round_to_zero ) begin
  5502. increment := 0;
  5503. end;
  5504. else begin
  5505. if ( zSign ) begin
  5506. increment := ( roundingMode = float_round_down ) and zSig1;
  5507. end;
  5508. else begin
  5509. increment := ( roundingMode = float_round_up ) and zSig1;
  5510. end;
  5511. end;
  5512. end;
  5513. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5514. if ( ( $7FFE < zExp )
  5515. or ( ( zExp = $7FFE )
  5516. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5517. and increment
  5518. )
  5519. ) begin
  5520. roundMask := 0;
  5521. overflow:
  5522. float_raise( float_flag_overflow or float_flag_inexact );
  5523. if ( ( roundingMode = float_round_to_zero )
  5524. or ( zSign and ( roundingMode = float_round_up ) )
  5525. or ( ! zSign and ( roundingMode = float_round_down ) )
  5526. ) begin
  5527. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5528. end;
  5529. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5530. end;
  5531. if ( zExp <= 0 ) begin
  5532. isTiny =
  5533. ( float_detect_tininess = float_tininess_before_rounding )
  5534. or ( zExp < 0 )
  5535. or ! increment
  5536. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5537. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  5538. zExp := 0;
  5539. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5540. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5541. if ( roundNearestEven ) begin
  5542. increment := ( (sbits64) zSig1 < 0 );
  5543. end;
  5544. else begin
  5545. if ( zSign ) begin
  5546. increment := ( roundingMode = float_round_down ) and zSig1;
  5547. end;
  5548. else begin
  5549. increment := ( roundingMode = float_round_up ) and zSig1;
  5550. end;
  5551. end;
  5552. if ( increment ) begin
  5553. ++zSig0;
  5554. zSig0 =
  5555. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5556. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5557. end;
  5558. result:=packFloatx80( zSign, zExp, zSig0 );
  5559. end;
  5560. end;
  5561. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5562. if ( increment ) begin
  5563. ++zSig0;
  5564. if ( zSig0 = 0 ) begin
  5565. ++zExp;
  5566. zSig0 := LIT64( $8000000000000000 );
  5567. end;
  5568. else begin
  5569. zSig0 = ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5570. end;
  5571. end;
  5572. else begin
  5573. if ( zSig0 = 0 ) zExp := 0;
  5574. end;
  5575. result:=packFloatx80( zSign, zExp, zSig0 );
  5576. end;
  5577. {*----------------------------------------------------------------------------
  5578. | Takes an abstract floating-point value having sign `zSign', exponent
  5579. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5580. | and returns the proper extended double-precision floating-point value
  5581. | corresponding to the abstract input. This routine is just like
  5582. | `roundAndPackFloatx80' except that the input significand does not have to be
  5583. | normalized.
  5584. *----------------------------------------------------------------------------*}
  5585. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5586. var
  5587. shiftCount: int8;
  5588. begin
  5589. if ( zSig0 = 0 ) begin
  5590. zSig0 := zSig1;
  5591. zSig1 := 0;
  5592. zExp -= 64;
  5593. end;
  5594. shiftCount := countLeadingZeros64( zSig0 );
  5595. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5596. zExp := eExp - shiftCount;
  5597. return
  5598. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5599. end;
  5600. {*----------------------------------------------------------------------------
  5601. | Returns the result of converting the extended double-precision floating-
  5602. | point value `a' to the 32-bit two's complement integer format. The
  5603. | conversion is performed according to the IEC/IEEE Standard for Binary
  5604. | Floating-Point Arithmetic---which means in particular that the conversion
  5605. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5606. | largest positive integer is returned. Otherwise, if the conversion
  5607. | overflows, the largest integer with the same sign as `a' is returned.
  5608. *----------------------------------------------------------------------------*}
  5609. function floatx80_to_int32(a: floatx80): int32;
  5610. var
  5611. aSign: flag;
  5612. aExp, shiftCount: int32;
  5613. aSig: bits64;
  5614. begin
  5615. aSig := extractFloatx80Frac( a );
  5616. aExp := extractFloatx80Exp( a );
  5617. aSign := extractFloatx80Sign( a );
  5618. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5619. shiftCount := $4037 - aExp;
  5620. if ( shiftCount <= 0 ) shiftCount := 1;
  5621. shift64RightJamming( aSig, shiftCount, aSig );
  5622. result := roundAndPackInt32( aSign, aSig );
  5623. end;
  5624. {*----------------------------------------------------------------------------
  5625. | Returns the result of converting the extended double-precision floating-
  5626. | point value `a' to the 32-bit two's complement integer format. The
  5627. | conversion is performed according to the IEC/IEEE Standard for Binary
  5628. | Floating-Point Arithmetic, except that the conversion is always rounded
  5629. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5630. | Otherwise, if the conversion overflows, the largest integer with the same
  5631. | sign as `a' is returned.
  5632. *----------------------------------------------------------------------------*}
  5633. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5634. var
  5635. aSign: flag;
  5636. aExp, shiftCount: int32;
  5637. aSig, savedASig: bits64;
  5638. z: int32;
  5639. begin
  5640. aSig := extractFloatx80Frac( a );
  5641. aExp := extractFloatx80Exp( a );
  5642. aSign := extractFloatx80Sign( a );
  5643. if ( $401E < aExp ) begin
  5644. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5645. goto invalid;
  5646. end;
  5647. else if ( aExp < $3FFF ) begin
  5648. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5649. result := 0;
  5650. end;
  5651. shiftCount := $403E - aExp;
  5652. savedASig := aSig;
  5653. aSig >>= shiftCount;
  5654. z := aSig;
  5655. if ( aSign ) z := - z;
  5656. if ( ( z < 0 ) xor aSign ) begin
  5657. invalid:
  5658. float_raise( float_flag_invalid );
  5659. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5660. end;
  5661. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5662. softfloat_exception_flags or= float_flag_inexact;
  5663. end;
  5664. result := z;
  5665. end;
  5666. {*----------------------------------------------------------------------------
  5667. | Returns the result of converting the extended double-precision floating-
  5668. | point value `a' to the 64-bit two's complement integer format. The
  5669. | conversion is performed according to the IEC/IEEE Standard for Binary
  5670. | Floating-Point Arithmetic---which means in particular that the conversion
  5671. | is rounded according to the current rounding mode. If `a' is a NaN,
  5672. | the largest positive integer is returned. Otherwise, if the conversion
  5673. | overflows, the largest integer with the same sign as `a' is returned.
  5674. *----------------------------------------------------------------------------*}
  5675. function floatx80_to_int64(a: floatx80): int64;
  5676. var
  5677. aSign: flag;
  5678. aExp, shiftCount: int32;
  5679. aSig, aSigExtra: bits64;
  5680. begin
  5681. aSig := extractFloatx80Frac( a );
  5682. aExp := extractFloatx80Exp( a );
  5683. aSign := extractFloatx80Sign( a );
  5684. shiftCount := $403E - aExp;
  5685. if ( shiftCount <= 0 ) begin
  5686. if ( shiftCount ) begin
  5687. float_raise( float_flag_invalid );
  5688. if ( ! aSign
  5689. or ( ( aExp = $7FFF )
  5690. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5691. ) begin
  5692. result := LIT64( $7FFFFFFFFFFFFFFF );
  5693. end;
  5694. result := (sbits64) LIT64( $8000000000000000 );
  5695. end;
  5696. aSigExtra := 0;
  5697. end;
  5698. else begin
  5699. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  5700. end;
  5701. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5702. end;
  5703. {*----------------------------------------------------------------------------
  5704. | Returns the result of converting the extended double-precision floating-
  5705. | point value `a' to the 64-bit two's complement integer format. The
  5706. | conversion is performed according to the IEC/IEEE Standard for Binary
  5707. | Floating-Point Arithmetic, except that the conversion is always rounded
  5708. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5709. | Otherwise, if the conversion overflows, the largest integer with the same
  5710. | sign as `a' is returned.
  5711. *----------------------------------------------------------------------------*}
  5712. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5713. var
  5714. aSign: flag;
  5715. aExp, shiftCount: int32;
  5716. aSig: bits64;
  5717. z: int64;
  5718. begin
  5719. aSig := extractFloatx80Frac( a );
  5720. aExp := extractFloatx80Exp( a );
  5721. aSign := extractFloatx80Sign( a );
  5722. shiftCount := aExp - $403E;
  5723. if ( 0 <= shiftCount ) begin
  5724. aSig = LIT64( $7FFFFFFFFFFFFFFF );
  5725. if ( ( a.high <> $C03E ) or aSig ) begin
  5726. float_raise( float_flag_invalid );
  5727. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5728. result := LIT64( $7FFFFFFFFFFFFFFF );
  5729. end;
  5730. end;
  5731. result := (sbits64) LIT64( $8000000000000000 );
  5732. end;
  5733. else if ( aExp < $3FFF ) begin
  5734. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5735. result := 0;
  5736. end;
  5737. z := aSig>>( - shiftCount );
  5738. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5739. softfloat_exception_flags or= float_flag_inexact;
  5740. end;
  5741. if ( aSign ) z := - z;
  5742. result := z;
  5743. end;
  5744. {*----------------------------------------------------------------------------
  5745. | Returns the result of converting the extended double-precision floating-
  5746. | point value `a' to the single-precision floating-point format. The
  5747. | conversion is performed according to the IEC/IEEE Standard for Binary
  5748. | Floating-Point Arithmetic.
  5749. *----------------------------------------------------------------------------*}
  5750. function floatx80_to_float32(a: floatx80): float32;
  5751. var
  5752. aSign: flag;
  5753. aExp: int32;
  5754. aSig: bits64;
  5755. begin
  5756. aSig := extractFloatx80Frac( a );
  5757. aExp := extractFloatx80Exp( a );
  5758. aSign := extractFloatx80Sign( a );
  5759. if ( aExp = $7FFF ) begin
  5760. if ( (bits64) ( aSig shl 1 ) ) begin
  5761. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5762. end;
  5763. result := packFloat32( aSign, $FF, 0 );
  5764. end;
  5765. shift64RightJamming( aSig, 33, aSig );
  5766. if ( aExp or aSig ) aExp -= $3F81;
  5767. result := roundAndPackFloat32( aSign, aExp, aSig );
  5768. end;
  5769. {*----------------------------------------------------------------------------
  5770. | Returns the result of converting the extended double-precision floating-
  5771. | point value `a' to the double-precision floating-point format. The
  5772. | conversion is performed according to the IEC/IEEE Standard for Binary
  5773. | Floating-Point Arithmetic.
  5774. *----------------------------------------------------------------------------*}
  5775. function floatx80_to_float64(a: floatx80): float64;
  5776. var
  5777. aSign: flag;
  5778. aExp: int32;
  5779. aSig, zSig: bits64;
  5780. begin
  5781. aSig := extractFloatx80Frac( a );
  5782. aExp := extractFloatx80Exp( a );
  5783. aSign := extractFloatx80Sign( a );
  5784. if ( aExp = $7FFF ) begin
  5785. if ( (bits64) ( aSig shl 1 ) ) begin
  5786. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5787. end;
  5788. result := packFloat64( aSign, $7FF, 0 );
  5789. end;
  5790. shift64RightJamming( aSig, 1, zSig );
  5791. if ( aExp or aSig ) aExp -= $3C01;
  5792. result := roundAndPackFloat64( aSign, aExp, zSig );
  5793. end;
  5794. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5795. {*----------------------------------------------------------------------------
  5796. | Returns the result of converting the extended double-precision floating-
  5797. | point value `a' to the quadruple-precision floating-point format. The
  5798. | conversion is performed according to the IEC/IEEE Standard for Binary
  5799. | Floating-Point Arithmetic.
  5800. *----------------------------------------------------------------------------*}
  5801. function floatx80_to_float128(a: floatx80): float128;
  5802. var
  5803. aSign: flag;
  5804. aExp: int16;
  5805. aSig, zSig0, zSig1: bits64;
  5806. begin
  5807. aSig := extractFloatx80Frac( a );
  5808. aExp := extractFloatx80Exp( a );
  5809. aSign := extractFloatx80Sign( a );
  5810. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5811. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5812. end;
  5813. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  5814. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5815. end;
  5816. {$endif FPC_SOFTFLOAT_FLOAT128}
  5817. {*----------------------------------------------------------------------------
  5818. | Rounds the extended double-precision floating-point value `a' to an integer,
  5819. | and Returns the result as an extended quadruple-precision floating-point
  5820. | value. The operation is performed according to the IEC/IEEE Standard for
  5821. | Binary Floating-Point Arithmetic.
  5822. *----------------------------------------------------------------------------*}
  5823. function floatx80_round_to_int(a: floatx80): floatx80;
  5824. var
  5825. aSign: flag;
  5826. aExp: int32;
  5827. lastBitMask, roundBitsMask: bits64;
  5828. roundingMode: int8;
  5829. z: floatx80;
  5830. begin
  5831. aExp := extractFloatx80Exp( a );
  5832. if ( $403E <= aExp ) begin
  5833. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5834. result := propagateFloatx80NaN( a, a );
  5835. end;
  5836. result := a;
  5837. end;
  5838. if ( aExp < $3FFF ) begin
  5839. if ( ( aExp = 0 )
  5840. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5841. result := a;
  5842. end;
  5843. softfloat_exception_flags or= float_flag_inexact;
  5844. aSign := extractFloatx80Sign( a );
  5845. switch ( softfloat_rounding_mode ) begin
  5846. case float_round_nearest_even:
  5847. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5848. ) begin
  5849. result :=
  5850. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5851. end;
  5852. break;
  5853. case float_round_down:
  5854. result :=
  5855. aSign ?
  5856. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5857. : packFloatx80( 0, 0, 0 );
  5858. case float_round_up:
  5859. result :=
  5860. aSign ? packFloatx80( 1, 0, 0 )
  5861. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5862. end;
  5863. result := packFloatx80( aSign, 0, 0 );
  5864. end;
  5865. lastBitMask := 1;
  5866. lastBitMask shl = $403E - aExp;
  5867. roundBitsMask := lastBitMask - 1;
  5868. z := a;
  5869. roundingMode := softfloat_rounding_mode;
  5870. if ( roundingMode = float_round_nearest_even ) begin
  5871. z.low += lastBitMask>>1;
  5872. if ( ( z.low and roundBitsMask ) = 0 ) z.low = ~ lastBitMask;
  5873. end;
  5874. else if ( roundingMode <> float_round_to_zero ) begin
  5875. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5876. z.low += roundBitsMask;
  5877. end;
  5878. end;
  5879. z.low = ~ roundBitsMask;
  5880. if ( z.low = 0 ) begin
  5881. ++z.high;
  5882. z.low := LIT64( $8000000000000000 );
  5883. end;
  5884. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5885. result := z;
  5886. end;
  5887. {*----------------------------------------------------------------------------
  5888. | Returns the result of adding the absolute values of the extended double-
  5889. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5890. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5891. | The addition is performed according to the IEC/IEEE Standard for Binary
  5892. | Floating-Point Arithmetic.
  5893. *----------------------------------------------------------------------------*}
  5894. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5895. var
  5896. aExp, bExp, zExp: int32;
  5897. aSig, bSig, zSig0, zSig1: bits64;
  5898. expDiff: int32;
  5899. begin
  5900. aSig := extractFloatx80Frac( a );
  5901. aExp := extractFloatx80Exp( a );
  5902. bSig := extractFloatx80Frac( b );
  5903. bExp := extractFloatx80Exp( b );
  5904. expDiff := aExp - bExp;
  5905. if ( 0 < expDiff ) begin
  5906. if ( aExp = $7FFF ) begin
  5907. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5908. result := a;
  5909. end;
  5910. if ( bExp = 0 ) --expDiff;
  5911. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5912. zExp := aExp;
  5913. end;
  5914. else if ( expDiff < 0 ) begin
  5915. if ( bExp = $7FFF ) begin
  5916. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5917. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5918. end;
  5919. if ( aExp = 0 ) ++expDiff;
  5920. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5921. zExp := bExp;
  5922. end;
  5923. else begin
  5924. if ( aExp = $7FFF ) begin
  5925. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5926. result := propagateFloatx80NaN( a, b );
  5927. end;
  5928. result := a;
  5929. end;
  5930. zSig1 := 0;
  5931. zSig0 := aSig + bSig;
  5932. if ( aExp = 0 ) begin
  5933. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  5934. goto roundAndPack;
  5935. end;
  5936. zExp := aExp;
  5937. goto shiftRight1;
  5938. end;
  5939. zSig0 := aSig + bSig;
  5940. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5941. shiftRight1:
  5942. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  5943. zSig0 or= LIT64( $8000000000000000 );
  5944. ++zExp;
  5945. roundAndPack:
  5946. result :=
  5947. roundAndPackFloatx80(
  5948. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5949. end;
  5950. {*----------------------------------------------------------------------------
  5951. | Returns the result of subtracting the absolute values of the extended
  5952. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5953. | difference is negated before being returned. `zSign' is ignored if the
  5954. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5955. | Standard for Binary Floating-Point Arithmetic.
  5956. *----------------------------------------------------------------------------*}
  5957. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5958. var
  5959. aExp, bExp, zExp: int32;
  5960. aSig, bSig, zSig0, zSig1: bits64;
  5961. expDiff: int32;
  5962. z: floatx80;
  5963. begin
  5964. aSig := extractFloatx80Frac( a );
  5965. aExp := extractFloatx80Exp( a );
  5966. bSig := extractFloatx80Frac( b );
  5967. bExp := extractFloatx80Exp( b );
  5968. expDiff := aExp - bExp;
  5969. if ( 0 < expDiff ) goto aExpBigger;
  5970. if ( expDiff < 0 ) goto bExpBigger;
  5971. if ( aExp = $7FFF ) begin
  5972. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5973. result := propagateFloatx80NaN( a, b );
  5974. end;
  5975. float_raise( float_flag_invalid );
  5976. z.low := floatx80_default_nan_low;
  5977. z.high := floatx80_default_nan_high;
  5978. result := z;
  5979. end;
  5980. if ( aExp = 0 ) begin
  5981. aExp := 1;
  5982. bExp := 1;
  5983. end;
  5984. zSig1 := 0;
  5985. if ( bSig < aSig ) goto aBigger;
  5986. if ( aSig < bSig ) goto bBigger;
  5987. result := packFloatx80( softfloat_rounding_mode = float_round_down, 0, 0 );
  5988. bExpBigger:
  5989. if ( bExp = $7FFF ) begin
  5990. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5991. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  5992. end;
  5993. if ( aExp = 0 ) ++expDiff;
  5994. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5995. bBigger:
  5996. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  5997. zExp := bExp;
  5998. zSign xor = 1;
  5999. goto normalizeRoundAndPack;
  6000. aExpBigger:
  6001. if ( aExp = $7FFF ) begin
  6002. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6003. result := a;
  6004. end;
  6005. if ( bExp = 0 ) --expDiff;
  6006. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6007. aBigger:
  6008. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6009. zExp := aExp;
  6010. normalizeRoundAndPack:
  6011. result :=
  6012. normalizeRoundAndPackFloatx80(
  6013. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6014. end;
  6015. {*----------------------------------------------------------------------------
  6016. | Returns the result of adding the extended double-precision floating-point
  6017. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6018. | Standard for Binary Floating-Point Arithmetic.
  6019. *----------------------------------------------------------------------------*}
  6020. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6021. var
  6022. aSign, bSign: flag;
  6023. begin
  6024. aSign := extractFloatx80Sign( a );
  6025. bSign := extractFloatx80Sign( b );
  6026. if ( aSign = bSign ) begin
  6027. result := addFloatx80Sigs( a, b, aSign );
  6028. end;
  6029. else begin
  6030. result := subFloatx80Sigs( a, b, aSign );
  6031. end;
  6032. end;
  6033. {*----------------------------------------------------------------------------
  6034. | Returns the result of subtracting the extended double-precision floating-
  6035. | point values `a' and `b'. The operation is performed according to the
  6036. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6037. *----------------------------------------------------------------------------*}
  6038. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6039. var
  6040. aSign, bSign: flag;
  6041. begin
  6042. aSign := extractFloatx80Sign( a );
  6043. bSign := extractFloatx80Sign( b );
  6044. if ( aSign = bSign ) begin
  6045. result := subFloatx80Sigs( a, b, aSign );
  6046. end;
  6047. else begin
  6048. result := addFloatx80Sigs( a, b, aSign );
  6049. end;
  6050. end;
  6051. {*----------------------------------------------------------------------------
  6052. | Returns the result of multiplying the extended double-precision floating-
  6053. | point values `a' and `b'. The operation is performed according to the
  6054. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6055. *----------------------------------------------------------------------------*}
  6056. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6057. var
  6058. aSign, bSign, zSign: flag;
  6059. aExp, bExp, zExp: int32;
  6060. aSig, bSig, zSig0, zSig1: bits64;
  6061. z: floatx80;
  6062. begin
  6063. aSig := extractFloatx80Frac( a );
  6064. aExp := extractFloatx80Exp( a );
  6065. aSign := extractFloatx80Sign( a );
  6066. bSig := extractFloatx80Frac( b );
  6067. bExp := extractFloatx80Exp( b );
  6068. bSign := extractFloatx80Sign( b );
  6069. zSign := aSign xor bSign;
  6070. if ( aExp = $7FFF ) begin
  6071. if ( (bits64) ( aSig shl 1 )
  6072. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6073. result := propagateFloatx80NaN( a, b );
  6074. end;
  6075. if ( ( bExp or bSig ) = 0 ) goto invalid;
  6076. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6077. end;
  6078. if ( bExp = $7FFF ) begin
  6079. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6080. if ( ( aExp or aSig ) = 0 ) begin
  6081. invalid:
  6082. float_raise( float_flag_invalid );
  6083. z.low := floatx80_default_nan_low;
  6084. z.high := floatx80_default_nan_high;
  6085. result := z;
  6086. end;
  6087. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6088. end;
  6089. if ( aExp = 0 ) begin
  6090. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6091. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6092. end;
  6093. if ( bExp = 0 ) begin
  6094. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6095. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6096. end;
  6097. zExp := aExp + bExp - $3FFE;
  6098. mul64To128( aSig, bSig, zSig0, zSig1 );
  6099. if ( 0 < (sbits64) zSig0 ) begin
  6100. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6101. --zExp;
  6102. end;
  6103. result :=
  6104. roundAndPackFloatx80(
  6105. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6106. end;
  6107. {*----------------------------------------------------------------------------
  6108. | Returns the result of dividing the extended double-precision floating-point
  6109. | value `a' by the corresponding value `b'. The operation is performed
  6110. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6111. *----------------------------------------------------------------------------*}
  6112. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6113. var
  6114. aSign, bSign, zSign: flag;
  6115. aExp, bExp, zExp: int32;
  6116. aSig, bSig, zSig0, zSig1: bits64;
  6117. rem0, rem1, rem2, term0, term1, term2: bits64;
  6118. z: floatx80;
  6119. begin
  6120. aSig := extractFloatx80Frac( a );
  6121. aExp := extractFloatx80Exp( a );
  6122. aSign := extractFloatx80Sign( a );
  6123. bSig := extractFloatx80Frac( b );
  6124. bExp := extractFloatx80Exp( b );
  6125. bSign := extractFloatx80Sign( b );
  6126. zSign := aSign xor bSign;
  6127. if ( aExp = $7FFF ) begin
  6128. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6129. if ( bExp = $7FFF ) begin
  6130. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6131. goto invalid;
  6132. end;
  6133. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6134. end;
  6135. if ( bExp = $7FFF ) begin
  6136. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6137. result := packFloatx80( zSign, 0, 0 );
  6138. end;
  6139. if ( bExp = 0 ) begin
  6140. if ( bSig = 0 ) begin
  6141. if ( ( aExp or aSig ) = 0 ) begin
  6142. invalid:
  6143. float_raise( float_flag_invalid );
  6144. z.low := floatx80_default_nan_low;
  6145. z.high := floatx80_default_nan_high;
  6146. result := z;
  6147. end;
  6148. float_raise( float_flag_divbyzero );
  6149. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6150. end;
  6151. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6152. end;
  6153. if ( aExp = 0 ) begin
  6154. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6155. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6156. end;
  6157. zExp := aExp - bExp + $3FFE;
  6158. rem1 := 0;
  6159. if ( bSig <= aSig ) begin
  6160. shift128Right( aSig, 0, 1, aSig, rem1 );
  6161. ++zExp;
  6162. end;
  6163. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6164. mul64To128( bSig, zSig0, term0, term1 );
  6165. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6166. while ( (sbits64) rem0 < 0 ) begin
  6167. --zSig0;
  6168. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6169. end;
  6170. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6171. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  6172. mul64To128( bSig, zSig1, term1, term2 );
  6173. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6174. while ( (sbits64) rem1 < 0 ) begin
  6175. --zSig1;
  6176. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6177. end;
  6178. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  6179. end;
  6180. result :=
  6181. roundAndPackFloatx80(
  6182. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6183. end;
  6184. {*----------------------------------------------------------------------------
  6185. | Returns the remainder of the extended double-precision floating-point value
  6186. | `a' with respect to the corresponding value `b'. The operation is performed
  6187. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6188. *----------------------------------------------------------------------------*}
  6189. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6190. var
  6191. aSign, bSign, zSign: flag;
  6192. aExp, bExp, expDiff: int32;
  6193. aSig0, aSig1, bSig: bits64;
  6194. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6195. z: floatx80;
  6196. begin
  6197. aSig0 := extractFloatx80Frac( a );
  6198. aExp := extractFloatx80Exp( a );
  6199. aSign := extractFloatx80Sign( a );
  6200. bSig := extractFloatx80Frac( b );
  6201. bExp := extractFloatx80Exp( b );
  6202. bSign := extractFloatx80Sign( b );
  6203. if ( aExp = $7FFF ) begin
  6204. if ( (bits64) ( aSig0 shl 1 )
  6205. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6206. result := propagateFloatx80NaN( a, b );
  6207. end;
  6208. goto invalid;
  6209. end;
  6210. if ( bExp = $7FFF ) begin
  6211. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6212. result := a;
  6213. end;
  6214. if ( bExp = 0 ) begin
  6215. if ( bSig = 0 ) begin
  6216. invalid:
  6217. float_raise( float_flag_invalid );
  6218. z.low := floatx80_default_nan_low;
  6219. z.high := floatx80_default_nan_high;
  6220. result := z;
  6221. end;
  6222. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6223. end;
  6224. if ( aExp = 0 ) begin
  6225. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  6226. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6227. end;
  6228. bSig or= LIT64( $8000000000000000 );
  6229. zSign := aSign;
  6230. expDiff := aExp - bExp;
  6231. aSig1 := 0;
  6232. if ( expDiff < 0 ) begin
  6233. if ( expDiff < -1 ) result := a;
  6234. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6235. expDiff := 0;
  6236. end;
  6237. q := ( bSig <= aSig0 );
  6238. if ( q ) aSig0 -= bSig;
  6239. expDiff -= 64;
  6240. while ( 0 < expDiff ) begin
  6241. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6242. q := ( 2 < q ) ? q - 2 : 0;
  6243. mul64To128( bSig, q, term0, term1 );
  6244. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6245. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6246. expDiff -= 62;
  6247. end;
  6248. expDiff += 64;
  6249. if ( 0 < expDiff ) begin
  6250. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6251. q := ( 2 < q ) ? q - 2 : 0;
  6252. q >>= 64 - expDiff;
  6253. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6254. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6255. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6256. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  6257. ++q;
  6258. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6259. end;
  6260. end;
  6261. else begin
  6262. term1 := 0;
  6263. term0 := bSig;
  6264. end;
  6265. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6266. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6267. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6268. and ( q and 1 ) )
  6269. ) begin
  6270. aSig0 := alternateASig0;
  6271. aSig1 := alternateASig1;
  6272. zSign := ! zSign;
  6273. end;
  6274. result :=
  6275. normalizeRoundAndPackFloatx80(
  6276. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6277. end;
  6278. {*----------------------------------------------------------------------------
  6279. | Returns the square root of the extended double-precision floating-point
  6280. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6281. | for Binary Floating-Point Arithmetic.
  6282. *----------------------------------------------------------------------------*}
  6283. function floatx80_sqrt(a: floatx80): floatx80;
  6284. var
  6285. aSign: flag;
  6286. aExp, zExp: int32;
  6287. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6288. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6289. z: floatx80;
  6290. label
  6291. invalid;
  6292. begin
  6293. aSig0 := extractFloatx80Frac( a );
  6294. aExp := extractFloatx80Exp( a );
  6295. aSign := extractFloatx80Sign( a );
  6296. if ( aExp = $7FFF ) begin
  6297. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  6298. if ( ! aSign ) result := a;
  6299. goto invalid;
  6300. end;
  6301. if ( aSign ) begin
  6302. if ( ( aExp or aSig0 ) = 0 ) result := a;
  6303. invalid:
  6304. float_raise( float_flag_invalid );
  6305. z.low := floatx80_default_nan_low;
  6306. z.high := floatx80_default_nan_high;
  6307. result := z;
  6308. end;
  6309. if ( aExp = 0 ) begin
  6310. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  6311. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6312. end;
  6313. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  6314. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  6315. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  6316. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  6317. doubleZSig0 := zSig0 shl 1;
  6318. mul64To128( zSig0, zSig0, term0, term1 );
  6319. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  6320. while ( (sbits64) rem0 < 0 ) begin
  6321. --zSig0;
  6322. doubleZSig0 -= 2;
  6323. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  6324. end;
  6325. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  6326. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  6327. if ( zSig1 = 0 ) zSig1 := 1;
  6328. mul64To128( doubleZSig0, zSig1, term1, term2 );
  6329. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6330. mul64To128( zSig1, zSig1, term2, term3 );
  6331. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  6332. while ( (sbits64) rem1 < 0 ) begin
  6333. --zSig1;
  6334. shortShift128Left( 0, zSig1, 1, term2, term3 );
  6335. term3 or= 1;
  6336. term2 or= doubleZSig0;
  6337. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  6338. end;
  6339. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  6340. end;
  6341. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  6342. zSig0 or= doubleZSig0;
  6343. result :=
  6344. roundAndPackFloatx80(
  6345. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  6346. end;
  6347. {*----------------------------------------------------------------------------
  6348. | Returns 1 if the extended double-precision floating-point value `a' is
  6349. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  6350. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6351. | Arithmetic.
  6352. *----------------------------------------------------------------------------*}
  6353. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  6354. begin
  6355. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6356. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6357. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6358. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6359. ) begin
  6360. if ( floatx80_is_signaling_nan( a )
  6361. or floatx80_is_signaling_nan( b ) ) begin
  6362. float_raise( float_flag_invalid );
  6363. end;
  6364. result := 0;
  6365. end;
  6366. result :=
  6367. ( a.low = b.low )
  6368. and ( ( a.high = b.high )
  6369. or ( ( a.low = 0 )
  6370. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6371. );
  6372. end;
  6373. {*----------------------------------------------------------------------------
  6374. | Returns 1 if the extended double-precision floating-point value `a' is
  6375. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6376. | comparison is performed according to the IEC/IEEE Standard for Binary
  6377. | Floating-Point Arithmetic.
  6378. *----------------------------------------------------------------------------*}
  6379. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6380. var
  6381. aSign, bSign: flag;
  6382. begin
  6383. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6384. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6385. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6386. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6387. ) begin
  6388. float_raise( float_flag_invalid );
  6389. result := 0;
  6390. end;
  6391. aSign := extractFloatx80Sign( a );
  6392. bSign := extractFloatx80Sign( b );
  6393. if ( aSign <> bSign ) begin
  6394. result :=
  6395. aSign
  6396. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6397. = 0 );
  6398. end;
  6399. result :=
  6400. aSign ? le128( b.high, b.low, a.high, a.low )
  6401. : le128( a.high, a.low, b.high, b.low );
  6402. end;
  6403. {*----------------------------------------------------------------------------
  6404. | Returns 1 if the extended double-precision floating-point value `a' is
  6405. | less than the corresponding value `b', and 0 otherwise. The comparison
  6406. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6407. | Arithmetic.
  6408. *----------------------------------------------------------------------------*}
  6409. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6410. var
  6411. aSign, bSign: flag;
  6412. begin
  6413. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6414. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6415. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6416. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6417. ) begin
  6418. float_raise( float_flag_invalid );
  6419. result := 0;
  6420. end;
  6421. aSign := extractFloatx80Sign( a );
  6422. bSign := extractFloatx80Sign( b );
  6423. if ( aSign <> bSign ) begin
  6424. result :=
  6425. aSign
  6426. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6427. <> 0 );
  6428. end;
  6429. result :=
  6430. aSign ? lt128( b.high, b.low, a.high, a.low )
  6431. : lt128( a.high, a.low, b.high, b.low );
  6432. end;
  6433. {*----------------------------------------------------------------------------
  6434. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6435. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6436. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6437. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6438. *----------------------------------------------------------------------------*}
  6439. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6440. begin
  6441. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6442. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6443. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6444. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6445. ) begin
  6446. float_raise( float_flag_invalid );
  6447. result := 0;
  6448. end;
  6449. result :=
  6450. ( a.low = b.low )
  6451. and ( ( a.high = b.high )
  6452. or ( ( a.low = 0 )
  6453. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6454. );
  6455. end;
  6456. {*----------------------------------------------------------------------------
  6457. | Returns 1 if the extended double-precision floating-point value `a' is less
  6458. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6459. | do not cause an exception. Otherwise, the comparison is performed according
  6460. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6461. *----------------------------------------------------------------------------*}
  6462. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6463. var
  6464. aSign, bSign: flag;
  6465. begin
  6466. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6467. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6468. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6469. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6470. ) begin
  6471. if ( floatx80_is_signaling_nan( a )
  6472. or floatx80_is_signaling_nan( b ) ) begin
  6473. float_raise( float_flag_invalid );
  6474. end;
  6475. result := 0;
  6476. end;
  6477. aSign := extractFloatx80Sign( a );
  6478. bSign := extractFloatx80Sign( b );
  6479. if ( aSign <> bSign ) begin
  6480. result :=
  6481. aSign
  6482. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6483. = 0 );
  6484. end;
  6485. result :=
  6486. aSign ? le128( b.high, b.low, a.high, a.low )
  6487. : le128( a.high, a.low, b.high, b.low );
  6488. end;
  6489. {*----------------------------------------------------------------------------
  6490. | Returns 1 if the extended double-precision floating-point value `a' is less
  6491. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6492. | an exception. Otherwise, the comparison is performed according to the
  6493. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6494. *----------------------------------------------------------------------------*}
  6495. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6496. var
  6497. aSign, bSign: flag;
  6498. begin
  6499. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6500. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6501. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6502. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6503. ) begin
  6504. if ( floatx80_is_signaling_nan( a )
  6505. or floatx80_is_signaling_nan( b ) ) begin
  6506. float_raise( float_flag_invalid );
  6507. end;
  6508. result := 0;
  6509. end;
  6510. aSign := extractFloatx80Sign( a );
  6511. bSign := extractFloatx80Sign( b );
  6512. if ( aSign <> bSign ) begin
  6513. result :=
  6514. aSign
  6515. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6516. <> 0 );
  6517. end;
  6518. result :=
  6519. aSign ? lt128( b.high, b.low, a.high, a.low )
  6520. : lt128( a.high, a.low, b.high, b.low );
  6521. end;
  6522. {$endif FPC_SOFTFLOAT_FLOATX80}
  6523. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6524. {*----------------------------------------------------------------------------
  6525. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6526. | floating-point value `a'.
  6527. *----------------------------------------------------------------------------*}
  6528. function extractFloat128Frac1(a : float128): bits64;
  6529. begin
  6530. result:=a.low;
  6531. end;
  6532. {*----------------------------------------------------------------------------
  6533. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6534. | floating-point value `a'.
  6535. *----------------------------------------------------------------------------*}
  6536. function extractFloat128Frac0(a : float128): bits64;
  6537. begin
  6538. result:=a.high and int64($0000FFFFFFFFFFFF);
  6539. end;
  6540. {*----------------------------------------------------------------------------
  6541. | Returns the exponent bits of the quadruple-precision floating-point value
  6542. | `a'.
  6543. *----------------------------------------------------------------------------*}
  6544. function extractFloat128Exp(a : float128): int32;
  6545. begin
  6546. result:=( a.high shr 48 ) and $7FFF;
  6547. end;
  6548. {*----------------------------------------------------------------------------
  6549. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6550. *----------------------------------------------------------------------------*}
  6551. function extractFloat128Sign(a : float128): flag;
  6552. begin
  6553. result:=a.high shr 63;
  6554. end;
  6555. {*----------------------------------------------------------------------------
  6556. | Normalizes the subnormal quadruple-precision floating-point value
  6557. | represented by the denormalized significand formed by the concatenation of
  6558. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6559. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6560. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6561. | least significant 64 bits of the normalized significand are stored at the
  6562. | location pointed to by `zSig1Ptr'.
  6563. *----------------------------------------------------------------------------*}
  6564. procedure normalizeFloat128Subnormal(
  6565. aSig0: bits64;
  6566. aSig1: bits64;
  6567. var zExpPtr: int32;
  6568. var zSig0Ptr: bits64;
  6569. var zSig1Ptr: bits64);
  6570. var
  6571. shiftCount: int8;
  6572. begin
  6573. if ( aSig0 = 0 ) then
  6574. begin
  6575. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6576. if ( shiftCount < 0 ) then
  6577. begin
  6578. zSig0Ptr := aSig1 shr ( - shiftCount );
  6579. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6580. end
  6581. else begin
  6582. zSig0Ptr := aSig1 shl shiftCount;
  6583. zSig1Ptr := 0;
  6584. end;
  6585. zExpPtr := - shiftCount - 63;
  6586. end
  6587. else begin
  6588. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6589. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6590. zExpPtr := 1 - shiftCount;
  6591. end;
  6592. end;
  6593. {*----------------------------------------------------------------------------
  6594. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6595. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6596. | floating-point value, returning the result. After being shifted into the
  6597. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6598. | added together to form the most significant 32 bits of the result. This
  6599. | means that any integer portion of `zSig0' will be added into the exponent.
  6600. | Since a properly normalized significand will have an integer portion equal
  6601. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6602. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6603. | significand.
  6604. *----------------------------------------------------------------------------*}
  6605. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6606. var
  6607. z: float128;
  6608. begin
  6609. z.low := zSig1;
  6610. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6611. result:=z;
  6612. end;
  6613. {*----------------------------------------------------------------------------
  6614. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6615. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6616. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6617. | corresponding to the abstract input. Ordinarily, the abstract value is
  6618. | simply rounded and packed into the quadruple-precision format, with the
  6619. | inexact exception raised if the abstract input cannot be represented
  6620. | exactly. However, if the abstract value is too large, the overflow and
  6621. | inexact exceptions are raised and an infinity or maximal finite value is
  6622. | returned. If the abstract value is too small, the input value is rounded to
  6623. | a subnormal number, and the underflow and inexact exceptions are raised if
  6624. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6625. | precision floating-point number.
  6626. | The input significand must be normalized or smaller. If the input
  6627. | significand is not normalized, `zExp' must be 0; in that case, the result
  6628. | returned is a subnormal number, and it must not require rounding. In the
  6629. | usual case that the input significand is normalized, `zExp' must be 1 less
  6630. | than the ``true'' floating-point exponent. The handling of underflow and
  6631. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6632. *----------------------------------------------------------------------------*}
  6633. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6634. var
  6635. roundingMode: int8;
  6636. roundNearestEven, increment, isTiny: flag;
  6637. begin
  6638. roundingMode := softfloat_rounding_mode;
  6639. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6640. increment := ord( sbits64(zSig2) < 0 );
  6641. if ( roundNearestEven=0 ) then
  6642. begin
  6643. if ( roundingMode = float_round_to_zero ) then
  6644. begin
  6645. increment := 0;
  6646. end
  6647. else begin
  6648. if ( zSign<>0 ) then
  6649. begin
  6650. increment := ord( roundingMode = float_round_down ) and zSig2;
  6651. end
  6652. else begin
  6653. increment := ord( roundingMode = float_round_up ) and zSig2;
  6654. end;
  6655. end;
  6656. end;
  6657. if ( $7FFD <= bits32(zExp) ) then
  6658. begin
  6659. if ( ord( $7FFD < zExp )
  6660. or ( ord( zExp = $7FFD )
  6661. and eq128(
  6662. int64( $0001FFFFFFFFFFFF ),
  6663. int64( $FFFFFFFFFFFFFFFF ),
  6664. zSig0,
  6665. zSig1
  6666. )
  6667. and increment
  6668. )
  6669. )<>0 then
  6670. begin
  6671. float_raise( float_flag_overflow or float_flag_inexact );
  6672. if ( ord( roundingMode = float_round_to_zero )
  6673. or ( zSign and ord( roundingMode = float_round_up ) )
  6674. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6675. )<>0 then
  6676. begin
  6677. result :=
  6678. packFloat128(
  6679. zSign,
  6680. $7FFE,
  6681. int64( $0000FFFFFFFFFFFF ),
  6682. int64( $FFFFFFFFFFFFFFFF )
  6683. );
  6684. end;
  6685. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6686. end;
  6687. if ( zExp < 0 ) then
  6688. begin
  6689. isTiny :=
  6690. ord(( float_detect_tininess = float_tininess_before_rounding )
  6691. or ( zExp < -1 )
  6692. or not( increment<>0 )
  6693. or boolean(lt128(
  6694. zSig0,
  6695. zSig1,
  6696. int64( $0001FFFFFFFFFFFF ),
  6697. int64( $FFFFFFFFFFFFFFFF )
  6698. )));
  6699. shift128ExtraRightJamming(
  6700. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6701. zExp := 0;
  6702. if ( isTiny and zSig2 )<>0 then
  6703. float_raise( float_flag_underflow );
  6704. if ( roundNearestEven<>0 ) then
  6705. begin
  6706. increment := ord( sbits64(zSig2) < 0 );
  6707. end
  6708. else begin
  6709. if ( zSign<>0 ) then
  6710. begin
  6711. increment := ord( roundingMode = float_round_down ) and zSig2;
  6712. end
  6713. else begin
  6714. increment := ord( roundingMode = float_round_up ) and zSig2;
  6715. end;
  6716. end;
  6717. end;
  6718. end;
  6719. if ( zSig2<>0 ) then
  6720. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6721. if ( increment<>0 ) then
  6722. begin
  6723. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6724. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6725. end
  6726. else begin
  6727. if ( ( zSig0 or zSig1 ) = 0 ) then
  6728. zExp := 0;
  6729. end;
  6730. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6731. end;
  6732. {*----------------------------------------------------------------------------
  6733. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6734. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6735. | returns the proper quadruple-precision floating-point value corresponding
  6736. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6737. | except that the input significand has fewer bits and does not have to be
  6738. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6739. | point exponent.
  6740. *----------------------------------------------------------------------------*}
  6741. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6742. var
  6743. shiftCount: int8;
  6744. zSig2: bits64;
  6745. begin
  6746. if ( zSig0 = 0 ) then
  6747. begin
  6748. zSig0 := zSig1;
  6749. zSig1 := 0;
  6750. dec(zExp, 64);
  6751. end;
  6752. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6753. if ( 0 <= shiftCount ) then
  6754. begin
  6755. zSig2 := 0;
  6756. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6757. end
  6758. else begin
  6759. shift128ExtraRightJamming(
  6760. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6761. end;
  6762. dec(zExp, shiftCount);
  6763. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6764. end;
  6765. {*----------------------------------------------------------------------------
  6766. | Returns the result of converting the quadruple-precision floating-point
  6767. | value `a' to the 32-bit two's complement integer format. The conversion
  6768. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6769. | Arithmetic---which means in particular that the conversion is rounded
  6770. | according to the current rounding mode. If `a' is a NaN, the largest
  6771. | positive integer is returned. Otherwise, if the conversion overflows, the
  6772. | largest integer with the same sign as `a' is returned.
  6773. *----------------------------------------------------------------------------*}
  6774. function float128_to_int32(a: float128): int32;
  6775. var
  6776. aSign: flag;
  6777. aExp, shiftCount: int32;
  6778. aSig0, aSig1: bits64;
  6779. begin
  6780. aSig1 := extractFloat128Frac1( a );
  6781. aSig0 := extractFloat128Frac0( a );
  6782. aExp := extractFloat128Exp( a );
  6783. aSign := extractFloat128Sign( a );
  6784. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6785. aSign := 0;
  6786. if ( aExp<>0 ) then
  6787. aSig0 := aSig0 or int64( $0001000000000000 );
  6788. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6789. shiftCount := $4028 - aExp;
  6790. if ( 0 < shiftCount ) then
  6791. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6792. result := roundAndPackInt32( aSign, aSig0 );
  6793. end;
  6794. {*----------------------------------------------------------------------------
  6795. | Returns the result of converting the quadruple-precision floating-point
  6796. | value `a' to the 32-bit two's complement integer format. The conversion
  6797. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6798. | Arithmetic, except that the conversion is always rounded toward zero. If
  6799. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6800. | conversion overflows, the largest integer with the same sign as `a' is
  6801. | returned.
  6802. *----------------------------------------------------------------------------*}
  6803. function float128_to_int32_round_to_zero(a: float128): int32;
  6804. var
  6805. aSign: flag;
  6806. aExp, shiftCount: int32;
  6807. aSig0, aSig1, savedASig: bits64;
  6808. z: int32;
  6809. label
  6810. invalid;
  6811. begin
  6812. aSig1 := extractFloat128Frac1( a );
  6813. aSig0 := extractFloat128Frac0( a );
  6814. aExp := extractFloat128Exp( a );
  6815. aSign := extractFloat128Sign( a );
  6816. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6817. if ( $401E < aExp ) then
  6818. begin
  6819. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6820. aSign := 0;
  6821. goto invalid;
  6822. end
  6823. else if ( aExp < $3FFF ) then
  6824. begin
  6825. if ( aExp or aSig0 )<>0 then
  6826. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6827. result := 0;
  6828. exit;
  6829. end;
  6830. aSig0 := aSig0 or int64( $0001000000000000 );
  6831. shiftCount := $402F - aExp;
  6832. savedASig := aSig0;
  6833. aSig0 := aSig0 shr shiftCount;
  6834. z := aSig0;
  6835. if ( aSign )<>0 then
  6836. z := - z;
  6837. if ( ord( z < 0 ) xor aSign )<>0 then
  6838. begin
  6839. invalid:
  6840. float_raise( float_flag_invalid );
  6841. if aSign<>0 then
  6842. result:=$80000000
  6843. else
  6844. result:=$7FFFFFFF;
  6845. exit;
  6846. end;
  6847. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6848. begin
  6849. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6850. end;
  6851. result := z;
  6852. end;
  6853. {*----------------------------------------------------------------------------
  6854. | Returns the result of converting the quadruple-precision floating-point
  6855. | value `a' to the 64-bit two's complement integer format. The conversion
  6856. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6857. | Arithmetic---which means in particular that the conversion is rounded
  6858. | according to the current rounding mode. If `a' is a NaN, the largest
  6859. | positive integer is returned. Otherwise, if the conversion overflows, the
  6860. | largest integer with the same sign as `a' is returned.
  6861. *----------------------------------------------------------------------------*}
  6862. function float128_to_int64(a: float128): int64;
  6863. var
  6864. aSign: flag;
  6865. aExp, shiftCount: int32;
  6866. aSig0, aSig1: bits64;
  6867. begin
  6868. aSig1 := extractFloat128Frac1( a );
  6869. aSig0 := extractFloat128Frac0( a );
  6870. aExp := extractFloat128Exp( a );
  6871. aSign := extractFloat128Sign( a );
  6872. if ( aExp<>0 ) then
  6873. aSig0 := aSig0 or int64( $0001000000000000 );
  6874. shiftCount := $402F - aExp;
  6875. if ( shiftCount <= 0 ) then
  6876. begin
  6877. if ( $403E < aExp ) then
  6878. begin
  6879. float_raise( float_flag_invalid );
  6880. if ( (aSign=0)
  6881. or ( ( aExp = $7FFF )
  6882. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6883. )
  6884. ) then
  6885. begin
  6886. result := int64( $7FFFFFFFFFFFFFFF );
  6887. end;
  6888. result := int64( $8000000000000000 );
  6889. end;
  6890. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6891. end
  6892. else begin
  6893. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6894. end;
  6895. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6896. end;
  6897. {*----------------------------------------------------------------------------
  6898. | Returns the result of converting the quadruple-precision floating-point
  6899. | value `a' to the 64-bit two's complement integer format. The conversion
  6900. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6901. | Arithmetic, except that the conversion is always rounded toward zero.
  6902. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6903. | the conversion overflows, the largest integer with the same sign as `a' is
  6904. | returned.
  6905. *----------------------------------------------------------------------------*}
  6906. function float128_to_int64_round_to_zero(a: float128): int64;
  6907. var
  6908. aSign: flag;
  6909. aExp, shiftCount: int32;
  6910. aSig0, aSig1: bits64;
  6911. z: int64;
  6912. begin
  6913. aSig1 := extractFloat128Frac1( a );
  6914. aSig0 := extractFloat128Frac0( a );
  6915. aExp := extractFloat128Exp( a );
  6916. aSign := extractFloat128Sign( a );
  6917. if ( aExp<>0 ) then
  6918. aSig0 := aSig0 or int64( $0001000000000000 );
  6919. shiftCount := aExp - $402F;
  6920. if ( 0 < shiftCount ) then
  6921. begin
  6922. if ( $403E <= aExp ) then
  6923. begin
  6924. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6925. if ( ( a.high = int64( $C03E000000000000 ) )
  6926. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6927. begin
  6928. if ( aSig1<>0 ) then
  6929. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6930. end
  6931. else begin
  6932. float_raise( float_flag_invalid );
  6933. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6934. begin
  6935. result := int64( $7FFFFFFFFFFFFFFF );
  6936. exit;
  6937. end;
  6938. end;
  6939. result := int64( $8000000000000000 );
  6940. exit;
  6941. end;
  6942. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6943. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6944. begin
  6945. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6946. end;
  6947. end
  6948. else begin
  6949. if ( aExp < $3FFF ) then
  6950. begin
  6951. if ( aExp or aSig0 or aSig1 )<>0 then
  6952. begin
  6953. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6954. end;
  6955. result := 0;
  6956. exit;
  6957. end;
  6958. z := aSig0 shr ( - shiftCount );
  6959. if ( (aSig1<>0)
  6960. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6961. begin
  6962. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6963. end;
  6964. end;
  6965. if ( aSign<>0 ) then
  6966. z := - z;
  6967. result := z;
  6968. end;
  6969. {*----------------------------------------------------------------------------
  6970. | Returns the result of converting the quadruple-precision floating-point
  6971. | value `a' to the single-precision floating-point format. The conversion
  6972. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6973. | Arithmetic.
  6974. *----------------------------------------------------------------------------*}
  6975. function float128_to_float32(a: float128): float32;
  6976. var
  6977. aSign: flag;
  6978. aExp: int32;
  6979. aSig0, aSig1: bits64;
  6980. zSig: bits32;
  6981. begin
  6982. aSig1 := extractFloat128Frac1( a );
  6983. aSig0 := extractFloat128Frac0( a );
  6984. aExp := extractFloat128Exp( a );
  6985. aSign := extractFloat128Sign( a );
  6986. if ( aExp = $7FFF ) then
  6987. begin
  6988. if ( aSig0 or aSig1 )<>0 then
  6989. begin
  6990. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  6991. exit;
  6992. end;
  6993. result := packFloat32( aSign, $FF, 0 );
  6994. exit;
  6995. end;
  6996. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6997. shift64RightJamming( aSig0, 18, aSig0 );
  6998. zSig := aSig0;
  6999. if ( aExp or zSig )<>0 then
  7000. begin
  7001. zSig := zSig or $40000000;
  7002. dec(aExp,$3F81);
  7003. end;
  7004. result := roundAndPackFloat32( aSign, aExp, zSig );
  7005. end;
  7006. {*----------------------------------------------------------------------------
  7007. | Returns the result of converting the quadruple-precision floating-point
  7008. | value `a' to the double-precision floating-point format. The conversion
  7009. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7010. | Arithmetic.
  7011. *----------------------------------------------------------------------------*}
  7012. function float128_to_float64(a: float128): float64;
  7013. var
  7014. aSign: flag;
  7015. aExp: int32;
  7016. aSig0, aSig1: bits64;
  7017. begin
  7018. aSig1 := extractFloat128Frac1( a );
  7019. aSig0 := extractFloat128Frac0( a );
  7020. aExp := extractFloat128Exp( a );
  7021. aSign := extractFloat128Sign( a );
  7022. if ( aExp = $7FFF ) then
  7023. begin
  7024. if ( aSig0 or aSig1 )<>0 then
  7025. begin
  7026. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  7027. exit;
  7028. end;
  7029. result:=packFloat64( aSign, $7FF, 0);
  7030. exit;
  7031. end;
  7032. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7033. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7034. if ( aExp or aSig0 )<>0 then
  7035. begin
  7036. aSig0 := aSig0 or int64( $4000000000000000 );
  7037. dec(aExp,$3C01);
  7038. end;
  7039. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7040. end;
  7041. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7042. {*----------------------------------------------------------------------------
  7043. | Returns the result of converting the quadruple-precision floating-point
  7044. | value `a' to the extended double-precision floating-point format. The
  7045. | conversion is performed according to the IEC/IEEE Standard for Binary
  7046. | Floating-Point Arithmetic.
  7047. *----------------------------------------------------------------------------*}
  7048. function float128_to_floatx80(a: float128): floatx80;
  7049. var
  7050. aSign: flag;
  7051. aExp: int32;
  7052. aSig0, aSig1: bits64;
  7053. begin
  7054. aSig1 := extractFloat128Frac1( a );
  7055. aSig0 := extractFloat128Frac0( a );
  7056. aExp := extractFloat128Exp( a );
  7057. aSign := extractFloat128Sign( a );
  7058. if ( aExp = $7FFF ) begin
  7059. if ( aSig0 or aSig1 ) begin
  7060. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7061. exit;
  7062. end;
  7063. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  7064. exit;
  7065. end;
  7066. if ( aExp = 0 ) begin
  7067. if ( ( aSig0 or aSig1 ) = 0 ) then
  7068. begin
  7069. result := packFloatx80( aSign, 0, 0 );
  7070. exit;
  7071. end;
  7072. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7073. end;
  7074. else begin
  7075. aSig0 or= int64( $0001000000000000 );
  7076. end;
  7077. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7078. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7079. end;
  7080. {$endif FPC_SOFTFLOAT_FLOATX80}
  7081. {*----------------------------------------------------------------------------
  7082. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7083. | Returns the result as a quadruple-precision floating-point value. The
  7084. | operation is performed according to the IEC/IEEE Standard for Binary
  7085. | Floating-Point Arithmetic.
  7086. *----------------------------------------------------------------------------*}
  7087. function float128_round_to_int(a: float128): float128;
  7088. var
  7089. aSign: flag;
  7090. aExp: int32;
  7091. lastBitMask, roundBitsMask: bits64;
  7092. roundingMode: int8;
  7093. z: float128;
  7094. begin
  7095. aExp := extractFloat128Exp( a );
  7096. if ( $402F <= aExp ) then
  7097. begin
  7098. if ( $406F <= aExp ) then
  7099. begin
  7100. if ( ( aExp = $7FFF )
  7101. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7102. ) then
  7103. begin
  7104. result := propagateFloat128NaN( a, a );
  7105. exit;
  7106. end;
  7107. result := a;
  7108. exit;
  7109. end;
  7110. lastBitMask := 1;
  7111. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7112. roundBitsMask := lastBitMask - 1;
  7113. z := a;
  7114. roundingMode := softfloat_rounding_mode;
  7115. if ( roundingMode = float_round_nearest_even ) then
  7116. begin
  7117. if ( lastBitMask )<>0 then
  7118. begin
  7119. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7120. if ( ( z.low and roundBitsMask ) = 0 ) then
  7121. z.low := z.low and not(lastBitMask);
  7122. end
  7123. else begin
  7124. if ( sbits64(z.low) < 0 ) then
  7125. begin
  7126. inc(z.high);
  7127. if ( bits64( z.low shl 1 ) = 0 ) then
  7128. z.high := z.high and not(1);
  7129. end;
  7130. end;
  7131. end
  7132. else if ( roundingMode <> float_round_to_zero ) then
  7133. begin
  7134. if ( extractFloat128Sign( z )
  7135. xor ord( roundingMode = float_round_up ) )<>0 then
  7136. begin
  7137. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7138. end;
  7139. end;
  7140. z.low := z.low and not(roundBitsMask);
  7141. end
  7142. else begin
  7143. if ( aExp < $3FFF ) then
  7144. begin
  7145. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7146. begin
  7147. result := a;
  7148. exit;
  7149. end;
  7150. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7151. aSign := extractFloat128Sign( a );
  7152. case softfloat_rounding_mode of
  7153. float_round_nearest_even:
  7154. if ( ( aExp = $3FFE )
  7155. and ( (extractFloat128Frac0( a )<>0)
  7156. or (extractFloat128Frac1( a )<>0) )
  7157. ) then begin
  7158. begin
  7159. result := packFloat128( aSign, $3FFF, 0, 0 );
  7160. exit;
  7161. end;
  7162. end;
  7163. float_round_down:
  7164. begin
  7165. if aSign<>0 then
  7166. result:=packFloat128( 1, $3FFF, 0, 0 )
  7167. else
  7168. result:=packFloat128( 0, 0, 0, 0 );
  7169. exit;
  7170. end;
  7171. float_round_up:
  7172. begin
  7173. if aSign<>0 then
  7174. result := packFloat128( 1, 0, 0, 0 )
  7175. else
  7176. result:=packFloat128( 0, $3FFF, 0, 0 );
  7177. exit;
  7178. end;
  7179. end;
  7180. result := packFloat128( aSign, 0, 0, 0 );
  7181. exit;
  7182. end;
  7183. lastBitMask := 1;
  7184. lastBitMask := lastBitMask shl ($402F - aExp);
  7185. roundBitsMask := lastBitMask - 1;
  7186. z.low := 0;
  7187. z.high := a.high;
  7188. roundingMode := softfloat_rounding_mode;
  7189. if ( roundingMode = float_round_nearest_even ) then begin
  7190. inc(z.high,lastBitMask shr 1);
  7191. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7192. z.high := z.high and not(lastBitMask);
  7193. end;
  7194. end
  7195. else if ( roundingMode <> float_round_to_zero ) then begin
  7196. if ( (extractFloat128Sign( z )<>0)
  7197. xor ( roundingMode = float_round_up ) ) then begin
  7198. z.high := z.high or ord( a.low <> 0 );
  7199. z.high := z.high+roundBitsMask;
  7200. end;
  7201. end;
  7202. z.high := z.high and not(roundBitsMask);
  7203. end;
  7204. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7205. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7206. end;
  7207. result := z;
  7208. end;
  7209. {*----------------------------------------------------------------------------
  7210. | Returns the result of adding the absolute values of the quadruple-precision
  7211. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7212. | before being returned. `zSign' is ignored if the result is a NaN.
  7213. | The addition is performed according to the IEC/IEEE Standard for Binary
  7214. | Floating-Point Arithmetic.
  7215. *----------------------------------------------------------------------------*}
  7216. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7217. var
  7218. aExp, bExp, zExp: int32;
  7219. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7220. expDiff: int32;
  7221. label
  7222. shiftRight1,roundAndPack;
  7223. begin
  7224. aSig1 := extractFloat128Frac1( a );
  7225. aSig0 := extractFloat128Frac0( a );
  7226. aExp := extractFloat128Exp( a );
  7227. bSig1 := extractFloat128Frac1( b );
  7228. bSig0 := extractFloat128Frac0( b );
  7229. bExp := extractFloat128Exp( b );
  7230. expDiff := aExp - bExp;
  7231. if ( 0 < expDiff ) then begin
  7232. if ( aExp = $7FFF ) then begin
  7233. if ( aSig0 or aSig1 )<>0 then
  7234. begin
  7235. result := propagateFloat128NaN( a, b );
  7236. exit;
  7237. end;
  7238. result := a;
  7239. exit;
  7240. end;
  7241. if ( bExp = 0 ) then begin
  7242. dec(expDiff);
  7243. end
  7244. else begin
  7245. bSig0 := bSig0 or int64( $0001000000000000 );
  7246. end;
  7247. shift128ExtraRightJamming(
  7248. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7249. zExp := aExp;
  7250. end
  7251. else if ( expDiff < 0 ) then begin
  7252. if ( bExp = $7FFF ) then begin
  7253. if ( bSig0 or bSig1 )<>0 then
  7254. begin
  7255. result := propagateFloat128NaN( a, b );
  7256. exit;
  7257. end;
  7258. result := packFloat128( zSign, $7FFF, 0, 0 );
  7259. exit;
  7260. end;
  7261. if ( aExp = 0 ) then begin
  7262. inc(expDiff);
  7263. end
  7264. else begin
  7265. aSig0 := aSig0 or int64( $0001000000000000 );
  7266. end;
  7267. shift128ExtraRightJamming(
  7268. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7269. zExp := bExp;
  7270. end
  7271. else begin
  7272. if ( aExp = $7FFF ) then begin
  7273. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7274. result := propagateFloat128NaN( a, b );
  7275. exit;
  7276. end;
  7277. result := a;
  7278. exit;
  7279. end;
  7280. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7281. if ( aExp = 0 ) then
  7282. begin
  7283. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7284. exit;
  7285. end;
  7286. zSig2 := 0;
  7287. zSig0 := zSig0 or int64( $0002000000000000 );
  7288. zExp := aExp;
  7289. goto shiftRight1;
  7290. end;
  7291. aSig0 := aSig0 or int64( $0001000000000000 );
  7292. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7293. dec(zExp);
  7294. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7295. inc(zExp);
  7296. shiftRight1:
  7297. shift128ExtraRightJamming(
  7298. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7299. roundAndPack:
  7300. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7301. end;
  7302. {*----------------------------------------------------------------------------
  7303. | Returns the result of subtracting the absolute values of the quadruple-
  7304. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  7305. | difference is negated before being returned. `zSign' is ignored if the
  7306. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  7307. | Standard for Binary Floating-Point Arithmetic.
  7308. *----------------------------------------------------------------------------*}
  7309. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  7310. var
  7311. aExp, bExp, zExp: int32;
  7312. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  7313. expDiff: int32;
  7314. z: float128;
  7315. label
  7316. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  7317. begin
  7318. aSig1 := extractFloat128Frac1( a );
  7319. aSig0 := extractFloat128Frac0( a );
  7320. aExp := extractFloat128Exp( a );
  7321. bSig1 := extractFloat128Frac1( b );
  7322. bSig0 := extractFloat128Frac0( b );
  7323. bExp := extractFloat128Exp( b );
  7324. expDiff := aExp - bExp;
  7325. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7326. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  7327. if ( 0 < expDiff ) then goto aExpBigger;
  7328. if ( expDiff < 0 ) then goto bExpBigger;
  7329. if ( aExp = $7FFF ) then begin
  7330. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7331. result := propagateFloat128NaN( a, b );
  7332. exit;
  7333. end;
  7334. float_raise( float_flag_invalid );
  7335. z.low := float128_default_nan_low;
  7336. z.high := float128_default_nan_high;
  7337. result := z;
  7338. exit;
  7339. end;
  7340. if ( aExp = 0 ) then begin
  7341. aExp := 1;
  7342. bExp := 1;
  7343. end;
  7344. if ( bSig0 < aSig0 ) then goto aBigger;
  7345. if ( aSig0 < bSig0 ) then goto bBigger;
  7346. if ( bSig1 < aSig1 ) then goto aBigger;
  7347. if ( aSig1 < bSig1 ) then goto bBigger;
  7348. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  7349. exit;
  7350. bExpBigger:
  7351. if ( bExp = $7FFF ) then begin
  7352. if ( bSig0 or bSig1 )<>0 then
  7353. begin
  7354. result := propagateFloat128NaN( a, b );
  7355. exit;
  7356. end;
  7357. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  7358. exit;
  7359. end;
  7360. if ( aExp = 0 ) then begin
  7361. inc(expDiff);
  7362. end
  7363. else begin
  7364. aSig0 := aSig0 or int64( $4000000000000000 );
  7365. end;
  7366. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7367. bSig0 := bSig0 or int64( $4000000000000000 );
  7368. bBigger:
  7369. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  7370. zExp := bExp;
  7371. zSign := zSign xor 1;
  7372. goto normalizeRoundAndPack;
  7373. aExpBigger:
  7374. if ( aExp = $7FFF ) then begin
  7375. if ( aSig0 or aSig1 )<>0 then
  7376. begin
  7377. result := propagateFloat128NaN( a, b );
  7378. exit;
  7379. end;
  7380. result := a;
  7381. exit;
  7382. end;
  7383. if ( bExp = 0 ) then begin
  7384. dec(expDiff);
  7385. end
  7386. else begin
  7387. bSig0 := bSig0 or int64( $4000000000000000 );
  7388. end;
  7389. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  7390. aSig0 := aSig0 or int64( $4000000000000000 );
  7391. aBigger:
  7392. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7393. zExp := aExp;
  7394. normalizeRoundAndPack:
  7395. dec(zExp);
  7396. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  7397. end;
  7398. {*----------------------------------------------------------------------------
  7399. | Returns the result of adding the quadruple-precision floating-point values
  7400. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  7401. | for Binary Floating-Point Arithmetic.
  7402. *----------------------------------------------------------------------------*}
  7403. function float128_add(a: float128; b: float128): float128;
  7404. var
  7405. aSign, bSign: flag;
  7406. begin
  7407. aSign := extractFloat128Sign( a );
  7408. bSign := extractFloat128Sign( b );
  7409. if ( aSign = bSign ) then begin
  7410. result := addFloat128Sigs( a, b, aSign );
  7411. end
  7412. else begin
  7413. result := subFloat128Sigs( a, b, aSign );
  7414. end;
  7415. end;
  7416. {*----------------------------------------------------------------------------
  7417. | Returns the result of subtracting the quadruple-precision floating-point
  7418. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7419. | Standard for Binary Floating-Point Arithmetic.
  7420. *----------------------------------------------------------------------------*}
  7421. function float128_sub(a: float128; b: float128): float128;
  7422. var
  7423. aSign, bSign: flag;
  7424. begin
  7425. aSign := extractFloat128Sign( a );
  7426. bSign := extractFloat128Sign( b );
  7427. if ( aSign = bSign ) then begin
  7428. result := subFloat128Sigs( a, b, aSign );
  7429. end
  7430. else begin
  7431. result := addFloat128Sigs( a, b, aSign );
  7432. end;
  7433. end;
  7434. {*----------------------------------------------------------------------------
  7435. | Returns the result of multiplying the quadruple-precision floating-point
  7436. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7437. | Standard for Binary Floating-Point Arithmetic.
  7438. *----------------------------------------------------------------------------*}
  7439. function float128_mul(a: float128; b: float128): float128;
  7440. var
  7441. aSign, bSign, zSign: flag;
  7442. aExp, bExp, zExp: int32;
  7443. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7444. z: float128;
  7445. label
  7446. invalid;
  7447. begin
  7448. aSig1 := extractFloat128Frac1( a );
  7449. aSig0 := extractFloat128Frac0( a );
  7450. aExp := extractFloat128Exp( a );
  7451. aSign := extractFloat128Sign( a );
  7452. bSig1 := extractFloat128Frac1( b );
  7453. bSig0 := extractFloat128Frac0( b );
  7454. bExp := extractFloat128Exp( b );
  7455. bSign := extractFloat128Sign( b );
  7456. zSign := aSign xor bSign;
  7457. if ( aExp = $7FFF ) then begin
  7458. if ( (( aSig0 or aSig1 )<>0)
  7459. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7460. result := propagateFloat128NaN( a, b );
  7461. exit;
  7462. end;
  7463. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  7464. result := packFloat128( zSign, $7FFF, 0, 0 );
  7465. exit;
  7466. end;
  7467. if ( bExp = $7FFF ) then begin
  7468. if ( bSig0 or bSig1 )<>0 then
  7469. begin
  7470. result := propagateFloat128NaN( a, b );
  7471. exit;
  7472. end;
  7473. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7474. invalid:
  7475. float_raise( float_flag_invalid );
  7476. z.low := float128_default_nan_low;
  7477. z.high := float128_default_nan_high;
  7478. result := z;
  7479. exit;
  7480. end;
  7481. result := packFloat128( zSign, $7FFF, 0, 0 );
  7482. exit;
  7483. end;
  7484. if ( aExp = 0 ) then begin
  7485. if ( ( aSig0 or aSig1 ) = 0 ) then
  7486. begin
  7487. result := packFloat128( zSign, 0, 0, 0 );
  7488. exit;
  7489. end;
  7490. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7491. end;
  7492. if ( bExp = 0 ) then begin
  7493. if ( ( bSig0 or bSig1 ) = 0 ) then
  7494. begin
  7495. result := packFloat128( zSign, 0, 0, 0 );
  7496. exit;
  7497. end;
  7498. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7499. end;
  7500. zExp := aExp + bExp - $4000;
  7501. aSig0 := aSig0 or int64( $0001000000000000 );
  7502. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  7503. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  7504. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  7505. zSig2 := zSig2 or ord( zSig3 <> 0 );
  7506. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  7507. shift128ExtraRightJamming(
  7508. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7509. inc(zExp);
  7510. end;
  7511. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7512. end;
  7513. {*----------------------------------------------------------------------------
  7514. | Returns the result of dividing the quadruple-precision floating-point value
  7515. | `a' by the corresponding value `b'. The operation is performed according to
  7516. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7517. *----------------------------------------------------------------------------*}
  7518. function float128_div(a: float128; b: float128): float128;
  7519. var
  7520. aSign, bSign, zSign: flag;
  7521. aExp, bExp, zExp: int32;
  7522. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7523. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7524. z: float128;
  7525. label
  7526. invalid;
  7527. begin
  7528. aSig1 := extractFloat128Frac1( a );
  7529. aSig0 := extractFloat128Frac0( a );
  7530. aExp := extractFloat128Exp( a );
  7531. aSign := extractFloat128Sign( a );
  7532. bSig1 := extractFloat128Frac1( b );
  7533. bSig0 := extractFloat128Frac0( b );
  7534. bExp := extractFloat128Exp( b );
  7535. bSign := extractFloat128Sign( b );
  7536. zSign := aSign xor bSign;
  7537. if ( aExp = $7FFF ) then begin
  7538. if ( aSig0 or aSig1 )<>0 then
  7539. begin
  7540. result := propagateFloat128NaN( a, b );
  7541. exit;
  7542. end;
  7543. if ( bExp = $7FFF ) then begin
  7544. if ( bSig0 or bSig1 )<>0 then
  7545. begin
  7546. result := propagateFloat128NaN( a, b );
  7547. exit;
  7548. end;
  7549. goto invalid;
  7550. end;
  7551. result := packFloat128( zSign, $7FFF, 0, 0 );
  7552. exit;
  7553. end;
  7554. if ( bExp = $7FFF ) then begin
  7555. if ( bSig0 or bSig1 )<>0 then
  7556. begin
  7557. result := propagateFloat128NaN( a, b );
  7558. exit;
  7559. end;
  7560. result := packFloat128( zSign, 0, 0, 0 );
  7561. exit;
  7562. end;
  7563. if ( bExp = 0 ) then begin
  7564. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7565. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7566. invalid:
  7567. float_raise( float_flag_invalid );
  7568. z.low := float128_default_nan_low;
  7569. z.high := float128_default_nan_high;
  7570. result := z;
  7571. exit;
  7572. end;
  7573. float_raise( float_flag_divbyzero );
  7574. result := packFloat128( zSign, $7FFF, 0, 0 );
  7575. exit;
  7576. end;
  7577. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7578. end;
  7579. if ( aExp = 0 ) then begin
  7580. if ( ( aSig0 or aSig1 ) = 0 ) then
  7581. begin
  7582. result := packFloat128( zSign, 0, 0, 0 );
  7583. exit;
  7584. end;
  7585. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7586. end;
  7587. zExp := aExp - bExp + $3FFD;
  7588. shortShift128Left(
  7589. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  7590. shortShift128Left(
  7591. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7592. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  7593. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  7594. inc(zExp);
  7595. end;
  7596. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7597. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  7598. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  7599. while ( sbits64(rem0) < 0 ) do begin
  7600. dec(zSig0);
  7601. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  7602. end;
  7603. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7604. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  7605. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  7606. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  7607. while ( sbits64(rem1) < 0 ) do begin
  7608. dec(zSig1);
  7609. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  7610. end;
  7611. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7612. end;
  7613. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  7614. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7615. end;
  7616. {*----------------------------------------------------------------------------
  7617. | Returns the remainder of the quadruple-precision floating-point value `a'
  7618. | with respect to the corresponding value `b'. The operation is performed
  7619. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7620. *----------------------------------------------------------------------------*}
  7621. function float128_rem(a: float128; b: float128): float128;
  7622. var
  7623. aSign, bSign, zSign: flag;
  7624. aExp, bExp, expDiff: int32;
  7625. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7626. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7627. sigMean0: sbits64;
  7628. z: float128;
  7629. label
  7630. invalid;
  7631. begin
  7632. aSig1 := extractFloat128Frac1( a );
  7633. aSig0 := extractFloat128Frac0( a );
  7634. aExp := extractFloat128Exp( a );
  7635. aSign := extractFloat128Sign( a );
  7636. bSig1 := extractFloat128Frac1( b );
  7637. bSig0 := extractFloat128Frac0( b );
  7638. bExp := extractFloat128Exp( b );
  7639. bSign := extractFloat128Sign( b );
  7640. if ( aExp = $7FFF ) then begin
  7641. if ( (( aSig0 or aSig1 )<>0)
  7642. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7643. result := propagateFloat128NaN( a, b );
  7644. exit;
  7645. end;
  7646. goto invalid;
  7647. end;
  7648. if ( bExp = $7FFF ) then begin
  7649. if ( bSig0 or bSig1 )<>0 then
  7650. begin
  7651. result := propagateFloat128NaN( a, b );
  7652. exit;
  7653. end;
  7654. result := a;
  7655. exit;
  7656. end;
  7657. if ( bExp = 0 ) then begin
  7658. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7659. invalid:
  7660. float_raise( float_flag_invalid );
  7661. z.low := float128_default_nan_low;
  7662. z.high := float128_default_nan_high;
  7663. result := z;
  7664. exit;
  7665. end;
  7666. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7667. end;
  7668. if ( aExp = 0 ) then begin
  7669. if ( ( aSig0 or aSig1 ) = 0 ) then
  7670. begin
  7671. result := a;
  7672. exit;
  7673. end;
  7674. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7675. end;
  7676. expDiff := aExp - bExp;
  7677. if ( expDiff < -1 ) then
  7678. begin
  7679. result := a;
  7680. exit;
  7681. end;
  7682. shortShift128Left(
  7683. aSig0 or int64( $0001000000000000 ),
  7684. aSig1,
  7685. 15 - ord( expDiff < 0 ),
  7686. aSig0,
  7687. aSig1
  7688. );
  7689. shortShift128Left(
  7690. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7691. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7692. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7693. dec(expDiff,64);
  7694. while ( 0 < expDiff ) do begin
  7695. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7696. if ( 4 < q ) then
  7697. q := q - 4
  7698. else
  7699. q := 0;
  7700. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7701. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  7702. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  7703. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  7704. dec(expDiff,61);
  7705. end;
  7706. if ( -64 < expDiff ) then begin
  7707. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7708. if ( 4 < q ) then
  7709. q := q - 4
  7710. else
  7711. q := 0;
  7712. q := q shr (- expDiff);
  7713. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7714. inc(expDiff,52);
  7715. if ( expDiff < 0 ) then begin
  7716. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7717. end
  7718. else begin
  7719. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  7720. end;
  7721. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7722. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  7723. end
  7724. else begin
  7725. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  7726. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7727. end;
  7728. repeat
  7729. alternateASig0 := aSig0;
  7730. alternateASig1 := aSig1;
  7731. inc(q);
  7732. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7733. until not( 0 <= sbits64(aSig0) );
  7734. add128(
  7735. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  7736. if ( ( sigMean0 < 0 )
  7737. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  7738. aSig0 := alternateASig0;
  7739. aSig1 := alternateASig1;
  7740. end;
  7741. zSign := ord( sbits64(aSig0) < 0 );
  7742. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  7743. result :=
  7744. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7745. end;
  7746. {*----------------------------------------------------------------------------
  7747. | Returns the square root of the quadruple-precision floating-point value `a'.
  7748. | The operation is performed according to the IEC/IEEE Standard for Binary
  7749. | Floating-Point Arithmetic.
  7750. *----------------------------------------------------------------------------*}
  7751. function float128_sqrt(a: float128): float128;
  7752. var
  7753. aSign: flag;
  7754. aExp, zExp: int32;
  7755. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7756. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7757. z: float128;
  7758. label
  7759. invalid;
  7760. begin
  7761. aSig1 := extractFloat128Frac1( a );
  7762. aSig0 := extractFloat128Frac0( a );
  7763. aExp := extractFloat128Exp( a );
  7764. aSign := extractFloat128Sign( a );
  7765. if ( aExp = $7FFF ) then begin
  7766. if ( aSig0 or aSig1 )<>0 then
  7767. begin
  7768. result := propagateFloat128NaN( a, a );
  7769. exit;
  7770. end;
  7771. if ( aSign=0 ) then
  7772. begin
  7773. result := a;
  7774. exit;
  7775. end;
  7776. goto invalid;
  7777. end;
  7778. if ( aSign<>0 ) then begin
  7779. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  7780. begin
  7781. result := a;
  7782. exit;
  7783. end;
  7784. invalid:
  7785. float_raise( float_flag_invalid );
  7786. z.low := float128_default_nan_low;
  7787. z.high := float128_default_nan_high;
  7788. result := z;
  7789. exit;
  7790. end;
  7791. if ( aExp = 0 ) then begin
  7792. if ( ( aSig0 or aSig1 ) = 0 ) then
  7793. begin
  7794. result := packFloat128( 0, 0, 0, 0 );
  7795. exit;
  7796. end;
  7797. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7798. end;
  7799. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7800. aSig0 := aSig0 or int64( $0001000000000000 );
  7801. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7802. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  7803. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7804. doubleZSig0 := zSig0 shl 1;
  7805. mul64To128( zSig0, zSig0, term0, term1 );
  7806. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7807. while ( sbits64(rem0) < 0 ) do begin
  7808. dec(zSig0);
  7809. dec(doubleZSig0,2);
  7810. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7811. end;
  7812. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7813. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  7814. if ( zSig1 = 0 ) then zSig1 := 1;
  7815. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7816. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7817. mul64To128( zSig1, zSig1, term2, term3 );
  7818. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7819. while ( sbits64(rem1) < 0 ) do begin
  7820. dec(zSig1);
  7821. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7822. term3 := term3 or 1;
  7823. term2 := term2 or doubleZSig0;
  7824. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7825. end;
  7826. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7827. end;
  7828. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  7829. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7830. end;
  7831. {*----------------------------------------------------------------------------
  7832. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7833. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7834. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7835. *----------------------------------------------------------------------------*}
  7836. function float128_eq(a: float128; b: float128): flag;
  7837. begin
  7838. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7839. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7840. or ( ( extractFloat128Exp( b ) = $7FFF )
  7841. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7842. ) then begin
  7843. if ( (float128_is_signaling_nan( a )<>0)
  7844. or (float128_is_signaling_nan( b )<>0) ) then begin
  7845. float_raise( float_flag_invalid );
  7846. end;
  7847. result := 0;
  7848. exit;
  7849. end;
  7850. result := ord(
  7851. ( a.low = b.low )
  7852. and ( ( a.high = b.high )
  7853. or ( ( a.low = 0 )
  7854. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  7855. ));
  7856. end;
  7857. {*----------------------------------------------------------------------------
  7858. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7859. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7860. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7861. | Arithmetic.
  7862. *----------------------------------------------------------------------------*}
  7863. function float128_le(a: float128; b: float128): flag;
  7864. var
  7865. aSign, bSign: flag;
  7866. begin
  7867. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7868. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7869. or ( ( extractFloat128Exp( b ) = $7FFF )
  7870. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7871. ) then begin
  7872. float_raise( float_flag_invalid );
  7873. result := 0;
  7874. exit;
  7875. end;
  7876. aSign := extractFloat128Sign( a );
  7877. bSign := extractFloat128Sign( b );
  7878. if ( aSign <> bSign ) then begin
  7879. result := ord(
  7880. (aSign<>0)
  7881. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7882. = 0 ));
  7883. exit;
  7884. end;
  7885. if aSign<>0 then
  7886. result := le128( b.high, b.low, a.high, a.low )
  7887. else
  7888. result := le128( a.high, a.low, b.high, b.low );
  7889. end;
  7890. {*----------------------------------------------------------------------------
  7891. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7892. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7893. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7894. *----------------------------------------------------------------------------*}
  7895. function float128_lt(a: float128; b: float128): flag;
  7896. var
  7897. aSign, bSign: flag;
  7898. begin
  7899. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7900. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7901. or ( ( extractFloat128Exp( b ) = $7FFF )
  7902. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7903. ) then begin
  7904. float_raise( float_flag_invalid );
  7905. result := 0;
  7906. exit;
  7907. end;
  7908. aSign := extractFloat128Sign( a );
  7909. bSign := extractFloat128Sign( b );
  7910. if ( aSign <> bSign ) then begin
  7911. result := ord(
  7912. (aSign<>0)
  7913. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7914. <> 0 ));
  7915. exit;
  7916. end;
  7917. if aSign<>0 then
  7918. result := lt128( b.high, b.low, a.high, a.low )
  7919. else
  7920. result := lt128( a.high, a.low, b.high, b.low );
  7921. end;
  7922. {*----------------------------------------------------------------------------
  7923. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7924. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7925. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7926. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7927. *----------------------------------------------------------------------------*}
  7928. function float128_eq_signaling(a: float128; b: float128): flag;
  7929. begin
  7930. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7931. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7932. or ( ( extractFloat128Exp( b ) = $7FFF )
  7933. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7934. ) then begin
  7935. float_raise( float_flag_invalid );
  7936. result := 0;
  7937. exit;
  7938. end;
  7939. result := ord(
  7940. ( a.low = b.low )
  7941. and ( ( a.high = b.high )
  7942. or ( ( a.low = 0 )
  7943. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7944. ));
  7945. end;
  7946. {*----------------------------------------------------------------------------
  7947. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7948. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7949. | cause an exception. Otherwise, the comparison is performed according to the
  7950. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7951. *----------------------------------------------------------------------------*}
  7952. function float128_le_quiet(a: float128; b: float128): flag;
  7953. var
  7954. aSign, bSign: flag;
  7955. begin
  7956. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7957. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7958. or ( ( extractFloat128Exp( b ) = $7FFF )
  7959. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7960. ) then begin
  7961. if ( (float128_is_signaling_nan( a )<>0)
  7962. or (float128_is_signaling_nan( b )<>0) ) then begin
  7963. float_raise( float_flag_invalid );
  7964. end;
  7965. result := 0;
  7966. exit;
  7967. end;
  7968. aSign := extractFloat128Sign( a );
  7969. bSign := extractFloat128Sign( b );
  7970. if ( aSign <> bSign ) then begin
  7971. result := ord(
  7972. (aSign<>0)
  7973. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7974. = 0 ));
  7975. exit;
  7976. end;
  7977. if aSign<>0 then
  7978. result := le128( b.high, b.low, a.high, a.low )
  7979. else
  7980. result := le128( a.high, a.low, b.high, b.low );
  7981. end;
  7982. {*----------------------------------------------------------------------------
  7983. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7984. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  7985. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  7986. | Standard for Binary Floating-Point Arithmetic.
  7987. *----------------------------------------------------------------------------*}
  7988. function float128_lt_quiet(a: float128; b: float128): flag;
  7989. var
  7990. aSign, bSign: flag;
  7991. begin
  7992. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7993. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7994. or ( ( extractFloat128Exp( b ) = $7FFF )
  7995. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7996. ) then begin
  7997. if ( (float128_is_signaling_nan( a )<>0)
  7998. or (float128_is_signaling_nan( b )<>0) ) then begin
  7999. float_raise( float_flag_invalid );
  8000. end;
  8001. result := 0;
  8002. exit;
  8003. end;
  8004. aSign := extractFloat128Sign( a );
  8005. bSign := extractFloat128Sign( b );
  8006. if ( aSign <> bSign ) then begin
  8007. result := ord(
  8008. (aSign<>0)
  8009. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8010. <> 0 ));
  8011. exit;
  8012. end;
  8013. if aSign<>0 then
  8014. result:=lt128( b.high, b.low, a.high, a.low )
  8015. else
  8016. result:=lt128( a.high, a.low, b.high, b.low );
  8017. end;
  8018. {----------------------------------------------------------------------------
  8019. | Returns the result of converting the double-precision floating-point value
  8020. | `a' to the quadruple-precision floating-point format. The conversion is
  8021. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8022. | Arithmetic.
  8023. *----------------------------------------------------------------------------}
  8024. function float64_to_float128( a : float64) : float128;
  8025. var
  8026. aSign : flag;
  8027. aExp : int16;
  8028. aSig, zSig0, zSig1 : bits64;
  8029. begin
  8030. aSig := extractFloat64Frac( a );
  8031. aExp := extractFloat64Exp( a );
  8032. aSign := extractFloat64Sign( a );
  8033. if ( aExp = $7FF ) then begin
  8034. if ( aSig<>0 ) then
  8035. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8036. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8037. exit;
  8038. end;
  8039. if ( aExp = 0 ) then begin
  8040. if ( aSig = 0 ) then
  8041. begin
  8042. result:=packFloat128( aSign, 0, 0, 0 );
  8043. exit;
  8044. end;
  8045. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8046. dec(aExp);
  8047. end;
  8048. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8049. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8050. end;
  8051. {$endif FPC_SOFTFLOAT_FLOAT128}
  8052. {$endif not(defined(fpc_softfpu_interface))}
  8053. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8054. end.
  8055. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}