softfpu.pp 327 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. {$IFNDEF FPC_DOTTEDUNITS}
  63. unit softfpu;
  64. {$ENDIF FPC_DOTTEDUNITS}
  65. { Overflow checking must be disabled,
  66. since some operations expect overflow!
  67. }
  68. {$Q-}
  69. {$goto on}
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. {$ifndef FPC_SYSTEM_HAS_float32}
  80. float32 = longword;
  81. {$define FPC_SYSTEM_HAS_float32}
  82. {$endif ndef FPC_SYSTEM_HAS_float32}
  83. { we use here a record in the function header because
  84. the record allows bitwise conversion to single }
  85. float32rec = record
  86. float32 : float32;
  87. end;
  88. flag = byte;
  89. bits8 = byte;
  90. sbits8 = shortint;
  91. bits16 = word;
  92. sbits16 = smallint;
  93. sbits32 = longint;
  94. bits32 = longword;
  95. {$ifndef fpc}
  96. qword = int64;
  97. {$endif}
  98. { now part of the system unit
  99. uint64 = qword;
  100. }
  101. bits64 = qword;
  102. sbits64 = int64;
  103. {$ifdef ENDIAN_LITTLE}
  104. {$ifndef FPC_SYSTEM_HAS_float64}
  105. float64 = record
  106. case byte of
  107. // force the record to be aligned like a double
  108. // else *_to_double will fail for cpus like sparc
  109. // and avoid expensive unpacking/packing operations
  110. 1: (dummy : double);
  111. 2: (low,high : bits32);
  112. end;
  113. {$endif ndef FPC_SYSTEM_HAS_float64}
  114. floatx80 = record
  115. case byte of
  116. // force the record to be aligned like a double
  117. // else *_to_double will fail for cpus like sparc
  118. // and avoid expensive unpacking/packing operations
  119. 1: (dummy : extended);
  120. 2: (low : qword;high : word);
  121. end;
  122. float128 = record
  123. case byte of
  124. // force the record to be aligned like a double
  125. // else *_to_double will fail for cpus like sparc
  126. // and avoid expensive unpacking/packing operations
  127. 1: (dummy : qword);
  128. 2: (low,high : qword);
  129. end;
  130. {$else}
  131. {$ifndef FPC_SYSTEM_HAS_float64}
  132. float64 = record
  133. case byte of
  134. // force the record to be aligned like a double
  135. // else *_to_double will fail for cpus like sparc
  136. 1: (dummy : double);
  137. 2: (high,low : bits32);
  138. end;
  139. {$endif ndef FPC_SYSTEM_HAS_float64}
  140. floatx80 = record
  141. case byte of
  142. // force the record to be aligned like a double
  143. // else *_to_double will fail for cpus like sparc
  144. // and avoid expensive unpacking/packing operations
  145. 1: (dummy : qword);
  146. 2: (high : word;low : qword);
  147. end;
  148. float128 = record
  149. case byte of
  150. // force the record to be aligned like a double
  151. // else *_to_double will fail for cpus like sparc
  152. // and avoid expensive unpacking/packing operations
  153. 1: (dummy : qword);
  154. 2: (high : qword;low : qword);
  155. end;
  156. {$endif}
  157. {$define FPC_SYSTEM_HAS_float64}
  158. {*
  159. -------------------------------------------------------------------------------
  160. Returns 1 if the double-precision floating-point value `a' is less than
  161. the corresponding value `b', and 0 otherwise. The comparison is performed
  162. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  163. -------------------------------------------------------------------------------
  164. *}
  165. Function float64_lt(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  166. {*
  167. -------------------------------------------------------------------------------
  168. Returns 1 if the double-precision floating-point value `a' is less than
  169. or equal to the corresponding value `b', and 0 otherwise. The comparison
  170. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  171. Arithmetic.
  172. -------------------------------------------------------------------------------
  173. *}
  174. Function float64_le(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  175. {*
  176. -------------------------------------------------------------------------------
  177. Returns 1 if the double-precision floating-point value `a' is equal to
  178. the corresponding value `b', and 0 otherwise. The comparison is performed
  179. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  180. -------------------------------------------------------------------------------
  181. *}
  182. Function float64_eq(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  183. {*
  184. -------------------------------------------------------------------------------
  185. Returns the square root of the double-precision floating-point value `a'.
  186. The operation is performed according to the IEC/IEEE Standard for Binary
  187. Floating-Point Arithmetic.
  188. -------------------------------------------------------------------------------
  189. *}
  190. function float64_sqrt( a: float64 ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  191. {*
  192. -------------------------------------------------------------------------------
  193. Returns the remainder of the double-precision floating-point value `a'
  194. with respect to the corresponding value `b'. The operation is performed
  195. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  196. -------------------------------------------------------------------------------
  197. *}
  198. Function float64_rem(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  199. {*
  200. -------------------------------------------------------------------------------
  201. Returns the result of dividing the double-precision floating-point value `a'
  202. by the corresponding value `b'. The operation is performed according to the
  203. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  204. -------------------------------------------------------------------------------
  205. *}
  206. Function float64_div(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  207. {*
  208. -------------------------------------------------------------------------------
  209. Returns the result of multiplying the double-precision floating-point values
  210. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  211. for Binary Floating-Point Arithmetic.
  212. -------------------------------------------------------------------------------
  213. *}
  214. Function float64_mul( a: float64; b:float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  215. {*
  216. -------------------------------------------------------------------------------
  217. Returns the result of subtracting the double-precision floating-point values
  218. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  219. for Binary Floating-Point Arithmetic.
  220. -------------------------------------------------------------------------------
  221. *}
  222. Function float64_sub(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  223. {*
  224. -------------------------------------------------------------------------------
  225. Returns the result of adding the double-precision floating-point values `a'
  226. and `b'. The operation is performed according to the IEC/IEEE Standard for
  227. Binary Floating-Point Arithmetic.
  228. -------------------------------------------------------------------------------
  229. *}
  230. Function float64_add( a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  231. {*
  232. -------------------------------------------------------------------------------
  233. Rounds the double-precision floating-point value `a' to an integer,
  234. and returns the result as a double-precision floating-point value. The
  235. operation is performed according to the IEC/IEEE Standard for Binary
  236. Floating-Point Arithmetic.
  237. -------------------------------------------------------------------------------
  238. *}
  239. Function float64_round_to_int(a: float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  240. {*
  241. -------------------------------------------------------------------------------
  242. Returns the result of converting the double-precision floating-point value
  243. `a' to the single-precision floating-point format. The conversion is
  244. performed according to the IEC/IEEE Standard for Binary Floating-Point
  245. Arithmetic.
  246. -------------------------------------------------------------------------------
  247. *}
  248. Function float64_to_float32(a: float64) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  249. {*
  250. -------------------------------------------------------------------------------
  251. Returns the result of converting the double-precision floating-point value
  252. `a' to the 32-bit two's complement integer format. The conversion is
  253. performed according to the IEC/IEEE Standard for Binary Floating-Point
  254. Arithmetic, except that the conversion is always rounded toward zero.
  255. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  256. the conversion overflows, the largest integer with the same sign as `a' is
  257. returned.
  258. -------------------------------------------------------------------------------
  259. *}
  260. Function float64_to_int32_round_to_zero(a: float64 ): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  261. {*
  262. -------------------------------------------------------------------------------
  263. Returns the result of converting the double-precision floating-point value
  264. `a' to the 32-bit two's complement integer format. The conversion is
  265. performed according to the IEC/IEEE Standard for Binary Floating-Point
  266. Arithmetic---which means in particular that the conversion is rounded
  267. according to the current rounding mode. If `a' is a NaN, the largest
  268. positive integer is returned. Otherwise, if the conversion overflows, the
  269. largest integer with the same sign as `a' is returned.
  270. -------------------------------------------------------------------------------
  271. *}
  272. Function float64_to_int32(a: float64): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  273. {*
  274. -------------------------------------------------------------------------------
  275. Returns 1 if the single-precision floating-point value `a' is less than
  276. the corresponding value `b', and 0 otherwise. The comparison is performed
  277. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  278. -------------------------------------------------------------------------------
  279. *}
  280. Function float32_lt( a:float32rec ; b : float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  281. {*
  282. -------------------------------------------------------------------------------
  283. Returns 1 if the single-precision floating-point value `a' is less than
  284. or equal to the corresponding value `b', and 0 otherwise. The comparison
  285. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  286. Arithmetic.
  287. -------------------------------------------------------------------------------
  288. *}
  289. Function float32_le( a: float32rec; b : float32rec ):flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  290. {*
  291. -------------------------------------------------------------------------------
  292. Returns 1 if the single-precision floating-point value `a' is equal to
  293. the corresponding value `b', and 0 otherwise. The comparison is performed
  294. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  295. -------------------------------------------------------------------------------
  296. *}
  297. Function float32_eq( a:float32rec; b:float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  298. {*
  299. -------------------------------------------------------------------------------
  300. Returns the square root of the single-precision floating-point value `a'.
  301. The operation is performed according to the IEC/IEEE Standard for Binary
  302. Floating-Point Arithmetic.
  303. -------------------------------------------------------------------------------
  304. *}
  305. Function float32_sqrt(a: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  306. {*
  307. -------------------------------------------------------------------------------
  308. Returns the remainder of the single-precision floating-point value `a'
  309. with respect to the corresponding value `b'. The operation is performed
  310. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  311. -------------------------------------------------------------------------------
  312. *}
  313. Function float32_rem(a: float32rec; b: float32rec ):float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  314. {*
  315. -------------------------------------------------------------------------------
  316. Returns the result of dividing the single-precision floating-point value `a'
  317. by the corresponding value `b'. The operation is performed according to the
  318. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  319. -------------------------------------------------------------------------------
  320. *}
  321. Function float32_div(a: float32rec;b: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  322. {*
  323. -------------------------------------------------------------------------------
  324. Returns the result of multiplying the single-precision floating-point values
  325. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  326. for Binary Floating-Point Arithmetic.
  327. -------------------------------------------------------------------------------
  328. *}
  329. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  330. {*
  331. -------------------------------------------------------------------------------
  332. Returns the result of subtracting the single-precision floating-point values
  333. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  334. for Binary Floating-Point Arithmetic.
  335. -------------------------------------------------------------------------------
  336. *}
  337. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  338. {*
  339. -------------------------------------------------------------------------------
  340. Returns the result of adding the single-precision floating-point values `a'
  341. and `b'. The operation is performed according to the IEC/IEEE Standard for
  342. Binary Floating-Point Arithmetic.
  343. -------------------------------------------------------------------------------
  344. *}
  345. Function float32_add( a: float32rec; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  346. {*
  347. -------------------------------------------------------------------------------
  348. Rounds the single-precision floating-point value `a' to an integer,
  349. and returns the result as a single-precision floating-point value. The
  350. operation is performed according to the IEC/IEEE Standard for Binary
  351. Floating-Point Arithmetic.
  352. -------------------------------------------------------------------------------
  353. *}
  354. Function float32_round_to_int( a: float32rec): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  355. {*
  356. -------------------------------------------------------------------------------
  357. Returns the result of converting the single-precision floating-point value
  358. `a' to the double-precision floating-point format. The conversion is
  359. performed according to the IEC/IEEE Standard for Binary Floating-Point
  360. Arithmetic.
  361. -------------------------------------------------------------------------------
  362. *}
  363. Function float32_to_float64( a : float32rec) : Float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  364. {*
  365. -------------------------------------------------------------------------------
  366. Returns the result of converting the single-precision floating-point value
  367. `a' to the 32-bit two's complement integer format. The conversion is
  368. performed according to the IEC/IEEE Standard for Binary Floating-Point
  369. Arithmetic, except that the conversion is always rounded toward zero.
  370. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  371. the conversion overflows, the largest integer with the same sign as `a' is
  372. returned.
  373. -------------------------------------------------------------------------------
  374. *}
  375. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  376. {*
  377. -------------------------------------------------------------------------------
  378. Returns the result of converting the single-precision floating-point value
  379. `a' to the 32-bit two's complement integer format. The conversion is
  380. performed according to the IEC/IEEE Standard for Binary Floating-Point
  381. Arithmetic---which means in particular that the conversion is rounded
  382. according to the current rounding mode. If `a' is a NaN, the largest
  383. positive integer is returned. Otherwise, if the conversion overflows, the
  384. largest integer with the same sign as `a' is returned.
  385. -------------------------------------------------------------------------------
  386. *}
  387. Function float32_to_int32( a : float32rec) : int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  388. {*
  389. -------------------------------------------------------------------------------
  390. Returns the result of converting the 32-bit two's complement integer `a' to
  391. the double-precision floating-point format. The conversion is performed
  392. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  393. -------------------------------------------------------------------------------
  394. *}
  395. Function int32_to_float64( a: int32) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  396. {*
  397. -------------------------------------------------------------------------------
  398. Returns the result of converting the 32-bit two's complement integer `a' to
  399. the single-precision floating-point format. The conversion is performed
  400. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  401. -------------------------------------------------------------------------------
  402. *}
  403. Function int32_to_float32( a: int32): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  404. {*----------------------------------------------------------------------------
  405. | Returns the result of converting the 64-bit two's complement integer `a'
  406. | to the double-precision floating-point format. The conversion is performed
  407. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  408. *----------------------------------------------------------------------------*}
  409. Function int64_to_float64( a: int64 ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  410. Function qword_to_float64( a: qword ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  411. {*----------------------------------------------------------------------------
  412. | Returns the result of converting the 64-bit two's complement integer `a'
  413. | to the single-precision floating-point format. The conversion is performed
  414. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  415. *----------------------------------------------------------------------------*}
  416. Function int64_to_float32( a: int64 ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  417. Function qword_to_float32( a: qword ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  418. // +++
  419. function float32_to_int64( a: float32 ): int64;
  420. function float32_to_int64_round_to_zero( a: float32 ): int64;
  421. function float32_eq_signaling( a: float32; b: float32) : flag;
  422. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  423. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  424. function float32_is_signaling_nan( a : float32 ): flag;
  425. function float32_is_nan( a : float32 ): flag;
  426. function float64_to_int64( a: float64 ): int64;
  427. function float64_to_int64_round_to_zero( a: float64 ): int64;
  428. function float64_eq_signaling( a: float64; b: float64): flag;
  429. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  430. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  431. function float64_is_signaling_nan( a : float64 ): flag;
  432. function float64_is_nan( a : float64 ): flag;
  433. // ===
  434. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  435. {*----------------------------------------------------------------------------
  436. | Extended double-precision rounding precision
  437. *----------------------------------------------------------------------------*}
  438. var // threadvar!?
  439. floatx80_rounding_precision : int8 = 80;
  440. function int32_to_floatx80( a: int32 ): floatx80;
  441. function int64_to_floatx80( a: int64 ): floatx80;
  442. function qword_to_floatx80( a: qword ): floatx80;
  443. function float32_to_floatx80( a: float32 ): floatx80;
  444. function float64_to_floatx80( a: float64 ): floatx80;
  445. function floatx80_to_int32( a: floatx80 ): int32;
  446. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  447. function floatx80_to_int64( a: floatx80 ): int64;
  448. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  449. function floatx80_to_float32( a: floatx80 ): float32;
  450. function floatx80_to_float64( a: floatx80 ): float64;
  451. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  452. function floatx80_to_float128( a: floatx80 ): float128;
  453. {$endif FPC_SOFTFLOAT_FLOAT128}
  454. function floatx80_round_to_int( a: floatx80 ): floatx80;
  455. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  456. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  457. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  458. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  459. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  460. function floatx80_sqrt( a: floatx80 ): floatx80;
  461. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  462. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  463. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  464. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  465. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  466. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  467. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  468. function floatx80_is_nan(a : floatx80 ): flag;
  469. {$endif FPC_SOFTFLOAT_FLOATX80}
  470. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  471. function int32_to_float128( a: int32 ): float128;
  472. function int64_to_float128( a: int64 ): float128;
  473. function qword_to_float128( a: qword ): float128;
  474. function float32_to_float128( a: float32 ): float128;
  475. function float128_is_nan( a : float128): flag;
  476. function float128_is_signaling_nan( a : float128): flag;
  477. function float128_to_int32(a: float128): int32;
  478. function float128_to_int32_round_to_zero(a: float128): int32;
  479. function float128_to_int64(a: float128): int64;
  480. function float128_to_int64_round_to_zero(a: float128): int64;
  481. function float128_to_float32(a: float128): float32;
  482. function float128_to_float64(a: float128): float64;
  483. function float64_to_float128( a : float64) : float128;
  484. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  485. function float128_to_floatx80(a: float128): floatx80;
  486. {$endif FPC_SOFTFLOAT_FLOATX80}
  487. function float128_round_to_int(a: float128): float128;
  488. function float128_add(a: float128; b: float128): float128;
  489. function float128_sub(a: float128; b: float128): float128;
  490. function float128_mul(a: float128; b: float128): float128;
  491. function float128_div(a: float128; b: float128): float128;
  492. function float128_rem(a: float128; b: float128): float128;
  493. function float128_sqrt(a: float128): float128;
  494. function float128_eq(a: float128; b: float128): flag;
  495. function float128_le(a: float128; b: float128): flag;
  496. function float128_lt(a: float128; b: float128): flag;
  497. function float128_eq_signaling(a: float128; b: float128): flag;
  498. function float128_le_quiet(a: float128; b: float128): flag;
  499. function float128_lt_quiet(a: float128; b: float128): flag;
  500. {$endif FPC_SOFTFLOAT_FLOAT128}
  501. CONST
  502. {-------------------------------------------------------------------------------
  503. Software IEC/IEEE floating-point underflow tininess-detection mode.
  504. -------------------------------------------------------------------------------
  505. *}
  506. float_tininess_after_rounding = 0;
  507. float_tininess_before_rounding = 1;
  508. {*
  509. -------------------------------------------------------------------------------
  510. Underflow tininess-detection mode, statically initialized to default value.
  511. (The declaration in `softfloat.h' must match the `int8' type here.)
  512. -------------------------------------------------------------------------------
  513. *}
  514. var // threadvar!?
  515. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  516. {$endif not(defined(fpc_softfpu_implementation))}
  517. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  518. implementation
  519. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  520. {$if not(defined(fpc_softfpu_interface))}
  521. {$ifdef FPC}
  522. { disable range and overflow checking explicitly }
  523. { This might be more essential for x80 and 128-bit
  524. floating point types and could, maybe be
  525. restricted to code handle floatx80 and float128 }
  526. {$push}
  527. {$R-}
  528. {$Q-}
  529. {$endif FPC}
  530. (*****************************************************************************)
  531. (*----------------------------------------------------------------------------*)
  532. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  533. (* division and square root approximations. (Can be specialized to target if *)
  534. (* desired.) *)
  535. (* ---------------------------------------------------------------------------*)
  536. (*****************************************************************************)
  537. { This procedure serves as a single access point to softfloat_exception_flags.
  538. It also helps to reduce code size a bit because softfloat_exception_flags is
  539. a threadvar. }
  540. procedure set_inexact_flag;
  541. begin
  542. include(softfloat_exception_flags,float_flag_inexact);
  543. end;
  544. {*----------------------------------------------------------------------------
  545. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  546. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  547. | input. If `zSign' is 1, the input is negated before being converted to an
  548. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  549. | is simply rounded to an integer, with the inexact exception raised if the
  550. | input cannot be represented exactly as an integer. However, if the fixed-
  551. | point input is too large, the invalid exception is raised and the largest
  552. | positive or negative integer is returned.
  553. *----------------------------------------------------------------------------*}
  554. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  555. var
  556. roundingMode: TFPURoundingMode;
  557. roundNearestEven: boolean;
  558. roundIncrement, roundBits: int8;
  559. z: int32;
  560. begin
  561. roundingMode := softfloat_rounding_mode;
  562. roundNearestEven := (roundingMode = float_round_nearest_even);
  563. roundIncrement := $40;
  564. if not roundNearestEven then
  565. begin
  566. if ( roundingMode = float_round_to_zero ) then
  567. begin
  568. roundIncrement := 0;
  569. end
  570. else begin
  571. roundIncrement := $7F;
  572. if ( zSign<>0 ) then
  573. begin
  574. if ( roundingMode = float_round_up ) then
  575. roundIncrement := 0;
  576. end
  577. else begin
  578. if ( roundingMode = float_round_down ) then
  579. roundIncrement := 0;
  580. end;
  581. end;
  582. end;
  583. roundBits := lo(absZ) and $7F;
  584. absZ := ( absZ + roundIncrement ) shr 7;
  585. absZ := absZ and not( bits64( ord( ( roundBits xor $40 ) = 0 ) and ord(roundNearestEven) ));
  586. z := absZ;
  587. if ( zSign<>0 ) then
  588. z := - z;
  589. if ( longint(hi( absZ )) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  590. begin
  591. float_raise( float_flag_invalid );
  592. if zSign<>0 then
  593. result:=sbits32($80000000)
  594. else
  595. result:=$7FFFFFFF;
  596. exit;
  597. end;
  598. if ( roundBits<>0 ) then
  599. set_inexact_flag;
  600. result:=z;
  601. end;
  602. {*----------------------------------------------------------------------------
  603. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  604. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  605. | and returns the properly rounded 64-bit integer corresponding to the input.
  606. | If `zSign' is 1, the input is negated before being converted to an integer.
  607. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  608. | the inexact exception raised if the input cannot be represented exactly as
  609. | an integer. However, if the fixed-point input is too large, the invalid
  610. | exception is raised and the largest positive or negative integer is
  611. | returned.
  612. *----------------------------------------------------------------------------*}
  613. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  614. var
  615. roundingMode: TFPURoundingMode;
  616. roundNearestEven, increment: flag;
  617. z: int64;
  618. label
  619. overflow;
  620. begin
  621. roundingMode := softfloat_rounding_mode;
  622. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  623. increment := ord( sbits64(absZ1) < 0 );
  624. if ( roundNearestEven=0 ) then
  625. begin
  626. if ( roundingMode = float_round_to_zero ) then
  627. begin
  628. increment := 0;
  629. end
  630. else begin
  631. if ( zSign<>0 ) then
  632. begin
  633. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  634. end
  635. else begin
  636. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  637. end;
  638. end;
  639. end;
  640. if ( increment<>0 ) then
  641. begin
  642. inc(absZ0);
  643. if ( absZ0 = 0 ) then
  644. goto overflow;
  645. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  646. end;
  647. z := absZ0;
  648. if ( zSign<>0 ) then
  649. z := - z;
  650. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  651. begin
  652. overflow:
  653. float_raise( float_flag_invalid );
  654. if zSign<>0 then
  655. result:=int64($8000000000000000)
  656. else
  657. result:=int64($7FFFFFFFFFFFFFFF);
  658. exit;
  659. end;
  660. if ( absZ1<>0 ) then
  661. set_inexact_flag;
  662. result:=z;
  663. end;
  664. {*
  665. -------------------------------------------------------------------------------
  666. Shifts `a' right by the number of bits given in `count'. If any nonzero
  667. bits are shifted off, they are ``jammed'' into the least significant bit of
  668. the result by setting the least significant bit to 1. The value of `count'
  669. can be arbitrarily large; in particular, if `count' is greater than 32, the
  670. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  671. The result is stored in the location pointed to by `zPtr'.
  672. -------------------------------------------------------------------------------
  673. *}
  674. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  675. var
  676. z: Bits32;
  677. Begin
  678. if ( count = 0 ) then
  679. z := a
  680. else
  681. if ( count < 32 ) then
  682. Begin
  683. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  684. End
  685. else
  686. Begin
  687. z := bits32( a <> 0 );
  688. End;
  689. zPtr := z;
  690. End;
  691. {*----------------------------------------------------------------------------
  692. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  693. | number of bits given in `count'. Any bits shifted off are lost. The value
  694. | of `count' can be arbitrarily large; in particular, if `count' is greater
  695. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  696. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  697. *----------------------------------------------------------------------------*}
  698. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  699. var
  700. z0, z1: bits64;
  701. negCount: int8;
  702. begin
  703. negCount := ( - count ) and 63;
  704. if ( count = 0 ) then
  705. begin
  706. z1 := a1;
  707. z0 := a0;
  708. end
  709. else if ( count < 64 ) then
  710. begin
  711. z1 := ( a0 shl negCount ) or ( a1 shr count );
  712. z0 := a0 shr count;
  713. end
  714. else
  715. begin
  716. if ( count < 128 ) then
  717. z1 := a0 shr ( count and 63 )
  718. else
  719. z1 := 0;
  720. z0 := 0;
  721. end;
  722. z1Ptr := z1;
  723. z0Ptr := z0;
  724. end;
  725. {*----------------------------------------------------------------------------
  726. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  727. | number of bits given in `count'. If any nonzero bits are shifted off, they
  728. | are ``jammed'' into the least significant bit of the result by setting the
  729. | least significant bit to 1. The value of `count' can be arbitrarily large;
  730. | in particular, if `count' is greater than 128, the result will be either
  731. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  732. | nonzero. The result is broken into two 64-bit pieces which are stored at
  733. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  734. *----------------------------------------------------------------------------*}
  735. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  736. var
  737. z0,z1 : bits64;
  738. negCount : int8;
  739. begin
  740. negCount := ( - count ) and 63;
  741. if ( count = 0 ) then begin
  742. z1 := a1;
  743. z0 := a0;
  744. end
  745. else if ( count < 64 ) then begin
  746. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  747. z0 := a0 shr count;
  748. end
  749. else begin
  750. if ( count = 64 ) then begin
  751. z1 := a0 or ord( a1 <> 0 );
  752. end
  753. else if ( count < 128 ) then begin
  754. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  755. end
  756. else begin
  757. z1 := ord( ( a0 or a1 ) <> 0 );
  758. end;
  759. z0 := 0;
  760. end;
  761. z1Ptr := z1;
  762. z0Ptr := z0;
  763. end;
  764. {*
  765. -------------------------------------------------------------------------------
  766. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  767. number of bits given in `count'. Any bits shifted off are lost. The value
  768. of `count' can be arbitrarily large; in particular, if `count' is greater
  769. than 64, the result will be 0. The result is broken into two 32-bit pieces
  770. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  771. -------------------------------------------------------------------------------
  772. *}
  773. Procedure
  774. shift64Right(
  775. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  776. Var
  777. z0, z1: bits32;
  778. negCount : int8;
  779. Begin
  780. negCount := ( - count ) AND 31;
  781. if ( count = 0 ) then
  782. Begin
  783. z1 := a1;
  784. z0 := a0;
  785. End
  786. else if ( count < 32 ) then
  787. Begin
  788. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  789. z0 := a0 shr count;
  790. End
  791. else
  792. Begin
  793. if (count < 64) then
  794. z1 := ( a0 shr ( count AND 31 ) )
  795. else
  796. z1 := 0;
  797. z0 := 0;
  798. End;
  799. z1Ptr := z1;
  800. z0Ptr := z0;
  801. End;
  802. {*
  803. -------------------------------------------------------------------------------
  804. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  805. number of bits given in `count'. If any nonzero bits are shifted off, they
  806. are ``jammed'' into the least significant bit of the result by setting the
  807. least significant bit to 1. The value of `count' can be arbitrarily large;
  808. in particular, if `count' is greater than 64, the result will be either 0
  809. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  810. nonzero. The result is broken into two 32-bit pieces which are stored at
  811. the locations pointed to by `z0Ptr' and `z1Ptr'.
  812. -------------------------------------------------------------------------------
  813. *}
  814. Procedure
  815. shift64RightJamming(
  816. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  817. VAR
  818. z0, z1 : bits32;
  819. negCount : int8;
  820. Begin
  821. negCount := ( - count ) AND 31;
  822. if ( count = 0 ) then
  823. Begin
  824. z1 := a1;
  825. z0 := a0;
  826. End
  827. else
  828. if ( count < 32 ) then
  829. Begin
  830. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  831. z0 := a0 shr count;
  832. End
  833. else
  834. Begin
  835. if ( count = 32 ) then
  836. Begin
  837. z1 := a0 OR bits32( a1 <> 0 );
  838. End
  839. else
  840. if ( count < 64 ) Then
  841. Begin
  842. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  843. End
  844. else
  845. Begin
  846. z1 := bits32( ( a0 OR a1 ) <> 0 );
  847. End;
  848. z0 := 0;
  849. End;
  850. z1Ptr := z1;
  851. z0Ptr := z0;
  852. End;
  853. {*----------------------------------------------------------------------------
  854. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  855. | bits are shifted off, they are ``jammed'' into the least significant bit of
  856. | the result by setting the least significant bit to 1. The value of `count'
  857. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  858. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  859. | The result is stored in the location pointed to by `zPtr'.
  860. *----------------------------------------------------------------------------*}
  861. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  862. var
  863. z: bits64;
  864. begin
  865. if ( count = 0 ) then
  866. begin
  867. z := a;
  868. end
  869. else if ( count < 64 ) then
  870. begin
  871. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  872. end
  873. else
  874. begin
  875. z := ord( a <> 0 );
  876. end;
  877. zPtr := z;
  878. end;
  879. {$if not defined(shift64ExtraRightJamming)}
  880. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  881. overload;
  882. forward;
  883. {$endif}
  884. {*
  885. -------------------------------------------------------------------------------
  886. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  887. by 32 _plus_ the number of bits given in `count'. The shifted result is
  888. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  889. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  890. off form a third 32-bit result as follows: The _last_ bit shifted off is
  891. the most-significant bit of the extra result, and the other 31 bits of the
  892. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  893. were all zero. This extra result is stored in the location pointed to by
  894. `z2Ptr'. The value of `count' can be arbitrarily large.
  895. (This routine makes more sense if `a0', `a1', and `a2' are considered
  896. to form a fixed-point value with binary point between `a1' and `a2'. This
  897. fixed-point value is shifted right by the number of bits given in `count',
  898. and the integer part of the result is returned at the locations pointed to
  899. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  900. corrupted as described above, and is returned at the location pointed to by
  901. `z2Ptr'.)
  902. -------------------------------------------------------------------------------
  903. }
  904. Procedure
  905. shift64ExtraRightJamming(
  906. a0: bits32;
  907. a1: bits32;
  908. a2: bits32;
  909. count: int16;
  910. VAR z0Ptr: bits32;
  911. VAR z1Ptr: bits32;
  912. VAR z2Ptr: bits32
  913. ); overload;
  914. Var
  915. z0, z1, z2: bits32;
  916. negCount : int8;
  917. Begin
  918. negCount := ( - count ) AND 31;
  919. if ( count = 0 ) then
  920. Begin
  921. z2 := a2;
  922. z1 := a1;
  923. z0 := a0;
  924. End
  925. else
  926. Begin
  927. if ( count < 32 ) Then
  928. Begin
  929. z2 := a1 shl negCount;
  930. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  931. z0 := a0 shr count;
  932. End
  933. else
  934. Begin
  935. if ( count = 32 ) then
  936. Begin
  937. z2 := a1;
  938. z1 := a0;
  939. End
  940. else
  941. Begin
  942. a2 := a2 or a1;
  943. if ( count < 64 ) then
  944. Begin
  945. z2 := a0 shl negCount;
  946. z1 := a0 shr ( count AND 31 );
  947. End
  948. else
  949. Begin
  950. if count = 64 then
  951. z2 := a0
  952. else
  953. z2 := bits32(a0 <> 0);
  954. z1 := 0;
  955. End;
  956. End;
  957. z0 := 0;
  958. End;
  959. z2 := z2 or bits32( a2 <> 0 );
  960. End;
  961. z2Ptr := z2;
  962. z1Ptr := z1;
  963. z0Ptr := z0;
  964. End;
  965. {*
  966. -------------------------------------------------------------------------------
  967. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  968. number of bits given in `count'. Any bits shifted off are lost. The value
  969. of `count' must be less than 32. The result is broken into two 32-bit
  970. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  971. -------------------------------------------------------------------------------
  972. *}
  973. Procedure
  974. shortShift64Left(
  975. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  976. Begin
  977. z1Ptr := a1 shl count;
  978. if count = 0 then
  979. z0Ptr := a0
  980. else
  981. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  982. End;
  983. {*
  984. -------------------------------------------------------------------------------
  985. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  986. by the number of bits given in `count'. Any bits shifted off are lost.
  987. The value of `count' must be less than 32. The result is broken into three
  988. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  989. `z1Ptr', and `z2Ptr'.
  990. -------------------------------------------------------------------------------
  991. *}
  992. Procedure
  993. shortShift96Left(
  994. a0: bits32;
  995. a1: bits32;
  996. a2: bits32;
  997. count: int16;
  998. VAR z0Ptr: bits32;
  999. VAR z1Ptr: bits32;
  1000. VAR z2Ptr: bits32
  1001. );
  1002. Var
  1003. z0, z1, z2: bits32;
  1004. negCount: int8;
  1005. Begin
  1006. z2 := a2 shl count;
  1007. z1 := a1 shl count;
  1008. z0 := a0 shl count;
  1009. if ( 0 < count ) then
  1010. Begin
  1011. negCount := ( ( - count ) AND 31 );
  1012. z1 := z1 or (a2 shr negCount);
  1013. z0 := z0 or (a1 shr negCount);
  1014. End;
  1015. z2Ptr := z2;
  1016. z1Ptr := z1;
  1017. z0Ptr := z0;
  1018. End;
  1019. {*----------------------------------------------------------------------------
  1020. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1021. | number of bits given in `count'. Any bits shifted off are lost. The value
  1022. | of `count' must be less than 64. The result is broken into two 64-bit
  1023. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1024. *----------------------------------------------------------------------------*}
  1025. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1026. begin
  1027. z1Ptr := a1 shl count;
  1028. if count=0 then
  1029. z0Ptr:=a0
  1030. else
  1031. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1032. end;
  1033. {*
  1034. -------------------------------------------------------------------------------
  1035. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1036. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1037. any carry out is lost. The result is broken into two 32-bit pieces which
  1038. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1039. -------------------------------------------------------------------------------
  1040. *}
  1041. Procedure
  1042. add64(
  1043. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1044. Var
  1045. z1: bits32;
  1046. Begin
  1047. z1 := a1 + b1;
  1048. z1Ptr := z1;
  1049. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1050. End;
  1051. {*
  1052. -------------------------------------------------------------------------------
  1053. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1054. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1055. modulo 2^96, so any carry out is lost. The result is broken into three
  1056. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1057. `z1Ptr', and `z2Ptr'.
  1058. -------------------------------------------------------------------------------
  1059. *}
  1060. Procedure
  1061. add96(
  1062. a0: bits32;
  1063. a1: bits32;
  1064. a2: bits32;
  1065. b0: bits32;
  1066. b1: bits32;
  1067. b2: bits32;
  1068. VAR z0Ptr: bits32;
  1069. VAR z1Ptr: bits32;
  1070. VAR z2Ptr: bits32
  1071. );
  1072. var
  1073. z0, z1, z2: bits32;
  1074. carry0, carry1: int8;
  1075. Begin
  1076. z2 := a2 + b2;
  1077. carry1 := int8( z2 < a2 );
  1078. z1 := a1 + b1;
  1079. carry0 := int8( z1 < a1 );
  1080. z0 := a0 + b0;
  1081. z1 := z1 + carry1;
  1082. z0 := z0 + bits32( z1 < carry1 );
  1083. z0 := z0 + carry0;
  1084. z2Ptr := z2;
  1085. z1Ptr := z1;
  1086. z0Ptr := z0;
  1087. End;
  1088. {*----------------------------------------------------------------------------
  1089. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1090. | by the number of bits given in `count'. Any bits shifted off are lost.
  1091. | The value of `count' must be less than 64. The result is broken into three
  1092. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1093. | `z1Ptr', and `z2Ptr'.
  1094. *----------------------------------------------------------------------------*}
  1095. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1096. var
  1097. z0, z1, z2 : bits64;
  1098. negCount : int8;
  1099. begin
  1100. z2 := a2 shl count;
  1101. z1 := a1 shl count;
  1102. z0 := a0 shl count;
  1103. if ( 0 < count ) then
  1104. begin
  1105. negCount := ( ( - count ) and 63 );
  1106. z1 := z1 or (a2 shr negCount);
  1107. z0 := z0 or (a1 shr negCount);
  1108. end;
  1109. z2Ptr := z2;
  1110. z1Ptr := z1;
  1111. z0Ptr := z0;
  1112. end;
  1113. {*----------------------------------------------------------------------------
  1114. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1115. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1116. | any carry out is lost. The result is broken into two 64-bit pieces which
  1117. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1118. *----------------------------------------------------------------------------*}
  1119. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1120. var
  1121. z1 : bits64;
  1122. begin
  1123. z1 := a1 + b1;
  1124. z1Ptr := z1;
  1125. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1126. end;
  1127. {*----------------------------------------------------------------------------
  1128. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1129. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1130. | modulo 2^192, so any carry out is lost. The result is broken into three
  1131. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1132. | `z1Ptr', and `z2Ptr'.
  1133. *----------------------------------------------------------------------------*}
  1134. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1135. var
  1136. z0, z1, z2 : bits64;
  1137. carry0, carry1 : int8;
  1138. begin
  1139. z2 := a2 + b2;
  1140. carry1 := ord( z2 < a2 );
  1141. z1 := a1 + b1;
  1142. carry0 := ord( z1 < a1 );
  1143. z0 := a0 + b0;
  1144. inc(z1, carry1);
  1145. inc(z0, ord( z1 < carry1 ));
  1146. inc(z0, carry0);
  1147. z2Ptr := z2;
  1148. z1Ptr := z1;
  1149. z0Ptr := z0;
  1150. end;
  1151. {*
  1152. -------------------------------------------------------------------------------
  1153. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1154. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1155. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1156. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1157. `z1Ptr'.
  1158. -------------------------------------------------------------------------------
  1159. *}
  1160. Procedure
  1161. sub64(
  1162. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1163. Begin
  1164. z1Ptr := a1 - b1;
  1165. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1166. End;
  1167. {*
  1168. -------------------------------------------------------------------------------
  1169. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1170. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1171. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1172. into three 32-bit pieces which are stored at the locations pointed to by
  1173. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1174. -------------------------------------------------------------------------------
  1175. *}
  1176. Procedure
  1177. sub96(
  1178. a0:bits32;
  1179. a1:bits32;
  1180. a2:bits32;
  1181. b0:bits32;
  1182. b1:bits32;
  1183. b2:bits32;
  1184. VAR z0Ptr:bits32;
  1185. VAR z1Ptr:bits32;
  1186. VAR z2Ptr:bits32
  1187. );
  1188. Var
  1189. z0, z1, z2: bits32;
  1190. borrow0, borrow1: int8;
  1191. Begin
  1192. z2 := a2 - b2;
  1193. borrow1 := int8( a2 < b2 );
  1194. z1 := a1 - b1;
  1195. borrow0 := int8( a1 < b1 );
  1196. z0 := a0 - b0;
  1197. z0 := z0 - bits32( z1 < borrow1 );
  1198. z1 := z1 - borrow1;
  1199. z0 := z0 -borrow0;
  1200. z2Ptr := z2;
  1201. z1Ptr := z1;
  1202. z0Ptr := z0;
  1203. End;
  1204. {*----------------------------------------------------------------------------
  1205. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1206. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1207. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1208. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1209. | `z1Ptr'.
  1210. *----------------------------------------------------------------------------*}
  1211. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1212. begin
  1213. z1Ptr := a1 - b1;
  1214. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1215. end;
  1216. {*----------------------------------------------------------------------------
  1217. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1218. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1219. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1220. | result is broken into three 64-bit pieces which are stored at the locations
  1221. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1222. *----------------------------------------------------------------------------*}
  1223. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1224. var
  1225. z0, z1, z2 : bits64;
  1226. borrow0, borrow1 : int8;
  1227. begin
  1228. z2 := a2 - b2;
  1229. borrow1 := ord( a2 < b2 );
  1230. z1 := a1 - b1;
  1231. borrow0 := ord( a1 < b1 );
  1232. z0 := a0 - b0;
  1233. dec(z0, ord( z1 < borrow1 ));
  1234. dec(z1, borrow1);
  1235. dec(z0, borrow0);
  1236. z2Ptr := z2;
  1237. z1Ptr := z1;
  1238. z0Ptr := z0;
  1239. end;
  1240. {*
  1241. -------------------------------------------------------------------------------
  1242. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1243. into two 32-bit pieces which are stored at the locations pointed to by
  1244. `z0Ptr' and `z1Ptr'.
  1245. -------------------------------------------------------------------------------
  1246. *}
  1247. {$IFDEF SOFTFPU_COMPILER_MUL32TO64}
  1248. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr :bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1249. var
  1250. tmp: qword;
  1251. begin
  1252. tmp:=qword(a) * b;
  1253. z0ptr:=hi(tmp);
  1254. z1ptr:=lo(tmp);
  1255. end;
  1256. {$ELSE}
  1257. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1258. :bits32 );
  1259. Var
  1260. aHigh, aLow, bHigh, bLow: bits16;
  1261. z0, zMiddleA, zMiddleB, z1: bits32;
  1262. Begin
  1263. aLow := bits16(a);
  1264. aHigh := a shr 16;
  1265. bLow := bits16(b);
  1266. bHigh := b shr 16;
  1267. z1 := ( bits32( aLow) ) * bLow;
  1268. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1269. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1270. z0 := ( bits32 (aHigh) ) * bHigh;
  1271. zMiddleA := zMiddleA + zMiddleB;
  1272. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1273. zMiddleA := zmiddleA shl 16;
  1274. z1 := z1 + zMiddleA;
  1275. z0 := z0 + bits32( z1 < zMiddleA );
  1276. z1Ptr := z1;
  1277. z0Ptr := z0;
  1278. End;
  1279. {$ENDIF}
  1280. {*
  1281. -------------------------------------------------------------------------------
  1282. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1283. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1284. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1285. `z2Ptr'.
  1286. -------------------------------------------------------------------------------
  1287. *}
  1288. Procedure
  1289. mul64By32To96(
  1290. a0:bits32;
  1291. a1:bits32;
  1292. b:bits32;
  1293. VAR z0Ptr:bits32;
  1294. VAR z1Ptr:bits32;
  1295. VAR z2Ptr:bits32
  1296. );
  1297. Var
  1298. z0, z1, z2, more1: bits32;
  1299. Begin
  1300. mul32To64( a1, b, z1, z2 );
  1301. mul32To64( a0, b, z0, more1 );
  1302. add64( z0, more1, 0, z1, z0, z1 );
  1303. z2Ptr := z2;
  1304. z1Ptr := z1;
  1305. z0Ptr := z0;
  1306. End;
  1307. {*
  1308. -------------------------------------------------------------------------------
  1309. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1310. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1311. product. The product is broken into four 32-bit pieces which are stored at
  1312. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1313. -------------------------------------------------------------------------------
  1314. *}
  1315. Procedure
  1316. mul64To128(
  1317. a0:bits32;
  1318. a1:bits32;
  1319. b0:bits32;
  1320. b1:bits32;
  1321. VAR z0Ptr:bits32;
  1322. VAR z1Ptr:bits32;
  1323. VAR z2Ptr:bits32;
  1324. VAR z3Ptr:bits32
  1325. );
  1326. Var
  1327. z0, z1, z2, z3: bits32;
  1328. more1, more2: bits32;
  1329. Begin
  1330. mul32To64( a1, b1, z2, z3 );
  1331. mul32To64( a1, b0, z1, more2 );
  1332. add64( z1, more2, 0, z2, z1, z2 );
  1333. mul32To64( a0, b0, z0, more1 );
  1334. add64( z0, more1, 0, z1, z0, z1 );
  1335. mul32To64( a0, b1, more1, more2 );
  1336. add64( more1, more2, 0, z2, more1, z2 );
  1337. add64( z0, z1, 0, more1, z0, z1 );
  1338. z3Ptr := z3;
  1339. z2Ptr := z2;
  1340. z1Ptr := z1;
  1341. z0Ptr := z0;
  1342. End;
  1343. {*----------------------------------------------------------------------------
  1344. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1345. | into two 64-bit pieces which are stored at the locations pointed to by
  1346. | `z0Ptr' and `z1Ptr'.
  1347. *----------------------------------------------------------------------------*}
  1348. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1349. var
  1350. aHigh, aLow, bHigh, bLow : bits32;
  1351. z0, zMiddleA, zMiddleB, z1 : bits64;
  1352. begin
  1353. aLow := a;
  1354. aHigh := a shr 32;
  1355. bLow := b;
  1356. bHigh := b shr 32;
  1357. z1 := ( bits64(aLow) ) * bLow;
  1358. zMiddleA := ( bits64( aLow )) * bHigh;
  1359. zMiddleB := ( bits64( aHigh )) * bLow;
  1360. z0 := ( bits64(aHigh) ) * bHigh;
  1361. inc(zMiddleA, zMiddleB);
  1362. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1363. zMiddleA := zMiddleA shl 32;
  1364. inc(z1, zMiddleA);
  1365. inc(z0, ord( z1 < zMiddleA ));
  1366. z1Ptr := z1;
  1367. z0Ptr := z0;
  1368. end;
  1369. {*----------------------------------------------------------------------------
  1370. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1371. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1372. | product. The product is broken into four 64-bit pieces which are stored at
  1373. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1374. *----------------------------------------------------------------------------*}
  1375. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1376. var
  1377. z0,z1,z2,z3,more1,more2 : bits64;
  1378. begin
  1379. mul64To128( a1, b1, z2, z3 );
  1380. mul64To128( a1, b0, z1, more2 );
  1381. add128( z1, more2, 0, z2, z1, z2 );
  1382. mul64To128( a0, b0, z0, more1 );
  1383. add128( z0, more1, 0, z1, z0, z1 );
  1384. mul64To128( a0, b1, more1, more2 );
  1385. add128( more1, more2, 0, z2, more1, z2 );
  1386. add128( z0, z1, 0, more1, z0, z1 );
  1387. z3Ptr := z3;
  1388. z2Ptr := z2;
  1389. z1Ptr := z1;
  1390. z0Ptr := z0;
  1391. end;
  1392. {*----------------------------------------------------------------------------
  1393. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1394. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1395. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1396. | `z2Ptr'.
  1397. *----------------------------------------------------------------------------*}
  1398. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1399. var
  1400. z0, z1, z2, more1 : bits64;
  1401. begin
  1402. mul64To128( a1, b, z1, z2 );
  1403. mul64To128( a0, b, z0, more1 );
  1404. add128( z0, more1, 0, z1, z0, z1 );
  1405. z2Ptr := z2;
  1406. z1Ptr := z1;
  1407. z0Ptr := z0;
  1408. end;
  1409. {*----------------------------------------------------------------------------
  1410. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1411. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1412. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1413. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1414. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1415. | unsigned integer is returned.
  1416. *----------------------------------------------------------------------------*}
  1417. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1418. var
  1419. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1420. begin
  1421. if ( b <= a0 ) then
  1422. begin
  1423. result:=qword( $FFFFFFFFFFFFFFFF );
  1424. exit;
  1425. end;
  1426. b0 := b shr 32;
  1427. if ( b0 shl 32 <= a0 ) then
  1428. z:=qword( $FFFFFFFF00000000 )
  1429. else
  1430. z:=( a0 div b0 ) shl 32;
  1431. mul64To128( b, z, term0, term1 );
  1432. sub128( a0, a1, term0, term1, rem0, rem1 );
  1433. while ( ( sbits64(rem0) ) < 0 ) do begin
  1434. dec(z,qword( $100000000 ));
  1435. b1 := b shl 32;
  1436. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1437. end;
  1438. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1439. if ( b0 shl 32 <= rem0 ) then
  1440. z:=z or $FFFFFFFF
  1441. else
  1442. z:=z or rem0 div b0;
  1443. result:=z;
  1444. end;
  1445. {*
  1446. -------------------------------------------------------------------------------
  1447. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1448. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1449. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1450. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1451. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1452. unsigned integer is returned.
  1453. -------------------------------------------------------------------------------
  1454. *}
  1455. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1456. Var
  1457. b0, b1: bits32;
  1458. rem0, rem1, term0, term1: bits32;
  1459. z: bits32;
  1460. Begin
  1461. if ( b <= a0 ) then
  1462. Begin
  1463. estimateDiv64To32 := $FFFFFFFF;
  1464. exit;
  1465. End;
  1466. b0 := b shr 16;
  1467. if ( b0 shl 16 <= a0 ) then
  1468. z:= $FFFF0000
  1469. else
  1470. z:= ( a0 div b0 ) shl 16;
  1471. mul32To64( b, z, term0, term1 );
  1472. sub64( a0, a1, term0, term1, rem0, rem1 );
  1473. while ( ( sbits32 (rem0) ) < 0 ) do
  1474. Begin
  1475. z := z - $10000;
  1476. b1 := b shl 16;
  1477. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1478. End;
  1479. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1480. if ( b0 shl 16 <= rem0 ) then
  1481. z := z or $FFFF
  1482. else
  1483. z := z or (rem0 div b0);
  1484. estimateDiv64To32 := z;
  1485. End;
  1486. {*
  1487. -------------------------------------------------------------------------------
  1488. Returns an approximation to the square root of the 32-bit significand given
  1489. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1490. `aExp' (the least significant bit) is 1, the integer returned approximates
  1491. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1492. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1493. case, the approximation returned lies strictly within +/-2 of the exact
  1494. value.
  1495. -------------------------------------------------------------------------------
  1496. *}
  1497. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1498. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1499. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1500. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1501. );
  1502. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1503. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1504. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1505. );
  1506. Var
  1507. index: int8;
  1508. z: bits32;
  1509. Begin
  1510. index := ( a shr 27 ) AND 15;
  1511. if ( aExp AND 1 ) <> 0 then
  1512. Begin
  1513. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1514. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1515. a := a shr 1;
  1516. End
  1517. else
  1518. Begin
  1519. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1520. z := a div z + z;
  1521. if ( $20000 <= z ) then
  1522. z := $FFFF8000
  1523. else
  1524. z := ( z shl 15 );
  1525. if ( z <= a ) then
  1526. Begin
  1527. estimateSqrt32 := bits32 ( SarLongint( sbits32 (a)) );
  1528. exit;
  1529. End;
  1530. End;
  1531. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1532. End;
  1533. {*
  1534. -------------------------------------------------------------------------------
  1535. Returns the number of leading 0 bits before the most-significant 1 bit of
  1536. `a'. If `a' is zero, 32 is returned.
  1537. -------------------------------------------------------------------------------
  1538. *}
  1539. Function countLeadingZeros32( a:bits32 ): int8;
  1540. const countLeadingZerosHigh:array[0..255] of int8 = (
  1541. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1542. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1543. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1544. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1545. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1546. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1547. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1548. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1549. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1550. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1551. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1552. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1553. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1554. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1555. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1556. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1557. );
  1558. Var
  1559. shiftCount: int8;
  1560. Begin
  1561. shiftCount := 0;
  1562. if ( a < $10000 ) then
  1563. Begin
  1564. shiftCount := shiftcount + 16;
  1565. a := a shl 16;
  1566. End;
  1567. if ( a < $1000000 ) then
  1568. Begin
  1569. shiftCount := shiftcount + 8;
  1570. a := a shl 8;
  1571. end;
  1572. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1573. countLeadingZeros32:= shiftCount;
  1574. End;
  1575. {*----------------------------------------------------------------------------
  1576. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1577. | `a'. If `a' is zero, 64 is returned.
  1578. *----------------------------------------------------------------------------*}
  1579. function countLeadingZeros64( a : bits64): int8;
  1580. var
  1581. shiftcount : int8;
  1582. Begin
  1583. shiftCount := 0;
  1584. if ( a < bits64(bits64(1) shl 32 )) then
  1585. shiftCount := shiftcount + 32
  1586. else
  1587. a := a shr 32;
  1588. shiftCount := shiftCount + countLeadingZeros32( a );
  1589. countLeadingZeros64:= shiftCount;
  1590. End;
  1591. {*
  1592. -------------------------------------------------------------------------------
  1593. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1594. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1595. Otherwise, returns 0.
  1596. -------------------------------------------------------------------------------
  1597. *}
  1598. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1599. Begin
  1600. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1601. End;
  1602. {*
  1603. -------------------------------------------------------------------------------
  1604. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1605. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1606. returns 0.
  1607. -------------------------------------------------------------------------------
  1608. *}
  1609. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1610. Begin
  1611. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1612. End;
  1613. const
  1614. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1615. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1616. (*****************************************************************************)
  1617. (* End Low-Level arithmetic *)
  1618. (*****************************************************************************)
  1619. {*----------------------------------------------------------------------------
  1620. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1621. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1622. | returns 0.
  1623. *----------------------------------------------------------------------------*}
  1624. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1625. begin
  1626. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1627. end;
  1628. {*
  1629. -------------------------------------------------------------------------------
  1630. Functions and definitions to determine: (1) whether tininess for underflow
  1631. is detected before or after rounding by default, (2) what (if anything)
  1632. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1633. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1634. are propagated from function inputs to output. These details are ENDIAN
  1635. specific
  1636. -------------------------------------------------------------------------------
  1637. *}
  1638. {$IFDEF ENDIAN_LITTLE}
  1639. {*
  1640. -------------------------------------------------------------------------------
  1641. Internal canonical NaN format.
  1642. -------------------------------------------------------------------------------
  1643. *}
  1644. TYPE
  1645. commonNaNT = record
  1646. high, low : bits32;
  1647. sign: flag;
  1648. end;
  1649. {*
  1650. -------------------------------------------------------------------------------
  1651. The pattern for a default generated single-precision NaN.
  1652. -------------------------------------------------------------------------------
  1653. *}
  1654. const float32_default_nan = $FFC00000;
  1655. {*
  1656. -------------------------------------------------------------------------------
  1657. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1658. otherwise returns 0.
  1659. -------------------------------------------------------------------------------
  1660. *}
  1661. Function float32_is_nan( a : float32 ): flag;
  1662. Begin
  1663. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1664. End;
  1665. {*
  1666. -------------------------------------------------------------------------------
  1667. Returns 1 if the single-precision floating-point value `a' is a signaling
  1668. NaN; otherwise returns 0.
  1669. -------------------------------------------------------------------------------
  1670. *}
  1671. Function float32_is_signaling_nan( a : float32 ): flag;
  1672. Begin
  1673. float32_is_signaling_nan := flag
  1674. (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));
  1675. End;
  1676. {*
  1677. -------------------------------------------------------------------------------
  1678. Returns the result of converting the single-precision floating-point NaN
  1679. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1680. exception is raised.
  1681. -------------------------------------------------------------------------------
  1682. *}
  1683. function float32ToCommonNaN(a: float32) : commonNaNT;
  1684. var
  1685. z : commonNaNT ;
  1686. Begin
  1687. if ( float32_is_signaling_nan( a ) <> 0) then
  1688. float_raise( float_flag_invalid );
  1689. z.sign := a shr 31;
  1690. z.low := 0;
  1691. z.high := a shl 9;
  1692. result := z;
  1693. End;
  1694. {*
  1695. -------------------------------------------------------------------------------
  1696. Returns the result of converting the canonical NaN `a' to the single-
  1697. precision floating-point format.
  1698. -------------------------------------------------------------------------------
  1699. *}
  1700. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1701. Begin
  1702. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1703. End;
  1704. {*
  1705. -------------------------------------------------------------------------------
  1706. Takes two single-precision floating-point values `a' and `b', one of which
  1707. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1708. signaling NaN, the invalid exception is raised.
  1709. -------------------------------------------------------------------------------
  1710. *}
  1711. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1712. Var
  1713. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1714. label returnLargerSignificand;
  1715. Begin
  1716. aIsNaN := float32_is_nan( a );
  1717. aIsSignalingNaN := float32_is_signaling_nan( a );
  1718. bIsNaN := float32_is_nan( b );
  1719. bIsSignalingNaN := float32_is_signaling_nan( b );
  1720. a := a or $00400000;
  1721. b := b or $00400000;
  1722. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1723. float_raise( float_flag_invalid );
  1724. if ( aIsSignalingNaN )<> 0 then
  1725. Begin
  1726. if ( bIsSignalingNaN ) <> 0 then
  1727. goto returnLargerSignificand;
  1728. if bIsNan <> 0 then
  1729. propagateFloat32NaN := b
  1730. else
  1731. propagateFloat32NaN := a;
  1732. exit;
  1733. End
  1734. else if ( aIsNaN <> 0) then
  1735. Begin
  1736. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1737. Begin
  1738. propagateFloat32NaN := a;
  1739. exit;
  1740. End;
  1741. returnLargerSignificand:
  1742. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1743. Begin
  1744. propagateFloat32NaN := b;
  1745. exit;
  1746. End;
  1747. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1748. Begin
  1749. propagateFloat32NaN := a;
  1750. End;
  1751. if a < b then
  1752. propagateFloat32NaN := a
  1753. else
  1754. propagateFloat32NaN := b;
  1755. exit;
  1756. End
  1757. else
  1758. Begin
  1759. propagateFloat32NaN := b;
  1760. exit;
  1761. End;
  1762. End;
  1763. {*
  1764. -------------------------------------------------------------------------------
  1765. The pattern for a default generated double-precision NaN. The `high' and
  1766. `low' values hold the most- and least-significant bits, respectively.
  1767. -------------------------------------------------------------------------------
  1768. *}
  1769. const
  1770. float64_default_nan_high = $FFF80000;
  1771. float64_default_nan_low = $00000000;
  1772. {*
  1773. -------------------------------------------------------------------------------
  1774. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1775. otherwise returns 0.
  1776. -------------------------------------------------------------------------------
  1777. *}
  1778. Function float64_is_nan( a : float64 ) : flag;
  1779. Begin
  1780. float64_is_nan :=
  1781. flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1782. and (( a.low or ( a.high and $000FFFFF ) )<>0));
  1783. End;
  1784. {*
  1785. -------------------------------------------------------------------------------
  1786. Returns 1 if the double-precision floating-point value `a' is a signaling
  1787. NaN; otherwise returns 0.
  1788. -------------------------------------------------------------------------------
  1789. *}
  1790. Function float64_is_signaling_nan( a : float64 ): flag;
  1791. Begin
  1792. float64_is_signaling_nan :=
  1793. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1794. and ( a.low or ( a.high and $0007FFFF ) );
  1795. End;
  1796. {*
  1797. -------------------------------------------------------------------------------
  1798. Returns the result of converting the double-precision floating-point NaN
  1799. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1800. exception is raised.
  1801. -------------------------------------------------------------------------------
  1802. *}
  1803. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1804. Var
  1805. z : commonNaNT;
  1806. Begin
  1807. if ( float64_is_signaling_nan( a )<>0 ) then
  1808. float_raise( float_flag_invalid );
  1809. z.sign := a.high shr 31;
  1810. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1811. result := z;
  1812. End;
  1813. {*
  1814. -------------------------------------------------------------------------------
  1815. Returns the result of converting the canonical NaN `a' to the double-
  1816. precision floating-point format.
  1817. -------------------------------------------------------------------------------
  1818. *}
  1819. function commonNaNToFloat64( a : commonNaNT) : float64;
  1820. Var
  1821. z: float64;
  1822. Begin
  1823. shift64Right( a.high, a.low, 12, z.high, z.low );
  1824. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1825. result := z;
  1826. End;
  1827. {*
  1828. -------------------------------------------------------------------------------
  1829. Takes two double-precision floating-point values `a' and `b', one of which
  1830. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1831. signaling NaN, the invalid exception is raised.
  1832. -------------------------------------------------------------------------------
  1833. *}
  1834. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1835. Var
  1836. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1837. label returnLargerSignificand;
  1838. Begin
  1839. aIsNaN := float64_is_nan( a );
  1840. aIsSignalingNaN := float64_is_signaling_nan( a );
  1841. bIsNaN := float64_is_nan( b );
  1842. bIsSignalingNaN := float64_is_signaling_nan( b );
  1843. a.high := a.high or $00080000;
  1844. b.high := b.high or $00080000;
  1845. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1846. float_raise( float_flag_invalid );
  1847. if ( aIsSignalingNaN )<>0 then
  1848. Begin
  1849. if ( bIsSignalingNaN )<>0 then
  1850. goto returnLargerSignificand;
  1851. if bIsNan <> 0 then
  1852. c := b
  1853. else
  1854. c := a;
  1855. exit;
  1856. End
  1857. else if ( aIsNaN )<> 0 then
  1858. Begin
  1859. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1860. Begin
  1861. c := a;
  1862. exit;
  1863. End;
  1864. returnLargerSignificand:
  1865. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1866. Begin
  1867. c := b;
  1868. exit;
  1869. End;
  1870. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1871. Begin
  1872. c := a;
  1873. exit;
  1874. End;
  1875. if a.high < b.high then
  1876. c := a
  1877. else
  1878. c := b;
  1879. exit;
  1880. End
  1881. else
  1882. Begin
  1883. c := b;
  1884. exit;
  1885. End;
  1886. End;
  1887. {*----------------------------------------------------------------------------
  1888. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1889. | otherwise returns 0.
  1890. *----------------------------------------------------------------------------*}
  1891. function float128_is_nan( a : float128): flag;
  1892. begin
  1893. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1894. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1895. end;
  1896. {*----------------------------------------------------------------------------
  1897. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1898. | signaling NaN; otherwise returns 0.
  1899. *----------------------------------------------------------------------------*}
  1900. function float128_is_signaling_nan( a : float128): flag;
  1901. begin
  1902. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1903. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1904. end;
  1905. {*----------------------------------------------------------------------------
  1906. | Returns the result of converting the quadruple-precision floating-point NaN
  1907. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1908. | exception is raised.
  1909. *----------------------------------------------------------------------------*}
  1910. function float128ToCommonNaN( a : float128): commonNaNT;
  1911. var
  1912. z: commonNaNT;
  1913. qhigh,qlow : qword;
  1914. begin
  1915. if ( float128_is_signaling_nan( a )<>0) then
  1916. float_raise( float_flag_invalid );
  1917. z.sign := a.high shr 63;
  1918. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1919. z.high:=qhigh shr 32;
  1920. z.low:=qhigh and $ffffffff;
  1921. result:=z;
  1922. end;
  1923. {*----------------------------------------------------------------------------
  1924. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1925. | precision floating-point format.
  1926. *----------------------------------------------------------------------------*}
  1927. function commonNaNToFloat128( a : commonNaNT): float128;
  1928. var
  1929. z: float128;
  1930. begin
  1931. shift128Right( a.high, a.low, 16, z.high, z.low );
  1932. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1933. result:=z;
  1934. end;
  1935. {*----------------------------------------------------------------------------
  1936. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1937. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1938. | `b' is a signaling NaN, the invalid exception is raised.
  1939. *----------------------------------------------------------------------------*}
  1940. function propagateFloat128NaN( a: float128; b : float128): float128;
  1941. var
  1942. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1943. label
  1944. returnLargerSignificand;
  1945. begin
  1946. aIsNaN := float128_is_nan( a );
  1947. aIsSignalingNaN := float128_is_signaling_nan( a );
  1948. bIsNaN := float128_is_nan( b );
  1949. bIsSignalingNaN := float128_is_signaling_nan( b );
  1950. a.high := a.high or int64( $0000800000000000 );
  1951. b.high := b.high or int64( $0000800000000000 );
  1952. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1953. float_raise( float_flag_invalid );
  1954. if ( aIsSignalingNaN )<>0 then
  1955. begin
  1956. if ( bIsSignalingNaN )<>0 then
  1957. goto returnLargerSignificand;
  1958. if bIsNaN<>0 then
  1959. result := b
  1960. else
  1961. result := a;
  1962. exit;
  1963. end
  1964. else if ( aIsNaN )<>0 then
  1965. begin
  1966. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1967. begin
  1968. result := a;
  1969. exit;
  1970. end;
  1971. returnLargerSignificand:
  1972. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1973. begin
  1974. result := b;
  1975. exit;
  1976. end;
  1977. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1978. begin
  1979. result := a;
  1980. exit
  1981. end;
  1982. if ( a.high < b.high ) then
  1983. result := a
  1984. else
  1985. result := b;
  1986. exit;
  1987. end
  1988. else
  1989. result:=b;
  1990. end;
  1991. {$ELSE}
  1992. { Big endian code }
  1993. (*----------------------------------------------------------------------------
  1994. | Internal canonical NaN format.
  1995. *----------------------------------------------------------------------------*)
  1996. type
  1997. commonNANT = record
  1998. high, low : bits32;
  1999. sign : flag;
  2000. end;
  2001. (*----------------------------------------------------------------------------
  2002. | The pattern for a default generated single-precision NaN.
  2003. *----------------------------------------------------------------------------*)
  2004. const float32_default_nan = $7FFFFFFF;
  2005. (*----------------------------------------------------------------------------
  2006. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  2007. | otherwise returns 0.
  2008. *----------------------------------------------------------------------------*)
  2009. function float32_is_nan(a: float32): flag;
  2010. begin
  2011. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  2012. end;
  2013. (*----------------------------------------------------------------------------
  2014. | Returns 1 if the single-precision floating-point value `a' is a signaling
  2015. | NaN; otherwise returns 0.
  2016. *----------------------------------------------------------------------------*)
  2017. function float32_is_signaling_nan(a: float32):flag;
  2018. begin
  2019. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2020. end;
  2021. (*----------------------------------------------------------------------------
  2022. | Returns the result of converting the single-precision floating-point NaN
  2023. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2024. | exception is raised.
  2025. *----------------------------------------------------------------------------*)
  2026. function float32ToCommonNaN( a: float32) : commonNaNT;
  2027. var
  2028. z: commonNANT;
  2029. begin
  2030. if float32_is_signaling_nan(a)<>0 then
  2031. float_raise(float_flag_invalid);
  2032. z.sign := a shr 31;
  2033. z.low := 0;
  2034. z.high := a shl 9;
  2035. result:=z;
  2036. end;
  2037. (*----------------------------------------------------------------------------
  2038. | Returns the result of converting the canonical NaN `a' to the single-
  2039. | precision floating-point format.
  2040. *----------------------------------------------------------------------------*)
  2041. function CommonNanToFloat32(a : CommonNaNT): float32;
  2042. begin
  2043. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2044. end;
  2045. (*----------------------------------------------------------------------------
  2046. | Takes two single-precision floating-point values `a' and `b', one of which
  2047. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2048. | signaling NaN, the invalid exception is raised.
  2049. *----------------------------------------------------------------------------*)
  2050. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2051. var
  2052. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2053. begin
  2054. aIsNaN := float32_is_nan( a );
  2055. aIsSignalingNaN := float32_is_signaling_nan( a );
  2056. bIsNaN := float32_is_nan( b );
  2057. bIsSignalingNaN := float32_is_signaling_nan( b );
  2058. a := a or $00400000;
  2059. b := b or $00400000;
  2060. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2061. float_raise( float_flag_invalid );
  2062. if bIsSignalingNaN<>0 then
  2063. propagateFloat32Nan := b
  2064. else if aIsSignalingNan<>0 then
  2065. propagateFloat32Nan := a
  2066. else if bIsNan<>0 then
  2067. propagateFloat32Nan := b
  2068. else
  2069. propagateFloat32Nan := a;
  2070. end;
  2071. (*----------------------------------------------------------------------------
  2072. | The pattern for a default generated double-precision NaN. The `high' and
  2073. | `low' values hold the most- and least-significant bits, respectively.
  2074. *----------------------------------------------------------------------------*)
  2075. const
  2076. float64_default_nan_high = $7FFFFFFF;
  2077. float64_default_nan_low = $FFFFFFFF;
  2078. (*----------------------------------------------------------------------------
  2079. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2080. | otherwise returns 0.
  2081. *----------------------------------------------------------------------------*)
  2082. function float64_is_nan(a: float64): flag;
  2083. begin
  2084. float64_is_nan := flag (
  2085. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2086. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2087. end;
  2088. (*----------------------------------------------------------------------------
  2089. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2090. | NaN; otherwise returns 0.
  2091. *----------------------------------------------------------------------------*)
  2092. function float64_is_signaling_nan( a:float64): flag;
  2093. begin
  2094. float64_is_signaling_nan := flag(
  2095. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2096. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2097. end;
  2098. (*----------------------------------------------------------------------------
  2099. | Returns the result of converting the double-precision floating-point NaN
  2100. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2101. | exception is raised.
  2102. *----------------------------------------------------------------------------*)
  2103. function float64ToCommonNaN( a : float64) : commonNaNT;
  2104. var
  2105. z : commonNaNT;
  2106. begin
  2107. if ( float64_is_signaling_nan( a )<>0 ) then
  2108. float_raise( float_flag_invalid );
  2109. z.sign := a.high shr 31;
  2110. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2111. result:=z;
  2112. end;
  2113. (*----------------------------------------------------------------------------
  2114. | Returns the result of converting the canonical NaN `a' to the double-
  2115. | precision floating-point format.
  2116. *----------------------------------------------------------------------------*)
  2117. function commonNaNToFloat64( a : commonNaNT): float64;
  2118. var
  2119. z: float64;
  2120. begin
  2121. shift64Right( a.high, a.low, 12, z.high, z.low );
  2122. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2123. result:=z;
  2124. end;
  2125. (*----------------------------------------------------------------------------
  2126. | Takes two double-precision floating-point values `a' and `b', one of which
  2127. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2128. | signaling NaN, the invalid exception is raised.
  2129. *----------------------------------------------------------------------------*)
  2130. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2131. var
  2132. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2133. begin
  2134. aIsNaN := float64_is_nan( a );
  2135. aIsSignalingNaN := float64_is_signaling_nan( a );
  2136. bIsNaN := float64_is_nan( b );
  2137. bIsSignalingNaN := float64_is_signaling_nan( b );
  2138. a.high := a.high or $00080000;
  2139. b.high := b.high or $00080000;
  2140. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2141. float_raise( float_flag_invalid );
  2142. if bIsSignalingNaN<>0 then
  2143. c := b
  2144. else if aIsSignalingNan<>0 then
  2145. c := a
  2146. else if bIsNan<>0 then
  2147. c := b
  2148. else
  2149. c := a;
  2150. end;
  2151. {*----------------------------------------------------------------------------
  2152. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  2153. | otherwise returns 0.
  2154. *----------------------------------------------------------------------------*}
  2155. function float128_is_nan( a : float128): flag;
  2156. begin
  2157. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  2158. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  2159. end;
  2160. {*----------------------------------------------------------------------------
  2161. | Returns 1 if the quadruple-precision floating-point value `a' is a
  2162. | signaling NaN; otherwise returns 0.
  2163. *----------------------------------------------------------------------------*}
  2164. function float128_is_signaling_nan( a : float128): flag;
  2165. begin
  2166. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  2167. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  2168. end;
  2169. {*----------------------------------------------------------------------------
  2170. | Returns the result of converting the quadruple-precision floating-point NaN
  2171. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2172. | exception is raised.
  2173. *----------------------------------------------------------------------------*}
  2174. function float128ToCommonNaN( a : float128): commonNaNT;
  2175. var
  2176. z: commonNaNT;
  2177. qhigh,qlow : qword;
  2178. begin
  2179. if ( float128_is_signaling_nan( a )<>0) then
  2180. float_raise( float_flag_invalid );
  2181. z.sign := a.high shr 63;
  2182. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  2183. z.high:=qhigh shr 32;
  2184. z.low:=qhigh and $ffffffff;
  2185. result:=z;
  2186. end;
  2187. {*----------------------------------------------------------------------------
  2188. | Returns the result of converting the canonical NaN `a' to the quadruple-
  2189. | precision floating-point format.
  2190. *----------------------------------------------------------------------------*}
  2191. function commonNaNToFloat128( a : commonNaNT): float128;
  2192. var
  2193. z: float128;
  2194. begin
  2195. shift128Right( a.high, a.low, 16, z.high, z.low );
  2196. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  2197. result:=z;
  2198. end;
  2199. {*----------------------------------------------------------------------------
  2200. | Takes two quadruple-precision floating-point values `a' and `b', one of
  2201. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  2202. | `b' is a signaling NaN, the invalid exception is raised.
  2203. *----------------------------------------------------------------------------*}
  2204. function propagateFloat128NaN( a: float128; b : float128): float128;
  2205. var
  2206. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2207. label
  2208. returnLargerSignificand;
  2209. begin
  2210. aIsNaN := float128_is_nan( a );
  2211. aIsSignalingNaN := float128_is_signaling_nan( a );
  2212. bIsNaN := float128_is_nan( b );
  2213. bIsSignalingNaN := float128_is_signaling_nan( b );
  2214. a.high := a.high or int64( $0000800000000000 );
  2215. b.high := b.high or int64( $0000800000000000 );
  2216. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2217. float_raise( float_flag_invalid );
  2218. if ( aIsSignalingNaN )<>0 then
  2219. begin
  2220. if ( bIsSignalingNaN )<>0 then
  2221. goto returnLargerSignificand;
  2222. if bIsNaN<>0 then
  2223. result := b
  2224. else
  2225. result := a;
  2226. exit;
  2227. end
  2228. else if ( aIsNaN )<>0 then
  2229. begin
  2230. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  2231. begin
  2232. result := a;
  2233. exit;
  2234. end;
  2235. returnLargerSignificand:
  2236. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  2237. begin
  2238. result := b;
  2239. exit;
  2240. end;
  2241. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  2242. begin
  2243. result := a;
  2244. exit
  2245. end;
  2246. if ( a.high < b.high ) then
  2247. result := a
  2248. else
  2249. result := b;
  2250. exit;
  2251. end
  2252. else
  2253. result:=b;
  2254. end;
  2255. {$ENDIF}
  2256. (****************************************************************************)
  2257. (* END ENDIAN SPECIFIC CODE *)
  2258. (****************************************************************************)
  2259. {*
  2260. -------------------------------------------------------------------------------
  2261. Returns the fraction bits of the single-precision floating-point value `a'.
  2262. -------------------------------------------------------------------------------
  2263. *}
  2264. Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
  2265. Begin
  2266. ExtractFloat32Frac := A AND $007FFFFF;
  2267. End;
  2268. {*
  2269. -------------------------------------------------------------------------------
  2270. Returns the exponent bits of the single-precision floating-point value `a'.
  2271. -------------------------------------------------------------------------------
  2272. *}
  2273. Function extractFloat32Exp( a: float32 ): Int16; inline;
  2274. Begin
  2275. extractFloat32Exp := (a shr 23) AND $FF;
  2276. End;
  2277. {*
  2278. -------------------------------------------------------------------------------
  2279. Returns the sign bit of the single-precision floating-point value `a'.
  2280. -------------------------------------------------------------------------------
  2281. *}
  2282. Function extractFloat32Sign( a: float32 ): Flag; inline;
  2283. Begin
  2284. extractFloat32Sign := a shr 31;
  2285. End;
  2286. {*
  2287. -------------------------------------------------------------------------------
  2288. Normalizes the subnormal single-precision floating-point value represented
  2289. by the denormalized significand `aSig'. The normalized exponent and
  2290. significand are stored at the locations pointed to by `zExpPtr' and
  2291. `zSigPtr', respectively.
  2292. -------------------------------------------------------------------------------
  2293. *}
  2294. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2295. Var
  2296. ShiftCount : BYTE;
  2297. Begin
  2298. shiftCount := countLeadingZeros32( aSig ) - 8;
  2299. zSigPtr := aSig shl shiftCount;
  2300. zExpPtr := 1 - shiftCount;
  2301. End;
  2302. {*
  2303. -------------------------------------------------------------------------------
  2304. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2305. single-precision floating-point value, returning the result. After being
  2306. shifted into the proper positions, the three fields are simply added
  2307. together to form the result. This means that any integer portion of `zSig'
  2308. will be added into the exponent. Since a properly normalized significand
  2309. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2310. than the desired result exponent whenever `zSig' is a complete, normalized
  2311. significand.
  2312. -------------------------------------------------------------------------------
  2313. *}
  2314. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
  2315. Begin
  2316. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2317. + zSig;
  2318. End;
  2319. {*
  2320. -------------------------------------------------------------------------------
  2321. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2322. and significand `zSig', and returns the proper single-precision floating-
  2323. point value corresponding to the abstract input. Ordinarily, the abstract
  2324. value is simply rounded and packed into the single-precision format, with
  2325. the inexact exception raised if the abstract input cannot be represented
  2326. exactly. However, if the abstract value is too large, the overflow and
  2327. inexact exceptions are raised and an infinity or maximal finite value is
  2328. returned. If the abstract value is too small, the input value is rounded to
  2329. a subnormal number, and the underflow and inexact exceptions are raised if
  2330. the abstract input cannot be represented exactly as a subnormal single-
  2331. precision floating-point number.
  2332. The input significand `zSig' has its binary point between bits 30
  2333. and 29, which is 7 bits to the left of the usual location. This shifted
  2334. significand must be normalized or smaller. If `zSig' is not normalized,
  2335. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2336. and it must not require rounding. In the usual case that `zSig' is
  2337. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2338. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2339. Binary Floating-Point Arithmetic.
  2340. -------------------------------------------------------------------------------
  2341. *}
  2342. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2343. Var
  2344. roundingMode : TFPURoundingMode;
  2345. roundNearestEven : boolean;
  2346. roundIncrement, roundBits : BYTE;
  2347. IsTiny : boolean;
  2348. Begin
  2349. roundingMode := softfloat_rounding_mode;
  2350. roundNearestEven := (roundingMode = float_round_nearest_even);
  2351. roundIncrement := $40;
  2352. if not roundNearestEven then
  2353. Begin
  2354. if ( roundingMode = float_round_to_zero ) Then
  2355. Begin
  2356. roundIncrement := 0;
  2357. End
  2358. else
  2359. Begin
  2360. roundIncrement := $7F;
  2361. if ( zSign <> 0 ) then
  2362. Begin
  2363. if roundingMode = float_round_up then roundIncrement := 0;
  2364. End
  2365. else
  2366. Begin
  2367. if roundingMode = float_round_down then roundIncrement := 0;
  2368. End;
  2369. End
  2370. End;
  2371. roundBits := zSig AND $7F;
  2372. if ($FD <= bits16 (zExp) ) then
  2373. Begin
  2374. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2375. Begin
  2376. float_raise( [float_flag_overflow,float_flag_inexact] );
  2377. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2378. exit;
  2379. End;
  2380. if ( zExp < 0 ) then
  2381. Begin
  2382. isTiny :=
  2383. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2384. OR ( zExp < -1 )
  2385. OR ( (zSig + roundIncrement) < $80000000 );
  2386. shift32RightJamming( zSig, - zExp, zSig );
  2387. zExp := 0;
  2388. roundBits := zSig AND $7F;
  2389. if ( isTiny and (roundBits<>0) ) then
  2390. float_raise( float_flag_underflow );
  2391. End;
  2392. End;
  2393. if ( roundBits )<> 0 then
  2394. set_inexact_flag;
  2395. zSig := ( zSig + roundIncrement ) shr 7;
  2396. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
  2397. if ( zSig = 0 ) then zExp := 0;
  2398. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2399. End;
  2400. {*
  2401. -------------------------------------------------------------------------------
  2402. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2403. and significand `zSig', and returns the proper single-precision floating-
  2404. point value corresponding to the abstract input. This routine is just like
  2405. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2406. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2407. floating-point exponent.
  2408. -------------------------------------------------------------------------------
  2409. *}
  2410. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2411. Var
  2412. ShiftCount : int8;
  2413. Begin
  2414. shiftCount := countLeadingZeros32( zSig ) - 1;
  2415. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2416. End;
  2417. {*
  2418. -------------------------------------------------------------------------------
  2419. Returns the most-significant 20 fraction bits of the double-precision
  2420. floating-point value `a'.
  2421. -------------------------------------------------------------------------------
  2422. *}
  2423. Function extractFloat64Frac0(a: float64): bits32; inline;
  2424. Begin
  2425. extractFloat64Frac0 := a.high and $000FFFFF;
  2426. End;
  2427. {*
  2428. -------------------------------------------------------------------------------
  2429. Returns the least-significant 32 fraction bits of the double-precision
  2430. floating-point value `a'.
  2431. -------------------------------------------------------------------------------
  2432. *}
  2433. Function extractFloat64Frac1(a: float64): bits32; inline;
  2434. Begin
  2435. extractFloat64Frac1 := a.low;
  2436. End;
  2437. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2438. Function extractFloat64Frac(a: float64): bits64; inline;
  2439. Begin
  2440. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2441. End;
  2442. {*
  2443. -------------------------------------------------------------------------------
  2444. Returns the exponent bits of the double-precision floating-point value `a'.
  2445. -------------------------------------------------------------------------------
  2446. *}
  2447. Function extractFloat64Exp(a: float64): int16; inline;
  2448. Begin
  2449. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2450. End;
  2451. {*
  2452. -------------------------------------------------------------------------------
  2453. Returns the sign bit of the double-precision floating-point value `a'.
  2454. -------------------------------------------------------------------------------
  2455. *}
  2456. Function extractFloat64Sign(a: float64) : flag; inline;
  2457. Begin
  2458. extractFloat64Sign := a.high shr 31;
  2459. End;
  2460. {*
  2461. -------------------------------------------------------------------------------
  2462. Normalizes the subnormal double-precision floating-point value represented
  2463. by the denormalized significand formed by the concatenation of `aSig0' and
  2464. `aSig1'. The normalized exponent is stored at the location pointed to by
  2465. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2466. stored at the location pointed to by `zSig0Ptr', and the least significant
  2467. 32 bits of the normalized significand are stored at the location pointed to
  2468. by `zSig1Ptr'.
  2469. -------------------------------------------------------------------------------
  2470. *}
  2471. Procedure normalizeFloat64Subnormal(
  2472. aSig0: bits32;
  2473. aSig1: bits32;
  2474. VAR zExpPtr : Int16;
  2475. VAR zSig0Ptr : Bits32;
  2476. VAR zSig1Ptr : Bits32
  2477. );
  2478. Var
  2479. ShiftCount : Int8;
  2480. Begin
  2481. if ( aSig0 = 0 ) then
  2482. Begin
  2483. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2484. if ( shiftCount < 0 ) then
  2485. Begin
  2486. zSig0Ptr := aSig1 shr ( - shiftCount );
  2487. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2488. End
  2489. else
  2490. Begin
  2491. zSig0Ptr := aSig1 shl shiftCount;
  2492. zSig1Ptr := 0;
  2493. End;
  2494. zExpPtr := - shiftCount - 31;
  2495. End
  2496. else
  2497. Begin
  2498. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2499. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2500. zExpPtr := 1 - shiftCount;
  2501. End;
  2502. End;
  2503. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2504. var
  2505. shiftCount : int8;
  2506. begin
  2507. shiftCount := countLeadingZeros64( aSig ) - 11;
  2508. zSigPtr := aSig shl shiftCount;
  2509. zExpPtr := 1 - shiftCount;
  2510. end;
  2511. {*
  2512. -------------------------------------------------------------------------------
  2513. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2514. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2515. point value, returning the result. After being shifted into the proper
  2516. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2517. together to form the most significant 32 bits of the result. This means
  2518. that any integer portion of `zSig0' will be added into the exponent. Since
  2519. a properly normalized significand will have an integer portion equal to 1,
  2520. the `zExp' input should be 1 less than the desired result exponent whenever
  2521. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2522. -------------------------------------------------------------------------------
  2523. *}
  2524. Procedure
  2525. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2526. var
  2527. z: Float64;
  2528. Begin
  2529. z.low := zSig1;
  2530. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2531. c := z;
  2532. End;
  2533. {*----------------------------------------------------------------------------
  2534. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2535. | double-precision floating-point value, returning the result. After being
  2536. | shifted into the proper positions, the three fields are simply added
  2537. | together to form the result. This means that any integer portion of `zSig'
  2538. | will be added into the exponent. Since a properly normalized significand
  2539. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2540. | than the desired result exponent whenever `zSig' is a complete, normalized
  2541. | significand.
  2542. *----------------------------------------------------------------------------*}
  2543. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2544. begin
  2545. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2546. end;
  2547. {*
  2548. -------------------------------------------------------------------------------
  2549. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2550. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2551. and `zSig2', and returns the proper double-precision floating-point value
  2552. corresponding to the abstract input. Ordinarily, the abstract value is
  2553. simply rounded and packed into the double-precision format, with the inexact
  2554. exception raised if the abstract input cannot be represented exactly.
  2555. However, if the abstract value is too large, the overflow and inexact
  2556. exceptions are raised and an infinity or maximal finite value is returned.
  2557. If the abstract value is too small, the input value is rounded to a
  2558. subnormal number, and the underflow and inexact exceptions are raised if the
  2559. abstract input cannot be represented exactly as a subnormal double-precision
  2560. floating-point number.
  2561. The input significand must be normalized or smaller. If the input
  2562. significand is not normalized, `zExp' must be 0; in that case, the result
  2563. returned is a subnormal number, and it must not require rounding. In the
  2564. usual case that the input significand is normalized, `zExp' must be 1 less
  2565. than the ``true'' floating-point exponent. The handling of underflow and
  2566. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2567. -------------------------------------------------------------------------------
  2568. *}
  2569. Procedure
  2570. roundAndPackFloat64(
  2571. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2572. Var
  2573. roundingMode : TFPURoundingMode;
  2574. roundNearestEven, increment, isTiny : Flag;
  2575. Begin
  2576. roundingMode := softfloat_rounding_mode;
  2577. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2578. increment := flag( sbits32 (zSig2) < 0 );
  2579. if ( roundNearestEven = flag(FALSE) ) then
  2580. Begin
  2581. if ( roundingMode = float_round_to_zero ) then
  2582. increment := 0
  2583. else
  2584. Begin
  2585. if ( zSign )<> 0 then
  2586. Begin
  2587. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2588. End
  2589. else
  2590. Begin
  2591. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2592. End
  2593. End
  2594. End;
  2595. if ( $7FD <= bits16 (zExp) ) then
  2596. Begin
  2597. if (( $7FD < zExp )
  2598. or (( zExp = $7FD )
  2599. and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
  2600. and (increment<>0)
  2601. )
  2602. ) then
  2603. Begin
  2604. float_raise( [float_flag_overflow,float_flag_inexact] );
  2605. if (( roundingMode = float_round_to_zero )
  2606. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2607. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2608. ) then
  2609. Begin
  2610. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2611. exit;
  2612. End;
  2613. packFloat64( zSign, $7FF, 0, 0, c );
  2614. exit;
  2615. End;
  2616. if ( zExp < 0 ) then
  2617. Begin
  2618. isTiny :=
  2619. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2620. or flag( zExp < -1 )
  2621. or flag(increment = 0)
  2622. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2623. shift64ExtraRightJamming(
  2624. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2625. zExp := 0;
  2626. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2627. if ( roundNearestEven )<>0 then
  2628. Begin
  2629. increment := flag( sbits32 (zSig2) < 0 );
  2630. End
  2631. else
  2632. Begin
  2633. if ( zSign )<>0 then
  2634. Begin
  2635. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2636. End
  2637. else
  2638. Begin
  2639. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2640. End
  2641. End;
  2642. End;
  2643. End;
  2644. if ( zSig2 )<>0 then
  2645. set_inexact_flag;
  2646. if ( increment )<>0 then
  2647. Begin
  2648. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2649. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2650. End
  2651. else
  2652. Begin
  2653. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2654. End;
  2655. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2656. End;
  2657. {*----------------------------------------------------------------------------
  2658. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2659. | and significand `zSig', and returns the proper double-precision floating-
  2660. | point value corresponding to the abstract input. Ordinarily, the abstract
  2661. | value is simply rounded and packed into the double-precision format, with
  2662. | the inexact exception raised if the abstract input cannot be represented
  2663. | exactly. However, if the abstract value is too large, the overflow and
  2664. | inexact exceptions are raised and an infinity or maximal finite value is
  2665. | returned. If the abstract value is too small, the input value is rounded
  2666. | to a subnormal number, and the underflow and inexact exceptions are raised
  2667. | if the abstract input cannot be represented exactly as a subnormal double-
  2668. | precision floating-point number.
  2669. | The input significand `zSig' has its binary point between bits 62
  2670. | and 61, which is 10 bits to the left of the usual location. This shifted
  2671. | significand must be normalized or smaller. If `zSig' is not normalized,
  2672. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2673. | and it must not require rounding. In the usual case that `zSig' is
  2674. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2675. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2676. | Binary Floating-Point Arithmetic.
  2677. *----------------------------------------------------------------------------*}
  2678. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2679. var
  2680. roundingMode: TFPURoundingMode;
  2681. roundNearestEven: flag;
  2682. roundIncrement, roundBits: int16;
  2683. isTiny: flag;
  2684. begin
  2685. roundingMode := softfloat_rounding_mode;
  2686. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2687. roundIncrement := $200;
  2688. if ( roundNearestEven=0 ) then
  2689. begin
  2690. if ( roundingMode = float_round_to_zero ) then
  2691. begin
  2692. roundIncrement := 0;
  2693. end
  2694. else begin
  2695. roundIncrement := $3FF;
  2696. if ( zSign<>0 ) then
  2697. begin
  2698. if ( roundingMode = float_round_up ) then
  2699. roundIncrement := 0;
  2700. end
  2701. else begin
  2702. if ( roundingMode = float_round_down ) then
  2703. roundIncrement := 0;
  2704. end
  2705. end
  2706. end;
  2707. roundBits := zSig and $3FF;
  2708. if ( $7FD <= bits16(zExp) ) then
  2709. begin
  2710. if ( ( $7FD < zExp )
  2711. or ( ( zExp = $7FD )
  2712. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2713. ) then
  2714. begin
  2715. float_raise( [float_flag_overflow,float_flag_inexact] );
  2716. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2717. exit;
  2718. end;
  2719. if ( zExp < 0 ) then
  2720. begin
  2721. isTiny := ord(
  2722. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2723. or ( zExp < -1 )
  2724. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2725. shift64RightJamming( zSig, - zExp, zSig );
  2726. zExp := 0;
  2727. roundBits := zSig and $3FF;
  2728. if ( isTiny and roundBits )<>0 then
  2729. float_raise( float_flag_underflow );
  2730. end
  2731. end;
  2732. if ( roundBits<>0 ) then
  2733. set_inexact_flag;
  2734. zSig := ( zSig + roundIncrement ) shr 10;
  2735. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2736. if ( zSig = 0 ) then
  2737. zExp := 0;
  2738. result:=packFloat64( zSign, zExp, zSig );
  2739. end;
  2740. {*
  2741. -------------------------------------------------------------------------------
  2742. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2743. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2744. returns the proper double-precision floating-point value corresponding
  2745. to the abstract input. This routine is just like `roundAndPackFloat64'
  2746. except that the input significand has fewer bits and does not have to be
  2747. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2748. point exponent.
  2749. -------------------------------------------------------------------------------
  2750. *}
  2751. Procedure
  2752. normalizeRoundAndPackFloat64(
  2753. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2754. Var
  2755. shiftCount : int8;
  2756. zSig2 : bits32;
  2757. Begin
  2758. if ( zSig0 = 0 ) then
  2759. Begin
  2760. zSig0 := zSig1;
  2761. zSig1 := 0;
  2762. zExp := zExp -32;
  2763. End;
  2764. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2765. if ( 0 <= shiftCount ) then
  2766. Begin
  2767. zSig2 := 0;
  2768. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2769. End
  2770. else
  2771. Begin
  2772. shift64ExtraRightJamming
  2773. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2774. End;
  2775. zExp := zExp - shiftCount;
  2776. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2777. End;
  2778. {*
  2779. ----------------------------------------------------------------------------
  2780. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2781. and significand `zSig', and returns the proper double-precision floating-
  2782. point value corresponding to the abstract input. This routine is just like
  2783. `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  2784. Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2785. floating-point exponent.
  2786. ----------------------------------------------------------------------------
  2787. *}
  2788. function normalizeRoundAndPackFloat64(zSign: flag; zExp: int16; zSig: bits64): float64;
  2789. var
  2790. shiftCount: int8;
  2791. begin
  2792. shiftCount := countLeadingZeros64( zSig ) - 1;
  2793. result := roundAndPackFloat64( zSign, zExp - shiftCount, zSig shl shiftCount);
  2794. end;
  2795. {*
  2796. -------------------------------------------------------------------------------
  2797. Returns the result of converting the 32-bit two's complement integer `a' to
  2798. the single-precision floating-point format. The conversion is performed
  2799. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2800. -------------------------------------------------------------------------------
  2801. *}
  2802. Function int32_to_float32( a: int32): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  2803. Var
  2804. zSign : Flag;
  2805. Begin
  2806. if ( a = 0 ) then
  2807. Begin
  2808. int32_to_float32.float32 := 0;
  2809. exit;
  2810. End;
  2811. if ( a = sbits32 ($80000000) ) then
  2812. Begin
  2813. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2814. exit;
  2815. end;
  2816. zSign := flag( a < 0 );
  2817. If zSign<>0 then
  2818. a := -a;
  2819. int32_to_float32.float32:=
  2820. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2821. End;
  2822. {*
  2823. -------------------------------------------------------------------------------
  2824. Returns the result of converting the 32-bit two's complement integer `a' to
  2825. the double-precision floating-point format. The conversion is performed
  2826. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2827. -------------------------------------------------------------------------------
  2828. *}
  2829. Function int32_to_float64( a: int32) : float64;{$ifdef FPC_IS_SYSTEM} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2830. var
  2831. zSign : flag;
  2832. absA : bits32;
  2833. shiftCount : int8;
  2834. zSig0, zSig1 : bits32;
  2835. Begin
  2836. if ( a = 0 ) then
  2837. Begin
  2838. packFloat64( 0, 0, 0, 0, result );
  2839. exit;
  2840. end;
  2841. zSign := flag( a < 0 );
  2842. if ZSign<>0 then
  2843. AbsA := -a
  2844. else
  2845. AbsA := a;
  2846. shiftCount := countLeadingZeros32( absA ) - 11;
  2847. if ( 0 <= shiftCount ) then
  2848. Begin
  2849. zSig0 := absA shl shiftCount;
  2850. zSig1 := 0;
  2851. End
  2852. else
  2853. Begin
  2854. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2855. End;
  2856. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2857. End;
  2858. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2859. {$if not defined(packFloatx80)}
  2860. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2861. forward;
  2862. {$endif}
  2863. {*----------------------------------------------------------------------------
  2864. | Returns the result of converting the 32-bit two's complement integer `a'
  2865. | to the extended double-precision floating-point format. The conversion
  2866. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2867. | Arithmetic.
  2868. *----------------------------------------------------------------------------*}
  2869. function int32_to_floatx80( a: int32 ): floatx80;
  2870. var
  2871. zSign: flag;
  2872. absA: uint32;
  2873. shiftCount: int8;
  2874. zSig: bits64;
  2875. begin
  2876. if ( a = 0 ) then begin
  2877. result := packFloatx80( 0, 0, 0 );
  2878. exit;
  2879. end;
  2880. zSign := ord( a < 0 );
  2881. if zSign <> 0 then absA := - a else absA := a;
  2882. shiftCount := countLeadingZeros32( absA ) + 32;
  2883. zSig := absA;
  2884. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2885. end;
  2886. {$endif FPC_SOFTFLOAT_FLOATX80}
  2887. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2888. {$if not defined(packFloat128)}
  2889. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2890. forward;
  2891. {$endif}
  2892. {*----------------------------------------------------------------------------
  2893. | Returns the result of converting the 32-bit two's complement integer `a' to
  2894. | the quadruple-precision floating-point format. The conversion is performed
  2895. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2896. *----------------------------------------------------------------------------*}
  2897. function int32_to_float128( a: int32 ): float128;
  2898. var
  2899. zSign: flag;
  2900. absA: uint32;
  2901. shiftCount: int8;
  2902. zSig0: bits64;
  2903. begin
  2904. if ( a = 0 ) then begin
  2905. result := packFloat128( 0, 0, 0, 0 );
  2906. exit;
  2907. end;
  2908. zSign := ord( a < 0 );
  2909. if zSign <> 0 then absA := - a else absA := a;
  2910. shiftCount := countLeadingZeros32( absA ) + 17;
  2911. zSig0 := absA;
  2912. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2913. end;
  2914. {$endif FPC_SOFTFLOAT_FLOAT128}
  2915. {*
  2916. -------------------------------------------------------------------------------
  2917. Returns the result of converting the single-precision floating-point value
  2918. `a' to the 32-bit two's complement integer format. The conversion is
  2919. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2920. Arithmetic---which means in particular that the conversion is rounded
  2921. according to the current rounding mode. If `a' is a NaN, the largest
  2922. positive integer is returned. Otherwise, if the conversion overflows, the
  2923. largest integer with the same sign as `a' is returned.
  2924. -------------------------------------------------------------------------------
  2925. *}
  2926. Function float32_to_int32( a : float32rec) : int32;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  2927. Var
  2928. aSign: flag;
  2929. aExp, shiftCount: int16;
  2930. aSig, aSigExtra: bits32;
  2931. z: int32;
  2932. roundingMode: TFPURoundingMode;
  2933. Begin
  2934. aSig := extractFloat32Frac( a.float32 );
  2935. aExp := extractFloat32Exp( a.float32 );
  2936. aSign := extractFloat32Sign( a.float32 );
  2937. shiftCount := aExp - $96;
  2938. if ( 0 <= shiftCount ) then
  2939. Begin
  2940. if ( $9E <= aExp ) then
  2941. Begin
  2942. if ( a.float32 <> $CF000000 ) then
  2943. Begin
  2944. float_raise( float_flag_invalid );
  2945. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2946. Begin
  2947. float32_to_int32 := $7FFFFFFF;
  2948. exit;
  2949. End;
  2950. End;
  2951. float32_to_int32 := sbits32 ($80000000);
  2952. exit;
  2953. End;
  2954. z := ( aSig or $00800000 ) shl shiftCount;
  2955. if ( aSign<>0 ) then z := - z;
  2956. End
  2957. else
  2958. Begin
  2959. if ( aExp < $7E ) then
  2960. Begin
  2961. aSigExtra := aExp OR aSig;
  2962. z := 0;
  2963. End
  2964. else
  2965. Begin
  2966. aSig := aSig OR $00800000;
  2967. aSigExtra := aSig shl ( shiftCount and 31 );
  2968. z := aSig shr ( - shiftCount );
  2969. End;
  2970. if ( aSigExtra<>0 ) then
  2971. set_inexact_flag;
  2972. roundingMode := softfloat_rounding_mode;
  2973. if ( roundingMode = float_round_nearest_even ) then
  2974. Begin
  2975. if ( sbits32 (aSigExtra) < 0 ) then
  2976. Begin
  2977. Inc(z);
  2978. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2979. z := z and not 1;
  2980. End;
  2981. if ( aSign<>0 ) then
  2982. z := - z;
  2983. End
  2984. else
  2985. Begin
  2986. aSigExtra := flag( aSigExtra <> 0 );
  2987. if ( aSign<>0 ) then
  2988. Begin
  2989. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2990. z := - z;
  2991. End
  2992. else
  2993. Begin
  2994. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2995. End
  2996. End;
  2997. End;
  2998. float32_to_int32 := z;
  2999. End;
  3000. {*
  3001. -------------------------------------------------------------------------------
  3002. Returns the result of converting the single-precision floating-point value
  3003. `a' to the 32-bit two's complement integer format. The conversion is
  3004. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3005. Arithmetic, except that the conversion is always rounded toward zero.
  3006. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3007. the conversion overflows, the largest integer with the same sign as `a' is
  3008. returned.
  3009. -------------------------------------------------------------------------------
  3010. *}
  3011. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3012. Var
  3013. aSign : flag;
  3014. aExp, shiftCount : int16;
  3015. aSig : bits32;
  3016. z : int32;
  3017. Begin
  3018. aSig := extractFloat32Frac( a.float32 );
  3019. aExp := extractFloat32Exp( a.float32 );
  3020. aSign := extractFloat32Sign( a.float32 );
  3021. shiftCount := aExp - $9E;
  3022. if ( 0 <= shiftCount ) then
  3023. Begin
  3024. if ( a.float32 <> $CF000000 ) then
  3025. Begin
  3026. float_raise( float_flag_invalid );
  3027. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  3028. Begin
  3029. float32_to_int32_round_to_zero := $7FFFFFFF;
  3030. exit;
  3031. end;
  3032. End;
  3033. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  3034. exit;
  3035. End
  3036. else
  3037. if ( aExp <= $7E ) then
  3038. Begin
  3039. if ( aExp or aSig )<>0 then
  3040. set_inexact_flag;
  3041. float32_to_int32_round_to_zero := 0;
  3042. exit;
  3043. End;
  3044. aSig := ( aSig or $00800000 ) shl 8;
  3045. z := aSig shr ( - shiftCount );
  3046. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  3047. Begin
  3048. set_inexact_flag;
  3049. End;
  3050. if ( aSign<>0 ) then z := - z;
  3051. float32_to_int32_round_to_zero := z;
  3052. End;
  3053. {*----------------------------------------------------------------------------
  3054. | Returns the result of converting the single-precision floating-point value
  3055. | `a' to the 64-bit two's complement integer format. The conversion is
  3056. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3057. | Arithmetic---which means in particular that the conversion is rounded
  3058. | according to the current rounding mode. If `a' is a NaN, the largest
  3059. | positive integer is returned. Otherwise, if the conversion overflows, the
  3060. | largest integer with the same sign as `a' is returned.
  3061. *----------------------------------------------------------------------------*}
  3062. function float32_to_int64( a: float32 ): int64;
  3063. var
  3064. aSign: flag;
  3065. aExp, shiftCount: int16;
  3066. aSig: bits32;
  3067. aSig64, aSigExtra: bits64;
  3068. begin
  3069. aSig := extractFloat32Frac( a );
  3070. aExp := extractFloat32Exp( a );
  3071. aSign := extractFloat32Sign( a );
  3072. shiftCount := $BE - aExp;
  3073. if ( shiftCount < 0 ) then begin
  3074. float_raise( float_flag_invalid );
  3075. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3076. result := $7FFFFFFFFFFFFFFF;
  3077. exit;
  3078. end;
  3079. result := $8000000000000000;
  3080. exit;
  3081. end;
  3082. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  3083. aSig64 := aSig;
  3084. aSig64 := aSig64 shl 40;
  3085. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  3086. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  3087. end;
  3088. {*----------------------------------------------------------------------------
  3089. | Returns the result of converting the single-precision floating-point value
  3090. | `a' to the 64-bit two's complement integer format. The conversion is
  3091. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3092. | Arithmetic, except that the conversion is always rounded toward zero. If
  3093. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  3094. | conversion overflows, the largest integer with the same sign as `a' is
  3095. | returned.
  3096. *----------------------------------------------------------------------------*}
  3097. function float32_to_int64_round_to_zero( a: float32 ): int64;
  3098. var
  3099. aSign: flag;
  3100. aExp, shiftCount: int16;
  3101. aSig: bits32;
  3102. aSig64: bits64;
  3103. z: int64;
  3104. begin
  3105. aSig := extractFloat32Frac( a );
  3106. aExp := extractFloat32Exp( a );
  3107. aSign := extractFloat32Sign( a );
  3108. shiftCount := aExp - $BE;
  3109. if ( 0 <= shiftCount ) then begin
  3110. if ( a <> $DF000000 ) then begin
  3111. float_raise( float_flag_invalid );
  3112. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3113. result := $7FFFFFFFFFFFFFFF;
  3114. exit;
  3115. end;
  3116. end;
  3117. result := $8000000000000000;
  3118. exit;
  3119. end
  3120. else if ( aExp <= $7E ) then begin
  3121. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  3122. result := 0;
  3123. exit;
  3124. end;
  3125. aSig64 := aSig or $00800000;
  3126. aSig64 := aSig64 shl 40;
  3127. z := aSig64 shr ( - shiftCount );
  3128. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3129. set_inexact_flag;
  3130. if ( aSign <> 0 ) then z := - z;
  3131. result := z;
  3132. end;
  3133. {*
  3134. -------------------------------------------------------------------------------
  3135. Returns the result of converting the single-precision floating-point value
  3136. `a' to the double-precision floating-point format. The conversion is
  3137. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3138. Arithmetic.
  3139. -------------------------------------------------------------------------------
  3140. *}
  3141. Function float32_to_float64( a : float32rec) : Float64;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3142. Var
  3143. aSign : flag;
  3144. aExp : int16;
  3145. aSig, zSig0, zSig1: bits32;
  3146. tmp : CommonNanT;
  3147. Begin
  3148. aSig := extractFloat32Frac( a.float32 );
  3149. aExp := extractFloat32Exp( a.float32 );
  3150. aSign := extractFloat32Sign( a.float32 );
  3151. if ( aExp = $FF ) then
  3152. Begin
  3153. if ( aSig<>0 ) then
  3154. Begin
  3155. tmp:=float32ToCommonNaN(a.float32);
  3156. result:=commonNaNToFloat64(tmp);
  3157. exit;
  3158. End;
  3159. packFloat64( aSign, $7FF, 0, 0, result);
  3160. exit;
  3161. End;
  3162. if ( aExp = 0 ) then
  3163. Begin
  3164. if ( aSig = 0 ) then
  3165. Begin
  3166. packFloat64( aSign, 0, 0, 0, result );
  3167. exit;
  3168. end;
  3169. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3170. Dec(aExp);
  3171. End;
  3172. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3173. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3174. End;
  3175. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3176. {*----------------------------------------------------------------------------
  3177. | Returns the result of converting the canonical NaN `a' to the extended
  3178. | double-precision floating-point format.
  3179. *----------------------------------------------------------------------------*}
  3180. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3181. var
  3182. z : floatx80;
  3183. begin
  3184. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3185. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3186. result := z;
  3187. end;
  3188. {*----------------------------------------------------------------------------
  3189. | Returns the result of converting the single-precision floating-point value
  3190. | `a' to the extended double-precision floating-point format. The conversion
  3191. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3192. | Arithmetic.
  3193. *----------------------------------------------------------------------------*}
  3194. function float32_to_floatx80( a: float32 ): floatx80;
  3195. var
  3196. aSign: flag;
  3197. aExp: int16;
  3198. aSig: bits32;
  3199. tmp: commonNaNT;
  3200. begin
  3201. aSig := extractFloat32Frac( a );
  3202. aExp := extractFloat32Exp( a );
  3203. aSign := extractFloat32Sign( a );
  3204. if ( aExp = $FF ) then begin
  3205. if ( aSig <> 0 ) then begin
  3206. tmp:=float32ToCommonNaN(a);
  3207. result := commonNaNToFloatx80( tmp );
  3208. exit;
  3209. end;
  3210. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3211. exit;
  3212. end;
  3213. if ( aExp = 0 ) then begin
  3214. if ( aSig = 0 ) then begin
  3215. result := packFloatx80( aSign, 0, 0 );
  3216. exit;
  3217. end;
  3218. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3219. end;
  3220. aSig := aSig or $00800000;
  3221. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3222. end;
  3223. {$endif FPC_SOFTFLOAT_FLOATX80}
  3224. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3225. {*----------------------------------------------------------------------------
  3226. | Returns the result of converting the single-precision floating-point value
  3227. | `a' to the double-precision floating-point format. The conversion is
  3228. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3229. | Arithmetic.
  3230. *----------------------------------------------------------------------------*}
  3231. function float32_to_float128( a: float32 ): float128;
  3232. var
  3233. aSign: flag;
  3234. aExp: int16;
  3235. aSig: bits32;
  3236. tmp: commonNaNT;
  3237. begin
  3238. aSig := extractFloat32Frac( a );
  3239. aExp := extractFloat32Exp( a );
  3240. aSign := extractFloat32Sign( a );
  3241. if ( aExp = $FF ) then begin
  3242. if ( aSig <> 0 ) then begin
  3243. tmp:=float32ToCommonNaN(a);
  3244. result := commonNaNToFloat128( tmp );
  3245. exit;
  3246. end;
  3247. result := packFloat128( aSign, $7FFF, 0, 0 );
  3248. exit;
  3249. end;
  3250. if ( aExp = 0 ) then begin
  3251. if ( aSig = 0 ) then begin
  3252. result := packFloat128( aSign, 0, 0, 0 );
  3253. exit;
  3254. end;
  3255. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3256. dec( aExp );
  3257. end;
  3258. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3259. end;
  3260. {$endif FPC_SOFTFLOAT_FLOAT128}
  3261. {*
  3262. -------------------------------------------------------------------------------
  3263. Rounds the single-precision floating-point value `a' to an integer,
  3264. and returns the result as a single-precision floating-point value. The
  3265. operation is performed according to the IEC/IEEE Standard for Binary
  3266. Floating-Point Arithmetic.
  3267. -------------------------------------------------------------------------------
  3268. *}
  3269. Function float32_round_to_int( a: float32rec): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3270. Var
  3271. aSign: flag;
  3272. aExp: int16;
  3273. lastBitMask, roundBitsMask: bits32;
  3274. roundingMode: TFPURoundingMode;
  3275. z: float32;
  3276. Begin
  3277. aExp := extractFloat32Exp( a.float32 );
  3278. if ( $96 <= aExp ) then
  3279. Begin
  3280. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3281. Begin
  3282. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3283. exit;
  3284. End;
  3285. float32_round_to_int:=a;
  3286. exit;
  3287. End;
  3288. if ( aExp <= $7E ) then
  3289. Begin
  3290. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3291. Begin
  3292. float32_round_to_int:=a;
  3293. exit;
  3294. end;
  3295. set_inexact_flag;
  3296. aSign := extractFloat32Sign( a.float32 );
  3297. case ( softfloat_rounding_mode ) of
  3298. float_round_nearest_even:
  3299. Begin
  3300. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3301. Begin
  3302. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3303. exit;
  3304. End;
  3305. End;
  3306. float_round_down:
  3307. Begin
  3308. if aSign <> 0 then
  3309. float32_round_to_int.float32 := $BF800000
  3310. else
  3311. float32_round_to_int.float32 := 0;
  3312. exit;
  3313. End;
  3314. float_round_up:
  3315. Begin
  3316. if aSign <> 0 then
  3317. float32_round_to_int.float32 := $80000000
  3318. else
  3319. float32_round_to_int.float32 := $3F800000;
  3320. exit;
  3321. End;
  3322. end;
  3323. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3324. exit;
  3325. End;
  3326. lastBitMask := 1;
  3327. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3328. lastBitMask := lastBitMask shl ($96 - aExp);
  3329. roundBitsMask := lastBitMask - 1;
  3330. z := a.float32;
  3331. roundingMode := softfloat_rounding_mode;
  3332. if ( roundingMode = float_round_nearest_even ) then
  3333. Begin
  3334. z := z + (lastBitMask shr 1);
  3335. if ( ( z and roundBitsMask ) = 0 ) then
  3336. z := z and not lastBitMask;
  3337. End
  3338. else if ( roundingMode <> float_round_to_zero ) then
  3339. Begin
  3340. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3341. Begin
  3342. z := z + roundBitsMask;
  3343. End;
  3344. End;
  3345. z := z and not roundBitsMask;
  3346. if ( z <> a.float32 ) then
  3347. set_inexact_flag;
  3348. float32_round_to_int.float32 := z;
  3349. End;
  3350. {*
  3351. -------------------------------------------------------------------------------
  3352. Returns the result of adding the absolute values of the single-precision
  3353. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3354. before being returned. `zSign' is ignored if the result is a NaN.
  3355. The addition is performed according to the IEC/IEEE Standard for Binary
  3356. Floating-Point Arithmetic.
  3357. -------------------------------------------------------------------------------
  3358. *}
  3359. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3360. Var
  3361. aExp, bExp, zExp: int16;
  3362. aSig, bSig, zSig: bits32;
  3363. expDiff: int16;
  3364. label roundAndPack;
  3365. Begin
  3366. aSig:=extractFloat32Frac( a );
  3367. aExp:=extractFloat32Exp( a );
  3368. bSig:=extractFloat32Frac( b );
  3369. bExp := extractFloat32Exp( b );
  3370. expDiff := aExp - bExp;
  3371. aSig := aSig shl 6;
  3372. bSig := bSig shl 6;
  3373. if ( 0 < expDiff ) then
  3374. Begin
  3375. if ( aExp = $FF ) then
  3376. Begin
  3377. if ( aSig <> 0) then
  3378. Begin
  3379. addFloat32Sigs := propagateFloat32NaN( a, b );
  3380. exit;
  3381. End;
  3382. addFloat32Sigs := a;
  3383. exit;
  3384. End;
  3385. if ( bExp = 0 ) then
  3386. Begin
  3387. Dec(expDiff);
  3388. End
  3389. else
  3390. Begin
  3391. bSig := bSig or $20000000;
  3392. End;
  3393. shift32RightJamming( bSig, expDiff, bSig );
  3394. zExp := aExp;
  3395. End
  3396. else
  3397. If ( expDiff < 0 ) then
  3398. Begin
  3399. if ( bExp = $FF ) then
  3400. Begin
  3401. if ( bSig<>0 ) then
  3402. Begin
  3403. addFloat32Sigs := propagateFloat32NaN( a, b );
  3404. exit;
  3405. end;
  3406. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3407. exit;
  3408. End;
  3409. if ( aExp = 0 ) then
  3410. Begin
  3411. Inc(expDiff);
  3412. End
  3413. else
  3414. Begin
  3415. aSig := aSig OR $20000000;
  3416. End;
  3417. shift32RightJamming( aSig, - expDiff, aSig );
  3418. zExp := bExp;
  3419. End
  3420. else
  3421. Begin
  3422. if ( aExp = $FF ) then
  3423. Begin
  3424. if ( aSig OR bSig )<> 0 then
  3425. Begin
  3426. addFloat32Sigs := propagateFloat32NaN( a, b );
  3427. exit;
  3428. end;
  3429. addFloat32Sigs := a;
  3430. exit;
  3431. End;
  3432. if ( aExp = 0 ) then
  3433. Begin
  3434. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3435. exit;
  3436. end;
  3437. zSig := $40000000 + aSig + bSig;
  3438. zExp := aExp;
  3439. goto roundAndPack;
  3440. End;
  3441. aSig := aSig OR $20000000;
  3442. zSig := ( aSig + bSig ) shl 1;
  3443. Dec(zExp);
  3444. if ( sbits32 (zSig) < 0 ) then
  3445. Begin
  3446. zSig := aSig + bSig;
  3447. Inc(zExp);
  3448. End;
  3449. roundAndPack:
  3450. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3451. End;
  3452. {*
  3453. -------------------------------------------------------------------------------
  3454. Returns the result of subtracting the absolute values of the single-
  3455. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3456. difference is negated before being returned. `zSign' is ignored if the
  3457. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3458. Standard for Binary Floating-Point Arithmetic.
  3459. -------------------------------------------------------------------------------
  3460. *}
  3461. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3462. Var
  3463. aExp, bExp, zExp: int16;
  3464. aSig, bSig, zSig: bits32;
  3465. expDiff : int16;
  3466. label aExpBigger;
  3467. label bExpBigger;
  3468. label aBigger;
  3469. label bBigger;
  3470. label normalizeRoundAndPack;
  3471. Begin
  3472. aSig := extractFloat32Frac( a );
  3473. aExp := extractFloat32Exp( a );
  3474. bSig := extractFloat32Frac( b );
  3475. bExp := extractFloat32Exp( b );
  3476. expDiff := aExp - bExp;
  3477. aSig := aSig shl 7;
  3478. bSig := bSig shl 7;
  3479. if ( 0 < expDiff ) then goto aExpBigger;
  3480. if ( expDiff < 0 ) then goto bExpBigger;
  3481. if ( aExp = $FF ) then
  3482. Begin
  3483. if ( aSig OR bSig )<> 0 then
  3484. Begin
  3485. subFloat32Sigs := propagateFloat32NaN( a, b );
  3486. exit;
  3487. End;
  3488. float_raise( float_flag_invalid );
  3489. subFloat32Sigs := float32_default_nan;
  3490. exit;
  3491. End;
  3492. if ( aExp = 0 ) then
  3493. Begin
  3494. aExp := 1;
  3495. bExp := 1;
  3496. End;
  3497. if ( bSig < aSig ) Then goto aBigger;
  3498. if ( aSig < bSig ) Then goto bBigger;
  3499. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3500. exit;
  3501. bExpBigger:
  3502. if ( bExp = $FF ) then
  3503. Begin
  3504. if ( bSig<>0 ) then
  3505. Begin
  3506. subFloat32Sigs := propagateFloat32NaN( a, b );
  3507. exit;
  3508. End;
  3509. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3510. exit;
  3511. End;
  3512. if ( aExp = 0 ) then
  3513. Begin
  3514. Inc(expDiff);
  3515. End
  3516. else
  3517. Begin
  3518. aSig := aSig OR $40000000;
  3519. End;
  3520. shift32RightJamming( aSig, - expDiff, aSig );
  3521. bSig := bSig OR $40000000;
  3522. bBigger:
  3523. zSig := bSig - aSig;
  3524. zExp := bExp;
  3525. zSign := zSign xor 1;
  3526. goto normalizeRoundAndPack;
  3527. aExpBigger:
  3528. if ( aExp = $FF ) then
  3529. Begin
  3530. if ( aSig <> 0) then
  3531. Begin
  3532. subFloat32Sigs := propagateFloat32NaN( a, b );
  3533. exit;
  3534. End;
  3535. subFloat32Sigs := a;
  3536. exit;
  3537. End;
  3538. if ( bExp = 0 ) then
  3539. Begin
  3540. Dec(expDiff);
  3541. End
  3542. else
  3543. Begin
  3544. bSig := bSig OR $40000000;
  3545. End;
  3546. shift32RightJamming( bSig, expDiff, bSig );
  3547. aSig := aSig OR $40000000;
  3548. aBigger:
  3549. zSig := aSig - bSig;
  3550. zExp := aExp;
  3551. normalizeRoundAndPack:
  3552. Dec(zExp);
  3553. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3554. End;
  3555. {*
  3556. -------------------------------------------------------------------------------
  3557. Returns the result of adding the single-precision floating-point values `a'
  3558. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3559. Binary Floating-Point Arithmetic.
  3560. -------------------------------------------------------------------------------
  3561. *}
  3562. Function float32_add( a: float32rec; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3563. Var
  3564. aSign, bSign: Flag;
  3565. Begin
  3566. aSign := extractFloat32Sign( a.float32 );
  3567. bSign := extractFloat32Sign( b.float32 );
  3568. if ( aSign = bSign ) then
  3569. Begin
  3570. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3571. End
  3572. else
  3573. Begin
  3574. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3575. End;
  3576. End;
  3577. {*
  3578. -------------------------------------------------------------------------------
  3579. Returns the result of subtracting the single-precision floating-point values
  3580. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3581. for Binary Floating-Point Arithmetic.
  3582. -------------------------------------------------------------------------------
  3583. *}
  3584. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3585. Var
  3586. aSign, bSign: flag;
  3587. Begin
  3588. aSign := extractFloat32Sign( a.float32 );
  3589. bSign := extractFloat32Sign( b.float32 );
  3590. if ( aSign = bSign ) then
  3591. Begin
  3592. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3593. End
  3594. else
  3595. Begin
  3596. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3597. End;
  3598. End;
  3599. {*
  3600. -------------------------------------------------------------------------------
  3601. Returns the result of multiplying the single-precision floating-point values
  3602. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3603. for Binary Floating-Point Arithmetic.
  3604. -------------------------------------------------------------------------------
  3605. *}
  3606. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3607. Var
  3608. aSign, bSign, zSign: flag;
  3609. aExp, bExp, zExp : int16;
  3610. aSig, bSig, zSig0, zSig1: bits32;
  3611. Begin
  3612. aSig := extractFloat32Frac( a.float32 );
  3613. aExp := extractFloat32Exp( a.float32 );
  3614. aSign := extractFloat32Sign( a.float32 );
  3615. bSig := extractFloat32Frac( b.float32 );
  3616. bExp := extractFloat32Exp( b.float32 );
  3617. bSign := extractFloat32Sign( b.float32 );
  3618. zSign := aSign xor bSign;
  3619. if ( aExp = $FF ) then
  3620. Begin
  3621. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3622. Begin
  3623. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3624. exit;
  3625. End;
  3626. if ( ( bits32(bExp) OR bSig ) = 0 ) then
  3627. Begin
  3628. float_raise( float_flag_invalid );
  3629. float32_mul.float32 := float32_default_nan;
  3630. exit;
  3631. End;
  3632. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3633. exit;
  3634. End;
  3635. if ( bExp = $FF ) then
  3636. Begin
  3637. if ( bSig <> 0 ) then
  3638. Begin
  3639. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3640. exit;
  3641. End;
  3642. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3643. Begin
  3644. float_raise( float_flag_invalid );
  3645. float32_mul.float32 := float32_default_nan;
  3646. exit;
  3647. End;
  3648. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3649. exit;
  3650. End;
  3651. if ( aExp = 0 ) then
  3652. Begin
  3653. if ( aSig = 0 ) then
  3654. Begin
  3655. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3656. exit;
  3657. End;
  3658. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3659. End;
  3660. if ( bExp = 0 ) then
  3661. Begin
  3662. if ( bSig = 0 ) then
  3663. Begin
  3664. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3665. exit;
  3666. End;
  3667. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3668. End;
  3669. zExp := aExp + bExp - $7F;
  3670. aSig := ( aSig OR $00800000 ) shl 7;
  3671. bSig := ( bSig OR $00800000 ) shl 8;
  3672. mul32To64( aSig, bSig, zSig0, zSig1 );
  3673. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3674. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3675. Begin
  3676. zSig0 := zSig0 shl 1;
  3677. Dec(zExp);
  3678. End;
  3679. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3680. End;
  3681. {*
  3682. -------------------------------------------------------------------------------
  3683. Returns the result of dividing the single-precision floating-point value `a'
  3684. by the corresponding value `b'. The operation is performed according to the
  3685. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3686. -------------------------------------------------------------------------------
  3687. *}
  3688. Function float32_div(a: float32rec;b: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3689. Var
  3690. aSign, bSign, zSign: flag;
  3691. aExp, bExp, zExp: int16;
  3692. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3693. Begin
  3694. aSig := extractFloat32Frac( a.float32 );
  3695. aExp := extractFloat32Exp( a.float32 );
  3696. aSign := extractFloat32Sign( a.float32 );
  3697. bSig := extractFloat32Frac( b.float32 );
  3698. bExp := extractFloat32Exp( b.float32 );
  3699. bSign := extractFloat32Sign( b.float32 );
  3700. zSign := aSign xor bSign;
  3701. if ( aExp = $FF ) then
  3702. Begin
  3703. if ( aSig <> 0 ) then
  3704. Begin
  3705. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3706. exit;
  3707. End;
  3708. if ( bExp = $FF ) then
  3709. Begin
  3710. if ( bSig <> 0) then
  3711. Begin
  3712. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3713. exit;
  3714. End;
  3715. float_raise( float_flag_invalid );
  3716. float32_div.float32 := float32_default_nan;
  3717. exit;
  3718. End;
  3719. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3720. exit;
  3721. End;
  3722. if ( bExp = $FF ) then
  3723. Begin
  3724. if ( bSig <> 0) then
  3725. Begin
  3726. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3727. exit;
  3728. End;
  3729. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3730. exit;
  3731. End;
  3732. if ( bExp = 0 ) Then
  3733. Begin
  3734. if ( bSig = 0 ) Then
  3735. Begin
  3736. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3737. Begin
  3738. float_raise( float_flag_invalid );
  3739. float32_div.float32 := float32_default_nan;
  3740. exit;
  3741. End;
  3742. float_raise( float_flag_divbyzero );
  3743. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3744. exit;
  3745. End;
  3746. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3747. End;
  3748. if ( aExp = 0 ) Then
  3749. Begin
  3750. if ( aSig = 0 ) Then
  3751. Begin
  3752. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3753. exit;
  3754. End;
  3755. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3756. End;
  3757. zExp := aExp - bExp + $7D;
  3758. aSig := ( aSig OR $00800000 ) shl 7;
  3759. bSig := ( bSig OR $00800000 ) shl 8;
  3760. if ( bSig <= ( aSig + aSig ) ) then
  3761. Begin
  3762. aSig := aSig shr 1;
  3763. Inc(zExp);
  3764. End;
  3765. zSig := estimateDiv64To32( aSig, 0, bSig );
  3766. if ( ( zSig and $3F ) <= 2 ) then
  3767. Begin
  3768. mul32To64( bSig, zSig, term0, term1 );
  3769. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3770. while ( sbits32 (rem0) < 0 ) do
  3771. Begin
  3772. Dec(zSig);
  3773. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3774. End;
  3775. zSig := zSig or bits32( rem1 <> 0 );
  3776. End;
  3777. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3778. End;
  3779. {*
  3780. -------------------------------------------------------------------------------
  3781. Returns the remainder of the single-precision floating-point value `a'
  3782. with respect to the corresponding value `b'. The operation is performed
  3783. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3784. -------------------------------------------------------------------------------
  3785. *}
  3786. Function float32_rem(a: float32rec; b: float32rec ):float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3787. Var
  3788. aSign, zSign: flag;
  3789. aExp, bExp, expDiff: int16;
  3790. aSig, bSig, q, alternateASig: bits32;
  3791. sigMean: sbits32;
  3792. Begin
  3793. aSig := extractFloat32Frac( a.float32 );
  3794. aExp := extractFloat32Exp( a.float32 );
  3795. aSign := extractFloat32Sign( a.float32 );
  3796. bSig := extractFloat32Frac( b.float32 );
  3797. bExp := extractFloat32Exp( b.float32 );
  3798. if ( aExp = $FF ) then
  3799. Begin
  3800. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3801. Begin
  3802. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3803. exit;
  3804. End;
  3805. float_raise( float_flag_invalid );
  3806. float32_rem.float32 := float32_default_nan;
  3807. exit;
  3808. End;
  3809. if ( bExp = $FF ) then
  3810. Begin
  3811. if ( bSig <> 0 ) then
  3812. Begin
  3813. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3814. exit;
  3815. End;
  3816. float32_rem := a;
  3817. exit;
  3818. End;
  3819. if ( bExp = 0 ) then
  3820. Begin
  3821. if ( bSig = 0 ) then
  3822. Begin
  3823. float_raise( float_flag_invalid );
  3824. float32_rem.float32 := float32_default_nan;
  3825. exit;
  3826. End;
  3827. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3828. End;
  3829. if ( aExp = 0 ) then
  3830. Begin
  3831. if ( aSig = 0 ) then
  3832. Begin
  3833. float32_rem := a;
  3834. exit;
  3835. End;
  3836. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3837. End;
  3838. expDiff := aExp - bExp;
  3839. aSig := ( aSig OR $00800000 ) shl 8;
  3840. bSig := ( bSig OR $00800000 ) shl 8;
  3841. if ( expDiff < 0 ) then
  3842. Begin
  3843. if ( expDiff < -1 ) then
  3844. Begin
  3845. float32_rem := a;
  3846. exit;
  3847. End;
  3848. aSig := aSig shr 1;
  3849. End;
  3850. q := bits32( bSig <= aSig );
  3851. if ( q <> 0) then
  3852. aSig := aSig - bSig;
  3853. expDiff := expDiff - 32;
  3854. while ( 0 < expDiff ) do
  3855. Begin
  3856. q := estimateDiv64To32( aSig, 0, bSig );
  3857. if (2 < q) then
  3858. q := q - 2
  3859. else
  3860. q := 0;
  3861. aSig := - ( ( bSig shr 2 ) * q );
  3862. expDiff := expDiff - 30;
  3863. End;
  3864. expDiff := expDiff + 32;
  3865. if ( 0 < expDiff ) then
  3866. Begin
  3867. q := estimateDiv64To32( aSig, 0, bSig );
  3868. if (2 < q) then
  3869. q := q - 2
  3870. else
  3871. q := 0;
  3872. q := q shr (32 - expDiff);
  3873. bSig := bSig shr 2;
  3874. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3875. End
  3876. else
  3877. Begin
  3878. aSig := aSig shr 2;
  3879. bSig := bSig shr 2;
  3880. End;
  3881. Repeat
  3882. alternateASig := aSig;
  3883. Inc(q);
  3884. aSig := aSig - bSig;
  3885. Until not ( 0 <= sbits32 (aSig) );
  3886. sigMean := aSig + alternateASig;
  3887. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3888. Begin
  3889. aSig := alternateASig;
  3890. End;
  3891. zSign := flag( sbits32 (aSig) < 0 );
  3892. if ( zSign<>0 ) then
  3893. aSig := - aSig;
  3894. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3895. End;
  3896. {*
  3897. -------------------------------------------------------------------------------
  3898. Returns the square root of the single-precision floating-point value `a'.
  3899. The operation is performed according to the IEC/IEEE Standard for Binary
  3900. Floating-Point Arithmetic.
  3901. -------------------------------------------------------------------------------
  3902. *}
  3903. Function float32_sqrt(a: float32rec ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3904. Var
  3905. aSign : flag;
  3906. aExp, zExp : int16;
  3907. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3908. label roundAndPack;
  3909. Begin
  3910. aSig := extractFloat32Frac( a.float32 );
  3911. aExp := extractFloat32Exp( a.float32 );
  3912. aSign := extractFloat32Sign( a.float32 );
  3913. if ( aExp = $FF ) then
  3914. Begin
  3915. if ( aSig <> 0) then
  3916. Begin
  3917. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3918. exit;
  3919. End;
  3920. if ( aSign = 0) then
  3921. Begin
  3922. float32_sqrt := a;
  3923. exit;
  3924. End;
  3925. float_raise( float_flag_invalid );
  3926. float32_sqrt.float32 := float32_default_nan;
  3927. exit;
  3928. End;
  3929. if ( aSign <> 0) then
  3930. Begin
  3931. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3932. Begin
  3933. float32_sqrt := a;
  3934. exit;
  3935. End;
  3936. float_raise( float_flag_invalid );
  3937. float32_sqrt.float32 := float32_default_nan;
  3938. exit;
  3939. End;
  3940. if ( aExp = 0 ) then
  3941. Begin
  3942. if ( aSig = 0 ) then
  3943. Begin
  3944. float32_sqrt.float32 := 0;
  3945. exit;
  3946. End;
  3947. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3948. End;
  3949. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3950. aSig := ( aSig OR $00800000 ) shl 8;
  3951. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3952. if ( ( zSig and $7F ) <= 5 ) then
  3953. Begin
  3954. if ( zSig < 2 ) then
  3955. Begin
  3956. zSig := $7FFFFFFF;
  3957. goto roundAndPack;
  3958. End
  3959. else
  3960. Begin
  3961. aSig := aSig shr (aExp and 1);
  3962. mul32To64( zSig, zSig, term0, term1 );
  3963. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3964. while ( sbits32 (rem0) < 0 ) do
  3965. Begin
  3966. Dec(zSig);
  3967. shortShift64Left( 0, zSig, 1, term0, term1 );
  3968. term1 := term1 or 1;
  3969. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3970. End;
  3971. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3972. End;
  3973. End;
  3974. shift32RightJamming( zSig, 1, zSig );
  3975. roundAndPack:
  3976. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3977. End;
  3978. {*
  3979. -------------------------------------------------------------------------------
  3980. Returns 1 if the single-precision floating-point value `a' is equal to
  3981. the corresponding value `b', and 0 otherwise. The comparison is performed
  3982. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3983. -------------------------------------------------------------------------------
  3984. *}
  3985. Function float32_eq( a:float32rec; b:float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3986. Begin
  3987. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3988. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3989. ) then
  3990. Begin
  3991. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3992. Begin
  3993. float_raise( float_flag_invalid );
  3994. End;
  3995. float32_eq := 0;
  3996. exit;
  3997. End;
  3998. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3999. End;
  4000. {*
  4001. -------------------------------------------------------------------------------
  4002. Returns 1 if the single-precision floating-point value `a' is less than
  4003. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4004. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4005. Arithmetic.
  4006. -------------------------------------------------------------------------------
  4007. *}
  4008. Function float32_le( a: float32rec; b : float32rec ):flag;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4009. var
  4010. aSign, bSign: flag;
  4011. Begin
  4012. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  4013. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  4014. ) then
  4015. Begin
  4016. float_raise( float_flag_invalid );
  4017. float32_le := 0;
  4018. exit;
  4019. End;
  4020. aSign := extractFloat32Sign( a.float32 );
  4021. bSign := extractFloat32Sign( b.float32 );
  4022. if ( aSign <> bSign ) then
  4023. Begin
  4024. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  4025. exit;
  4026. End;
  4027. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  4028. End;
  4029. {*
  4030. -------------------------------------------------------------------------------
  4031. Returns 1 if the single-precision floating-point value `a' is less than
  4032. the corresponding value `b', and 0 otherwise. The comparison is performed
  4033. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4034. -------------------------------------------------------------------------------
  4035. *}
  4036. Function float32_lt( a:float32rec ; b : float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4037. var
  4038. aSign, bSign: flag;
  4039. Begin
  4040. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  4041. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  4042. ) then
  4043. Begin
  4044. float_raise( float_flag_invalid );
  4045. float32_lt :=0;
  4046. exit;
  4047. End;
  4048. aSign := extractFloat32Sign( a.float32 );
  4049. bSign := extractFloat32Sign( b.float32 );
  4050. if ( aSign <> bSign ) then
  4051. Begin
  4052. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  4053. exit;
  4054. End;
  4055. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  4056. End;
  4057. {*
  4058. -------------------------------------------------------------------------------
  4059. Returns 1 if the single-precision floating-point value `a' is equal to
  4060. the corresponding value `b', and 0 otherwise. The invalid exception is
  4061. raised if either operand is a NaN. Otherwise, the comparison is performed
  4062. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4063. -------------------------------------------------------------------------------
  4064. *}
  4065. Function float32_eq_signaling( a: float32; b: float32) : flag;
  4066. Begin
  4067. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  4068. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  4069. ) then
  4070. Begin
  4071. float_raise( float_flag_invalid );
  4072. float32_eq_signaling := 0;
  4073. exit;
  4074. End;
  4075. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  4076. End;
  4077. {*
  4078. -------------------------------------------------------------------------------
  4079. Returns 1 if the single-precision floating-point value `a' is less than or
  4080. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4081. cause an exception. Otherwise, the comparison is performed according to the
  4082. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4083. -------------------------------------------------------------------------------
  4084. *}
  4085. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  4086. Var
  4087. aSign, bSign: flag;
  4088. Begin
  4089. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4090. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4091. ) then
  4092. Begin
  4093. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4094. Begin
  4095. float_raise( float_flag_invalid );
  4096. End;
  4097. float32_le_quiet := 0;
  4098. exit;
  4099. End;
  4100. aSign := extractFloat32Sign( a );
  4101. bSign := extractFloat32Sign( b );
  4102. if ( aSign <> bSign ) then
  4103. Begin
  4104. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  4105. exit;
  4106. End;
  4107. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  4108. End;
  4109. {*
  4110. -------------------------------------------------------------------------------
  4111. Returns 1 if the single-precision floating-point value `a' is less than
  4112. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4113. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4114. Standard for Binary Floating-Point Arithmetic.
  4115. -------------------------------------------------------------------------------
  4116. *}
  4117. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  4118. Var
  4119. aSign, bSign: flag;
  4120. Begin
  4121. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4122. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4123. ) then
  4124. Begin
  4125. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4126. Begin
  4127. float_raise( float_flag_invalid );
  4128. End;
  4129. float32_lt_quiet := 0;
  4130. exit;
  4131. End;
  4132. aSign := extractFloat32Sign( a );
  4133. bSign := extractFloat32Sign( b );
  4134. if ( aSign <> bSign ) then
  4135. Begin
  4136. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4137. exit;
  4138. End;
  4139. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4140. End;
  4141. {*
  4142. -------------------------------------------------------------------------------
  4143. Returns the result of converting the double-precision floating-point value
  4144. `a' to the 32-bit two's complement integer format. The conversion is
  4145. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4146. Arithmetic---which means in particular that the conversion is rounded
  4147. according to the current rounding mode. If `a' is a NaN, the largest
  4148. positive integer is returned. Otherwise, if the conversion overflows, the
  4149. largest integer with the same sign as `a' is returned.
  4150. -------------------------------------------------------------------------------
  4151. *}
  4152. Function float64_to_int32(a: float64): int32;{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4153. var
  4154. aSign: flag;
  4155. aExp, shiftCount: int16;
  4156. aSig0, aSig1, absZ, aSigExtra: bits32;
  4157. z: int32;
  4158. roundingMode: TFPURoundingMode;
  4159. label invalid;
  4160. Begin
  4161. aSig1 := extractFloat64Frac1( a );
  4162. aSig0 := extractFloat64Frac0( a );
  4163. aExp := extractFloat64Exp( a );
  4164. aSign := extractFloat64Sign( a );
  4165. shiftCount := aExp - $413;
  4166. if ( 0 <= shiftCount ) then
  4167. Begin
  4168. if ( $41E < aExp ) then
  4169. Begin
  4170. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4171. aSign := 0;
  4172. goto invalid;
  4173. End;
  4174. shortShift64Left(
  4175. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4176. if ( $80000000 < absZ ) then
  4177. goto invalid;
  4178. End
  4179. else
  4180. Begin
  4181. aSig1 := flag( aSig1 <> 0 );
  4182. if ( aExp < $3FE ) then
  4183. Begin
  4184. aSigExtra := aExp OR aSig0 OR aSig1;
  4185. absZ := 0;
  4186. End
  4187. else
  4188. Begin
  4189. aSig0 := aSig0 OR $00100000;
  4190. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4191. absZ := aSig0 shr ( - shiftCount );
  4192. End;
  4193. End;
  4194. roundingMode := softfloat_rounding_mode;
  4195. if ( roundingMode = float_round_nearest_even ) then
  4196. Begin
  4197. if ( sbits32(aSigExtra) < 0 ) then
  4198. Begin
  4199. Inc(absZ);
  4200. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4201. absZ := absZ and not 1;
  4202. End;
  4203. if aSign <> 0 then
  4204. z := - absZ
  4205. else
  4206. z := absZ;
  4207. End
  4208. else
  4209. Begin
  4210. aSigExtra := bits32( aSigExtra <> 0 );
  4211. if ( aSign <> 0) then
  4212. Begin
  4213. z := - ( absZ
  4214. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4215. End
  4216. else
  4217. Begin
  4218. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4219. End
  4220. End;
  4221. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4222. Begin
  4223. invalid:
  4224. float_raise( float_flag_invalid );
  4225. if (aSign <> 0 ) then
  4226. float64_to_int32 := sbits32 ($80000000)
  4227. else
  4228. float64_to_int32 := $7FFFFFFF;
  4229. exit;
  4230. End;
  4231. if ( aSigExtra <> 0) then
  4232. set_inexact_flag;
  4233. float64_to_int32 := z;
  4234. End;
  4235. {*
  4236. -------------------------------------------------------------------------------
  4237. Returns the result of converting the double-precision floating-point value
  4238. `a' to the 32-bit two's complement integer format. The conversion is
  4239. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4240. Arithmetic, except that the conversion is always rounded toward zero.
  4241. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4242. the conversion overflows, the largest integer with the same sign as `a' is
  4243. returned.
  4244. -------------------------------------------------------------------------------
  4245. *}
  4246. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4247. {$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4248. Var
  4249. aSign: flag;
  4250. aExp, shiftCount: int16;
  4251. aSig0, aSig1, absZ, aSigExtra: bits32;
  4252. z: int32;
  4253. label invalid;
  4254. Begin
  4255. aSig1 := extractFloat64Frac1( a );
  4256. aSig0 := extractFloat64Frac0( a );
  4257. aExp := extractFloat64Exp( a );
  4258. aSign := extractFloat64Sign( a );
  4259. shiftCount := aExp - $413;
  4260. if ( 0 <= shiftCount ) then
  4261. Begin
  4262. if ( $41E < aExp ) then
  4263. Begin
  4264. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4265. aSign := 0;
  4266. goto invalid;
  4267. End;
  4268. shortShift64Left(
  4269. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4270. End
  4271. else
  4272. Begin
  4273. if ( aExp < $3FF ) then
  4274. Begin
  4275. if ( bits32(aExp) OR aSig0 OR aSig1 )<>0 then
  4276. Begin
  4277. set_inexact_flag;
  4278. End;
  4279. float64_to_int32_round_to_zero := 0;
  4280. exit;
  4281. End;
  4282. aSig0 := aSig0 or $00100000;
  4283. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4284. absZ := aSig0 shr ( - shiftCount );
  4285. End;
  4286. if aSign <> 0 then
  4287. z := - absZ
  4288. else
  4289. z := absZ;
  4290. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4291. Begin
  4292. invalid:
  4293. float_raise( float_flag_invalid );
  4294. if (aSign <> 0) then
  4295. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4296. else
  4297. float64_to_int32_round_to_zero := $7FFFFFFF;
  4298. exit;
  4299. End;
  4300. if ( aSigExtra <> 0) then
  4301. set_inexact_flag;
  4302. float64_to_int32_round_to_zero := z;
  4303. End;
  4304. {*----------------------------------------------------------------------------
  4305. | Returns the result of converting the double-precision floating-point value
  4306. | `a' to the 64-bit two's complement integer format. The conversion is
  4307. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4308. | Arithmetic---which means in particular that the conversion is rounded
  4309. | according to the current rounding mode. If `a' is a NaN, the largest
  4310. | positive integer is returned. Otherwise, if the conversion overflows, the
  4311. | largest integer with the same sign as `a' is returned.
  4312. *----------------------------------------------------------------------------*}
  4313. function float64_to_int64( a: float64 ): int64;
  4314. var
  4315. aSign: flag;
  4316. aExp, shiftCount: int16;
  4317. aSig, aSigExtra: bits64;
  4318. begin
  4319. aSig := extractFloat64Frac( a );
  4320. aExp := extractFloat64Exp( a );
  4321. aSign := extractFloat64Sign( a );
  4322. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4323. shiftCount := $433 - aExp;
  4324. if ( shiftCount <= 0 ) then begin
  4325. if ( $43E < aExp ) then begin
  4326. float_raise( float_flag_invalid );
  4327. if ( ( aSign = 0 )
  4328. or ( ( aExp = $7FF )
  4329. and ( aSig <> $0010000000000000 ) )
  4330. ) then begin
  4331. result := $7FFFFFFFFFFFFFFF;
  4332. exit;
  4333. end;
  4334. result := $8000000000000000;
  4335. exit;
  4336. end;
  4337. aSigExtra := 0;
  4338. aSig := aSig shl ( - shiftCount );
  4339. end
  4340. else
  4341. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4342. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4343. end;
  4344. {*----------------------------------------------------------------------------
  4345. | Returns the result of converting the double-precision floating-point value
  4346. | `a' to the 64-bit two's complement integer format. The conversion is
  4347. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4348. | Arithmetic, except that the conversion is always rounded toward zero.
  4349. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4350. | the conversion overflows, the largest integer with the same sign as `a' is
  4351. | returned.
  4352. *----------------------------------------------------------------------------*}
  4353. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4354. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4355. var
  4356. aSign: flag;
  4357. aExp, shiftCount: int16;
  4358. aSig: bits64;
  4359. z: int64;
  4360. begin
  4361. aSig := extractFloat64Frac( a );
  4362. aExp := extractFloat64Exp( a );
  4363. aSign := extractFloat64Sign( a );
  4364. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4365. shiftCount := aExp - $433;
  4366. if ( 0 <= shiftCount ) then begin
  4367. if ( $43E <= aExp ) then begin
  4368. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4369. float_raise( float_flag_invalid );
  4370. if ( ( aSign = 0 )
  4371. or ( ( aExp = $7FF )
  4372. and ( aSig <> $0010000000000000 ) )
  4373. ) then begin
  4374. result := $7FFFFFFFFFFFFFFF;
  4375. exit;
  4376. end;
  4377. end;
  4378. result := $8000000000000000;
  4379. exit;
  4380. end;
  4381. z := aSig shl shiftCount;
  4382. end
  4383. else begin
  4384. if ( aExp < $3FE ) then begin
  4385. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  4386. result := 0;
  4387. exit;
  4388. end;
  4389. z := aSig shr ( - shiftCount );
  4390. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4391. set_inexact_flag;
  4392. end;
  4393. if ( aSign <> 0 ) then z := - z;
  4394. result := z;
  4395. end;
  4396. {*
  4397. -------------------------------------------------------------------------------
  4398. Returns the result of converting the double-precision floating-point value
  4399. `a' to the single-precision floating-point format. The conversion is
  4400. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4401. Arithmetic.
  4402. -------------------------------------------------------------------------------
  4403. *}
  4404. Function float64_to_float32(a: float64 ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4405. Var
  4406. aSign: flag;
  4407. aExp: int16;
  4408. aSig0, aSig1, zSig: bits32;
  4409. allZero: bits32;
  4410. tmp : CommonNanT;
  4411. Begin
  4412. aSig1 := extractFloat64Frac1( a );
  4413. aSig0 := extractFloat64Frac0( a );
  4414. aExp := extractFloat64Exp( a );
  4415. aSign := extractFloat64Sign( a );
  4416. if ( aExp = $7FF ) then
  4417. Begin
  4418. if ( aSig0 OR aSig1 ) <> 0 then
  4419. Begin
  4420. tmp:=float64ToCommonNaN(a);
  4421. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4422. exit;
  4423. End;
  4424. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4425. exit;
  4426. End;
  4427. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4428. if ( aExp <> 0) then
  4429. zSig := zSig OR $40000000;
  4430. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4431. End;
  4432. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4433. {*----------------------------------------------------------------------------
  4434. | Returns the result of converting the double-precision floating-point value
  4435. | `a' to the extended double-precision floating-point format. The conversion
  4436. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4437. | Arithmetic.
  4438. *----------------------------------------------------------------------------*}
  4439. function float64_to_floatx80( a: float64 ): floatx80;
  4440. var
  4441. aSign: flag;
  4442. aExp: int16;
  4443. aSig: bits64;
  4444. begin
  4445. aSig := extractFloat64Frac( a );
  4446. aExp := extractFloat64Exp( a );
  4447. aSign := extractFloat64Sign( a );
  4448. if ( aExp = $7FF ) then begin
  4449. if ( aSig <> 0 ) then begin
  4450. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4451. exit;
  4452. end;
  4453. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4454. exit;
  4455. end;
  4456. if ( aExp = 0 ) then begin
  4457. if ( aSig = 0 ) then begin
  4458. result := packFloatx80( aSign, 0, 0 );
  4459. exit;
  4460. end;
  4461. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4462. end;
  4463. result :=
  4464. packFloatx80(
  4465. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4466. end;
  4467. {$endif FPC_SOFTFLOAT_FLOATX80}
  4468. {*
  4469. -------------------------------------------------------------------------------
  4470. Rounds the double-precision floating-point value `a' to an integer,
  4471. and returns the result as a double-precision floating-point value. The
  4472. operation is performed according to the IEC/IEEE Standard for Binary
  4473. Floating-Point Arithmetic.
  4474. -------------------------------------------------------------------------------
  4475. *}
  4476. function float64_round_to_int(a: float64) : Float64;{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4477. Var
  4478. aSign: flag;
  4479. aExp: int16;
  4480. lastBitMask, roundBitsMask: bits32;
  4481. roundingMode: TFPURoundingMode;
  4482. z: float64;
  4483. Begin
  4484. aExp := extractFloat64Exp( a );
  4485. if ( $413 <= aExp ) then
  4486. Begin
  4487. if ( $433 <= aExp ) then
  4488. Begin
  4489. if ( ( aExp = $7FF )
  4490. AND
  4491. (
  4492. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4493. ) <>0)
  4494. ) then
  4495. Begin
  4496. propagateFloat64NaN( a, a, result );
  4497. exit;
  4498. End;
  4499. result := a;
  4500. exit;
  4501. End;
  4502. lastBitMask := 1;
  4503. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4504. roundBitsMask := lastBitMask - 1;
  4505. z := a;
  4506. roundingMode := softfloat_rounding_mode;
  4507. if ( roundingMode = float_round_nearest_even ) then
  4508. Begin
  4509. if ( lastBitMask <> 0) then
  4510. Begin
  4511. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4512. if ( ( z.low and roundBitsMask ) = 0 ) then
  4513. z.low := z.low and not lastBitMask;
  4514. End
  4515. else
  4516. Begin
  4517. if ( sbits32 (z.low) < 0 ) then
  4518. Begin
  4519. Inc(z.high);
  4520. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4521. z.high := z.high and not 1;
  4522. End;
  4523. End;
  4524. End
  4525. else if ( roundingMode <> float_round_to_zero ) then
  4526. Begin
  4527. if ( extractFloat64Sign( z )
  4528. xor flag( roundingMode = float_round_up ) )<> 0 then
  4529. Begin
  4530. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4531. End;
  4532. End;
  4533. z.low := z.low and not roundBitsMask;
  4534. End
  4535. else
  4536. Begin
  4537. if ( aExp <= $3FE ) then
  4538. Begin
  4539. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4540. Begin
  4541. result := a;
  4542. exit;
  4543. End;
  4544. set_inexact_flag;
  4545. aSign := extractFloat64Sign( a );
  4546. case ( softfloat_rounding_mode ) of
  4547. float_round_nearest_even:
  4548. Begin
  4549. if ( ( aExp = $3FE )
  4550. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4551. ) then
  4552. Begin
  4553. packFloat64( aSign, $3FF, 0, 0, result );
  4554. exit;
  4555. End;
  4556. End;
  4557. float_round_down:
  4558. Begin
  4559. if aSign<>0 then
  4560. packFloat64( 1, $3FF, 0, 0, result )
  4561. else
  4562. packFloat64( 0, 0, 0, 0, result );
  4563. exit;
  4564. End;
  4565. float_round_up:
  4566. Begin
  4567. if aSign <> 0 then
  4568. packFloat64( 1, 0, 0, 0, result )
  4569. else
  4570. packFloat64( 0, $3FF, 0, 0, result );
  4571. exit;
  4572. End;
  4573. end;
  4574. packFloat64( aSign, 0, 0, 0, result );
  4575. exit;
  4576. End;
  4577. lastBitMask := 1;
  4578. lastBitMask := lastBitMask shl ($413 - aExp);
  4579. roundBitsMask := lastBitMask - 1;
  4580. z.low := 0;
  4581. z.high := a.high;
  4582. roundingMode := softfloat_rounding_mode;
  4583. if ( roundingMode = float_round_nearest_even ) then
  4584. Begin
  4585. z.high := z.high + lastBitMask shr 1;
  4586. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4587. Begin
  4588. z.high := z.high and not lastBitMask;
  4589. End;
  4590. End
  4591. else if ( roundingMode <> float_round_to_zero ) then
  4592. Begin
  4593. if ( extractFloat64Sign( z )
  4594. xor flag( roundingMode = float_round_up ) )<> 0 then
  4595. Begin
  4596. z.high := z.high or bits32( a.low <> 0 );
  4597. z.high := z.high + roundBitsMask;
  4598. End;
  4599. End;
  4600. z.high := z.high and not roundBitsMask;
  4601. End;
  4602. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4603. Begin
  4604. set_inexact_flag;
  4605. End;
  4606. result := z;
  4607. End;
  4608. {*
  4609. -------------------------------------------------------------------------------
  4610. Returns the result of adding the absolute values of the double-precision
  4611. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4612. before being returned. `zSign' is ignored if the result is a NaN.
  4613. The addition is performed according to the IEC/IEEE Standard for Binary
  4614. Floating-Point Arithmetic.
  4615. -------------------------------------------------------------------------------
  4616. *}
  4617. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4618. Var
  4619. aExp, bExp, zExp: int16;
  4620. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4621. expDiff: int16;
  4622. label shiftRight1;
  4623. label roundAndPack;
  4624. Begin
  4625. aSig1 := extractFloat64Frac1( a );
  4626. aSig0 := extractFloat64Frac0( a );
  4627. aExp := extractFloat64Exp( a );
  4628. bSig1 := extractFloat64Frac1( b );
  4629. bSig0 := extractFloat64Frac0( b );
  4630. bExp := extractFloat64Exp( b );
  4631. expDiff := aExp - bExp;
  4632. if ( 0 < expDiff ) then
  4633. Begin
  4634. if ( aExp = $7FF ) then
  4635. Begin
  4636. if ( aSig0 OR aSig1 ) <> 0 then
  4637. Begin
  4638. propagateFloat64NaN( a, b, out );
  4639. exit;
  4640. end;
  4641. out := a;
  4642. exit;
  4643. End;
  4644. if ( bExp = 0 ) then
  4645. Begin
  4646. Dec(expDiff);
  4647. End
  4648. else
  4649. Begin
  4650. bSig0 := bSig0 or $00100000;
  4651. End;
  4652. shift64ExtraRightJamming(
  4653. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4654. zExp := aExp;
  4655. End
  4656. else if ( expDiff < 0 ) then
  4657. Begin
  4658. if ( bExp = $7FF ) then
  4659. Begin
  4660. if ( bSig0 OR bSig1 ) <> 0 then
  4661. Begin
  4662. propagateFloat64NaN( a, b, out );
  4663. exit;
  4664. End;
  4665. packFloat64( zSign, $7FF, 0, 0, out );
  4666. exit;
  4667. End;
  4668. if ( aExp = 0 ) then
  4669. Begin
  4670. Inc(expDiff);
  4671. End
  4672. else
  4673. Begin
  4674. aSig0 := aSig0 or $00100000;
  4675. End;
  4676. shift64ExtraRightJamming(
  4677. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4678. zExp := bExp;
  4679. End
  4680. else
  4681. Begin
  4682. if ( aExp = $7FF ) then
  4683. Begin
  4684. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4685. Begin
  4686. propagateFloat64NaN( a, b, out );
  4687. exit;
  4688. End;
  4689. out := a;
  4690. exit;
  4691. End;
  4692. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4693. if ( aExp = 0 ) then
  4694. Begin
  4695. packFloat64( zSign, 0, zSig0, zSig1, out );
  4696. exit;
  4697. End;
  4698. zSig2 := 0;
  4699. zSig0 := zSig0 or $00200000;
  4700. zExp := aExp;
  4701. goto shiftRight1;
  4702. End;
  4703. aSig0 := aSig0 or $00100000;
  4704. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4705. Dec(zExp);
  4706. if ( zSig0 < $00200000 ) then
  4707. goto roundAndPack;
  4708. Inc(zExp);
  4709. shiftRight1:
  4710. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4711. roundAndPack:
  4712. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4713. End;
  4714. {*
  4715. -------------------------------------------------------------------------------
  4716. Returns the result of subtracting the absolute values of the double-
  4717. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4718. difference is negated before being returned. `zSign' is ignored if the
  4719. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4720. Standard for Binary Floating-Point Arithmetic.
  4721. -------------------------------------------------------------------------------
  4722. *}
  4723. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4724. Var
  4725. aExp, bExp, zExp: int16;
  4726. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4727. expDiff: int16;
  4728. z: float64;
  4729. label aExpBigger;
  4730. label bExpBigger;
  4731. label aBigger;
  4732. label bBigger;
  4733. label normalizeRoundAndPack;
  4734. Begin
  4735. aSig1 := extractFloat64Frac1( a );
  4736. aSig0 := extractFloat64Frac0( a );
  4737. aExp := extractFloat64Exp( a );
  4738. bSig1 := extractFloat64Frac1( b );
  4739. bSig0 := extractFloat64Frac0( b );
  4740. bExp := extractFloat64Exp( b );
  4741. expDiff := aExp - bExp;
  4742. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4743. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4744. if ( 0 < expDiff ) then goto aExpBigger;
  4745. if ( expDiff < 0 ) then goto bExpBigger;
  4746. if ( aExp = $7FF ) then
  4747. Begin
  4748. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4749. Begin
  4750. propagateFloat64NaN( a, b, out );
  4751. exit;
  4752. End;
  4753. float_raise( float_flag_invalid );
  4754. z.low := float64_default_nan_low;
  4755. z.high := float64_default_nan_high;
  4756. out := z;
  4757. exit;
  4758. End;
  4759. if ( aExp = 0 ) then
  4760. Begin
  4761. aExp := 1;
  4762. bExp := 1;
  4763. End;
  4764. if ( bSig0 < aSig0 ) then goto aBigger;
  4765. if ( aSig0 < bSig0 ) then goto bBigger;
  4766. if ( bSig1 < aSig1 ) then goto aBigger;
  4767. if ( aSig1 < bSig1 ) then goto bBigger;
  4768. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4769. exit;
  4770. bExpBigger:
  4771. if ( bExp = $7FF ) then
  4772. Begin
  4773. if ( bSig0 OR bSig1 ) <> 0 then
  4774. Begin
  4775. propagateFloat64NaN( a, b, out );
  4776. exit;
  4777. End;
  4778. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4779. exit;
  4780. End;
  4781. if ( aExp = 0 ) then
  4782. Begin
  4783. Inc(expDiff);
  4784. End
  4785. else
  4786. Begin
  4787. aSig0 := aSig0 or $40000000;
  4788. End;
  4789. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4790. bSig0 := bSig0 or $40000000;
  4791. bBigger:
  4792. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4793. zExp := bExp;
  4794. zSign := zSign xor 1;
  4795. goto normalizeRoundAndPack;
  4796. aExpBigger:
  4797. if ( aExp = $7FF ) then
  4798. Begin
  4799. if ( aSig0 OR aSig1 ) <> 0 then
  4800. Begin
  4801. propagateFloat64NaN( a, b, out );
  4802. exit;
  4803. End;
  4804. out := a;
  4805. exit;
  4806. End;
  4807. if ( bExp = 0 ) then
  4808. Begin
  4809. Dec(expDiff);
  4810. End
  4811. else
  4812. Begin
  4813. bSig0 := bSig0 or $40000000;
  4814. End;
  4815. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4816. aSig0 := aSig0 or $40000000;
  4817. aBigger:
  4818. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4819. zExp := aExp;
  4820. normalizeRoundAndPack:
  4821. Dec(zExp);
  4822. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4823. End;
  4824. {*
  4825. -------------------------------------------------------------------------------
  4826. Returns the result of adding the double-precision floating-point values `a'
  4827. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4828. Binary Floating-Point Arithmetic.
  4829. -------------------------------------------------------------------------------
  4830. *}
  4831. Function float64_add( a: float64; b : float64) : Float64;
  4832. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4833. Var
  4834. aSign, bSign: flag;
  4835. Begin
  4836. aSign := extractFloat64Sign( a );
  4837. bSign := extractFloat64Sign( b );
  4838. if ( aSign = bSign ) then
  4839. Begin
  4840. addFloat64Sigs( a, b, aSign, result );
  4841. End
  4842. else
  4843. Begin
  4844. subFloat64Sigs( a, b, aSign, result );
  4845. End;
  4846. End;
  4847. {*
  4848. -------------------------------------------------------------------------------
  4849. Returns the result of subtracting the double-precision floating-point values
  4850. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4851. for Binary Floating-Point Arithmetic.
  4852. -------------------------------------------------------------------------------
  4853. *}
  4854. Function float64_sub(a: float64; b : float64) : Float64;
  4855. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4856. Var
  4857. aSign, bSign: flag;
  4858. Begin
  4859. aSign := extractFloat64Sign( a );
  4860. bSign := extractFloat64Sign( b );
  4861. if ( aSign = bSign ) then
  4862. Begin
  4863. subFloat64Sigs( a, b, aSign, result );
  4864. End
  4865. else
  4866. Begin
  4867. addFloat64Sigs( a, b, aSign, result );
  4868. End;
  4869. End;
  4870. {*
  4871. -------------------------------------------------------------------------------
  4872. Returns the result of multiplying the double-precision floating-point values
  4873. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4874. for Binary Floating-Point Arithmetic.
  4875. -------------------------------------------------------------------------------
  4876. *}
  4877. Function float64_mul( a: float64; b:float64) : Float64;
  4878. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4879. Var
  4880. aSign, bSign, zSign: flag;
  4881. aExp, bExp, zExp: int16;
  4882. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4883. z: float64;
  4884. label invalid;
  4885. Begin
  4886. aSig1 := extractFloat64Frac1( a );
  4887. aSig0 := extractFloat64Frac0( a );
  4888. aExp := extractFloat64Exp( a );
  4889. aSign := extractFloat64Sign( a );
  4890. bSig1 := extractFloat64Frac1( b );
  4891. bSig0 := extractFloat64Frac0( b );
  4892. bExp := extractFloat64Exp( b );
  4893. bSign := extractFloat64Sign( b );
  4894. zSign := aSign xor bSign;
  4895. if ( aExp = $7FF ) then
  4896. Begin
  4897. if ( (( aSig0 OR aSig1 ) <>0)
  4898. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4899. Begin
  4900. propagateFloat64NaN( a, b, result );
  4901. exit;
  4902. End;
  4903. if ( ( bits32(bExp) OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4904. packFloat64( zSign, $7FF, 0, 0, result );
  4905. exit;
  4906. End;
  4907. if ( bExp = $7FF ) then
  4908. Begin
  4909. if ( bSig0 OR bSig1 )<> 0 then
  4910. Begin
  4911. propagateFloat64NaN( a, b, result );
  4912. exit;
  4913. End;
  4914. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4915. Begin
  4916. invalid:
  4917. float_raise( float_flag_invalid );
  4918. z.low := float64_default_nan_low;
  4919. z.high := float64_default_nan_high;
  4920. result := z;
  4921. exit;
  4922. End;
  4923. packFloat64( zSign, $7FF, 0, 0, result );
  4924. exit;
  4925. End;
  4926. if ( aExp = 0 ) then
  4927. Begin
  4928. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4929. Begin
  4930. packFloat64( zSign, 0, 0, 0, result );
  4931. exit;
  4932. End;
  4933. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4934. End;
  4935. if ( bExp = 0 ) then
  4936. Begin
  4937. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4938. Begin
  4939. packFloat64( zSign, 0, 0, 0, result );
  4940. exit;
  4941. End;
  4942. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4943. End;
  4944. zExp := aExp + bExp - $400;
  4945. aSig0 := aSig0 or $00100000;
  4946. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4947. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4948. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4949. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4950. if ( $00200000 <= zSig0 ) then
  4951. Begin
  4952. shift64ExtraRightJamming(
  4953. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4954. Inc(zExp);
  4955. End;
  4956. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4957. End;
  4958. {*
  4959. -------------------------------------------------------------------------------
  4960. Returns the result of dividing the double-precision floating-point value `a'
  4961. by the corresponding value `b'. The operation is performed according to the
  4962. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4963. -------------------------------------------------------------------------------
  4964. *}
  4965. Function float64_div(a: float64; b : float64) : Float64;
  4966. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4967. Var
  4968. aSign, bSign, zSign: flag;
  4969. aExp, bExp, zExp: int16;
  4970. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4971. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4972. z: float64;
  4973. label invalid;
  4974. Begin
  4975. aSig1 := extractFloat64Frac1( a );
  4976. aSig0 := extractFloat64Frac0( a );
  4977. aExp := extractFloat64Exp( a );
  4978. aSign := extractFloat64Sign( a );
  4979. bSig1 := extractFloat64Frac1( b );
  4980. bSig0 := extractFloat64Frac0( b );
  4981. bExp := extractFloat64Exp( b );
  4982. bSign := extractFloat64Sign( b );
  4983. zSign := aSign xor bSign;
  4984. if ( aExp = $7FF ) then
  4985. Begin
  4986. if ( aSig0 OR aSig1 )<> 0 then
  4987. Begin
  4988. propagateFloat64NaN( a, b, result );
  4989. exit;
  4990. end;
  4991. if ( bExp = $7FF ) then
  4992. Begin
  4993. if ( bSig0 OR bSig1 )<>0 then
  4994. Begin
  4995. propagateFloat64NaN( a, b, result );
  4996. exit;
  4997. End;
  4998. goto invalid;
  4999. End;
  5000. packFloat64( zSign, $7FF, 0, 0, result );
  5001. exit;
  5002. End;
  5003. if ( bExp = $7FF ) then
  5004. Begin
  5005. if ( bSig0 OR bSig1 )<> 0 then
  5006. Begin
  5007. propagateFloat64NaN( a, b, result );
  5008. exit;
  5009. End;
  5010. packFloat64( zSign, 0, 0, 0, result );
  5011. exit;
  5012. End;
  5013. if ( bExp = 0 ) then
  5014. Begin
  5015. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5016. Begin
  5017. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5018. Begin
  5019. invalid:
  5020. float_raise( float_flag_invalid );
  5021. z.low := float64_default_nan_low;
  5022. z.high := float64_default_nan_high;
  5023. result := z;
  5024. exit;
  5025. End;
  5026. float_raise( float_flag_divbyzero );
  5027. packFloat64( zSign, $7FF, 0, 0, result );
  5028. exit;
  5029. End;
  5030. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5031. End;
  5032. if ( aExp = 0 ) then
  5033. Begin
  5034. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5035. Begin
  5036. packFloat64( zSign, 0, 0, 0, result );
  5037. exit;
  5038. End;
  5039. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5040. End;
  5041. zExp := aExp - bExp + $3FD;
  5042. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  5043. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5044. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  5045. Begin
  5046. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  5047. Inc(zExp);
  5048. End;
  5049. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5050. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  5051. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  5052. while ( sbits32 (rem0) < 0 ) do
  5053. Begin
  5054. Dec(zSig0);
  5055. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  5056. End;
  5057. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  5058. if ( ( zSig1 and $3FF ) <= 4 ) then
  5059. Begin
  5060. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  5061. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  5062. while ( sbits32 (rem1) < 0 ) do
  5063. Begin
  5064. Dec(zSig1);
  5065. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  5066. End;
  5067. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5068. End;
  5069. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  5070. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  5071. End;
  5072. {*
  5073. -------------------------------------------------------------------------------
  5074. Returns the remainder of the double-precision floating-point value `a'
  5075. with respect to the corresponding value `b'. The operation is performed
  5076. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5077. -------------------------------------------------------------------------------
  5078. *}
  5079. Function float64_rem(a: float64; b : float64) : float64;
  5080. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  5081. Var
  5082. aSign, zSign: flag;
  5083. aExp, bExp, expDiff: int16;
  5084. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  5085. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  5086. sigMean0: sbits32;
  5087. z: float64;
  5088. label invalid;
  5089. Begin
  5090. aSig1 := extractFloat64Frac1( a );
  5091. aSig0 := extractFloat64Frac0( a );
  5092. aExp := extractFloat64Exp( a );
  5093. aSign := extractFloat64Sign( a );
  5094. bSig1 := extractFloat64Frac1( b );
  5095. bSig0 := extractFloat64Frac0( b );
  5096. bExp := extractFloat64Exp( b );
  5097. if ( aExp = $7FF ) then
  5098. Begin
  5099. if ((( aSig0 OR aSig1 )<>0)
  5100. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  5101. Begin
  5102. propagateFloat64NaN( a, b, result );
  5103. exit;
  5104. End;
  5105. goto invalid;
  5106. End;
  5107. if ( bExp = $7FF ) then
  5108. Begin
  5109. if ( bSig0 OR bSig1 ) <> 0 then
  5110. Begin
  5111. propagateFloat64NaN( a, b, result );
  5112. exit;
  5113. End;
  5114. result := a;
  5115. exit;
  5116. End;
  5117. if ( bExp = 0 ) then
  5118. Begin
  5119. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5120. Begin
  5121. invalid:
  5122. float_raise( float_flag_invalid );
  5123. z.low := float64_default_nan_low;
  5124. z.high := float64_default_nan_high;
  5125. result := z;
  5126. exit;
  5127. End;
  5128. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5129. End;
  5130. if ( aExp = 0 ) then
  5131. Begin
  5132. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5133. Begin
  5134. result := a;
  5135. exit;
  5136. End;
  5137. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5138. End;
  5139. expDiff := aExp - bExp;
  5140. if ( expDiff < -1 ) then
  5141. Begin
  5142. result := a;
  5143. exit;
  5144. End;
  5145. shortShift64Left(
  5146. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5147. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5148. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5149. if ( q )<>0 then
  5150. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5151. expDiff := expDiff - 32;
  5152. while ( 0 < expDiff ) do
  5153. Begin
  5154. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5155. if 4 < q then
  5156. q:= q - 4
  5157. else
  5158. q := 0;
  5159. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5160. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5161. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5162. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5163. expDiff := expDiff - 29;
  5164. End;
  5165. if ( -32 < expDiff ) then
  5166. Begin
  5167. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5168. if 4 < q then
  5169. q := q - 4
  5170. else
  5171. q := 0;
  5172. q := q shr (- expDiff);
  5173. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5174. expDiff := expDiff + 24;
  5175. if ( expDiff < 0 ) then
  5176. Begin
  5177. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5178. End
  5179. else
  5180. Begin
  5181. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5182. End;
  5183. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5184. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5185. End
  5186. else
  5187. Begin
  5188. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5189. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5190. End;
  5191. Repeat
  5192. alternateASig0 := aSig0;
  5193. alternateASig1 := aSig1;
  5194. Inc(q);
  5195. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5196. Until not ( 0 <= sbits32 (aSig0) );
  5197. add64(
  5198. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5199. if ( ( sigMean0 < 0 )
  5200. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5201. Begin
  5202. aSig0 := alternateASig0;
  5203. aSig1 := alternateASig1;
  5204. End;
  5205. zSign := flag( sbits32 (aSig0) < 0 );
  5206. if ( zSign <> 0 ) then
  5207. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5208. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5209. End;
  5210. {*
  5211. -------------------------------------------------------------------------------
  5212. Returns the square root of the double-precision floating-point value `a'.
  5213. The operation is performed according to the IEC/IEEE Standard for Binary
  5214. Floating-Point Arithmetic.
  5215. -------------------------------------------------------------------------------
  5216. *}
  5217. function float64_sqrt( a: float64 ): float64;
  5218. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5219. Var
  5220. aSign: flag;
  5221. aExp, zExp: int16;
  5222. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5223. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5224. label invalid;
  5225. Begin
  5226. aSig1 := extractFloat64Frac1( a );
  5227. aSig0 := extractFloat64Frac0( a );
  5228. aExp := extractFloat64Exp( a );
  5229. aSign := extractFloat64Sign( a );
  5230. if ( aExp = $7FF ) then
  5231. Begin
  5232. if ( aSig0 OR aSig1 ) <> 0 then
  5233. Begin
  5234. propagateFloat64NaN( a, a, result );
  5235. exit;
  5236. End;
  5237. if ( aSign = 0) then
  5238. Begin
  5239. result := a;
  5240. exit;
  5241. End;
  5242. goto invalid;
  5243. End;
  5244. if ( aSign <> 0 ) then
  5245. Begin
  5246. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5247. Begin
  5248. result := a;
  5249. exit;
  5250. End;
  5251. invalid:
  5252. float_raise( float_flag_invalid );
  5253. result.low := float64_default_nan_low;
  5254. result.high := float64_default_nan_high;
  5255. exit;
  5256. End;
  5257. if ( aExp = 0 ) then
  5258. Begin
  5259. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5260. Begin
  5261. packFloat64( 0, 0, 0, 0, result );
  5262. exit;
  5263. End;
  5264. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5265. End;
  5266. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5267. aSig0 := aSig0 or $00100000;
  5268. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5269. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5270. if ( zSig0 = 0 ) then
  5271. zSig0 := $7FFFFFFF;
  5272. doubleZSig0 := zSig0 + zSig0;
  5273. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5274. mul32To64( zSig0, zSig0, term0, term1 );
  5275. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5276. while ( sbits32 (rem0) < 0 ) do
  5277. Begin
  5278. Dec(zSig0);
  5279. doubleZSig0 := doubleZSig0 - 2;
  5280. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5281. End;
  5282. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5283. if ( ( zSig1 and $1FF ) <= 5 ) then
  5284. Begin
  5285. if ( zSig1 = 0 ) then
  5286. zSig1 := 1;
  5287. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5288. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5289. mul32To64( zSig1, zSig1, term2, term3 );
  5290. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5291. while ( sbits32 (rem1) < 0 ) do
  5292. Begin
  5293. Dec(zSig1);
  5294. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5295. term3 := term3 or 1;
  5296. term2 := term2 or doubleZSig0;
  5297. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5298. End;
  5299. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5300. End;
  5301. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5302. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
  5303. End;
  5304. {*
  5305. -------------------------------------------------------------------------------
  5306. Returns 1 if the double-precision floating-point value `a' is equal to
  5307. the corresponding value `b', and 0 otherwise. The comparison is performed
  5308. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5309. -------------------------------------------------------------------------------
  5310. *}
  5311. Function float64_eq(a: float64; b: float64): flag;
  5312. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5313. Begin
  5314. if
  5315. (
  5316. ( extractFloat64Exp( a ) = $7FF )
  5317. AND
  5318. (
  5319. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5320. )
  5321. )
  5322. OR (
  5323. ( extractFloat64Exp( b ) = $7FF )
  5324. AND (
  5325. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5326. )
  5327. )
  5328. ) then
  5329. Begin
  5330. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5331. float_raise( float_flag_invalid );
  5332. float64_eq := 0;
  5333. exit;
  5334. End;
  5335. float64_eq := flag(
  5336. ( a.low = b.low )
  5337. AND ( ( a.high = b.high )
  5338. OR ( ( a.low = 0 )
  5339. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5340. ));
  5341. End;
  5342. {*
  5343. -------------------------------------------------------------------------------
  5344. Returns 1 if the double-precision floating-point value `a' is less than
  5345. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5346. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5347. Arithmetic.
  5348. -------------------------------------------------------------------------------
  5349. *}
  5350. Function float64_le(a: float64;b: float64): flag;
  5351. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5352. Var
  5353. aSign, bSign: flag;
  5354. Begin
  5355. if
  5356. (
  5357. ( extractFloat64Exp( a ) = $7FF )
  5358. AND
  5359. (
  5360. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5361. )
  5362. )
  5363. OR (
  5364. ( extractFloat64Exp( b ) = $7FF )
  5365. AND (
  5366. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5367. )
  5368. )
  5369. ) then
  5370. Begin
  5371. float_raise( float_flag_invalid );
  5372. float64_le := 0;
  5373. exit;
  5374. End;
  5375. aSign := extractFloat64Sign( a );
  5376. bSign := extractFloat64Sign( b );
  5377. if ( aSign <> bSign ) then
  5378. Begin
  5379. float64_le := flag(
  5380. (aSign <> 0)
  5381. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5382. = 0 ));
  5383. exit;
  5384. End;
  5385. if aSign <> 0 then
  5386. float64_le := le64( b.high, b.low, a.high, a.low )
  5387. else
  5388. float64_le := le64( a.high, a.low, b.high, b.low );
  5389. End;
  5390. {*
  5391. -------------------------------------------------------------------------------
  5392. Returns 1 if the double-precision floating-point value `a' is less than
  5393. the corresponding value `b', and 0 otherwise. The comparison is performed
  5394. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5395. -------------------------------------------------------------------------------
  5396. *}
  5397. Function float64_lt(a: float64;b: float64): flag;
  5398. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5399. Var
  5400. aSign, bSign: flag;
  5401. Begin
  5402. if
  5403. (
  5404. ( extractFloat64Exp( a ) = $7FF )
  5405. AND
  5406. (
  5407. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5408. )
  5409. )
  5410. OR (
  5411. ( extractFloat64Exp( b ) = $7FF )
  5412. AND (
  5413. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5414. )
  5415. )
  5416. ) then
  5417. Begin
  5418. float_raise( float_flag_invalid );
  5419. float64_lt := 0;
  5420. exit;
  5421. End;
  5422. aSign := extractFloat64Sign( a );
  5423. bSign := extractFloat64Sign( b );
  5424. if ( aSign <> bSign ) then
  5425. Begin
  5426. float64_lt := flag(
  5427. (aSign <> 0)
  5428. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5429. <> 0 ));
  5430. exit;
  5431. End;
  5432. if aSign <> 0 then
  5433. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5434. else
  5435. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5436. End;
  5437. {*
  5438. -------------------------------------------------------------------------------
  5439. Returns 1 if the double-precision floating-point value `a' is equal to
  5440. the corresponding value `b', and 0 otherwise. The invalid exception is
  5441. raised if either operand is a NaN. Otherwise, the comparison is performed
  5442. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5443. -------------------------------------------------------------------------------
  5444. *}
  5445. Function float64_eq_signaling( a: float64; b: float64): flag;
  5446. Begin
  5447. if
  5448. (
  5449. ( extractFloat64Exp( a ) = $7FF )
  5450. AND
  5451. (
  5452. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5453. )
  5454. )
  5455. OR (
  5456. ( extractFloat64Exp( b ) = $7FF )
  5457. AND (
  5458. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5459. )
  5460. )
  5461. ) then
  5462. Begin
  5463. float_raise( float_flag_invalid );
  5464. float64_eq_signaling := 0;
  5465. exit;
  5466. End;
  5467. float64_eq_signaling := flag(
  5468. ( a.low = b.low )
  5469. AND ( ( a.high = b.high )
  5470. OR ( ( a.low = 0 )
  5471. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5472. ));
  5473. End;
  5474. {*
  5475. -------------------------------------------------------------------------------
  5476. Returns 1 if the double-precision floating-point value `a' is less than or
  5477. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5478. cause an exception. Otherwise, the comparison is performed according to the
  5479. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5480. -------------------------------------------------------------------------------
  5481. *}
  5482. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5483. Var
  5484. aSign, bSign : flag;
  5485. Begin
  5486. if
  5487. (
  5488. ( extractFloat64Exp( a ) = $7FF )
  5489. AND
  5490. (
  5491. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5492. )
  5493. )
  5494. OR (
  5495. ( extractFloat64Exp( b ) = $7FF )
  5496. AND (
  5497. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5498. )
  5499. )
  5500. ) then
  5501. Begin
  5502. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5503. float_raise( float_flag_invalid );
  5504. float64_le_quiet := 0;
  5505. exit;
  5506. End;
  5507. aSign := extractFloat64Sign( a );
  5508. bSign := extractFloat64Sign( b );
  5509. if ( aSign <> bSign ) then
  5510. Begin
  5511. float64_le_quiet := flag
  5512. ((aSign <> 0)
  5513. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5514. = 0 ));
  5515. exit;
  5516. End;
  5517. if aSign <> 0 then
  5518. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5519. else
  5520. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5521. End;
  5522. {*
  5523. -------------------------------------------------------------------------------
  5524. Returns 1 if the double-precision floating-point value `a' is less than
  5525. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5526. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5527. Standard for Binary Floating-Point Arithmetic.
  5528. -------------------------------------------------------------------------------
  5529. *}
  5530. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5531. Var
  5532. aSign, bSign: flag;
  5533. Begin
  5534. if
  5535. (
  5536. ( extractFloat64Exp( a ) = $7FF )
  5537. AND
  5538. (
  5539. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5540. )
  5541. )
  5542. OR (
  5543. ( extractFloat64Exp( b ) = $7FF )
  5544. AND (
  5545. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5546. )
  5547. )
  5548. ) then
  5549. Begin
  5550. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5551. float_raise( float_flag_invalid );
  5552. float64_lt_quiet := 0;
  5553. exit;
  5554. End;
  5555. aSign := extractFloat64Sign( a );
  5556. bSign := extractFloat64Sign( b );
  5557. if ( aSign <> bSign ) then
  5558. Begin
  5559. float64_lt_quiet := flag(
  5560. (aSign<>0)
  5561. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5562. <> 0 ));
  5563. exit;
  5564. End;
  5565. If aSign <> 0 then
  5566. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5567. else
  5568. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5569. End;
  5570. {*----------------------------------------------------------------------------
  5571. | Returns the result of converting the 64-bit two's complement integer `a'
  5572. | to the single-precision floating-point format. The conversion is performed
  5573. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5574. *----------------------------------------------------------------------------*}
  5575. function int64_to_float32( a: int64 ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  5576. var
  5577. zSign : flag;
  5578. absA : uint64;
  5579. shiftCount: int8;
  5580. Begin
  5581. if ( a = 0 ) then
  5582. begin
  5583. int64_to_float32.float32 := 0;
  5584. exit;
  5585. end;
  5586. if a < 0 then
  5587. zSign := flag(TRUE)
  5588. else
  5589. zSign := flag(FALSE);
  5590. if zSign<>0 then
  5591. absA := -a
  5592. else
  5593. absA := a;
  5594. shiftCount := countLeadingZeros64( absA ) - 40;
  5595. if ( 0 <= shiftCount ) then
  5596. begin
  5597. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5598. end
  5599. else
  5600. begin
  5601. shiftCount := shiftCount + 7;
  5602. if ( shiftCount < 0 ) then
  5603. shift64RightJamming( absA, - shiftCount, absA )
  5604. else
  5605. absA := absA shl shiftCount;
  5606. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5607. end;
  5608. End;
  5609. {*----------------------------------------------------------------------------
  5610. | Returns the result of converting the 64-bit two's complement integer `a'
  5611. | to the single-precision floating-point format. The conversion is performed
  5612. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5613. | Unisgned version.
  5614. *----------------------------------------------------------------------------*}
  5615. function qword_to_float32( a: qword ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  5616. var
  5617. absA : uint64;
  5618. shiftCount: int8;
  5619. Begin
  5620. if ( a = 0 ) then
  5621. begin
  5622. qword_to_float32.float32 := 0;
  5623. exit;
  5624. end;
  5625. absA := a;
  5626. shiftCount := countLeadingZeros64( absA ) - 40;
  5627. if ( 0 <= shiftCount ) then
  5628. begin
  5629. qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
  5630. end
  5631. else
  5632. begin
  5633. shiftCount := shiftCount + 7;
  5634. if ( shiftCount < 0 ) then
  5635. shift64RightJamming( absA, - shiftCount, absA )
  5636. else
  5637. absA := absA shl shiftCount;
  5638. qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
  5639. end;
  5640. End;
  5641. {*----------------------------------------------------------------------------
  5642. | Returns the result of converting the 64-bit two's complement integer `a'
  5643. | to the double-precision floating-point format. The conversion is performed
  5644. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5645. *----------------------------------------------------------------------------*}
  5646. function qword_to_float64( a: qword ): float64;
  5647. {$ifdef FPC_IS_SYSTEM}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5648. var
  5649. shiftCount: int8;
  5650. Begin
  5651. if ( a = 0 ) then
  5652. result := packFloat64( 0, 0, 0 )
  5653. else
  5654. begin
  5655. shiftCount := countLeadingZeros64(a) - 1;
  5656. { numbers with <= 53 significant bits are converted exactly }
  5657. if (shiftCount > 9) then
  5658. result := packFloat64(0, $43c - shiftCount, a shl (shiftCount-10))
  5659. else if (shiftCount>=0) then
  5660. result := roundAndPackFloat64( 0, $43c - shiftCount, a shl shiftCount)
  5661. else
  5662. begin
  5663. { the only possible negative value is -1, in case bit 63 of 'a' is set }
  5664. shift64RightJamming(a, 1, a);
  5665. result := roundAndPackFloat64(0, $43d, a);
  5666. end;
  5667. end;
  5668. End;
  5669. {*----------------------------------------------------------------------------
  5670. | Returns the result of converting the 64-bit two's complement integer `a'
  5671. | to the double-precision floating-point format. The conversion is performed
  5672. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5673. *----------------------------------------------------------------------------*}
  5674. function int64_to_float64( a: int64 ): float64;
  5675. {$ifdef FPC_IS_SYSTEM}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5676. Begin
  5677. if ( a = 0 ) then
  5678. result := packFloat64( 0, 0, 0 )
  5679. else if (a = int64($8000000000000000)) then
  5680. result := packFloat64( 1, $43e, 0 )
  5681. else if (a < 0) then
  5682. result := normalizeRoundAndPackFloat64( 1, $43c, -a )
  5683. else
  5684. result := normalizeRoundAndPackFloat64( 0, $43c, a );
  5685. End;
  5686. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5687. {*----------------------------------------------------------------------------
  5688. | Returns the result of converting the 64-bit two's complement integer `a'
  5689. | to the extended double-precision floating-point format. The conversion
  5690. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5691. | Arithmetic.
  5692. *----------------------------------------------------------------------------*}
  5693. function int64_to_floatx80( a: int64 ): floatx80;
  5694. var
  5695. zSign: flag;
  5696. absA: uint64;
  5697. shiftCount: int8;
  5698. begin
  5699. if ( a = 0 ) then begin
  5700. result := packFloatx80( 0, 0, 0 );
  5701. exit;
  5702. end;
  5703. zSign := ord( a < 0 );
  5704. if zSign <> 0 then absA := - a else absA := a;
  5705. shiftCount := countLeadingZeros64( absA );
  5706. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5707. end;
  5708. {*----------------------------------------------------------------------------
  5709. | Returns the result of converting the 64-bit two's complement integer `a'
  5710. | to the extended double-precision floating-point format. The conversion
  5711. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5712. | Arithmetic.
  5713. | Unsigned version.
  5714. *----------------------------------------------------------------------------*}
  5715. function qword_to_floatx80( a: qword ): floatx80;
  5716. var
  5717. absA: bits64;
  5718. shiftCount: int8;
  5719. begin
  5720. if ( a = 0 ) then begin
  5721. result := packFloatx80( 0, 0, 0 );
  5722. exit;
  5723. end;
  5724. absA := a;
  5725. shiftCount := countLeadingZeros64( absA );
  5726. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5727. end;
  5728. {$endif FPC_SOFTFLOAT_FLOATX80}
  5729. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5730. {*----------------------------------------------------------------------------
  5731. | Returns the result of converting the 64-bit two's complement integer `a' to
  5732. | the quadruple-precision floating-point format. The conversion is performed
  5733. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5734. *----------------------------------------------------------------------------*}
  5735. function int64_to_float128( a: int64 ): float128;
  5736. var
  5737. zSign: flag;
  5738. absA: uint64;
  5739. shiftCount: int8;
  5740. zExp: int32;
  5741. zSig0, zSig1: bits64;
  5742. begin
  5743. if ( a = 0 ) then begin
  5744. result := packFloat128( 0, 0, 0, 0 );
  5745. exit;
  5746. end;
  5747. zSign := ord( a < 0 );
  5748. if zSign <> 0 then absA := - a else absA := a;
  5749. shiftCount := countLeadingZeros64( absA ) + 49;
  5750. zExp := $406E - shiftCount;
  5751. if ( 64 <= shiftCount ) then begin
  5752. zSig1 := 0;
  5753. zSig0 := absA;
  5754. dec( shiftCount, 64 );
  5755. end
  5756. else begin
  5757. zSig1 := absA;
  5758. zSig0 := 0;
  5759. end;
  5760. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5761. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5762. end;
  5763. {*----------------------------------------------------------------------------
  5764. | Returns the result of converting the 64-bit two's complement integer `a' to
  5765. | the quadruple-precision floating-point format. The conversion is performed
  5766. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5767. | Unsigned version.
  5768. *----------------------------------------------------------------------------*}
  5769. function qword_to_float128( a: qword ): float128;
  5770. var
  5771. absA: bits64;
  5772. shiftCount: int8;
  5773. zExp: int32;
  5774. zSig0, zSig1: bits64;
  5775. begin
  5776. if ( a = 0 ) then begin
  5777. result := packFloat128( 0, 0, 0, 0 );
  5778. exit;
  5779. end;
  5780. absA := a;
  5781. shiftCount := countLeadingZeros64( absA ) + 49;
  5782. zExp := $406E - shiftCount;
  5783. if ( 64 <= shiftCount ) then begin
  5784. zSig1 := 0;
  5785. zSig0 := absA;
  5786. dec( shiftCount, 64 );
  5787. end
  5788. else begin
  5789. zSig1 := absA;
  5790. zSig0 := 0;
  5791. end;
  5792. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5793. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5794. end;
  5795. {$endif FPC_SOFTFLOAT_FLOAT128}
  5796. {*----------------------------------------------------------------------------
  5797. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5798. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5799. | Otherwise, returns 0.
  5800. *----------------------------------------------------------------------------*}
  5801. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5802. begin
  5803. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5804. end;
  5805. {*----------------------------------------------------------------------------
  5806. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5807. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5808. | Otherwise, returns 0.
  5809. *----------------------------------------------------------------------------*}
  5810. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5811. begin
  5812. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5813. end;
  5814. {*----------------------------------------------------------------------------
  5815. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5816. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5817. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5818. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5819. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5820. | the most-significant bit of the extra result, and the other 63 bits of the
  5821. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5822. | were all zero. This extra result is stored in the location pointed to by
  5823. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5824. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5825. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5826. | fixed-point value is shifted right by the number of bits given in `count',
  5827. | and the integer part of the result is returned at the locations pointed to
  5828. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5829. | corrupted as described above, and is returned at the location pointed to by
  5830. | `z2Ptr'.)
  5831. *----------------------------------------------------------------------------*}
  5832. procedure shift128ExtraRightJamming(
  5833. a0: bits64;
  5834. a1: bits64;
  5835. a2: bits64;
  5836. count: int16;
  5837. var z0Ptr: bits64;
  5838. var z1Ptr: bits64;
  5839. var z2Ptr: bits64);
  5840. var
  5841. z0, z1, z2: bits64;
  5842. negCount: int8;
  5843. begin
  5844. negCount := ( - count ) and 63;
  5845. if ( count = 0 ) then
  5846. begin
  5847. z2 := a2;
  5848. z1 := a1;
  5849. z0 := a0;
  5850. end
  5851. else begin
  5852. if ( count < 64 ) then
  5853. begin
  5854. z2 := a1 shl negCount;
  5855. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5856. z0 := a0 shr count;
  5857. end
  5858. else begin
  5859. if ( count = 64 ) then
  5860. begin
  5861. z2 := a1;
  5862. z1 := a0;
  5863. end
  5864. else begin
  5865. a2 := a2 or a1;
  5866. if ( count < 128 ) then
  5867. begin
  5868. z2 := a0 shl negCount;
  5869. z1 := a0 shr ( count and 63 );
  5870. end
  5871. else begin
  5872. if ( count = 128 ) then
  5873. z2 := a0
  5874. else
  5875. z2 := ord( a0 <> 0 );
  5876. z1 := 0;
  5877. end;
  5878. end;
  5879. z0 := 0;
  5880. end;
  5881. z2 := z2 or ord( a2 <> 0 );
  5882. end;
  5883. z2Ptr := z2;
  5884. z1Ptr := z1;
  5885. z0Ptr := z0;
  5886. end;
  5887. {*----------------------------------------------------------------------------
  5888. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5889. | _plus_ the number of bits given in `count'. The shifted result is at most
  5890. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5891. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5892. | shifted off is the most-significant bit of the extra result, and the other
  5893. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5894. | bits shifted off were all zero. This extra result is stored in the location
  5895. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5896. | (This routine makes more sense if `a0' and `a1' are considered to form
  5897. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5898. | point value is shifted right by the number of bits given in `count', and
  5899. | the integer part of the result is returned at the location pointed to by
  5900. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5901. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5902. *----------------------------------------------------------------------------*}
  5903. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5904. var
  5905. z0, z1: bits64;
  5906. negCount: int8;
  5907. begin
  5908. negCount := ( - count ) and 63;
  5909. if ( count = 0 ) then
  5910. begin
  5911. z1 := a1;
  5912. z0 := a0;
  5913. end
  5914. else if ( count < 64 ) then
  5915. begin
  5916. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5917. z0 := a0 shr count;
  5918. end
  5919. else begin
  5920. if ( count = 64 ) then
  5921. begin
  5922. z1 := a0 or ord( a1 <> 0 );
  5923. end
  5924. else begin
  5925. z1 := ord( ( a0 or a1 ) <> 0 );
  5926. end;
  5927. z0 := 0;
  5928. end;
  5929. z1Ptr := z1;
  5930. z0Ptr := z0;
  5931. end;
  5932. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5933. {*----------------------------------------------------------------------------
  5934. | Returns the fraction bits of the extended double-precision floating-point
  5935. | value `a'.
  5936. *----------------------------------------------------------------------------*}
  5937. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5938. begin
  5939. result:=a.low;
  5940. end;
  5941. {*----------------------------------------------------------------------------
  5942. | Returns the exponent bits of the extended double-precision floating-point
  5943. | value `a'.
  5944. *----------------------------------------------------------------------------*}
  5945. function extractFloatx80Exp(a : floatx80): int32;inline;
  5946. begin
  5947. result:=a.high and $7FFF;
  5948. end;
  5949. {*----------------------------------------------------------------------------
  5950. | Returns the sign bit of the extended double-precision floating-point value
  5951. | `a'.
  5952. *----------------------------------------------------------------------------*}
  5953. function extractFloatx80Sign(a : floatx80): flag;inline;
  5954. begin
  5955. result:=a.high shr 15;
  5956. end;
  5957. {*----------------------------------------------------------------------------
  5958. | Normalizes the subnormal extended double-precision floating-point value
  5959. | represented by the denormalized significand `aSig'. The normalized exponent
  5960. | and significand are stored at the locations pointed to by `zExpPtr' and
  5961. | `zSigPtr', respectively.
  5962. *----------------------------------------------------------------------------*}
  5963. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5964. var
  5965. shiftCount: int8;
  5966. begin
  5967. shiftCount := countLeadingZeros64( aSig );
  5968. zSigPtr := aSig shl shiftCount;
  5969. zExpPtr := 1 - shiftCount;
  5970. end;
  5971. {*----------------------------------------------------------------------------
  5972. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5973. | extended double-precision floating-point value, returning the result.
  5974. *----------------------------------------------------------------------------*}
  5975. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5976. var
  5977. z: floatx80;
  5978. begin
  5979. z.low := zSig;
  5980. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5981. result:=z;
  5982. end;
  5983. {*----------------------------------------------------------------------------
  5984. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5985. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5986. | and returns the proper extended double-precision floating-point value
  5987. | corresponding to the abstract input. Ordinarily, the abstract value is
  5988. | rounded and packed into the extended double-precision format, with the
  5989. | inexact exception raised if the abstract input cannot be represented
  5990. | exactly. However, if the abstract value is too large, the overflow and
  5991. | inexact exceptions are raised and an infinity or maximal finite value is
  5992. | returned. If the abstract value is too small, the input value is rounded to
  5993. | a subnormal number, and the underflow and inexact exceptions are raised if
  5994. | the abstract input cannot be represented exactly as a subnormal extended
  5995. | double-precision floating-point number.
  5996. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5997. | number of bits as single or double precision, respectively. Otherwise, the
  5998. | result is rounded to the full precision of the extended double-precision
  5999. | format.
  6000. | The input significand must be normalized or smaller. If the input
  6001. | significand is not normalized, `zExp' must be 0; in that case, the result
  6002. | returned is a subnormal number, and it must not require rounding. The
  6003. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  6004. | Floating-Point Arithmetic.
  6005. *----------------------------------------------------------------------------*}
  6006. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6007. var
  6008. roundingMode: TFPURoundingMode;
  6009. roundNearestEven, increment, isTiny: flag;
  6010. roundIncrement, roundMask, roundBits: int64;
  6011. label
  6012. precision80, overflow;
  6013. begin
  6014. roundingMode := softfloat_rounding_mode;
  6015. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  6016. if ( roundingPrecision = 80 ) then
  6017. goto precision80;
  6018. if ( roundingPrecision = 64 ) then
  6019. begin
  6020. roundIncrement := int64( $0000000000000400 );
  6021. roundMask := int64( $00000000000007FF );
  6022. end
  6023. else if ( roundingPrecision = 32 ) then
  6024. begin
  6025. roundIncrement := int64( $0000008000000000 );
  6026. roundMask := int64( $000000FFFFFFFFFF );
  6027. end
  6028. else begin
  6029. goto precision80;
  6030. end;
  6031. zSig0 := zSig0 or ord( zSig1 <> 0 );
  6032. if ( not (roundNearestEven<>0) ) then
  6033. begin
  6034. if ( roundingMode = float_round_to_zero ) then
  6035. begin
  6036. roundIncrement := 0;
  6037. end
  6038. else begin
  6039. roundIncrement := roundMask;
  6040. if ( zSign<>0 ) then
  6041. begin
  6042. if ( roundingMode = float_round_up ) then
  6043. roundIncrement := 0;
  6044. end
  6045. else begin
  6046. if ( roundingMode = float_round_down ) then
  6047. roundIncrement := 0;
  6048. end;
  6049. end;
  6050. end;
  6051. roundBits := zSig0 and roundMask;
  6052. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6053. if ( ( $7FFE < zExp )
  6054. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  6055. ) then begin
  6056. goto overflow;
  6057. end;
  6058. if ( zExp <= 0 ) then begin
  6059. isTiny := ord (
  6060. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6061. or ( zExp < 0 )
  6062. or ( zSig0 <= zSig0 + roundIncrement ) );
  6063. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  6064. zExp := 0;
  6065. roundBits := zSig0 and roundMask;
  6066. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  6067. if ( roundBits <> 0 ) then set_inexact_flag;
  6068. inc( zSig0, roundIncrement );
  6069. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6070. roundIncrement := roundMask + 1;
  6071. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6072. roundMask := roundMask or roundIncrement;
  6073. end;
  6074. zSig0 := zSig0 and not roundMask;
  6075. result:=packFloatx80( zSign, zExp, zSig0 );
  6076. exit;
  6077. end;
  6078. end;
  6079. if ( roundBits <> 0 ) then set_inexact_flag;
  6080. inc( zSig0, roundIncrement );
  6081. if ( zSig0 < roundIncrement ) then begin
  6082. inc(zExp);
  6083. zSig0 := bits64( $8000000000000000 );
  6084. end;
  6085. roundIncrement := roundMask + 1;
  6086. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6087. roundMask := roundMask or roundIncrement;
  6088. end;
  6089. zSig0 := zSig0 and not roundMask;
  6090. if ( zSig0 = 0 ) then zExp := 0;
  6091. result:=packFloatx80( zSign, zExp, zSig0 );
  6092. exit;
  6093. precision80:
  6094. increment := ord ( sbits64( zSig1 ) < 0 );
  6095. if ( roundNearestEven = 0 ) then begin
  6096. if ( roundingMode = float_round_to_zero ) then begin
  6097. increment := 0;
  6098. end
  6099. else begin
  6100. if ( zSign <> 0 ) then begin
  6101. increment := ord ( roundingMode = float_round_down ) and zSig1;
  6102. end
  6103. else begin
  6104. increment := ord ( roundingMode = float_round_up ) and zSig1;
  6105. end;
  6106. end;
  6107. end;
  6108. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6109. if ( ( $7FFE < zExp )
  6110. or ( ( zExp = $7FFE )
  6111. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  6112. and ( increment <> 0 )
  6113. )
  6114. ) then begin
  6115. roundMask := 0;
  6116. overflow:
  6117. float_raise( [float_flag_overflow,float_flag_inexact] );
  6118. if ( ( roundingMode = float_round_to_zero )
  6119. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6120. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6121. ) then begin
  6122. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6123. exit;
  6124. end;
  6125. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6126. exit;
  6127. end;
  6128. if ( zExp <= 0 ) then begin
  6129. isTiny := ord(
  6130. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6131. or ( zExp < 0 )
  6132. or ( increment = 0 )
  6133. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6134. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6135. zExp := 0;
  6136. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6137. if ( zSig1 <> 0 ) then set_inexact_flag;
  6138. if ( roundNearestEven <> 0 ) then begin
  6139. increment := ord( sbits64( zSig1 ) < 0 );
  6140. end
  6141. else begin
  6142. if ( zSign <> 0 ) then begin
  6143. increment := ord( roundingMode = float_round_down ) and zSig1;
  6144. end
  6145. else begin
  6146. increment := ord( roundingMode = float_round_up ) and zSig1;
  6147. end;
  6148. end;
  6149. if ( increment <> 0 ) then begin
  6150. inc(zSig0);
  6151. zSig0 :=
  6152. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6153. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6154. end;
  6155. result:=packFloatx80( zSign, zExp, zSig0 );
  6156. exit;
  6157. end;
  6158. end;
  6159. if ( zSig1 <> 0 ) then set_inexact_flag;
  6160. if ( increment <> 0 ) then begin
  6161. inc(zSig0);
  6162. if ( zSig0 = 0 ) then begin
  6163. inc(zExp);
  6164. zSig0 := bits64( $8000000000000000 );
  6165. end
  6166. else begin
  6167. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6168. end;
  6169. end
  6170. else begin
  6171. if ( zSig0 = 0 ) then zExp := 0;
  6172. end;
  6173. result:=packFloatx80( zSign, zExp, zSig0 );
  6174. end;
  6175. {*----------------------------------------------------------------------------
  6176. | Takes an abstract floating-point value having sign `zSign', exponent
  6177. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6178. | and returns the proper extended double-precision floating-point value
  6179. | corresponding to the abstract input. This routine is just like
  6180. | `roundAndPackFloatx80' except that the input significand does not have to be
  6181. | normalized.
  6182. *----------------------------------------------------------------------------*}
  6183. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6184. var
  6185. shiftCount: int8;
  6186. begin
  6187. if ( zSig0 = 0 ) then begin
  6188. zSig0 := zSig1;
  6189. zSig1 := 0;
  6190. dec( zExp, 64 );
  6191. end;
  6192. shiftCount := countLeadingZeros64( zSig0 );
  6193. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6194. zExp := zExp - shiftCount;
  6195. result :=
  6196. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6197. end;
  6198. {*----------------------------------------------------------------------------
  6199. | Returns the result of converting the extended double-precision floating-
  6200. | point value `a' to the 32-bit two's complement integer format. The
  6201. | conversion is performed according to the IEC/IEEE Standard for Binary
  6202. | Floating-Point Arithmetic---which means in particular that the conversion
  6203. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6204. | largest positive integer is returned. Otherwise, if the conversion
  6205. | overflows, the largest integer with the same sign as `a' is returned.
  6206. *----------------------------------------------------------------------------*}
  6207. function floatx80_to_int32(a: floatx80): int32;
  6208. var
  6209. aSign: flag;
  6210. aExp, shiftCount: int32;
  6211. aSig: bits64;
  6212. begin
  6213. aSig := extractFloatx80Frac( a );
  6214. aExp := extractFloatx80Exp( a );
  6215. aSign := extractFloatx80Sign( a );
  6216. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6217. shiftCount := $4037 - aExp;
  6218. if ( shiftCount <= 0 ) then shiftCount := 1;
  6219. shift64RightJamming( aSig, shiftCount, aSig );
  6220. result := roundAndPackInt32( aSign, aSig );
  6221. end;
  6222. {*----------------------------------------------------------------------------
  6223. | Returns the result of converting the extended double-precision floating-
  6224. | point value `a' to the 32-bit two's complement integer format. The
  6225. | conversion is performed according to the IEC/IEEE Standard for Binary
  6226. | Floating-Point Arithmetic, except that the conversion is always rounded
  6227. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6228. | Otherwise, if the conversion overflows, the largest integer with the same
  6229. | sign as `a' is returned.
  6230. *----------------------------------------------------------------------------*}
  6231. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6232. var
  6233. aSign: flag;
  6234. aExp, shiftCount: int32;
  6235. aSig, savedASig: bits64;
  6236. z: int32;
  6237. label
  6238. invalid;
  6239. begin
  6240. aSig := extractFloatx80Frac( a );
  6241. aExp := extractFloatx80Exp( a );
  6242. aSign := extractFloatx80Sign( a );
  6243. if ( $401E < aExp ) then begin
  6244. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6245. goto invalid;
  6246. end
  6247. else if ( aExp < $3FFF ) then begin
  6248. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6249. result := 0;
  6250. exit;
  6251. end;
  6252. shiftCount := $403E - aExp;
  6253. savedASig := aSig;
  6254. aSig := aSig shr shiftCount;
  6255. z := aSig;
  6256. if ( aSign <> 0 ) then z := - z;
  6257. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6258. invalid:
  6259. float_raise( float_flag_invalid );
  6260. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6261. exit;
  6262. end;
  6263. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6264. set_inexact_flag;
  6265. end;
  6266. result := z;
  6267. end;
  6268. {*----------------------------------------------------------------------------
  6269. | Returns the result of converting the extended double-precision floating-
  6270. | point value `a' to the 64-bit two's complement integer format. The
  6271. | conversion is performed according to the IEC/IEEE Standard for Binary
  6272. | Floating-Point Arithmetic---which means in particular that the conversion
  6273. | is rounded according to the current rounding mode. If `a' is a NaN,
  6274. | the largest positive integer is returned. Otherwise, if the conversion
  6275. | overflows, the largest integer with the same sign as `a' is returned.
  6276. *----------------------------------------------------------------------------*}
  6277. function floatx80_to_int64(a: floatx80): int64;
  6278. var
  6279. aSign: flag;
  6280. aExp, shiftCount: int32;
  6281. aSig, aSigExtra: bits64;
  6282. begin
  6283. aSig := extractFloatx80Frac( a );
  6284. aExp := extractFloatx80Exp( a );
  6285. aSign := extractFloatx80Sign( a );
  6286. shiftCount := $403E - aExp;
  6287. if ( shiftCount <= 0 ) then begin
  6288. if ( shiftCount <> 0 ) then begin
  6289. float_raise( float_flag_invalid );
  6290. if ( ( aSign = 0 )
  6291. or ( ( aExp = $7FFF )
  6292. and ( aSig <> bits64( $8000000000000000 ) ) )
  6293. ) then begin
  6294. result := $7FFFFFFFFFFFFFFF;
  6295. exit;
  6296. end;
  6297. result := $8000000000000000;
  6298. exit;
  6299. end;
  6300. aSigExtra := 0;
  6301. end
  6302. else begin
  6303. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6304. end;
  6305. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6306. end;
  6307. {*----------------------------------------------------------------------------
  6308. | Returns the result of converting the extended double-precision floating-
  6309. | point value `a' to the 64-bit two's complement integer format. The
  6310. | conversion is performed according to the IEC/IEEE Standard for Binary
  6311. | Floating-Point Arithmetic, except that the conversion is always rounded
  6312. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6313. | Otherwise, if the conversion overflows, the largest integer with the same
  6314. | sign as `a' is returned.
  6315. *----------------------------------------------------------------------------*}
  6316. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6317. var
  6318. aSign: flag;
  6319. aExp, shiftCount: int32;
  6320. aSig: bits64;
  6321. z: int64;
  6322. begin
  6323. aSig := extractFloatx80Frac( a );
  6324. aExp := extractFloatx80Exp( a );
  6325. aSign := extractFloatx80Sign( a );
  6326. shiftCount := aExp - $403E;
  6327. if ( 0 <= shiftCount ) then begin
  6328. aSig := $7FFFFFFFFFFFFFFF;
  6329. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6330. float_raise( float_flag_invalid );
  6331. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6332. result := $7FFFFFFFFFFFFFFF;
  6333. exit;
  6334. end;
  6335. end;
  6336. result := $8000000000000000;
  6337. exit;
  6338. end
  6339. else if ( aExp < $3FFF ) then begin
  6340. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6341. result := 0;
  6342. exit;
  6343. end;
  6344. z := aSig shr ( - shiftCount );
  6345. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6346. set_inexact_flag;
  6347. end;
  6348. if ( aSign <> 0 ) then z := - z;
  6349. result := z;
  6350. end;
  6351. {*----------------------------------------------------------------------------
  6352. | The pattern for a default generated extended double-precision NaN. The
  6353. | `high' and `low' values hold the most- and least-significant bits,
  6354. | respectively.
  6355. *----------------------------------------------------------------------------*}
  6356. const
  6357. floatx80_default_nan_high = $FFFF;
  6358. floatx80_default_nan_low = bits64( $C000000000000000 );
  6359. {*----------------------------------------------------------------------------
  6360. | Returns 1 if the extended double-precision floating-point value `a' is a
  6361. | signaling NaN; otherwise returns 0.
  6362. *----------------------------------------------------------------------------*}
  6363. function floatx80_is_signaling_nan(a : floatx80): flag;
  6364. var
  6365. aLow: bits64;
  6366. begin
  6367. aLow := a.low and not $4000000000000000;
  6368. result := ord(
  6369. ( a.high and $7FFF = $7FFF )
  6370. and ( bits64( aLow shl 1 ) <> 0 )
  6371. and ( a.low = aLow ) );
  6372. end;
  6373. {*----------------------------------------------------------------------------
  6374. | Returns the result of converting the extended double-precision floating-
  6375. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6376. | invalid exception is raised.
  6377. *----------------------------------------------------------------------------*}
  6378. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6379. var
  6380. z: commonNaNT;
  6381. begin
  6382. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6383. z.sign := a.high shr 15;
  6384. z.low := 0;
  6385. z.high := a.low shl 1;
  6386. result := z;
  6387. end;
  6388. {*----------------------------------------------------------------------------
  6389. | Returns 1 if the extended double-precision floating-point value `a' is a
  6390. | NaN; otherwise returns 0.
  6391. *----------------------------------------------------------------------------*}
  6392. function floatx80_is_nan(a : floatx80 ): flag;
  6393. begin
  6394. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
  6395. end;
  6396. {*----------------------------------------------------------------------------
  6397. | Takes two extended double-precision floating-point values `a' and `b', one
  6398. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6399. | `b' is a signaling NaN, the invalid exception is raised.
  6400. *----------------------------------------------------------------------------*}
  6401. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6402. var
  6403. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6404. label
  6405. returnLargerSignificand;
  6406. begin
  6407. aIsNaN := floatx80_is_nan( a );
  6408. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6409. bIsNaN := floatx80_is_nan( b );
  6410. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6411. a.low := a.low or $C000000000000000;
  6412. b.low := b.low or $C000000000000000;
  6413. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6414. if aIsSignalingNaN <> 0 then begin
  6415. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6416. if bIsNaN <> 0 then result := b else result := a;
  6417. exit;
  6418. end
  6419. else if aIsNaN <>0 then begin
  6420. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6421. result := a;
  6422. exit;
  6423. end;
  6424. returnLargerSignificand:
  6425. if ( a.low < b.low ) then begin
  6426. result := b;
  6427. exit;
  6428. end;
  6429. if ( b.low < a.low ) then begin
  6430. result := a;
  6431. exit;
  6432. end;
  6433. if a.high < b.high then result := a else result := b;
  6434. exit;
  6435. end
  6436. else
  6437. result := b;
  6438. end;
  6439. {*----------------------------------------------------------------------------
  6440. | Returns the result of converting the extended double-precision floating-
  6441. | point value `a' to the single-precision floating-point format. The
  6442. | conversion is performed according to the IEC/IEEE Standard for Binary
  6443. | Floating-Point Arithmetic.
  6444. *----------------------------------------------------------------------------*}
  6445. function floatx80_to_float32(a: floatx80): float32;
  6446. var
  6447. aSign: flag;
  6448. aExp: int32;
  6449. aSig: bits64;
  6450. begin
  6451. aSig := extractFloatx80Frac( a );
  6452. aExp := extractFloatx80Exp( a );
  6453. aSign := extractFloatx80Sign( a );
  6454. if ( aExp = $7FFF ) then begin
  6455. if bits64( aSig shl 1 ) <> 0 then begin
  6456. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6457. exit;
  6458. end;
  6459. result := packFloat32( aSign, $FF, 0 );
  6460. exit;
  6461. end;
  6462. shift64RightJamming( aSig, 33, aSig );
  6463. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6464. result := roundAndPackFloat32( aSign, aExp, aSig );
  6465. end;
  6466. {*----------------------------------------------------------------------------
  6467. | Returns the result of converting the extended double-precision floating-
  6468. | point value `a' to the double-precision floating-point format. The
  6469. | conversion is performed according to the IEC/IEEE Standard for Binary
  6470. | Floating-Point Arithmetic.
  6471. *----------------------------------------------------------------------------*}
  6472. function floatx80_to_float64(a: floatx80): float64;
  6473. var
  6474. aSign: flag;
  6475. aExp: int32;
  6476. aSig, zSig: bits64;
  6477. begin
  6478. aSig := extractFloatx80Frac( a );
  6479. aExp := extractFloatx80Exp( a );
  6480. aSign := extractFloatx80Sign( a );
  6481. if ( aExp = $7FFF ) then begin
  6482. if bits64( aSig shl 1 ) <> 0 then begin
  6483. result:=commonNaNToFloat64(floatx80ToCommonNaN(a));
  6484. exit;
  6485. end;
  6486. result := packFloat64( aSign, $7FF, 0 );
  6487. exit;
  6488. end;
  6489. shift64RightJamming( aSig, 1, zSig );
  6490. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6491. result := roundAndPackFloat64( aSign, aExp, zSig );
  6492. end;
  6493. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6494. {*----------------------------------------------------------------------------
  6495. | Returns the result of converting the extended double-precision floating-
  6496. | point value `a' to the quadruple-precision floating-point format. The
  6497. | conversion is performed according to the IEC/IEEE Standard for Binary
  6498. | Floating-Point Arithmetic.
  6499. *----------------------------------------------------------------------------*}
  6500. function floatx80_to_float128(a: floatx80): float128;
  6501. var
  6502. aSign: flag;
  6503. aExp: int16;
  6504. aSig, zSig0, zSig1: bits64;
  6505. begin
  6506. aSig := extractFloatx80Frac( a );
  6507. aExp := extractFloatx80Exp( a );
  6508. aSign := extractFloatx80Sign( a );
  6509. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6510. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6511. exit;
  6512. end;
  6513. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6514. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6515. end;
  6516. {$endif FPC_SOFTFLOAT_FLOAT128}
  6517. {*----------------------------------------------------------------------------
  6518. | Rounds the extended double-precision floating-point value `a' to an integer,
  6519. | and Returns the result as an extended quadruple-precision floating-point
  6520. | value. The operation is performed according to the IEC/IEEE Standard for
  6521. | Binary Floating-Point Arithmetic.
  6522. *----------------------------------------------------------------------------*}
  6523. function floatx80_round_to_int(a: floatx80): floatx80;
  6524. var
  6525. aSign: flag;
  6526. aExp: int32;
  6527. lastBitMask, roundBitsMask: bits64;
  6528. roundingMode: TFPURoundingMode;
  6529. z: floatx80;
  6530. begin
  6531. aExp := extractFloatx80Exp( a );
  6532. if ( $403E <= aExp ) then begin
  6533. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6534. result := propagateFloatx80NaN( a, a );
  6535. exit;
  6536. end;
  6537. result := a;
  6538. exit;
  6539. end;
  6540. if ( aExp < $3FFF ) then begin
  6541. if ( ( aExp = 0 )
  6542. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6543. result := a;
  6544. exit;
  6545. end;
  6546. set_inexact_flag;
  6547. aSign := extractFloatx80Sign( a );
  6548. case softfloat_rounding_mode of
  6549. float_round_nearest_even:
  6550. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6551. ) then begin
  6552. result :=
  6553. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6554. exit;
  6555. end;
  6556. float_round_down: begin
  6557. if aSign <> 0 then
  6558. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6559. else
  6560. result := packFloatx80( 0, 0, 0 );
  6561. exit;
  6562. end;
  6563. float_round_up: begin
  6564. if aSign <> 0 then
  6565. result := packFloatx80( 1, 0, 0 )
  6566. else
  6567. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6568. exit;
  6569. end;
  6570. end;
  6571. result := packFloatx80( aSign, 0, 0 );
  6572. exit;
  6573. end;
  6574. lastBitMask := 1;
  6575. lastBitMask := lastBitMask shl ( $403E - aExp );
  6576. roundBitsMask := lastBitMask - 1;
  6577. z := a;
  6578. roundingMode := softfloat_rounding_mode;
  6579. if ( roundingMode = float_round_nearest_even ) then begin
  6580. inc( z.low, lastBitMask shr 1 );
  6581. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6582. end
  6583. else if ( roundingMode <> float_round_to_zero ) then begin
  6584. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6585. inc( z.low, roundBitsMask );
  6586. end;
  6587. end;
  6588. z.low := z.low and not roundBitsMask;
  6589. if ( z.low = 0 ) then begin
  6590. inc(z.high);
  6591. z.low := bits64( $8000000000000000 );
  6592. end;
  6593. if ( z.low <> a.low ) then set_inexact_flag;
  6594. result := z;
  6595. end;
  6596. {*----------------------------------------------------------------------------
  6597. | Returns the result of adding the absolute values of the extended double-
  6598. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6599. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6600. | The addition is performed according to the IEC/IEEE Standard for Binary
  6601. | Floating-Point Arithmetic.
  6602. *----------------------------------------------------------------------------*}
  6603. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6604. var
  6605. aExp, bExp, zExp: int32;
  6606. aSig, bSig, zSig0, zSig1: bits64;
  6607. expDiff: int32;
  6608. label
  6609. shiftRight1, roundAndPack;
  6610. begin
  6611. aSig := extractFloatx80Frac( a );
  6612. aExp := extractFloatx80Exp( a );
  6613. bSig := extractFloatx80Frac( b );
  6614. bExp := extractFloatx80Exp( b );
  6615. expDiff := aExp - bExp;
  6616. if ( 0 < expDiff ) then begin
  6617. if ( aExp = $7FFF ) then begin
  6618. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6619. result := propagateFloatx80NaN( a, b );
  6620. exit;
  6621. end;
  6622. result := a;
  6623. exit;
  6624. end;
  6625. if ( bExp = 0 ) then dec(expDiff);
  6626. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6627. zExp := aExp;
  6628. end
  6629. else if ( expDiff < 0 ) then begin
  6630. if ( bExp = $7FFF ) then begin
  6631. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6632. result := propagateFloatx80NaN( a, b );
  6633. exit;
  6634. end;
  6635. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6636. exit;
  6637. end;
  6638. if ( aExp = 0 ) then inc(expDiff);
  6639. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6640. zExp := bExp;
  6641. end
  6642. else begin
  6643. if ( aExp = $7FFF ) then begin
  6644. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6645. result := propagateFloatx80NaN( a, b );
  6646. exit;
  6647. end;
  6648. result := a;
  6649. exit;
  6650. end;
  6651. zSig1 := 0;
  6652. zSig0 := aSig + bSig;
  6653. if ( aExp = 0 ) then begin
  6654. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6655. goto roundAndPack;
  6656. end;
  6657. zExp := aExp;
  6658. goto shiftRight1;
  6659. end;
  6660. zSig0 := aSig + bSig;
  6661. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6662. shiftRight1:
  6663. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6664. zSig0 := zSig0 or $8000000000000000;
  6665. inc(zExp);
  6666. roundAndPack:
  6667. result :=
  6668. roundAndPackFloatx80(
  6669. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6670. end;
  6671. {*----------------------------------------------------------------------------
  6672. | Returns the result of subtracting the absolute values of the extended
  6673. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6674. | difference is negated before being returned. `zSign' is ignored if the
  6675. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6676. | Standard for Binary Floating-Point Arithmetic.
  6677. *----------------------------------------------------------------------------*}
  6678. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6679. var
  6680. aExp, bExp, zExp: int32;
  6681. aSig, bSig, zSig0, zSig1: bits64;
  6682. expDiff: int32;
  6683. z: floatx80;
  6684. label
  6685. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6686. begin
  6687. aSig := extractFloatx80Frac( a );
  6688. aExp := extractFloatx80Exp( a );
  6689. bSig := extractFloatx80Frac( b );
  6690. bExp := extractFloatx80Exp( b );
  6691. expDiff := aExp - bExp;
  6692. if ( 0 < expDiff ) then goto aExpBigger;
  6693. if ( expDiff < 0 ) then goto bExpBigger;
  6694. if ( aExp = $7FFF ) then begin
  6695. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6696. result := propagateFloatx80NaN( a, b );
  6697. exit;
  6698. end;
  6699. float_raise( float_flag_invalid );
  6700. z.low := floatx80_default_nan_low;
  6701. z.high := floatx80_default_nan_high;
  6702. result := z;
  6703. exit;
  6704. end;
  6705. if ( aExp = 0 ) then begin
  6706. aExp := 1;
  6707. bExp := 1;
  6708. end;
  6709. zSig1 := 0;
  6710. if ( bSig < aSig ) then goto aBigger;
  6711. if ( aSig < bSig ) then goto bBigger;
  6712. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6713. exit;
  6714. bExpBigger:
  6715. if ( bExp = $7FFF ) then begin
  6716. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6717. result := propagateFloatx80NaN( a, b );
  6718. exit;
  6719. end;
  6720. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6721. exit;
  6722. end;
  6723. if ( aExp = 0 ) then inc(expDiff);
  6724. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6725. bBigger:
  6726. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6727. zExp := bExp;
  6728. zSign := zSign xor 1;
  6729. goto normalizeRoundAndPack;
  6730. aExpBigger:
  6731. if ( aExp = $7FFF ) then begin
  6732. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6733. result := propagateFloatx80NaN( a, b );
  6734. exit;
  6735. end;
  6736. result := a;
  6737. exit;
  6738. end;
  6739. if ( bExp = 0 ) then dec(expDiff);
  6740. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6741. aBigger:
  6742. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6743. zExp := aExp;
  6744. normalizeRoundAndPack:
  6745. result :=
  6746. normalizeRoundAndPackFloatx80(
  6747. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6748. end;
  6749. {*----------------------------------------------------------------------------
  6750. | Returns the result of adding the extended double-precision floating-point
  6751. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6752. | Standard for Binary Floating-Point Arithmetic.
  6753. *----------------------------------------------------------------------------*}
  6754. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6755. var
  6756. aSign, bSign: flag;
  6757. begin
  6758. aSign := extractFloatx80Sign( a );
  6759. bSign := extractFloatx80Sign( b );
  6760. if ( aSign = bSign ) then begin
  6761. result := addFloatx80Sigs( a, b, aSign );
  6762. end
  6763. else begin
  6764. result := subFloatx80Sigs( a, b, aSign );
  6765. end;
  6766. end;
  6767. {*----------------------------------------------------------------------------
  6768. | Returns the result of subtracting the extended double-precision floating-
  6769. | point values `a' and `b'. The operation is performed according to the
  6770. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6771. *----------------------------------------------------------------------------*}
  6772. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6773. var
  6774. aSign, bSign: flag;
  6775. begin
  6776. aSign := extractFloatx80Sign( a );
  6777. bSign := extractFloatx80Sign( b );
  6778. if ( aSign = bSign ) then begin
  6779. result := subFloatx80Sigs( a, b, aSign );
  6780. end
  6781. else begin
  6782. result := addFloatx80Sigs( a, b, aSign );
  6783. end;
  6784. end;
  6785. {*----------------------------------------------------------------------------
  6786. | Returns the result of multiplying the extended double-precision floating-
  6787. | point values `a' and `b'. The operation is performed according to the
  6788. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6789. *----------------------------------------------------------------------------*}
  6790. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6791. var
  6792. aSign, bSign, zSign: flag;
  6793. aExp, bExp, zExp: int32;
  6794. aSig, bSig, zSig0, zSig1: bits64;
  6795. z: floatx80;
  6796. label
  6797. invalid;
  6798. begin
  6799. aSig := extractFloatx80Frac( a );
  6800. aExp := extractFloatx80Exp( a );
  6801. aSign := extractFloatx80Sign( a );
  6802. bSig := extractFloatx80Frac( b );
  6803. bExp := extractFloatx80Exp( b );
  6804. bSign := extractFloatx80Sign( b );
  6805. zSign := aSign xor bSign;
  6806. if ( aExp = $7FFF ) then begin
  6807. if ( bits64( aSig shl 1 ) <> 0 )
  6808. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6809. result := propagateFloatx80NaN( a, b );
  6810. exit;
  6811. end;
  6812. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6813. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6814. exit;
  6815. end;
  6816. if ( bExp = $7FFF ) then begin
  6817. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6818. result := propagateFloatx80NaN( a, b );
  6819. exit;
  6820. end;
  6821. if ( ( aExp or aSig ) = 0 ) then begin
  6822. invalid:
  6823. float_raise( float_flag_invalid );
  6824. z.low := floatx80_default_nan_low;
  6825. z.high := floatx80_default_nan_high;
  6826. result := z;
  6827. exit;
  6828. end;
  6829. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6830. exit;
  6831. end;
  6832. if ( aExp = 0 ) then begin
  6833. if ( aSig = 0 ) then begin
  6834. result := packFloatx80( zSign, 0, 0 );
  6835. exit;
  6836. end;
  6837. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6838. end;
  6839. if ( bExp = 0 ) then begin
  6840. if ( bSig = 0 ) then begin
  6841. result := packFloatx80( zSign, 0, 0 );
  6842. exit;
  6843. end;
  6844. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6845. end;
  6846. zExp := aExp + bExp - $3FFE;
  6847. mul64To128( aSig, bSig, zSig0, zSig1 );
  6848. if 0 < sbits64( zSig0 ) then begin
  6849. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6850. dec(zExp);
  6851. end;
  6852. result :=
  6853. roundAndPackFloatx80(
  6854. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6855. end;
  6856. {*----------------------------------------------------------------------------
  6857. | Returns the result of dividing the extended double-precision floating-point
  6858. | value `a' by the corresponding value `b'. The operation is performed
  6859. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6860. *----------------------------------------------------------------------------*}
  6861. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6862. var
  6863. aSign, bSign, zSign: flag;
  6864. aExp, bExp, zExp: int32;
  6865. aSig, bSig, zSig0, zSig1: bits64;
  6866. rem0, rem1, rem2, term0, term1, term2: bits64;
  6867. z: floatx80;
  6868. label
  6869. invalid;
  6870. begin
  6871. aSig := extractFloatx80Frac( a );
  6872. aExp := extractFloatx80Exp( a );
  6873. aSign := extractFloatx80Sign( a );
  6874. bSig := extractFloatx80Frac( b );
  6875. bExp := extractFloatx80Exp( b );
  6876. bSign := extractFloatx80Sign( b );
  6877. zSign := aSign xor bSign;
  6878. if ( aExp = $7FFF ) then begin
  6879. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6880. result := propagateFloatx80NaN( a, b );
  6881. exit;
  6882. end;
  6883. if ( bExp = $7FFF ) then begin
  6884. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6885. result := propagateFloatx80NaN( a, b );
  6886. exit;
  6887. end;
  6888. goto invalid;
  6889. end;
  6890. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6891. exit;
  6892. end;
  6893. if ( bExp = $7FFF ) then begin
  6894. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6895. result := propagateFloatx80NaN( a, b );
  6896. exit;
  6897. end;
  6898. result := packFloatx80( zSign, 0, 0 );
  6899. exit;
  6900. end;
  6901. if ( bExp = 0 ) then begin
  6902. if ( bSig = 0 ) then begin
  6903. if ( ( aExp or aSig ) = 0 ) then begin
  6904. invalid:
  6905. float_raise( float_flag_invalid );
  6906. z.low := floatx80_default_nan_low;
  6907. z.high := floatx80_default_nan_high;
  6908. result := z;
  6909. exit;
  6910. end;
  6911. float_raise( float_flag_divbyzero );
  6912. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6913. exit;
  6914. end;
  6915. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6916. end;
  6917. if ( aExp = 0 ) then begin
  6918. if ( aSig = 0 ) then begin
  6919. result := packFloatx80( zSign, 0, 0 );
  6920. exit;
  6921. end;
  6922. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6923. end;
  6924. zExp := aExp - bExp + $3FFE;
  6925. rem1 := 0;
  6926. if ( bSig <= aSig ) then begin
  6927. shift128Right( aSig, 0, 1, aSig, rem1 );
  6928. inc(zExp);
  6929. end;
  6930. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6931. mul64To128( bSig, zSig0, term0, term1 );
  6932. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6933. while ( sbits64( rem0 ) < 0 ) do begin
  6934. dec(zSig0);
  6935. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6936. end;
  6937. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6938. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6939. mul64To128( bSig, zSig1, term1, term2 );
  6940. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6941. while ( sbits64( rem1 ) < 0 ) do begin
  6942. dec(zSig1);
  6943. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6944. end;
  6945. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6946. end;
  6947. result :=
  6948. roundAndPackFloatx80(
  6949. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6950. end;
  6951. {*----------------------------------------------------------------------------
  6952. | Returns the remainder of the extended double-precision floating-point value
  6953. | `a' with respect to the corresponding value `b'. The operation is performed
  6954. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6955. *----------------------------------------------------------------------------*}
  6956. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6957. var
  6958. aSign, zSign: flag;
  6959. aExp, bExp, expDiff: int32;
  6960. aSig0, aSig1, bSig: bits64;
  6961. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6962. z: floatx80;
  6963. label
  6964. invalid;
  6965. begin
  6966. aSig0 := extractFloatx80Frac( a );
  6967. aExp := extractFloatx80Exp( a );
  6968. aSign := extractFloatx80Sign( a );
  6969. bSig := extractFloatx80Frac( b );
  6970. bExp := extractFloatx80Exp( b );
  6971. if ( aExp = $7FFF ) then begin
  6972. if ( bits64( aSig0 shl 1 ) <> 0 )
  6973. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6974. result := propagateFloatx80NaN( a, b );
  6975. exit;
  6976. end;
  6977. goto invalid;
  6978. end;
  6979. if ( bExp = $7FFF ) then begin
  6980. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6981. result := propagateFloatx80NaN( a, b );
  6982. exit;
  6983. end;
  6984. result := a;
  6985. exit;
  6986. end;
  6987. if ( bExp = 0 ) then begin
  6988. if ( bSig = 0 ) then begin
  6989. invalid:
  6990. float_raise( float_flag_invalid );
  6991. z.low := floatx80_default_nan_low;
  6992. z.high := floatx80_default_nan_high;
  6993. result := z;
  6994. exit;
  6995. end;
  6996. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6997. end;
  6998. if ( aExp = 0 ) then begin
  6999. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  7000. result := a;
  7001. exit;
  7002. end;
  7003. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7004. end;
  7005. bSig := bSig or $8000000000000000;
  7006. zSign := aSign;
  7007. expDiff := aExp - bExp;
  7008. aSig1 := 0;
  7009. if ( expDiff < 0 ) then begin
  7010. if ( expDiff < -1 ) then begin
  7011. result := a;
  7012. exit;
  7013. end;
  7014. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  7015. expDiff := 0;
  7016. end;
  7017. q := ord( bSig <= aSig0 );
  7018. if ( q <> 0 ) then dec( aSig0, bSig );
  7019. dec( expDiff, 64 );
  7020. while ( 0 < expDiff ) do begin
  7021. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7022. if ( 2 < q ) then q := q - 2 else q := 0;
  7023. mul64To128( bSig, q, term0, term1 );
  7024. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7025. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  7026. dec( expDiff, 62 );
  7027. end;
  7028. inc( expDiff, 64 );
  7029. if ( 0 < expDiff ) then begin
  7030. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7031. if ( 2 < q ) then q:= q - 2 else q := 0;
  7032. q := q shr ( 64 - expDiff );
  7033. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  7034. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7035. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  7036. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  7037. inc(q);
  7038. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7039. end;
  7040. end
  7041. else begin
  7042. term1 := 0;
  7043. term0 := bSig;
  7044. end;
  7045. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  7046. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7047. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7048. and ( q and 1 <> 0 ) )
  7049. then begin
  7050. aSig0 := alternateASig0;
  7051. aSig1 := alternateASig1;
  7052. zSign := ord( zSign = 0 );
  7053. end;
  7054. result :=
  7055. normalizeRoundAndPackFloatx80(
  7056. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  7057. end;
  7058. {*----------------------------------------------------------------------------
  7059. | Returns the square root of the extended double-precision floating-point
  7060. | value `a'. The operation is performed according to the IEC/IEEE Standard
  7061. | for Binary Floating-Point Arithmetic.
  7062. *----------------------------------------------------------------------------*}
  7063. function floatx80_sqrt(a: floatx80): floatx80;
  7064. var
  7065. aSign: flag;
  7066. aExp, zExp: int32;
  7067. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  7068. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7069. z: floatx80;
  7070. label
  7071. invalid;
  7072. begin
  7073. aSig0 := extractFloatx80Frac( a );
  7074. aExp := extractFloatx80Exp( a );
  7075. aSign := extractFloatx80Sign( a );
  7076. if ( aExp = $7FFF ) then begin
  7077. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  7078. result := propagateFloatx80NaN( a, a );
  7079. exit;
  7080. end;
  7081. if ( aSign = 0 ) then begin
  7082. result := a;
  7083. exit;
  7084. end;
  7085. goto invalid;
  7086. end;
  7087. if ( aSign <> 0 ) then begin
  7088. if ( ( aExp or aSig0 ) = 0 ) then begin
  7089. result := a;
  7090. exit;
  7091. end;
  7092. invalid:
  7093. float_raise( float_flag_invalid );
  7094. z.low := floatx80_default_nan_low;
  7095. z.high := floatx80_default_nan_high;
  7096. result := z;
  7097. exit;
  7098. end;
  7099. if ( aExp = 0 ) then begin
  7100. if ( aSig0 = 0 ) then begin
  7101. result := packFloatx80( 0, 0, 0 );
  7102. exit;
  7103. end;
  7104. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7105. end;
  7106. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
  7107. zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
  7108. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  7109. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7110. doubleZSig0 := zSig0 shl 1;
  7111. mul64To128( zSig0, zSig0, term0, term1 );
  7112. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7113. while ( sbits64( rem0 ) < 0 ) do begin
  7114. dec(zSig0);
  7115. dec( doubleZSig0, 2 );
  7116. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7117. end;
  7118. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7119. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7120. if ( zSig1 = 0 ) then zSig1 := 1;
  7121. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7122. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7123. mul64To128( zSig1, zSig1, term2, term3 );
  7124. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7125. while ( sbits64( rem1 ) < 0 ) do begin
  7126. dec(zSig1);
  7127. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7128. term3 := term3 or 1;
  7129. term2 := term2 or doubleZSig0;
  7130. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7131. end;
  7132. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7133. end;
  7134. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7135. zSig0 := zSig0 or doubleZSig0;
  7136. result :=
  7137. roundAndPackFloatx80(
  7138. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7139. end;
  7140. {*----------------------------------------------------------------------------
  7141. | Returns 1 if the extended double-precision floating-point value `a' is
  7142. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7143. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7144. | Arithmetic.
  7145. *----------------------------------------------------------------------------*}
  7146. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7147. begin
  7148. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7149. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7150. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7151. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7152. ) then begin
  7153. if ( floatx80_is_signaling_nan( a )
  7154. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7155. float_raise( float_flag_invalid );
  7156. end;
  7157. result := 0;
  7158. exit;
  7159. end;
  7160. result := ord(
  7161. ( a.low = b.low )
  7162. and ( ( a.high = b.high )
  7163. or ( ( a.low = 0 )
  7164. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7165. ) );
  7166. end;
  7167. {*----------------------------------------------------------------------------
  7168. | Returns 1 if the extended double-precision floating-point value `a' is
  7169. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7170. | comparison is performed according to the IEC/IEEE Standard for Binary
  7171. | Floating-Point Arithmetic.
  7172. *----------------------------------------------------------------------------*}
  7173. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7174. var
  7175. aSign, bSign: flag;
  7176. begin
  7177. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7178. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7179. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7180. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7181. then begin
  7182. float_raise( float_flag_invalid );
  7183. result := 0;
  7184. exit;
  7185. end;
  7186. aSign := extractFloatx80Sign( a );
  7187. bSign := extractFloatx80Sign( b );
  7188. if ( aSign <> bSign ) then begin
  7189. result := ord(
  7190. ( aSign <> 0 )
  7191. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7192. exit;
  7193. end;
  7194. if aSign<>0 then
  7195. result := le128( b.high, b.low, a.high, a.low )
  7196. else
  7197. result := le128( a.high, a.low, b.high, b.low );
  7198. end;
  7199. {*----------------------------------------------------------------------------
  7200. | Returns 1 if the extended double-precision floating-point value `a' is
  7201. | less than the corresponding value `b', and 0 otherwise. The comparison
  7202. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7203. | Arithmetic.
  7204. *----------------------------------------------------------------------------*}
  7205. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7206. var
  7207. aSign, bSign: flag;
  7208. begin
  7209. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7210. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7211. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7212. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7213. then begin
  7214. float_raise( float_flag_invalid );
  7215. result := 0;
  7216. exit;
  7217. end;
  7218. aSign := extractFloatx80Sign( a );
  7219. bSign := extractFloatx80Sign( b );
  7220. if ( aSign <> bSign ) then begin
  7221. result := ord(
  7222. ( aSign <> 0 )
  7223. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7224. exit;
  7225. end;
  7226. if aSign <> 0 then
  7227. result := lt128( b.high, b.low, a.high, a.low )
  7228. else
  7229. result := lt128( a.high, a.low, b.high, b.low );
  7230. end;
  7231. {*----------------------------------------------------------------------------
  7232. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7233. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7234. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7235. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7236. *----------------------------------------------------------------------------*}
  7237. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7238. begin
  7239. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7240. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7241. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7242. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7243. then begin
  7244. float_raise( float_flag_invalid );
  7245. result := 0;
  7246. exit;
  7247. end;
  7248. result := ord(
  7249. ( a.low = b.low )
  7250. and ( ( a.high = b.high )
  7251. or ( ( a.low = 0 )
  7252. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7253. ) );
  7254. end;
  7255. {*----------------------------------------------------------------------------
  7256. | Returns 1 if the extended double-precision floating-point value `a' is less
  7257. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7258. | do not cause an exception. Otherwise, the comparison is performed according
  7259. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7260. *----------------------------------------------------------------------------*}
  7261. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7262. var
  7263. aSign, bSign: flag;
  7264. begin
  7265. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7266. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7267. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7268. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7269. then begin
  7270. if ( floatx80_is_signaling_nan( a )
  7271. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7272. float_raise( float_flag_invalid );
  7273. end;
  7274. result := 0;
  7275. exit;
  7276. end;
  7277. aSign := extractFloatx80Sign( a );
  7278. bSign := extractFloatx80Sign( b );
  7279. if ( aSign <> bSign ) then begin
  7280. result := ord(
  7281. ( aSign <> 0 )
  7282. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7283. exit;
  7284. end;
  7285. if aSign <> 0 then
  7286. result := le128( b.high, b.low, a.high, a.low )
  7287. else
  7288. result := le128( a.high, a.low, b.high, b.low );
  7289. end;
  7290. {*----------------------------------------------------------------------------
  7291. | Returns 1 if the extended double-precision floating-point value `a' is less
  7292. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7293. | an exception. Otherwise, the comparison is performed according to the
  7294. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7295. *----------------------------------------------------------------------------*}
  7296. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7297. var
  7298. aSign, bSign: flag;
  7299. begin
  7300. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7301. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7302. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7303. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7304. then begin
  7305. if ( floatx80_is_signaling_nan( a )
  7306. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7307. float_raise( float_flag_invalid );
  7308. end;
  7309. result := 0;
  7310. exit;
  7311. end;
  7312. aSign := extractFloatx80Sign( a );
  7313. bSign := extractFloatx80Sign( b );
  7314. if ( aSign <> bSign ) then begin
  7315. result := ord(
  7316. ( aSign <> 0 )
  7317. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7318. exit;
  7319. end;
  7320. if aSign <> 0 then
  7321. result := lt128( b.high, b.low, a.high, a.low )
  7322. else
  7323. result := lt128( a.high, a.low, b.high, b.low );
  7324. end;
  7325. {$endif FPC_SOFTFLOAT_FLOATX80}
  7326. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7327. {*----------------------------------------------------------------------------
  7328. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7329. | floating-point value `a'.
  7330. *----------------------------------------------------------------------------*}
  7331. function extractFloat128Frac1(a : float128): bits64;
  7332. begin
  7333. result:=a.low;
  7334. end;
  7335. {*----------------------------------------------------------------------------
  7336. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7337. | floating-point value `a'.
  7338. *----------------------------------------------------------------------------*}
  7339. function extractFloat128Frac0(a : float128): bits64;
  7340. begin
  7341. result:=a.high and int64($0000FFFFFFFFFFFF);
  7342. end;
  7343. {*----------------------------------------------------------------------------
  7344. | Returns the exponent bits of the quadruple-precision floating-point value
  7345. | `a'.
  7346. *----------------------------------------------------------------------------*}
  7347. function extractFloat128Exp(a : float128): int32;
  7348. begin
  7349. result:=( a.high shr 48 ) and $7FFF;
  7350. end;
  7351. {*----------------------------------------------------------------------------
  7352. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7353. *----------------------------------------------------------------------------*}
  7354. function extractFloat128Sign(a : float128): flag;
  7355. begin
  7356. result:=a.high shr 63;
  7357. end;
  7358. {*----------------------------------------------------------------------------
  7359. | Normalizes the subnormal quadruple-precision floating-point value
  7360. | represented by the denormalized significand formed by the concatenation of
  7361. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7362. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7363. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7364. | least significant 64 bits of the normalized significand are stored at the
  7365. | location pointed to by `zSig1Ptr'.
  7366. *----------------------------------------------------------------------------*}
  7367. procedure normalizeFloat128Subnormal(
  7368. aSig0: bits64;
  7369. aSig1: bits64;
  7370. var zExpPtr: int32;
  7371. var zSig0Ptr: bits64;
  7372. var zSig1Ptr: bits64);
  7373. var
  7374. shiftCount: int8;
  7375. begin
  7376. if ( aSig0 = 0 ) then
  7377. begin
  7378. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7379. if ( shiftCount < 0 ) then
  7380. begin
  7381. zSig0Ptr := aSig1 shr ( - shiftCount );
  7382. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7383. end
  7384. else begin
  7385. zSig0Ptr := aSig1 shl shiftCount;
  7386. zSig1Ptr := 0;
  7387. end;
  7388. zExpPtr := - shiftCount - 63;
  7389. end
  7390. else begin
  7391. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7392. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7393. zExpPtr := 1 - shiftCount;
  7394. end;
  7395. end;
  7396. {*----------------------------------------------------------------------------
  7397. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7398. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7399. | floating-point value, returning the result. After being shifted into the
  7400. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7401. | added together to form the most significant 32 bits of the result. This
  7402. | means that any integer portion of `zSig0' will be added into the exponent.
  7403. | Since a properly normalized significand will have an integer portion equal
  7404. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7405. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7406. | significand.
  7407. *----------------------------------------------------------------------------*}
  7408. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7409. var
  7410. z: float128;
  7411. begin
  7412. z.low := zSig1;
  7413. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7414. result:=z;
  7415. end;
  7416. {*----------------------------------------------------------------------------
  7417. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7418. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7419. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7420. | corresponding to the abstract input. Ordinarily, the abstract value is
  7421. | simply rounded and packed into the quadruple-precision format, with the
  7422. | inexact exception raised if the abstract input cannot be represented
  7423. | exactly. However, if the abstract value is too large, the overflow and
  7424. | inexact exceptions are raised and an infinity or maximal finite value is
  7425. | returned. If the abstract value is too small, the input value is rounded to
  7426. | a subnormal number, and the underflow and inexact exceptions are raised if
  7427. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7428. | precision floating-point number.
  7429. | The input significand must be normalized or smaller. If the input
  7430. | significand is not normalized, `zExp' must be 0; in that case, the result
  7431. | returned is a subnormal number, and it must not require rounding. In the
  7432. | usual case that the input significand is normalized, `zExp' must be 1 less
  7433. | than the ``true'' floating-point exponent. The handling of underflow and
  7434. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7435. *----------------------------------------------------------------------------*}
  7436. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7437. var
  7438. roundingMode: TFPURoundingMode;
  7439. roundNearestEven, increment, isTiny: flag;
  7440. begin
  7441. roundingMode := softfloat_rounding_mode;
  7442. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7443. increment := ord( sbits64(zSig2) < 0 );
  7444. if ( roundNearestEven=0 ) then
  7445. begin
  7446. if ( roundingMode = float_round_to_zero ) then
  7447. begin
  7448. increment := 0;
  7449. end
  7450. else begin
  7451. if ( zSign<>0 ) then
  7452. begin
  7453. increment := ord( roundingMode = float_round_down ) and zSig2;
  7454. end
  7455. else begin
  7456. increment := ord( roundingMode = float_round_up ) and zSig2;
  7457. end;
  7458. end;
  7459. end;
  7460. if ( $7FFD <= bits32(zExp) ) then
  7461. begin
  7462. if ( ord( $7FFD < zExp )
  7463. or ( ord( zExp = $7FFD )
  7464. and eq128(
  7465. int64( $0001FFFFFFFFFFFF ),
  7466. bits64( $FFFFFFFFFFFFFFFF ),
  7467. zSig0,
  7468. zSig1
  7469. )
  7470. and increment
  7471. )
  7472. )<>0 then
  7473. begin
  7474. float_raise( [float_flag_overflow,float_flag_inexact] );
  7475. if ( ord( roundingMode = float_round_to_zero )
  7476. or ( zSign and ord( roundingMode = float_round_up ) )
  7477. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7478. )<>0 then
  7479. begin
  7480. result :=
  7481. packFloat128(
  7482. zSign,
  7483. $7FFE,
  7484. int64( $0000FFFFFFFFFFFF ),
  7485. bits64( $FFFFFFFFFFFFFFFF )
  7486. );
  7487. exit;
  7488. end;
  7489. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7490. exit;
  7491. end;
  7492. if ( zExp < 0 ) then
  7493. begin
  7494. isTiny :=
  7495. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7496. or ( zExp < -1 )
  7497. or not( increment<>0 )
  7498. or boolean(lt128(
  7499. zSig0,
  7500. zSig1,
  7501. int64( $0001FFFFFFFFFFFF ),
  7502. bits64( $FFFFFFFFFFFFFFFF )
  7503. )));
  7504. shift128ExtraRightJamming(
  7505. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7506. zExp := 0;
  7507. if ( isTiny and zSig2 )<>0 then
  7508. float_raise( float_flag_underflow );
  7509. if ( roundNearestEven<>0 ) then
  7510. begin
  7511. increment := ord( sbits64(zSig2) < 0 );
  7512. end
  7513. else begin
  7514. if ( zSign<>0 ) then
  7515. begin
  7516. increment := ord( roundingMode = float_round_down ) and zSig2;
  7517. end
  7518. else begin
  7519. increment := ord( roundingMode = float_round_up ) and zSig2;
  7520. end;
  7521. end;
  7522. end;
  7523. end;
  7524. if ( zSig2<>0 ) then
  7525. set_inexact_flag;
  7526. if ( increment<>0 ) then
  7527. begin
  7528. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7529. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7530. end
  7531. else begin
  7532. if ( ( zSig0 or zSig1 ) = 0 ) then
  7533. zExp := 0;
  7534. end;
  7535. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7536. end;
  7537. {*----------------------------------------------------------------------------
  7538. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7539. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7540. | returns the proper quadruple-precision floating-point value corresponding
  7541. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7542. | except that the input significand has fewer bits and does not have to be
  7543. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7544. | point exponent.
  7545. *----------------------------------------------------------------------------*}
  7546. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7547. var
  7548. shiftCount: int8;
  7549. zSig2: bits64;
  7550. begin
  7551. if ( zSig0 = 0 ) then
  7552. begin
  7553. zSig0 := zSig1;
  7554. zSig1 := 0;
  7555. dec(zExp, 64);
  7556. end;
  7557. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7558. if ( 0 <= shiftCount ) then
  7559. begin
  7560. zSig2 := 0;
  7561. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7562. end
  7563. else begin
  7564. shift128ExtraRightJamming(
  7565. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7566. end;
  7567. dec(zExp, shiftCount);
  7568. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7569. end;
  7570. {*----------------------------------------------------------------------------
  7571. | Returns the result of converting the quadruple-precision floating-point
  7572. | value `a' to the 32-bit two's complement integer format. The conversion
  7573. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7574. | Arithmetic---which means in particular that the conversion is rounded
  7575. | according to the current rounding mode. If `a' is a NaN, the largest
  7576. | positive integer is returned. Otherwise, if the conversion overflows, the
  7577. | largest integer with the same sign as `a' is returned.
  7578. *----------------------------------------------------------------------------*}
  7579. function float128_to_int32(a: float128): int32;
  7580. var
  7581. aSign: flag;
  7582. aExp, shiftCount: int32;
  7583. aSig0, aSig1: bits64;
  7584. begin
  7585. aSig1 := extractFloat128Frac1( a );
  7586. aSig0 := extractFloat128Frac0( a );
  7587. aExp := extractFloat128Exp( a );
  7588. aSign := extractFloat128Sign( a );
  7589. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7590. aSign := 0;
  7591. if ( aExp<>0 ) then
  7592. aSig0 := aSig0 or int64( $0001000000000000 );
  7593. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7594. shiftCount := $4028 - aExp;
  7595. if ( 0 < shiftCount ) then
  7596. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7597. result := roundAndPackInt32( aSign, aSig0 );
  7598. end;
  7599. {*----------------------------------------------------------------------------
  7600. | Returns the result of converting the quadruple-precision floating-point
  7601. | value `a' to the 32-bit two's complement integer format. The conversion
  7602. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7603. | Arithmetic, except that the conversion is always rounded toward zero. If
  7604. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7605. | conversion overflows, the largest integer with the same sign as `a' is
  7606. | returned.
  7607. *----------------------------------------------------------------------------*}
  7608. function float128_to_int32_round_to_zero(a: float128): int32;
  7609. var
  7610. aSign: flag;
  7611. aExp, shiftCount: int32;
  7612. aSig0, aSig1, savedASig: bits64;
  7613. z: int32;
  7614. label
  7615. invalid;
  7616. begin
  7617. aSig1 := extractFloat128Frac1( a );
  7618. aSig0 := extractFloat128Frac0( a );
  7619. aExp := extractFloat128Exp( a );
  7620. aSign := extractFloat128Sign( a );
  7621. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7622. if ( $401E < aExp ) then
  7623. begin
  7624. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7625. aSign := 0;
  7626. goto invalid;
  7627. end
  7628. else if ( aExp < $3FFF ) then
  7629. begin
  7630. if ( aExp or aSig0 )<>0 then
  7631. set_inexact_flag;
  7632. result := 0;
  7633. exit;
  7634. end;
  7635. aSig0 := aSig0 or int64( $0001000000000000 );
  7636. shiftCount := $402F - aExp;
  7637. savedASig := aSig0;
  7638. aSig0 := aSig0 shr shiftCount;
  7639. z := aSig0;
  7640. if ( aSign )<>0 then
  7641. z := - z;
  7642. if ( ord( z < 0 ) xor aSign )<>0 then
  7643. begin
  7644. invalid:
  7645. float_raise( float_flag_invalid );
  7646. if aSign<>0 then
  7647. result:= int32( $80000000 )
  7648. else
  7649. result:=$7FFFFFFF;
  7650. exit;
  7651. end;
  7652. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7653. begin
  7654. set_inexact_flag;
  7655. end;
  7656. result := z;
  7657. end;
  7658. {*----------------------------------------------------------------------------
  7659. | Returns the result of converting the quadruple-precision floating-point
  7660. | value `a' to the 64-bit two's complement integer format. The conversion
  7661. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7662. | Arithmetic---which means in particular that the conversion is rounded
  7663. | according to the current rounding mode. If `a' is a NaN, the largest
  7664. | positive integer is returned. Otherwise, if the conversion overflows, the
  7665. | largest integer with the same sign as `a' is returned.
  7666. *----------------------------------------------------------------------------*}
  7667. function float128_to_int64(a: float128): int64;
  7668. var
  7669. aSign: flag;
  7670. aExp, shiftCount: int32;
  7671. aSig0, aSig1: bits64;
  7672. begin
  7673. aSig1 := extractFloat128Frac1( a );
  7674. aSig0 := extractFloat128Frac0( a );
  7675. aExp := extractFloat128Exp( a );
  7676. aSign := extractFloat128Sign( a );
  7677. if ( aExp<>0 ) then
  7678. aSig0 := aSig0 or int64( $0001000000000000 );
  7679. shiftCount := $402F - aExp;
  7680. if ( shiftCount <= 0 ) then
  7681. begin
  7682. if ( $403E < aExp ) then
  7683. begin
  7684. float_raise( float_flag_invalid );
  7685. if ( (aSign=0)
  7686. or ( ( aExp = $7FFF )
  7687. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7688. )
  7689. ) then
  7690. begin
  7691. result := int64( $7FFFFFFFFFFFFFFF );
  7692. exit;
  7693. end;
  7694. result := int64( $8000000000000000 );
  7695. exit;
  7696. end;
  7697. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7698. end
  7699. else begin
  7700. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7701. end;
  7702. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7703. end;
  7704. {*----------------------------------------------------------------------------
  7705. | Returns the result of converting the quadruple-precision floating-point
  7706. | value `a' to the 64-bit two's complement integer format. The conversion
  7707. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7708. | Arithmetic, except that the conversion is always rounded toward zero.
  7709. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7710. | the conversion overflows, the largest integer with the same sign as `a' is
  7711. | returned.
  7712. *----------------------------------------------------------------------------*}
  7713. function float128_to_int64_round_to_zero(a: float128): int64;
  7714. var
  7715. aSign: flag;
  7716. aExp, shiftCount: int32;
  7717. aSig0, aSig1: bits64;
  7718. z: int64;
  7719. begin
  7720. aSig1 := extractFloat128Frac1( a );
  7721. aSig0 := extractFloat128Frac0( a );
  7722. aExp := extractFloat128Exp( a );
  7723. aSign := extractFloat128Sign( a );
  7724. if ( aExp<>0 ) then
  7725. aSig0 := aSig0 or int64( $0001000000000000 );
  7726. shiftCount := aExp - $402F;
  7727. if ( 0 < shiftCount ) then
  7728. begin
  7729. if ( $403E <= aExp ) then
  7730. begin
  7731. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7732. if ( ( a.high = bits64( $C03E000000000000 ) )
  7733. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7734. begin
  7735. if ( aSig1<>0 ) then
  7736. set_inexact_flag;
  7737. end
  7738. else begin
  7739. float_raise( float_flag_invalid );
  7740. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7741. begin
  7742. result := int64( $7FFFFFFFFFFFFFFF );
  7743. exit;
  7744. end;
  7745. end;
  7746. result := int64( $8000000000000000 );
  7747. exit;
  7748. end;
  7749. z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
  7750. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7751. begin
  7752. set_inexact_flag;
  7753. end;
  7754. end
  7755. else begin
  7756. if ( aExp < $3FFF ) then
  7757. begin
  7758. if ( aExp or aSig0 or aSig1 )<>0 then
  7759. begin
  7760. set_inexact_flag;
  7761. end;
  7762. result := 0;
  7763. exit;
  7764. end;
  7765. z := aSig0 shr ( - shiftCount );
  7766. if ( (aSig1<>0)
  7767. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7768. begin
  7769. set_inexact_flag;
  7770. end;
  7771. end;
  7772. if ( aSign<>0 ) then
  7773. z := - z;
  7774. result := z;
  7775. end;
  7776. {*----------------------------------------------------------------------------
  7777. | Returns the result of converting the quadruple-precision floating-point
  7778. | value `a' to the single-precision floating-point format. The conversion
  7779. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7780. | Arithmetic.
  7781. *----------------------------------------------------------------------------*}
  7782. function float128_to_float32(a: float128): float32;
  7783. var
  7784. aSign: flag;
  7785. aExp: int32;
  7786. aSig0, aSig1: bits64;
  7787. zSig: bits32;
  7788. begin
  7789. aSig1 := extractFloat128Frac1( a );
  7790. aSig0 := extractFloat128Frac0( a );
  7791. aExp := extractFloat128Exp( a );
  7792. aSign := extractFloat128Sign( a );
  7793. if ( aExp = $7FFF ) then
  7794. begin
  7795. if ( aSig0 or aSig1 )<>0 then
  7796. begin
  7797. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7798. exit;
  7799. end;
  7800. result := packFloat32( aSign, $FF, 0 );
  7801. exit;
  7802. end;
  7803. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7804. shift64RightJamming( aSig0, 18, aSig0 );
  7805. zSig := aSig0;
  7806. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7807. begin
  7808. zSig := zSig or $40000000;
  7809. dec(aExp,$3F81);
  7810. end;
  7811. result := roundAndPackFloat32( aSign, aExp, zSig );
  7812. end;
  7813. {*----------------------------------------------------------------------------
  7814. | Returns the result of converting the quadruple-precision floating-point
  7815. | value `a' to the double-precision floating-point format. The conversion
  7816. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7817. | Arithmetic.
  7818. *----------------------------------------------------------------------------*}
  7819. function float128_to_float64(a: float128): float64;
  7820. var
  7821. aSign: flag;
  7822. aExp: int32;
  7823. aSig0, aSig1: bits64;
  7824. begin
  7825. aSig1 := extractFloat128Frac1( a );
  7826. aSig0 := extractFloat128Frac0( a );
  7827. aExp := extractFloat128Exp( a );
  7828. aSign := extractFloat128Sign( a );
  7829. if ( aExp = $7FFF ) then
  7830. begin
  7831. if ( aSig0 or aSig1 )<>0 then
  7832. begin
  7833. result:=commonNaNToFloat64(float128ToCommonNaN(a));
  7834. exit;
  7835. end;
  7836. result:=packFloat64( aSign, $7FF, 0);
  7837. exit;
  7838. end;
  7839. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7840. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7841. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7842. begin
  7843. aSig0 := aSig0 or int64( $4000000000000000 );
  7844. dec(aExp,$3C01);
  7845. end;
  7846. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7847. end;
  7848. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7849. {*----------------------------------------------------------------------------
  7850. | Returns the result of converting the quadruple-precision floating-point
  7851. | value `a' to the extended double-precision floating-point format. The
  7852. | conversion is performed according to the IEC/IEEE Standard for Binary
  7853. | Floating-Point Arithmetic.
  7854. *----------------------------------------------------------------------------*}
  7855. function float128_to_floatx80(a: float128): floatx80;
  7856. var
  7857. aSign: flag;
  7858. aExp: int32;
  7859. aSig0, aSig1: bits64;
  7860. begin
  7861. aSig1 := extractFloat128Frac1( a );
  7862. aSig0 := extractFloat128Frac0( a );
  7863. aExp := extractFloat128Exp( a );
  7864. aSign := extractFloat128Sign( a );
  7865. if ( aExp = $7FFF ) then begin
  7866. if ( aSig0 or aSig1 <> 0 ) then begin
  7867. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7868. exit;
  7869. end;
  7870. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7871. exit;
  7872. end;
  7873. if ( aExp = 0 ) then begin
  7874. if ( ( aSig0 or aSig1 ) = 0 ) then
  7875. begin
  7876. result := packFloatx80( aSign, 0, 0 );
  7877. exit;
  7878. end;
  7879. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7880. end
  7881. else begin
  7882. aSig0 := aSig0 or int64( $0001000000000000 );
  7883. end;
  7884. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7885. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7886. end;
  7887. {$endif FPC_SOFTFLOAT_FLOATX80}
  7888. {*----------------------------------------------------------------------------
  7889. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7890. | Returns the result as a quadruple-precision floating-point value. The
  7891. | operation is performed according to the IEC/IEEE Standard for Binary
  7892. | Floating-Point Arithmetic.
  7893. *----------------------------------------------------------------------------*}
  7894. function float128_round_to_int(a: float128): float128;
  7895. var
  7896. aSign: flag;
  7897. aExp: int32;
  7898. lastBitMask, roundBitsMask: bits64;
  7899. roundingMode: TFPURoundingMode;
  7900. z: float128;
  7901. begin
  7902. aExp := extractFloat128Exp( a );
  7903. if ( $402F <= aExp ) then
  7904. begin
  7905. if ( $406F <= aExp ) then
  7906. begin
  7907. if ( ( aExp = $7FFF )
  7908. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7909. ) then
  7910. begin
  7911. result := propagateFloat128NaN( a, a );
  7912. exit;
  7913. end;
  7914. result := a;
  7915. exit;
  7916. end;
  7917. lastBitMask := 1;
  7918. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7919. roundBitsMask := lastBitMask - 1;
  7920. z := a;
  7921. roundingMode := softfloat_rounding_mode;
  7922. if ( roundingMode = float_round_nearest_even ) then
  7923. begin
  7924. if ( lastBitMask )<>0 then
  7925. begin
  7926. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7927. if ( ( z.low and roundBitsMask ) = 0 ) then
  7928. z.low := z.low and not(lastBitMask);
  7929. end
  7930. else begin
  7931. if ( sbits64(z.low) < 0 ) then
  7932. begin
  7933. inc(z.high);
  7934. if ( bits64( z.low shl 1 ) = 0 ) then
  7935. z.high := z.high and not bits64( 1 );
  7936. end;
  7937. end;
  7938. end
  7939. else if ( roundingMode <> float_round_to_zero ) then
  7940. begin
  7941. if ( extractFloat128Sign( z )
  7942. xor ord( roundingMode = float_round_up ) )<>0 then
  7943. begin
  7944. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7945. end;
  7946. end;
  7947. z.low := z.low and not(roundBitsMask);
  7948. end
  7949. else begin
  7950. if ( aExp < $3FFF ) then
  7951. begin
  7952. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7953. begin
  7954. result := a;
  7955. exit;
  7956. end;
  7957. set_inexact_flag;
  7958. aSign := extractFloat128Sign( a );
  7959. case softfloat_rounding_mode of
  7960. float_round_nearest_even:
  7961. if ( ( aExp = $3FFE )
  7962. and ( (extractFloat128Frac0( a )<>0)
  7963. or (extractFloat128Frac1( a )<>0) )
  7964. ) then begin
  7965. begin
  7966. result := packFloat128( aSign, $3FFF, 0, 0 );
  7967. exit;
  7968. end;
  7969. end;
  7970. float_round_down:
  7971. begin
  7972. if aSign<>0 then
  7973. result:=packFloat128( 1, $3FFF, 0, 0 )
  7974. else
  7975. result:=packFloat128( 0, 0, 0, 0 );
  7976. exit;
  7977. end;
  7978. float_round_up:
  7979. begin
  7980. if aSign<>0 then
  7981. result := packFloat128( 1, 0, 0, 0 )
  7982. else
  7983. result:=packFloat128( 0, $3FFF, 0, 0 );
  7984. exit;
  7985. end;
  7986. end;
  7987. result := packFloat128( aSign, 0, 0, 0 );
  7988. exit;
  7989. end;
  7990. lastBitMask := 1;
  7991. lastBitMask := lastBitMask shl ($402F - aExp);
  7992. roundBitsMask := lastBitMask - 1;
  7993. z.low := 0;
  7994. z.high := a.high;
  7995. roundingMode := softfloat_rounding_mode;
  7996. if ( roundingMode = float_round_nearest_even ) then begin
  7997. inc(z.high,lastBitMask shr 1);
  7998. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7999. z.high := z.high and not(lastBitMask);
  8000. end;
  8001. end
  8002. else if ( roundingMode <> float_round_to_zero ) then begin
  8003. if ( (extractFloat128Sign( z )<>0)
  8004. xor ( roundingMode = float_round_up ) ) then begin
  8005. z.high := z.high or ord( a.low <> 0 );
  8006. z.high := z.high+roundBitsMask;
  8007. end;
  8008. end;
  8009. z.high := z.high and not(roundBitsMask);
  8010. end;
  8011. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  8012. set_inexact_flag;
  8013. end;
  8014. result := z;
  8015. end;
  8016. {*----------------------------------------------------------------------------
  8017. | Returns the result of adding the absolute values of the quadruple-precision
  8018. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  8019. | before being returned. `zSign' is ignored if the result is a NaN.
  8020. | The addition is performed according to the IEC/IEEE Standard for Binary
  8021. | Floating-Point Arithmetic.
  8022. *----------------------------------------------------------------------------*}
  8023. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  8024. var
  8025. aExp, bExp, zExp: int32;
  8026. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8027. expDiff: int32;
  8028. label
  8029. shiftRight1,roundAndPack;
  8030. begin
  8031. aSig1 := extractFloat128Frac1( a );
  8032. aSig0 := extractFloat128Frac0( a );
  8033. aExp := extractFloat128Exp( a );
  8034. bSig1 := extractFloat128Frac1( b );
  8035. bSig0 := extractFloat128Frac0( b );
  8036. bExp := extractFloat128Exp( b );
  8037. expDiff := aExp - bExp;
  8038. if ( 0 < expDiff ) then begin
  8039. if ( aExp = $7FFF ) then begin
  8040. if ( aSig0 or aSig1 )<>0 then
  8041. begin
  8042. result := propagateFloat128NaN( a, b );
  8043. exit;
  8044. end;
  8045. result := a;
  8046. exit;
  8047. end;
  8048. if ( bExp = 0 ) then begin
  8049. dec(expDiff);
  8050. end
  8051. else begin
  8052. bSig0 := bSig0 or int64( $0001000000000000 );
  8053. end;
  8054. shift128ExtraRightJamming(
  8055. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  8056. zExp := aExp;
  8057. end
  8058. else if ( expDiff < 0 ) then begin
  8059. if ( bExp = $7FFF ) then begin
  8060. if ( bSig0 or bSig1 )<>0 then
  8061. begin
  8062. result := propagateFloat128NaN( a, b );
  8063. exit;
  8064. end;
  8065. result := packFloat128( zSign, $7FFF, 0, 0 );
  8066. exit;
  8067. end;
  8068. if ( aExp = 0 ) then begin
  8069. inc(expDiff);
  8070. end
  8071. else begin
  8072. aSig0 := aSig0 or int64( $0001000000000000 );
  8073. end;
  8074. shift128ExtraRightJamming(
  8075. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  8076. zExp := bExp;
  8077. end
  8078. else begin
  8079. if ( aExp = $7FFF ) then begin
  8080. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8081. result := propagateFloat128NaN( a, b );
  8082. exit;
  8083. end;
  8084. result := a;
  8085. exit;
  8086. end;
  8087. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8088. if ( aExp = 0 ) then
  8089. begin
  8090. result := packFloat128( zSign, 0, zSig0, zSig1 );
  8091. exit;
  8092. end;
  8093. zSig2 := 0;
  8094. zSig0 := zSig0 or int64( $0002000000000000 );
  8095. zExp := aExp;
  8096. goto shiftRight1;
  8097. end;
  8098. aSig0 := aSig0 or int64( $0001000000000000 );
  8099. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8100. dec(zExp);
  8101. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  8102. inc(zExp);
  8103. shiftRight1:
  8104. shift128ExtraRightJamming(
  8105. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8106. roundAndPack:
  8107. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8108. end;
  8109. {*----------------------------------------------------------------------------
  8110. | Returns the result of subtracting the absolute values of the quadruple-
  8111. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  8112. | difference is negated before being returned. `zSign' is ignored if the
  8113. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  8114. | Standard for Binary Floating-Point Arithmetic.
  8115. *----------------------------------------------------------------------------*}
  8116. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  8117. var
  8118. aExp, bExp, zExp: int32;
  8119. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8120. expDiff: int32;
  8121. z: float128;
  8122. label
  8123. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8124. begin
  8125. aSig1 := extractFloat128Frac1( a );
  8126. aSig0 := extractFloat128Frac0( a );
  8127. aExp := extractFloat128Exp( a );
  8128. bSig1 := extractFloat128Frac1( b );
  8129. bSig0 := extractFloat128Frac0( b );
  8130. bExp := extractFloat128Exp( b );
  8131. expDiff := aExp - bExp;
  8132. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8133. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8134. if ( 0 < expDiff ) then goto aExpBigger;
  8135. if ( expDiff < 0 ) then goto bExpBigger;
  8136. if ( aExp = $7FFF ) then begin
  8137. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8138. result := propagateFloat128NaN( a, b );
  8139. exit;
  8140. end;
  8141. float_raise( float_flag_invalid );
  8142. z.low := float128_default_nan_low;
  8143. z.high := float128_default_nan_high;
  8144. result := z;
  8145. exit;
  8146. end;
  8147. if ( aExp = 0 ) then begin
  8148. aExp := 1;
  8149. bExp := 1;
  8150. end;
  8151. if ( bSig0 < aSig0 ) then goto aBigger;
  8152. if ( aSig0 < bSig0 ) then goto bBigger;
  8153. if ( bSig1 < aSig1 ) then goto aBigger;
  8154. if ( aSig1 < bSig1 ) then goto bBigger;
  8155. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8156. exit;
  8157. bExpBigger:
  8158. if ( bExp = $7FFF ) then begin
  8159. if ( bSig0 or bSig1 )<>0 then
  8160. begin
  8161. result := propagateFloat128NaN( a, b );
  8162. exit;
  8163. end;
  8164. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8165. exit;
  8166. end;
  8167. if ( aExp = 0 ) then begin
  8168. inc(expDiff);
  8169. end
  8170. else begin
  8171. aSig0 := aSig0 or int64( $4000000000000000 );
  8172. end;
  8173. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8174. bSig0 := bSig0 or int64( $4000000000000000 );
  8175. bBigger:
  8176. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8177. zExp := bExp;
  8178. zSign := zSign xor 1;
  8179. goto normalizeRoundAndPack;
  8180. aExpBigger:
  8181. if ( aExp = $7FFF ) then begin
  8182. if ( aSig0 or aSig1 )<>0 then
  8183. begin
  8184. result := propagateFloat128NaN( a, b );
  8185. exit;
  8186. end;
  8187. result := a;
  8188. exit;
  8189. end;
  8190. if ( bExp = 0 ) then begin
  8191. dec(expDiff);
  8192. end
  8193. else begin
  8194. bSig0 := bSig0 or int64( $4000000000000000 );
  8195. end;
  8196. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8197. aSig0 := aSig0 or int64( $4000000000000000 );
  8198. aBigger:
  8199. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8200. zExp := aExp;
  8201. normalizeRoundAndPack:
  8202. dec(zExp);
  8203. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8204. end;
  8205. {*----------------------------------------------------------------------------
  8206. | Returns the result of adding the quadruple-precision floating-point values
  8207. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8208. | for Binary Floating-Point Arithmetic.
  8209. *----------------------------------------------------------------------------*}
  8210. function float128_add(a: float128; b: float128): float128;
  8211. var
  8212. aSign, bSign: flag;
  8213. begin
  8214. aSign := extractFloat128Sign( a );
  8215. bSign := extractFloat128Sign( b );
  8216. if ( aSign = bSign ) then begin
  8217. result := addFloat128Sigs( a, b, aSign );
  8218. end
  8219. else begin
  8220. result := subFloat128Sigs( a, b, aSign );
  8221. end;
  8222. end;
  8223. {*----------------------------------------------------------------------------
  8224. | Returns the result of subtracting the quadruple-precision floating-point
  8225. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8226. | Standard for Binary Floating-Point Arithmetic.
  8227. *----------------------------------------------------------------------------*}
  8228. function float128_sub(a: float128; b: float128): float128;
  8229. var
  8230. aSign, bSign: flag;
  8231. begin
  8232. aSign := extractFloat128Sign( a );
  8233. bSign := extractFloat128Sign( b );
  8234. if ( aSign = bSign ) then begin
  8235. result := subFloat128Sigs( a, b, aSign );
  8236. end
  8237. else begin
  8238. result := addFloat128Sigs( a, b, aSign );
  8239. end;
  8240. end;
  8241. {*----------------------------------------------------------------------------
  8242. | Returns the result of multiplying the quadruple-precision floating-point
  8243. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8244. | Standard for Binary Floating-Point Arithmetic.
  8245. *----------------------------------------------------------------------------*}
  8246. function float128_mul(a: float128; b: float128): float128;
  8247. var
  8248. aSign, bSign, zSign: flag;
  8249. aExp, bExp, zExp: int32;
  8250. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8251. z: float128;
  8252. label
  8253. invalid;
  8254. begin
  8255. aSig1 := extractFloat128Frac1( a );
  8256. aSig0 := extractFloat128Frac0( a );
  8257. aExp := extractFloat128Exp( a );
  8258. aSign := extractFloat128Sign( a );
  8259. bSig1 := extractFloat128Frac1( b );
  8260. bSig0 := extractFloat128Frac0( b );
  8261. bExp := extractFloat128Exp( b );
  8262. bSign := extractFloat128Sign( b );
  8263. zSign := aSign xor bSign;
  8264. if ( aExp = $7FFF ) then begin
  8265. if ( (( aSig0 or aSig1 )<>0)
  8266. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8267. result := propagateFloat128NaN( a, b );
  8268. exit;
  8269. end;
  8270. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8271. result := packFloat128( zSign, $7FFF, 0, 0 );
  8272. exit;
  8273. end;
  8274. if ( bExp = $7FFF ) then begin
  8275. if ( bSig0 or bSig1 )<>0 then
  8276. begin
  8277. result := propagateFloat128NaN( a, b );
  8278. exit;
  8279. end;
  8280. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8281. invalid:
  8282. float_raise( float_flag_invalid );
  8283. z.low := float128_default_nan_low;
  8284. z.high := float128_default_nan_high;
  8285. result := z;
  8286. exit;
  8287. end;
  8288. result := packFloat128( zSign, $7FFF, 0, 0 );
  8289. exit;
  8290. end;
  8291. if ( aExp = 0 ) then begin
  8292. if ( ( aSig0 or aSig1 ) = 0 ) then
  8293. begin
  8294. result := packFloat128( zSign, 0, 0, 0 );
  8295. exit;
  8296. end;
  8297. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8298. end;
  8299. if ( bExp = 0 ) then begin
  8300. if ( ( bSig0 or bSig1 ) = 0 ) then
  8301. begin
  8302. result := packFloat128( zSign, 0, 0, 0 );
  8303. exit;
  8304. end;
  8305. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8306. end;
  8307. zExp := aExp + bExp - $4000;
  8308. aSig0 := aSig0 or int64( $0001000000000000 );
  8309. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8310. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8311. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8312. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8313. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8314. shift128ExtraRightJamming(
  8315. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8316. inc(zExp);
  8317. end;
  8318. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8319. end;
  8320. {*----------------------------------------------------------------------------
  8321. | Returns the result of dividing the quadruple-precision floating-point value
  8322. | `a' by the corresponding value `b'. The operation is performed according to
  8323. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8324. *----------------------------------------------------------------------------*}
  8325. function float128_div(a: float128; b: float128): float128;
  8326. var
  8327. aSign, bSign, zSign: flag;
  8328. aExp, bExp, zExp: int32;
  8329. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8330. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8331. z: float128;
  8332. label
  8333. invalid;
  8334. begin
  8335. aSig1 := extractFloat128Frac1( a );
  8336. aSig0 := extractFloat128Frac0( a );
  8337. aExp := extractFloat128Exp( a );
  8338. aSign := extractFloat128Sign( a );
  8339. bSig1 := extractFloat128Frac1( b );
  8340. bSig0 := extractFloat128Frac0( b );
  8341. bExp := extractFloat128Exp( b );
  8342. bSign := extractFloat128Sign( b );
  8343. zSign := aSign xor bSign;
  8344. if ( aExp = $7FFF ) then begin
  8345. if ( aSig0 or aSig1 )<>0 then
  8346. begin
  8347. result := propagateFloat128NaN( a, b );
  8348. exit;
  8349. end;
  8350. if ( bExp = $7FFF ) then begin
  8351. if ( bSig0 or bSig1 )<>0 then
  8352. begin
  8353. result := propagateFloat128NaN( a, b );
  8354. exit;
  8355. end;
  8356. goto invalid;
  8357. end;
  8358. result := packFloat128( zSign, $7FFF, 0, 0 );
  8359. exit;
  8360. end;
  8361. if ( bExp = $7FFF ) then begin
  8362. if ( bSig0 or bSig1 )<>0 then
  8363. begin
  8364. result := propagateFloat128NaN( a, b );
  8365. exit;
  8366. end;
  8367. result := packFloat128( zSign, 0, 0, 0 );
  8368. exit;
  8369. end;
  8370. if ( bExp = 0 ) then begin
  8371. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8372. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8373. invalid:
  8374. float_raise( float_flag_invalid );
  8375. z.low := float128_default_nan_low;
  8376. z.high := float128_default_nan_high;
  8377. result := z;
  8378. exit;
  8379. end;
  8380. float_raise( float_flag_divbyzero );
  8381. result := packFloat128( zSign, $7FFF, 0, 0 );
  8382. exit;
  8383. end;
  8384. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8385. end;
  8386. if ( aExp = 0 ) then begin
  8387. if ( ( aSig0 or aSig1 ) = 0 ) then
  8388. begin
  8389. result := packFloat128( zSign, 0, 0, 0 );
  8390. exit;
  8391. end;
  8392. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8393. end;
  8394. zExp := aExp - bExp + $3FFD;
  8395. shortShift128Left(
  8396. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8397. shortShift128Left(
  8398. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8399. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8400. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8401. inc(zExp);
  8402. end;
  8403. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8404. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8405. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8406. while ( sbits64(rem0) < 0 ) do begin
  8407. dec(zSig0);
  8408. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8409. end;
  8410. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8411. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8412. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8413. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8414. while ( sbits64(rem1) < 0 ) do begin
  8415. dec(zSig1);
  8416. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8417. end;
  8418. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8419. end;
  8420. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8421. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8422. end;
  8423. {*----------------------------------------------------------------------------
  8424. | Returns the remainder of the quadruple-precision floating-point value `a'
  8425. | with respect to the corresponding value `b'. The operation is performed
  8426. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8427. *----------------------------------------------------------------------------*}
  8428. function float128_rem(a: float128; b: float128): float128;
  8429. var
  8430. aSign, zSign: flag;
  8431. aExp, bExp, expDiff: int32;
  8432. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8433. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8434. sigMean0: sbits64;
  8435. z: float128;
  8436. label
  8437. invalid;
  8438. begin
  8439. aSig1 := extractFloat128Frac1( a );
  8440. aSig0 := extractFloat128Frac0( a );
  8441. aExp := extractFloat128Exp( a );
  8442. aSign := extractFloat128Sign( a );
  8443. bSig1 := extractFloat128Frac1( b );
  8444. bSig0 := extractFloat128Frac0( b );
  8445. bExp := extractFloat128Exp( b );
  8446. if ( aExp = $7FFF ) then begin
  8447. if ( (( aSig0 or aSig1 )<>0)
  8448. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8449. result := propagateFloat128NaN( a, b );
  8450. exit;
  8451. end;
  8452. goto invalid;
  8453. end;
  8454. if ( bExp = $7FFF ) then begin
  8455. if ( bSig0 or bSig1 )<>0 then
  8456. begin
  8457. result := propagateFloat128NaN( a, b );
  8458. exit;
  8459. end;
  8460. result := a;
  8461. exit;
  8462. end;
  8463. if ( bExp = 0 ) then begin
  8464. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8465. invalid:
  8466. float_raise( float_flag_invalid );
  8467. z.low := float128_default_nan_low;
  8468. z.high := float128_default_nan_high;
  8469. result := z;
  8470. exit;
  8471. end;
  8472. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8473. end;
  8474. if ( aExp = 0 ) then begin
  8475. if ( ( aSig0 or aSig1 ) = 0 ) then
  8476. begin
  8477. result := a;
  8478. exit;
  8479. end;
  8480. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8481. end;
  8482. expDiff := aExp - bExp;
  8483. if ( expDiff < -1 ) then
  8484. begin
  8485. result := a;
  8486. exit;
  8487. end;
  8488. shortShift128Left(
  8489. aSig0 or int64( $0001000000000000 ),
  8490. aSig1,
  8491. 15 - ord( expDiff < 0 ),
  8492. aSig0,
  8493. aSig1
  8494. );
  8495. shortShift128Left(
  8496. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8497. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8498. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8499. dec(expDiff,64);
  8500. while ( 0 < expDiff ) do begin
  8501. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8502. if ( 4 < q ) then
  8503. q := q - 4
  8504. else
  8505. q := 0;
  8506. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8507. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8508. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8509. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8510. dec(expDiff,61);
  8511. end;
  8512. if ( -64 < expDiff ) then begin
  8513. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8514. if ( 4 < q ) then
  8515. q := q - 4
  8516. else
  8517. q := 0;
  8518. q := q shr (- expDiff);
  8519. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8520. inc(expDiff,52);
  8521. if ( expDiff < 0 ) then begin
  8522. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8523. end
  8524. else begin
  8525. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8526. end;
  8527. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8528. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8529. end
  8530. else begin
  8531. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8532. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8533. end;
  8534. repeat
  8535. alternateASig0 := aSig0;
  8536. alternateASig1 := aSig1;
  8537. inc(q);
  8538. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8539. until not( 0 <= sbits64(aSig0) );
  8540. add128(
  8541. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8542. if ( ( sigMean0 < 0 )
  8543. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8544. aSig0 := alternateASig0;
  8545. aSig1 := alternateASig1;
  8546. end;
  8547. zSign := ord( sbits64(aSig0) < 0 );
  8548. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8549. result :=
  8550. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8551. end;
  8552. {*----------------------------------------------------------------------------
  8553. | Returns the square root of the quadruple-precision floating-point value `a'.
  8554. | The operation is performed according to the IEC/IEEE Standard for Binary
  8555. | Floating-Point Arithmetic.
  8556. *----------------------------------------------------------------------------*}
  8557. function float128_sqrt(a: float128): float128;
  8558. var
  8559. aSign: flag;
  8560. aExp, zExp: int32;
  8561. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8562. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8563. z: float128;
  8564. label
  8565. invalid;
  8566. begin
  8567. aSig1 := extractFloat128Frac1( a );
  8568. aSig0 := extractFloat128Frac0( a );
  8569. aExp := extractFloat128Exp( a );
  8570. aSign := extractFloat128Sign( a );
  8571. if ( aExp = $7FFF ) then begin
  8572. if ( aSig0 or aSig1 )<>0 then
  8573. begin
  8574. result := propagateFloat128NaN( a, a );
  8575. exit;
  8576. end;
  8577. if ( aSign=0 ) then
  8578. begin
  8579. result := a;
  8580. exit;
  8581. end;
  8582. goto invalid;
  8583. end;
  8584. if ( aSign<>0 ) then begin
  8585. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8586. begin
  8587. result := a;
  8588. exit;
  8589. end;
  8590. invalid:
  8591. float_raise( float_flag_invalid );
  8592. z.low := float128_default_nan_low;
  8593. z.high := float128_default_nan_high;
  8594. result := z;
  8595. exit;
  8596. end;
  8597. if ( aExp = 0 ) then begin
  8598. if ( ( aSig0 or aSig1 ) = 0 ) then
  8599. begin
  8600. result := packFloat128( 0, 0, 0, 0 );
  8601. exit;
  8602. end;
  8603. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8604. end;
  8605. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
  8606. aSig0 := aSig0 or int64( $0001000000000000 );
  8607. zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
  8608. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8609. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8610. doubleZSig0 := zSig0 shl 1;
  8611. mul64To128( zSig0, zSig0, term0, term1 );
  8612. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8613. while ( sbits64(rem0) < 0 ) do begin
  8614. dec(zSig0);
  8615. dec(doubleZSig0,2);
  8616. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8617. end;
  8618. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8619. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8620. if ( zSig1 = 0 ) then zSig1 := 1;
  8621. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8622. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8623. mul64To128( zSig1, zSig1, term2, term3 );
  8624. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8625. while ( sbits64(rem1) < 0 ) do begin
  8626. dec(zSig1);
  8627. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8628. term3 := term3 or 1;
  8629. term2 := term2 or doubleZSig0;
  8630. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8631. end;
  8632. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8633. end;
  8634. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8635. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8636. end;
  8637. {*----------------------------------------------------------------------------
  8638. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8639. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8640. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8641. *----------------------------------------------------------------------------*}
  8642. function float128_eq(a: float128; b: float128): flag;
  8643. begin
  8644. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8645. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8646. or ( ( extractFloat128Exp( b ) = $7FFF )
  8647. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8648. ) then begin
  8649. if ( (float128_is_signaling_nan( a )<>0)
  8650. or (float128_is_signaling_nan( b )<>0) ) then begin
  8651. float_raise( float_flag_invalid );
  8652. end;
  8653. result := 0;
  8654. exit;
  8655. end;
  8656. result := ord(
  8657. ( a.low = b.low )
  8658. and ( ( a.high = b.high )
  8659. or ( ( a.low = 0 )
  8660. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8661. ));
  8662. end;
  8663. {*----------------------------------------------------------------------------
  8664. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8665. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8666. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8667. | Arithmetic.
  8668. *----------------------------------------------------------------------------*}
  8669. function float128_le(a: float128; b: float128): flag;
  8670. var
  8671. aSign, bSign: flag;
  8672. begin
  8673. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8674. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8675. or ( ( extractFloat128Exp( b ) = $7FFF )
  8676. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8677. ) then begin
  8678. float_raise( float_flag_invalid );
  8679. result := 0;
  8680. exit;
  8681. end;
  8682. aSign := extractFloat128Sign( a );
  8683. bSign := extractFloat128Sign( b );
  8684. if ( aSign <> bSign ) then begin
  8685. result := ord(
  8686. (aSign<>0)
  8687. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8688. = 0 ));
  8689. exit;
  8690. end;
  8691. if aSign<>0 then
  8692. result := le128( b.high, b.low, a.high, a.low )
  8693. else
  8694. result := le128( a.high, a.low, b.high, b.low );
  8695. end;
  8696. {*----------------------------------------------------------------------------
  8697. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8698. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8699. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8700. *----------------------------------------------------------------------------*}
  8701. function float128_lt(a: float128; b: float128): flag;
  8702. var
  8703. aSign, bSign: flag;
  8704. begin
  8705. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8706. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8707. or ( ( extractFloat128Exp( b ) = $7FFF )
  8708. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8709. ) then begin
  8710. float_raise( float_flag_invalid );
  8711. result := 0;
  8712. exit;
  8713. end;
  8714. aSign := extractFloat128Sign( a );
  8715. bSign := extractFloat128Sign( b );
  8716. if ( aSign <> bSign ) then begin
  8717. result := ord(
  8718. (aSign<>0)
  8719. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8720. <> 0 ));
  8721. exit;
  8722. end;
  8723. if aSign<>0 then
  8724. result := lt128( b.high, b.low, a.high, a.low )
  8725. else
  8726. result := lt128( a.high, a.low, b.high, b.low );
  8727. end;
  8728. {*----------------------------------------------------------------------------
  8729. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8730. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8731. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8732. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8733. *----------------------------------------------------------------------------*}
  8734. function float128_eq_signaling(a: float128; b: float128): flag;
  8735. begin
  8736. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8737. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8738. or ( ( extractFloat128Exp( b ) = $7FFF )
  8739. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8740. ) then begin
  8741. float_raise( float_flag_invalid );
  8742. result := 0;
  8743. exit;
  8744. end;
  8745. result := ord(
  8746. ( a.low = b.low )
  8747. and ( ( a.high = b.high )
  8748. or ( ( a.low = 0 )
  8749. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8750. ));
  8751. end;
  8752. {*----------------------------------------------------------------------------
  8753. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8754. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8755. | cause an exception. Otherwise, the comparison is performed according to the
  8756. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8757. *----------------------------------------------------------------------------*}
  8758. function float128_le_quiet(a: float128; b: float128): flag;
  8759. var
  8760. aSign, bSign: flag;
  8761. begin
  8762. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8763. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8764. or ( ( extractFloat128Exp( b ) = $7FFF )
  8765. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8766. ) then begin
  8767. if ( (float128_is_signaling_nan( a )<>0)
  8768. or (float128_is_signaling_nan( b )<>0) ) then begin
  8769. float_raise( float_flag_invalid );
  8770. end;
  8771. result := 0;
  8772. exit;
  8773. end;
  8774. aSign := extractFloat128Sign( a );
  8775. bSign := extractFloat128Sign( b );
  8776. if ( aSign <> bSign ) then begin
  8777. result := ord(
  8778. (aSign<>0)
  8779. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8780. = 0 ));
  8781. exit;
  8782. end;
  8783. if aSign<>0 then
  8784. result := le128( b.high, b.low, a.high, a.low )
  8785. else
  8786. result := le128( a.high, a.low, b.high, b.low );
  8787. end;
  8788. {*----------------------------------------------------------------------------
  8789. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8790. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8791. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8792. | Standard for Binary Floating-Point Arithmetic.
  8793. *----------------------------------------------------------------------------*}
  8794. function float128_lt_quiet(a: float128; b: float128): flag;
  8795. var
  8796. aSign, bSign: flag;
  8797. begin
  8798. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8799. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8800. or ( ( extractFloat128Exp( b ) = $7FFF )
  8801. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8802. ) then begin
  8803. if ( (float128_is_signaling_nan( a )<>0)
  8804. or (float128_is_signaling_nan( b )<>0) ) then begin
  8805. float_raise( float_flag_invalid );
  8806. end;
  8807. result := 0;
  8808. exit;
  8809. end;
  8810. aSign := extractFloat128Sign( a );
  8811. bSign := extractFloat128Sign( b );
  8812. if ( aSign <> bSign ) then begin
  8813. result := ord(
  8814. (aSign<>0)
  8815. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8816. <> 0 ));
  8817. exit;
  8818. end;
  8819. if aSign<>0 then
  8820. result:=lt128( b.high, b.low, a.high, a.low )
  8821. else
  8822. result:=lt128( a.high, a.low, b.high, b.low );
  8823. end;
  8824. {----------------------------------------------------------------------------
  8825. | Returns the result of converting the double-precision floating-point value
  8826. | `a' to the quadruple-precision floating-point format. The conversion is
  8827. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8828. | Arithmetic.
  8829. *----------------------------------------------------------------------------}
  8830. function float64_to_float128( a : float64) : float128;
  8831. var
  8832. aSign : flag;
  8833. aExp : int16;
  8834. aSig, zSig0, zSig1 : bits64;
  8835. begin
  8836. aSig := extractFloat64Frac( a );
  8837. aExp := extractFloat64Exp( a );
  8838. aSign := extractFloat64Sign( a );
  8839. if ( aExp = $7FF ) then begin
  8840. if ( aSig<>0 ) then begin
  8841. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8842. exit;
  8843. end;
  8844. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8845. exit;
  8846. end;
  8847. if ( aExp = 0 ) then begin
  8848. if ( aSig = 0 ) then
  8849. begin
  8850. result:=packFloat128( aSign, 0, 0, 0 );
  8851. exit;
  8852. end;
  8853. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8854. dec(aExp);
  8855. end;
  8856. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8857. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8858. end;
  8859. {$endif FPC_SOFTFLOAT_FLOAT128}
  8860. {$endif not(defined(fpc_softfpu_interface))}
  8861. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8862. end.
  8863. {$ifdef FPC}
  8864. { restore context modified at implmentation start
  8865. to possibly re-enable range and overflow checking explicitly}
  8866. {$pop}
  8867. {$endif FPC}
  8868. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}