softfpu.pp 328 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393939493959396939793989399940094019402940394049405940694079408940994109411941294139414941594169417941894199420942194229423942494259426942794289429
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { $define FPC_SOFTFLOAT_FLOATX80_FUNCS}
  60. {$ifdef FPC_SOFTFLOAT_FLOATX80_TRIG}
  61. {$define FPC_SOFTFLOAT_FLOAT128}
  62. {$endif FPC_SOFTFLOAT_FLOATX80_TRIG}
  63. { the softfpu unit can be also embedded directly into the system unit }
  64. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  65. {$mode objfpc}
  66. unit softfpu;
  67. { Overflow checking must be disabled,
  68. since some operations expect overflow!
  69. }
  70. {$Q-}
  71. {$goto on}
  72. interface
  73. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  74. {$if not(defined(fpc_softfpu_implementation))}
  75. {
  76. -------------------------------------------------------------------------------
  77. Software IEC/IEEE floating-point types.
  78. -------------------------------------------------------------------------------
  79. }
  80. TYPE
  81. {$ifndef FPC_SYSTEM_HAS_float32}
  82. float32 = longword;
  83. {$define FPC_SYSTEM_HAS_float32}
  84. {$endif ndef FPC_SYSTEM_HAS_float32}
  85. { we use here a record in the function header because
  86. the record allows bitwise conversion to single }
  87. float32rec = record
  88. float32 : float32;
  89. end;
  90. flag = byte;
  91. bits8 = byte;
  92. sbits8 = shortint;
  93. bits16 = word;
  94. sbits16 = smallint;
  95. sbits32 = longint;
  96. Bit32s = sbits32;
  97. bits32 = longword;
  98. Bit32u = bits32;
  99. {$ifndef fpc}
  100. qword = int64;
  101. {$endif}
  102. { now part of the system unit
  103. uint64 = qword;
  104. }
  105. bits64 = qword;
  106. Bit64u = bits64;
  107. sbits64 = int64;
  108. Bit64s = sbits64;
  109. {$ifdef ENDIAN_LITTLE}
  110. {$ifndef FPC_SYSTEM_HAS_float64}
  111. float64 = record
  112. case byte of
  113. // force the record to be aligned like a double
  114. // else *_to_double will fail for cpus like sparc
  115. // and avoid expensive unpacking/packing operations
  116. 1: (dummy : double);
  117. 2: (low,high : bits32);
  118. end;
  119. {$endif ndef FPC_SYSTEM_HAS_float64}
  120. floatx80 = record
  121. case byte of
  122. // force the record to be aligned like a double
  123. // else *_to_double will fail for cpus like sparc
  124. // and avoid expensive unpacking/packing operations
  125. 1: (dummy : extended);
  126. 2: (low : qword;high : word);
  127. end;
  128. float128 = record
  129. case byte of
  130. // force the record to be aligned like a double
  131. // else *_to_double will fail for cpus like sparc
  132. // and avoid expensive unpacking/packing operations
  133. 1: (dummy : qword);
  134. 2: (low,high : qword);
  135. end;
  136. {$else}
  137. {$ifndef FPC_SYSTEM_HAS_float64}
  138. float64 = record
  139. case byte of
  140. // force the record to be aligned like a double
  141. // else *_to_double will fail for cpus like sparc
  142. 1: (dummy : double);
  143. 2: (high,low : bits32);
  144. end;
  145. {$endif ndef FPC_SYSTEM_HAS_float64}
  146. floatx80 = record
  147. case byte of
  148. // force the record to be aligned like a double
  149. // else *_to_double will fail for cpus like sparc
  150. // and avoid expensive unpacking/packing operations
  151. 1: (dummy : qword);
  152. 2: (high : word;low : qword);
  153. end;
  154. float128 = record
  155. case byte of
  156. // force the record to be aligned like a double
  157. // else *_to_double will fail for cpus like sparc
  158. // and avoid expensive unpacking/packing operations
  159. 1: (dummy : qword);
  160. 2: (high : qword;low : qword);
  161. end;
  162. {$endif}
  163. pfloat128 = ^float128;
  164. pfloatx80 = ^floatx80;
  165. {$define FPC_SYSTEM_HAS_float64}
  166. {*
  167. -------------------------------------------------------------------------------
  168. Returns 1 if the double-precision floating-point value `a' is less than
  169. the corresponding value `b', and 0 otherwise. The comparison is performed
  170. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  171. -------------------------------------------------------------------------------
  172. *}
  173. Function float64_lt(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  174. {*
  175. -------------------------------------------------------------------------------
  176. Returns 1 if the double-precision floating-point value `a' is less than
  177. or equal to the corresponding value `b', and 0 otherwise. The comparison
  178. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  179. Arithmetic.
  180. -------------------------------------------------------------------------------
  181. *}
  182. Function float64_le(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  183. {*
  184. -------------------------------------------------------------------------------
  185. Returns 1 if the double-precision floating-point value `a' is equal to
  186. the corresponding value `b', and 0 otherwise. The comparison is performed
  187. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  188. -------------------------------------------------------------------------------
  189. *}
  190. Function float64_eq(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  191. {*
  192. -------------------------------------------------------------------------------
  193. Returns the square root of the double-precision floating-point value `a'.
  194. The operation is performed according to the IEC/IEEE Standard for Binary
  195. Floating-Point Arithmetic.
  196. -------------------------------------------------------------------------------
  197. *}
  198. function float64_sqrt( a: float64 ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  199. {*
  200. -------------------------------------------------------------------------------
  201. Returns the remainder of the double-precision floating-point value `a'
  202. with respect to the corresponding value `b'. The operation is performed
  203. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  204. -------------------------------------------------------------------------------
  205. *}
  206. Function float64_rem(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  207. {*
  208. -------------------------------------------------------------------------------
  209. Returns the result of dividing the double-precision floating-point value `a'
  210. by the corresponding value `b'. The operation is performed according to the
  211. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  212. -------------------------------------------------------------------------------
  213. *}
  214. Function float64_div(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  215. {*
  216. -------------------------------------------------------------------------------
  217. Returns the result of multiplying the double-precision floating-point values
  218. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  219. for Binary Floating-Point Arithmetic.
  220. -------------------------------------------------------------------------------
  221. *}
  222. Function float64_mul( a: float64; b:float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  223. {*
  224. -------------------------------------------------------------------------------
  225. Returns the result of subtracting the double-precision floating-point values
  226. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  227. for Binary Floating-Point Arithmetic.
  228. -------------------------------------------------------------------------------
  229. *}
  230. Function float64_sub(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  231. {*
  232. -------------------------------------------------------------------------------
  233. Returns the result of adding the double-precision floating-point values `a'
  234. and `b'. The operation is performed according to the IEC/IEEE Standard for
  235. Binary Floating-Point Arithmetic.
  236. -------------------------------------------------------------------------------
  237. *}
  238. Function float64_add( a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  239. {*
  240. -------------------------------------------------------------------------------
  241. Rounds the double-precision floating-point value `a' to an integer,
  242. and returns the result as a double-precision floating-point value. The
  243. operation is performed according to the IEC/IEEE Standard for Binary
  244. Floating-Point Arithmetic.
  245. -------------------------------------------------------------------------------
  246. *}
  247. Function float64_round_to_int(a: float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  248. {*
  249. -------------------------------------------------------------------------------
  250. Returns the result of converting the double-precision floating-point value
  251. `a' to the single-precision floating-point format. The conversion is
  252. performed according to the IEC/IEEE Standard for Binary Floating-Point
  253. Arithmetic.
  254. -------------------------------------------------------------------------------
  255. *}
  256. Function float64_to_float32(a: float64) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  257. {*
  258. -------------------------------------------------------------------------------
  259. Returns the result of converting the double-precision floating-point value
  260. `a' to the 32-bit two's complement integer format. The conversion is
  261. performed according to the IEC/IEEE Standard for Binary Floating-Point
  262. Arithmetic, except that the conversion is always rounded toward zero.
  263. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  264. the conversion overflows, the largest integer with the same sign as `a' is
  265. returned.
  266. -------------------------------------------------------------------------------
  267. *}
  268. Function float64_to_int32_round_to_zero(a: float64 ): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  269. {*
  270. -------------------------------------------------------------------------------
  271. Returns the result of converting the double-precision floating-point value
  272. `a' to the 32-bit two's complement integer format. The conversion is
  273. performed according to the IEC/IEEE Standard for Binary Floating-Point
  274. Arithmetic---which means in particular that the conversion is rounded
  275. according to the current rounding mode. If `a' is a NaN, the largest
  276. positive integer is returned. Otherwise, if the conversion overflows, the
  277. largest integer with the same sign as `a' is returned.
  278. -------------------------------------------------------------------------------
  279. *}
  280. Function float64_to_int32(a: float64): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  281. {*
  282. -------------------------------------------------------------------------------
  283. Returns 1 if the single-precision floating-point value `a' is less than
  284. the corresponding value `b', and 0 otherwise. The comparison is performed
  285. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  286. -------------------------------------------------------------------------------
  287. *}
  288. Function float32_lt( a:float32rec ; b : float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  289. {*
  290. -------------------------------------------------------------------------------
  291. Returns 1 if the single-precision floating-point value `a' is less than
  292. or equal to the corresponding value `b', and 0 otherwise. The comparison
  293. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  294. Arithmetic.
  295. -------------------------------------------------------------------------------
  296. *}
  297. Function float32_le( a: float32rec; b : float32rec ):flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  298. {*
  299. -------------------------------------------------------------------------------
  300. Returns 1 if the single-precision floating-point value `a' is equal to
  301. the corresponding value `b', and 0 otherwise. The comparison is performed
  302. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  303. -------------------------------------------------------------------------------
  304. *}
  305. Function float32_eq( a:float32rec; b:float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  306. {*
  307. -------------------------------------------------------------------------------
  308. Returns the square root of the single-precision floating-point value `a'.
  309. The operation is performed according to the IEC/IEEE Standard for Binary
  310. Floating-Point Arithmetic.
  311. -------------------------------------------------------------------------------
  312. *}
  313. Function float32_sqrt(a: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  314. {*
  315. -------------------------------------------------------------------------------
  316. Returns the remainder of the single-precision floating-point value `a'
  317. with respect to the corresponding value `b'. The operation is performed
  318. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  319. -------------------------------------------------------------------------------
  320. *}
  321. Function float32_rem(a: float32rec; b: float32rec ):float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  322. {*
  323. -------------------------------------------------------------------------------
  324. Returns the result of dividing the single-precision floating-point value `a'
  325. by the corresponding value `b'. The operation is performed according to the
  326. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  327. -------------------------------------------------------------------------------
  328. *}
  329. Function float32_div(a: float32rec;b: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  330. {*
  331. -------------------------------------------------------------------------------
  332. Returns the result of multiplying the single-precision floating-point values
  333. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  334. for Binary Floating-Point Arithmetic.
  335. -------------------------------------------------------------------------------
  336. *}
  337. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  338. {*
  339. -------------------------------------------------------------------------------
  340. Returns the result of subtracting the single-precision floating-point values
  341. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  342. for Binary Floating-Point Arithmetic.
  343. -------------------------------------------------------------------------------
  344. *}
  345. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  346. {*
  347. -------------------------------------------------------------------------------
  348. Returns the result of adding the single-precision floating-point values `a'
  349. and `b'. The operation is performed according to the IEC/IEEE Standard for
  350. Binary Floating-Point Arithmetic.
  351. -------------------------------------------------------------------------------
  352. *}
  353. Function float32_add( a: float32rec; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  354. {*
  355. -------------------------------------------------------------------------------
  356. Rounds the single-precision floating-point value `a' to an integer,
  357. and returns the result as a single-precision floating-point value. The
  358. operation is performed according to the IEC/IEEE Standard for Binary
  359. Floating-Point Arithmetic.
  360. -------------------------------------------------------------------------------
  361. *}
  362. Function float32_round_to_int( a: float32rec): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  363. {*
  364. -------------------------------------------------------------------------------
  365. Returns the result of converting the single-precision floating-point value
  366. `a' to the double-precision floating-point format. The conversion is
  367. performed according to the IEC/IEEE Standard for Binary Floating-Point
  368. Arithmetic.
  369. -------------------------------------------------------------------------------
  370. *}
  371. Function float32_to_float64( a : float32rec) : Float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  372. {*
  373. -------------------------------------------------------------------------------
  374. Returns the result of converting the single-precision floating-point value
  375. `a' to the 32-bit two's complement integer format. The conversion is
  376. performed according to the IEC/IEEE Standard for Binary Floating-Point
  377. Arithmetic, except that the conversion is always rounded toward zero.
  378. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  379. the conversion overflows, the largest integer with the same sign as `a' is
  380. returned.
  381. -------------------------------------------------------------------------------
  382. *}
  383. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  384. {*
  385. -------------------------------------------------------------------------------
  386. Returns the result of converting the single-precision floating-point value
  387. `a' to the 32-bit two's complement integer format. The conversion is
  388. performed according to the IEC/IEEE Standard for Binary Floating-Point
  389. Arithmetic---which means in particular that the conversion is rounded
  390. according to the current rounding mode. If `a' is a NaN, the largest
  391. positive integer is returned. Otherwise, if the conversion overflows, the
  392. largest integer with the same sign as `a' is returned.
  393. -------------------------------------------------------------------------------
  394. *}
  395. Function float32_to_int32( a : float32rec) : int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  396. {*
  397. -------------------------------------------------------------------------------
  398. Returns the result of converting the 32-bit two's complement integer `a' to
  399. the double-precision floating-point format. The conversion is performed
  400. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  401. -------------------------------------------------------------------------------
  402. *}
  403. Function int32_to_float64( a: int32) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  404. {*
  405. -------------------------------------------------------------------------------
  406. Returns the result of converting the 32-bit two's complement integer `a' to
  407. the single-precision floating-point format. The conversion is performed
  408. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  409. -------------------------------------------------------------------------------
  410. *}
  411. Function int32_to_float32( a: int32): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  412. {*----------------------------------------------------------------------------
  413. | Returns the result of converting the 64-bit two's complement integer `a'
  414. | to the double-precision floating-point format. The conversion is performed
  415. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  416. *----------------------------------------------------------------------------*}
  417. Function int64_to_float64( a: int64 ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  418. Function qword_to_float64( a: qword ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  419. {*----------------------------------------------------------------------------
  420. | Returns the result of converting the 64-bit two's complement integer `a'
  421. | to the single-precision floating-point format. The conversion is performed
  422. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  423. *----------------------------------------------------------------------------*}
  424. Function int64_to_float32( a: int64 ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  425. Function qword_to_float32( a: qword ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  426. // +++
  427. function float32_to_int64( a: float32 ): int64;
  428. function float32_to_int64_round_to_zero( a: float32 ): int64;
  429. function float32_eq_signaling( a: float32; b: float32) : flag;
  430. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  431. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  432. function float32_is_signaling_nan( a : float32 ): flag;
  433. function float32_is_nan( a : float32 ): flag;
  434. function float64_to_int64( a: float64 ): int64;
  435. function float64_to_int64_round_to_zero( a: float64 ): int64;
  436. function float64_eq_signaling( a: float64; b: float64): flag;
  437. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  438. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  439. function float64_is_signaling_nan( a : float64 ): flag;
  440. function float64_is_nan( a : float64 ): flag;
  441. // ===
  442. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  443. {*----------------------------------------------------------------------------
  444. | Extended double-precision rounding precision
  445. *----------------------------------------------------------------------------*}
  446. var // threadvar!?
  447. floatx80_rounding_precision : int8 = 80;
  448. function int32_to_floatx80( a: int32 ): floatx80;
  449. function int64_to_floatx80( a: int64 ): floatx80;
  450. function qword_to_floatx80( a: qword ): floatx80;
  451. function float32_to_floatx80( a: float32 ): floatx80;
  452. function float64_to_floatx80( a: float64 ): floatx80;
  453. function floatx80_to_int32( a: floatx80 ): int32;
  454. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  455. function floatx80_to_int64( a: floatx80 ): int64;
  456. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  457. function floatx80_to_float32( a: floatx80 ): float32;
  458. function floatx80_to_float64( a: floatx80 ): float64;
  459. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  460. function floatx80_to_float128( a: floatx80 ): float128;
  461. {$endif FPC_SOFTFLOAT_FLOAT128}
  462. function floatx80_round_to_int( a: floatx80 ): floatx80;
  463. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  464. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  465. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  466. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  467. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  468. function floatx80_sqrt( a: floatx80 ): floatx80;
  469. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  470. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  471. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  472. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  473. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  474. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  475. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  476. function floatx80_is_nan(a : floatx80 ): flag;
  477. {$endif FPC_SOFTFLOAT_FLOATX80}
  478. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  479. function int32_to_float128( a: int32 ): float128;
  480. function int64_to_float128( a: int64 ): float128;
  481. function qword_to_float128( a: qword ): float128;
  482. function float32_to_float128( a: float32 ): float128;
  483. function float128_is_nan( a : float128): flag;
  484. function float128_is_signaling_nan( a : float128): flag;
  485. function float128_to_int32(a: float128): int32;
  486. function float128_to_int32_round_to_zero(a: float128): int32;
  487. function float128_to_int64(a: float128): int64;
  488. function float128_to_int64_round_to_zero(a: float128): int64;
  489. function float128_to_float32(a: float128): float32;
  490. function float128_to_float64(a: float128): float64;
  491. function float64_to_float128( a : float64) : float128;
  492. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  493. function float128_to_floatx80(a: float128): floatx80;
  494. {$endif FPC_SOFTFLOAT_FLOATX80}
  495. function float128_round_to_int(a: float128): float128;
  496. function float128_add(a: float128; b: float128): float128;
  497. function float128_sub(a: float128; b: float128): float128;
  498. function float128_mul(a: float128; b: float128): float128;
  499. function float128_div(a: float128; b: float128): float128;
  500. function float128_rem(a: float128; b: float128): float128;
  501. function float128_sqrt(a: float128): float128;
  502. function float128_eq(a: float128; b: float128): flag;
  503. function float128_le(a: float128; b: float128): flag;
  504. function float128_lt(a: float128; b: float128): flag;
  505. function float128_eq_signaling(a: float128; b: float128): flag;
  506. function float128_le_quiet(a: float128; b: float128): flag;
  507. function float128_lt_quiet(a: float128; b: float128): flag;
  508. {$endif FPC_SOFTFLOAT_FLOAT128}
  509. CONST
  510. {-------------------------------------------------------------------------------
  511. Software IEC/IEEE floating-point underflow tininess-detection mode.
  512. -------------------------------------------------------------------------------
  513. *}
  514. float_tininess_after_rounding = 0;
  515. float_tininess_before_rounding = 1;
  516. {*
  517. -------------------------------------------------------------------------------
  518. Underflow tininess-detection mode, statically initialized to default value.
  519. (The declaration in `softfloat.h' must match the `int8' type here.)
  520. -------------------------------------------------------------------------------
  521. *}
  522. var // threadvar!?
  523. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  524. {$endif not(defined(fpc_softfpu_implementation))}
  525. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  526. implementation
  527. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  528. {$if not(defined(fpc_softfpu_interface))}
  529. {$ifdef FPC}
  530. { disable range and overflow checking explicitly }
  531. { This might be more essential for x80 and 128-bit
  532. floating point types and could, maybe be
  533. restricted to code handle floatx80 and float128 }
  534. {$push}
  535. {$R-}
  536. {$Q-}
  537. {$endif FPC}
  538. (*****************************************************************************)
  539. (*----------------------------------------------------------------------------*)
  540. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  541. (* division and square root approximations. (Can be specialized to target if *)
  542. (* desired.) *)
  543. (* ---------------------------------------------------------------------------*)
  544. (*****************************************************************************)
  545. { This procedure serves as a single access point to softfloat_exception_flags.
  546. It also helps to reduce code size a bit because softfloat_exception_flags is
  547. a threadvar. }
  548. procedure set_inexact_flag;
  549. begin
  550. include(softfloat_exception_flags,float_flag_inexact);
  551. end;
  552. {*----------------------------------------------------------------------------
  553. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  554. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  555. | input. If `zSign' is 1, the input is negated before being converted to an
  556. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  557. | is simply rounded to an integer, with the inexact exception raised if the
  558. | input cannot be represented exactly as an integer. However, if the fixed-
  559. | point input is too large, the invalid exception is raised and the largest
  560. | positive or negative integer is returned.
  561. *----------------------------------------------------------------------------*}
  562. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  563. var
  564. roundingMode: TFPURoundingMode;
  565. roundNearestEven: boolean;
  566. roundIncrement, roundBits: int8;
  567. z: int32;
  568. begin
  569. roundingMode := softfloat_rounding_mode;
  570. roundNearestEven := (roundingMode = float_round_nearest_even);
  571. roundIncrement := $40;
  572. if not roundNearestEven then
  573. begin
  574. if ( roundingMode = float_round_to_zero ) then
  575. begin
  576. roundIncrement := 0;
  577. end
  578. else begin
  579. roundIncrement := $7F;
  580. if ( zSign<>0 ) then
  581. begin
  582. if ( roundingMode = float_round_up ) then
  583. roundIncrement := 0;
  584. end
  585. else begin
  586. if ( roundingMode = float_round_down ) then
  587. roundIncrement := 0;
  588. end;
  589. end;
  590. end;
  591. roundBits := lo(absZ) and $7F;
  592. absZ := ( absZ + roundIncrement ) shr 7;
  593. absZ := absZ and not( bits64( ord( ( roundBits xor $40 ) = 0 ) and ord(roundNearestEven) ));
  594. z := absZ;
  595. if ( zSign<>0 ) then
  596. z := - z;
  597. if ( longint(hi( absZ )) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  598. begin
  599. float_raise( float_flag_invalid );
  600. if zSign<>0 then
  601. result:=sbits32($80000000)
  602. else
  603. result:=$7FFFFFFF;
  604. exit;
  605. end;
  606. if ( roundBits<>0 ) then
  607. set_inexact_flag;
  608. result:=z;
  609. end;
  610. {*----------------------------------------------------------------------------
  611. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  612. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  613. | and returns the properly rounded 64-bit integer corresponding to the input.
  614. | If `zSign' is 1, the input is negated before being converted to an integer.
  615. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  616. | the inexact exception raised if the input cannot be represented exactly as
  617. | an integer. However, if the fixed-point input is too large, the invalid
  618. | exception is raised and the largest positive or negative integer is
  619. | returned.
  620. *----------------------------------------------------------------------------*}
  621. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  622. var
  623. roundingMode: TFPURoundingMode;
  624. roundNearestEven, increment: flag;
  625. z: int64;
  626. label
  627. overflow;
  628. begin
  629. roundingMode := softfloat_rounding_mode;
  630. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  631. increment := ord( sbits64(absZ1) < 0 );
  632. if ( roundNearestEven=0 ) then
  633. begin
  634. if ( roundingMode = float_round_to_zero ) then
  635. begin
  636. increment := 0;
  637. end
  638. else begin
  639. if ( zSign<>0 ) then
  640. begin
  641. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  642. end
  643. else begin
  644. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  645. end;
  646. end;
  647. end;
  648. if ( increment<>0 ) then
  649. begin
  650. inc(absZ0);
  651. if ( absZ0 = 0 ) then
  652. goto overflow;
  653. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  654. end;
  655. z := absZ0;
  656. if ( zSign<>0 ) then
  657. z := - z;
  658. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  659. begin
  660. overflow:
  661. float_raise( float_flag_invalid );
  662. if zSign<>0 then
  663. result:=int64($8000000000000000)
  664. else
  665. result:=int64($7FFFFFFFFFFFFFFF);
  666. exit;
  667. end;
  668. if ( absZ1<>0 ) then
  669. set_inexact_flag;
  670. result:=z;
  671. end;
  672. {*
  673. -------------------------------------------------------------------------------
  674. Shifts `a' right by the number of bits given in `count'. If any nonzero
  675. bits are shifted off, they are ``jammed'' into the least significant bit of
  676. the result by setting the least significant bit to 1. The value of `count'
  677. can be arbitrarily large; in particular, if `count' is greater than 32, the
  678. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  679. The result is stored in the location pointed to by `zPtr'.
  680. -------------------------------------------------------------------------------
  681. *}
  682. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  683. var
  684. z: Bits32;
  685. Begin
  686. if ( count = 0 ) then
  687. z := a
  688. else
  689. if ( count < 32 ) then
  690. Begin
  691. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  692. End
  693. else
  694. Begin
  695. z := bits32( a <> 0 );
  696. End;
  697. zPtr := z;
  698. End;
  699. {*----------------------------------------------------------------------------
  700. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  701. | number of bits given in `count'. Any bits shifted off are lost. The value
  702. | of `count' can be arbitrarily large; in particular, if `count' is greater
  703. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  704. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  705. *----------------------------------------------------------------------------*}
  706. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  707. var
  708. z0, z1: bits64;
  709. negCount: int8;
  710. begin
  711. negCount := ( - count ) and 63;
  712. if ( count = 0 ) then
  713. begin
  714. z1 := a1;
  715. z0 := a0;
  716. end
  717. else if ( count < 64 ) then
  718. begin
  719. z1 := ( a0 shl negCount ) or ( a1 shr count );
  720. z0 := a0 shr count;
  721. end
  722. else
  723. begin
  724. if ( count < 128 ) then
  725. z1 := a0 shr ( count and 63 )
  726. else
  727. z1 := 0;
  728. z0 := 0;
  729. end;
  730. z1Ptr := z1;
  731. z0Ptr := z0;
  732. end;
  733. {*----------------------------------------------------------------------------
  734. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  735. | number of bits given in `count'. If any nonzero bits are shifted off, they
  736. | are ``jammed'' into the least significant bit of the result by setting the
  737. | least significant bit to 1. The value of `count' can be arbitrarily large;
  738. | in particular, if `count' is greater than 128, the result will be either
  739. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  740. | nonzero. The result is broken into two 64-bit pieces which are stored at
  741. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  742. *----------------------------------------------------------------------------*}
  743. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  744. var
  745. z0,z1 : bits64;
  746. negCount : int8;
  747. begin
  748. negCount := ( - count ) and 63;
  749. if ( count = 0 ) then begin
  750. z1 := a1;
  751. z0 := a0;
  752. end
  753. else if ( count < 64 ) then begin
  754. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  755. z0 := a0 shr count;
  756. end
  757. else begin
  758. if ( count = 64 ) then begin
  759. z1 := a0 or ord( a1 <> 0 );
  760. end
  761. else if ( count < 128 ) then begin
  762. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  763. end
  764. else begin
  765. z1 := ord( ( a0 or a1 ) <> 0 );
  766. end;
  767. z0 := 0;
  768. end;
  769. z1Ptr := z1;
  770. z0Ptr := z0;
  771. end;
  772. {*
  773. -------------------------------------------------------------------------------
  774. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  775. number of bits given in `count'. Any bits shifted off are lost. The value
  776. of `count' can be arbitrarily large; in particular, if `count' is greater
  777. than 64, the result will be 0. The result is broken into two 32-bit pieces
  778. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  779. -------------------------------------------------------------------------------
  780. *}
  781. Procedure
  782. shift64Right(
  783. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  784. Var
  785. z0, z1: bits32;
  786. negCount : int8;
  787. Begin
  788. negCount := ( - count ) AND 31;
  789. if ( count = 0 ) then
  790. Begin
  791. z1 := a1;
  792. z0 := a0;
  793. End
  794. else if ( count < 32 ) then
  795. Begin
  796. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  797. z0 := a0 shr count;
  798. End
  799. else
  800. Begin
  801. if (count < 64) then
  802. z1 := ( a0 shr ( count AND 31 ) )
  803. else
  804. z1 := 0;
  805. z0 := 0;
  806. End;
  807. z1Ptr := z1;
  808. z0Ptr := z0;
  809. End;
  810. {*
  811. -------------------------------------------------------------------------------
  812. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  813. number of bits given in `count'. If any nonzero bits are shifted off, they
  814. are ``jammed'' into the least significant bit of the result by setting the
  815. least significant bit to 1. The value of `count' can be arbitrarily large;
  816. in particular, if `count' is greater than 64, the result will be either 0
  817. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  818. nonzero. The result is broken into two 32-bit pieces which are stored at
  819. the locations pointed to by `z0Ptr' and `z1Ptr'.
  820. -------------------------------------------------------------------------------
  821. *}
  822. Procedure
  823. shift64RightJamming(
  824. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  825. VAR
  826. z0, z1 : bits32;
  827. negCount : int8;
  828. Begin
  829. negCount := ( - count ) AND 31;
  830. if ( count = 0 ) then
  831. Begin
  832. z1 := a1;
  833. z0 := a0;
  834. End
  835. else
  836. if ( count < 32 ) then
  837. Begin
  838. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  839. z0 := a0 shr count;
  840. End
  841. else
  842. Begin
  843. if ( count = 32 ) then
  844. Begin
  845. z1 := a0 OR bits32( a1 <> 0 );
  846. End
  847. else
  848. if ( count < 64 ) Then
  849. Begin
  850. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  851. End
  852. else
  853. Begin
  854. z1 := bits32( ( a0 OR a1 ) <> 0 );
  855. End;
  856. z0 := 0;
  857. End;
  858. z1Ptr := z1;
  859. z0Ptr := z0;
  860. End;
  861. {*----------------------------------------------------------------------------
  862. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  863. | bits are shifted off, they are ``jammed'' into the least significant bit of
  864. | the result by setting the least significant bit to 1. The value of `count'
  865. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  866. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  867. | The result is stored in the location pointed to by `zPtr'.
  868. *----------------------------------------------------------------------------*}
  869. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  870. var
  871. z: bits64;
  872. begin
  873. if ( count = 0 ) then
  874. begin
  875. z := a;
  876. end
  877. else if ( count < 64 ) then
  878. begin
  879. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  880. end
  881. else
  882. begin
  883. z := ord( a <> 0 );
  884. end;
  885. zPtr := z;
  886. end;
  887. {$if not defined(shift64ExtraRightJamming)}
  888. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  889. overload;
  890. forward;
  891. {$endif}
  892. {*
  893. -------------------------------------------------------------------------------
  894. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  895. by 32 _plus_ the number of bits given in `count'. The shifted result is
  896. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  897. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  898. off form a third 32-bit result as follows: The _last_ bit shifted off is
  899. the most-significant bit of the extra result, and the other 31 bits of the
  900. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  901. were all zero. This extra result is stored in the location pointed to by
  902. `z2Ptr'. The value of `count' can be arbitrarily large.
  903. (This routine makes more sense if `a0', `a1', and `a2' are considered
  904. to form a fixed-point value with binary point between `a1' and `a2'. This
  905. fixed-point value is shifted right by the number of bits given in `count',
  906. and the integer part of the result is returned at the locations pointed to
  907. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  908. corrupted as described above, and is returned at the location pointed to by
  909. `z2Ptr'.)
  910. -------------------------------------------------------------------------------
  911. }
  912. Procedure
  913. shift64ExtraRightJamming(
  914. a0: bits32;
  915. a1: bits32;
  916. a2: bits32;
  917. count: int16;
  918. VAR z0Ptr: bits32;
  919. VAR z1Ptr: bits32;
  920. VAR z2Ptr: bits32
  921. ); overload;
  922. Var
  923. z0, z1, z2: bits32;
  924. negCount : int8;
  925. Begin
  926. negCount := ( - count ) AND 31;
  927. if ( count = 0 ) then
  928. Begin
  929. z2 := a2;
  930. z1 := a1;
  931. z0 := a0;
  932. End
  933. else
  934. Begin
  935. if ( count < 32 ) Then
  936. Begin
  937. z2 := a1 shl negCount;
  938. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  939. z0 := a0 shr count;
  940. End
  941. else
  942. Begin
  943. if ( count = 32 ) then
  944. Begin
  945. z2 := a1;
  946. z1 := a0;
  947. End
  948. else
  949. Begin
  950. a2 := a2 or a1;
  951. if ( count < 64 ) then
  952. Begin
  953. z2 := a0 shl negCount;
  954. z1 := a0 shr ( count AND 31 );
  955. End
  956. else
  957. Begin
  958. if count = 64 then
  959. z2 := a0
  960. else
  961. z2 := bits32(a0 <> 0);
  962. z1 := 0;
  963. End;
  964. End;
  965. z0 := 0;
  966. End;
  967. z2 := z2 or bits32( a2 <> 0 );
  968. End;
  969. z2Ptr := z2;
  970. z1Ptr := z1;
  971. z0Ptr := z0;
  972. End;
  973. {*
  974. -------------------------------------------------------------------------------
  975. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  976. number of bits given in `count'. Any bits shifted off are lost. The value
  977. of `count' must be less than 32. The result is broken into two 32-bit
  978. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  979. -------------------------------------------------------------------------------
  980. *}
  981. Procedure
  982. shortShift64Left(
  983. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  984. Begin
  985. z1Ptr := a1 shl count;
  986. if count = 0 then
  987. z0Ptr := a0
  988. else
  989. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  990. End;
  991. {*
  992. -------------------------------------------------------------------------------
  993. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  994. by the number of bits given in `count'. Any bits shifted off are lost.
  995. The value of `count' must be less than 32. The result is broken into three
  996. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  997. `z1Ptr', and `z2Ptr'.
  998. -------------------------------------------------------------------------------
  999. *}
  1000. Procedure
  1001. shortShift96Left(
  1002. a0: bits32;
  1003. a1: bits32;
  1004. a2: bits32;
  1005. count: int16;
  1006. VAR z0Ptr: bits32;
  1007. VAR z1Ptr: bits32;
  1008. VAR z2Ptr: bits32
  1009. );
  1010. Var
  1011. z0, z1, z2: bits32;
  1012. negCount: int8;
  1013. Begin
  1014. z2 := a2 shl count;
  1015. z1 := a1 shl count;
  1016. z0 := a0 shl count;
  1017. if ( 0 < count ) then
  1018. Begin
  1019. negCount := ( ( - count ) AND 31 );
  1020. z1 := z1 or (a2 shr negCount);
  1021. z0 := z0 or (a1 shr negCount);
  1022. End;
  1023. z2Ptr := z2;
  1024. z1Ptr := z1;
  1025. z0Ptr := z0;
  1026. End;
  1027. {*----------------------------------------------------------------------------
  1028. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1029. | number of bits given in `count'. Any bits shifted off are lost. The value
  1030. | of `count' must be less than 64. The result is broken into two 64-bit
  1031. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1032. *----------------------------------------------------------------------------*}
  1033. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1034. begin
  1035. z1Ptr := a1 shl count;
  1036. if count=0 then
  1037. z0Ptr:=a0
  1038. else
  1039. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1040. end;
  1041. {*
  1042. -------------------------------------------------------------------------------
  1043. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1044. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1045. any carry out is lost. The result is broken into two 32-bit pieces which
  1046. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1047. -------------------------------------------------------------------------------
  1048. *}
  1049. Procedure
  1050. add64(
  1051. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1052. Var
  1053. z1: bits32;
  1054. Begin
  1055. z1 := a1 + b1;
  1056. z1Ptr := z1;
  1057. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1058. End;
  1059. {*
  1060. -------------------------------------------------------------------------------
  1061. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1062. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1063. modulo 2^96, so any carry out is lost. The result is broken into three
  1064. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1065. `z1Ptr', and `z2Ptr'.
  1066. -------------------------------------------------------------------------------
  1067. *}
  1068. Procedure
  1069. add96(
  1070. a0: bits32;
  1071. a1: bits32;
  1072. a2: bits32;
  1073. b0: bits32;
  1074. b1: bits32;
  1075. b2: bits32;
  1076. VAR z0Ptr: bits32;
  1077. VAR z1Ptr: bits32;
  1078. VAR z2Ptr: bits32
  1079. );
  1080. var
  1081. z0, z1, z2: bits32;
  1082. carry0, carry1: int8;
  1083. Begin
  1084. z2 := a2 + b2;
  1085. carry1 := int8( z2 < a2 );
  1086. z1 := a1 + b1;
  1087. carry0 := int8( z1 < a1 );
  1088. z0 := a0 + b0;
  1089. z1 := z1 + carry1;
  1090. z0 := z0 + bits32( z1 < carry1 );
  1091. z0 := z0 + carry0;
  1092. z2Ptr := z2;
  1093. z1Ptr := z1;
  1094. z0Ptr := z0;
  1095. End;
  1096. {*----------------------------------------------------------------------------
  1097. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1098. | by the number of bits given in `count'. Any bits shifted off are lost.
  1099. | The value of `count' must be less than 64. The result is broken into three
  1100. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1101. | `z1Ptr', and `z2Ptr'.
  1102. *----------------------------------------------------------------------------*}
  1103. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1104. var
  1105. z0, z1, z2 : bits64;
  1106. negCount : int8;
  1107. begin
  1108. z2 := a2 shl count;
  1109. z1 := a1 shl count;
  1110. z0 := a0 shl count;
  1111. if ( 0 < count ) then
  1112. begin
  1113. negCount := ( ( - count ) and 63 );
  1114. z1 := z1 or (a2 shr negCount);
  1115. z0 := z0 or (a1 shr negCount);
  1116. end;
  1117. z2Ptr := z2;
  1118. z1Ptr := z1;
  1119. z0Ptr := z0;
  1120. end;
  1121. {*----------------------------------------------------------------------------
  1122. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1123. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1124. | any carry out is lost. The result is broken into two 64-bit pieces which
  1125. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1126. *----------------------------------------------------------------------------*}
  1127. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1128. var
  1129. z1 : bits64;
  1130. begin
  1131. z1 := a1 + b1;
  1132. z1Ptr := z1;
  1133. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1134. end;
  1135. {*----------------------------------------------------------------------------
  1136. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1137. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1138. | modulo 2^192, so any carry out is lost. The result is broken into three
  1139. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1140. | `z1Ptr', and `z2Ptr'.
  1141. *----------------------------------------------------------------------------*}
  1142. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1143. var
  1144. z0, z1, z2 : bits64;
  1145. carry0, carry1 : int8;
  1146. begin
  1147. z2 := a2 + b2;
  1148. carry1 := ord( z2 < a2 );
  1149. z1 := a1 + b1;
  1150. carry0 := ord( z1 < a1 );
  1151. z0 := a0 + b0;
  1152. inc(z1, carry1);
  1153. inc(z0, ord( z1 < carry1 ));
  1154. inc(z0, carry0);
  1155. z2Ptr := z2;
  1156. z1Ptr := z1;
  1157. z0Ptr := z0;
  1158. end;
  1159. {*
  1160. -------------------------------------------------------------------------------
  1161. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1162. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1163. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1164. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1165. `z1Ptr'.
  1166. -------------------------------------------------------------------------------
  1167. *}
  1168. Procedure
  1169. sub64(
  1170. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1171. Begin
  1172. z1Ptr := a1 - b1;
  1173. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1174. End;
  1175. {*
  1176. -------------------------------------------------------------------------------
  1177. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1178. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1179. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1180. into three 32-bit pieces which are stored at the locations pointed to by
  1181. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1182. -------------------------------------------------------------------------------
  1183. *}
  1184. Procedure
  1185. sub96(
  1186. a0:bits32;
  1187. a1:bits32;
  1188. a2:bits32;
  1189. b0:bits32;
  1190. b1:bits32;
  1191. b2:bits32;
  1192. VAR z0Ptr:bits32;
  1193. VAR z1Ptr:bits32;
  1194. VAR z2Ptr:bits32
  1195. );
  1196. Var
  1197. z0, z1, z2: bits32;
  1198. borrow0, borrow1: int8;
  1199. Begin
  1200. z2 := a2 - b2;
  1201. borrow1 := int8( a2 < b2 );
  1202. z1 := a1 - b1;
  1203. borrow0 := int8( a1 < b1 );
  1204. z0 := a0 - b0;
  1205. z0 := z0 - bits32( z1 < borrow1 );
  1206. z1 := z1 - borrow1;
  1207. z0 := z0 -borrow0;
  1208. z2Ptr := z2;
  1209. z1Ptr := z1;
  1210. z0Ptr := z0;
  1211. End;
  1212. {*----------------------------------------------------------------------------
  1213. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1214. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1215. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1216. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1217. | `z1Ptr'.
  1218. *----------------------------------------------------------------------------*}
  1219. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1220. begin
  1221. z1Ptr := a1 - b1;
  1222. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1223. end;
  1224. {*----------------------------------------------------------------------------
  1225. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1226. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1227. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1228. | result is broken into three 64-bit pieces which are stored at the locations
  1229. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1230. *----------------------------------------------------------------------------*}
  1231. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1232. var
  1233. z0, z1, z2 : bits64;
  1234. borrow0, borrow1 : int8;
  1235. begin
  1236. z2 := a2 - b2;
  1237. borrow1 := ord( a2 < b2 );
  1238. z1 := a1 - b1;
  1239. borrow0 := ord( a1 < b1 );
  1240. z0 := a0 - b0;
  1241. dec(z0, ord( z1 < borrow1 ));
  1242. dec(z1, borrow1);
  1243. dec(z0, borrow0);
  1244. z2Ptr := z2;
  1245. z1Ptr := z1;
  1246. z0Ptr := z0;
  1247. end;
  1248. {*
  1249. -------------------------------------------------------------------------------
  1250. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1251. into two 32-bit pieces which are stored at the locations pointed to by
  1252. `z0Ptr' and `z1Ptr'.
  1253. -------------------------------------------------------------------------------
  1254. *}
  1255. {$IFDEF SOFTFPU_COMPILER_MUL32TO64}
  1256. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr :bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1257. var
  1258. tmp: qword;
  1259. begin
  1260. tmp:=qword(a) * b;
  1261. z0ptr:=hi(tmp);
  1262. z1ptr:=lo(tmp);
  1263. end;
  1264. {$ELSE}
  1265. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1266. :bits32 );
  1267. Var
  1268. aHigh, aLow, bHigh, bLow: bits16;
  1269. z0, zMiddleA, zMiddleB, z1: bits32;
  1270. Begin
  1271. aLow := bits16(a);
  1272. aHigh := a shr 16;
  1273. bLow := bits16(b);
  1274. bHigh := b shr 16;
  1275. z1 := ( bits32( aLow) ) * bLow;
  1276. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1277. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1278. z0 := ( bits32 (aHigh) ) * bHigh;
  1279. zMiddleA := zMiddleA + zMiddleB;
  1280. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1281. zMiddleA := zmiddleA shl 16;
  1282. z1 := z1 + zMiddleA;
  1283. z0 := z0 + bits32( z1 < zMiddleA );
  1284. z1Ptr := z1;
  1285. z0Ptr := z0;
  1286. End;
  1287. {$ENDIF}
  1288. {*
  1289. -------------------------------------------------------------------------------
  1290. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1291. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1292. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1293. `z2Ptr'.
  1294. -------------------------------------------------------------------------------
  1295. *}
  1296. Procedure
  1297. mul64By32To96(
  1298. a0:bits32;
  1299. a1:bits32;
  1300. b:bits32;
  1301. VAR z0Ptr:bits32;
  1302. VAR z1Ptr:bits32;
  1303. VAR z2Ptr:bits32
  1304. );
  1305. Var
  1306. z0, z1, z2, more1: bits32;
  1307. Begin
  1308. mul32To64( a1, b, z1, z2 );
  1309. mul32To64( a0, b, z0, more1 );
  1310. add64( z0, more1, 0, z1, z0, z1 );
  1311. z2Ptr := z2;
  1312. z1Ptr := z1;
  1313. z0Ptr := z0;
  1314. End;
  1315. {*
  1316. -------------------------------------------------------------------------------
  1317. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1318. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1319. product. The product is broken into four 32-bit pieces which are stored at
  1320. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1321. -------------------------------------------------------------------------------
  1322. *}
  1323. Procedure
  1324. mul64To128(
  1325. a0:bits32;
  1326. a1:bits32;
  1327. b0:bits32;
  1328. b1:bits32;
  1329. VAR z0Ptr:bits32;
  1330. VAR z1Ptr:bits32;
  1331. VAR z2Ptr:bits32;
  1332. VAR z3Ptr:bits32
  1333. );
  1334. Var
  1335. z0, z1, z2, z3: bits32;
  1336. more1, more2: bits32;
  1337. Begin
  1338. mul32To64( a1, b1, z2, z3 );
  1339. mul32To64( a1, b0, z1, more2 );
  1340. add64( z1, more2, 0, z2, z1, z2 );
  1341. mul32To64( a0, b0, z0, more1 );
  1342. add64( z0, more1, 0, z1, z0, z1 );
  1343. mul32To64( a0, b1, more1, more2 );
  1344. add64( more1, more2, 0, z2, more1, z2 );
  1345. add64( z0, z1, 0, more1, z0, z1 );
  1346. z3Ptr := z3;
  1347. z2Ptr := z2;
  1348. z1Ptr := z1;
  1349. z0Ptr := z0;
  1350. End;
  1351. {*----------------------------------------------------------------------------
  1352. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1353. | into two 64-bit pieces which are stored at the locations pointed to by
  1354. | `z0Ptr' and `z1Ptr'.
  1355. *----------------------------------------------------------------------------*}
  1356. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1357. var
  1358. aHigh, aLow, bHigh, bLow : bits32;
  1359. z0, zMiddleA, zMiddleB, z1 : bits64;
  1360. begin
  1361. aLow := a;
  1362. aHigh := a shr 32;
  1363. bLow := b;
  1364. bHigh := b shr 32;
  1365. z1 := ( bits64(aLow) ) * bLow;
  1366. zMiddleA := ( bits64( aLow )) * bHigh;
  1367. zMiddleB := ( bits64( aHigh )) * bLow;
  1368. z0 := ( bits64(aHigh) ) * bHigh;
  1369. inc(zMiddleA, zMiddleB);
  1370. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1371. zMiddleA := zMiddleA shl 32;
  1372. inc(z1, zMiddleA);
  1373. inc(z0, ord( z1 < zMiddleA ));
  1374. z1Ptr := z1;
  1375. z0Ptr := z0;
  1376. end;
  1377. {*----------------------------------------------------------------------------
  1378. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1379. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1380. | product. The product is broken into four 64-bit pieces which are stored at
  1381. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1382. *----------------------------------------------------------------------------*}
  1383. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1384. var
  1385. z0,z1,z2,z3,more1,more2 : bits64;
  1386. begin
  1387. mul64To128( a1, b1, z2, z3 );
  1388. mul64To128( a1, b0, z1, more2 );
  1389. add128( z1, more2, 0, z2, z1, z2 );
  1390. mul64To128( a0, b0, z0, more1 );
  1391. add128( z0, more1, 0, z1, z0, z1 );
  1392. mul64To128( a0, b1, more1, more2 );
  1393. add128( more1, more2, 0, z2, more1, z2 );
  1394. add128( z0, z1, 0, more1, z0, z1 );
  1395. z3Ptr := z3;
  1396. z2Ptr := z2;
  1397. z1Ptr := z1;
  1398. z0Ptr := z0;
  1399. end;
  1400. {*----------------------------------------------------------------------------
  1401. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1402. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1403. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1404. | `z2Ptr'.
  1405. *----------------------------------------------------------------------------*}
  1406. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1407. var
  1408. z0, z1, z2, more1 : bits64;
  1409. begin
  1410. mul64To128( a1, b, z1, z2 );
  1411. mul64To128( a0, b, z0, more1 );
  1412. add128( z0, more1, 0, z1, z0, z1 );
  1413. z2Ptr := z2;
  1414. z1Ptr := z1;
  1415. z0Ptr := z0;
  1416. end;
  1417. {*----------------------------------------------------------------------------
  1418. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1419. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1420. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1421. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1422. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1423. | unsigned integer is returned.
  1424. *----------------------------------------------------------------------------*}
  1425. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1426. var
  1427. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1428. begin
  1429. if ( b <= a0 ) then
  1430. begin
  1431. result:=qword( $FFFFFFFFFFFFFFFF );
  1432. exit;
  1433. end;
  1434. b0 := b shr 32;
  1435. if ( b0 shl 32 <= a0 ) then
  1436. z:=qword( $FFFFFFFF00000000 )
  1437. else
  1438. z:=( a0 div b0 ) shl 32;
  1439. mul64To128( b, z, term0, term1 );
  1440. sub128( a0, a1, term0, term1, rem0, rem1 );
  1441. while ( ( sbits64(rem0) ) < 0 ) do begin
  1442. dec(z,qword( $100000000 ));
  1443. b1 := b shl 32;
  1444. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1445. end;
  1446. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1447. if ( b0 shl 32 <= rem0 ) then
  1448. z:=z or $FFFFFFFF
  1449. else
  1450. z:=z or rem0 div b0;
  1451. result:=z;
  1452. end;
  1453. {*
  1454. -------------------------------------------------------------------------------
  1455. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1456. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1457. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1458. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1459. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1460. unsigned integer is returned.
  1461. -------------------------------------------------------------------------------
  1462. *}
  1463. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1464. Var
  1465. b0, b1: bits32;
  1466. rem0, rem1, term0, term1: bits32;
  1467. z: bits32;
  1468. Begin
  1469. if ( b <= a0 ) then
  1470. Begin
  1471. estimateDiv64To32 := $FFFFFFFF;
  1472. exit;
  1473. End;
  1474. b0 := b shr 16;
  1475. if ( b0 shl 16 <= a0 ) then
  1476. z:= $FFFF0000
  1477. else
  1478. z:= ( a0 div b0 ) shl 16;
  1479. mul32To64( b, z, term0, term1 );
  1480. sub64( a0, a1, term0, term1, rem0, rem1 );
  1481. while ( ( sbits32 (rem0) ) < 0 ) do
  1482. Begin
  1483. z := z - $10000;
  1484. b1 := b shl 16;
  1485. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1486. End;
  1487. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1488. if ( b0 shl 16 <= rem0 ) then
  1489. z := z or $FFFF
  1490. else
  1491. z := z or (rem0 div b0);
  1492. estimateDiv64To32 := z;
  1493. End;
  1494. {*
  1495. -------------------------------------------------------------------------------
  1496. Returns an approximation to the square root of the 32-bit significand given
  1497. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1498. `aExp' (the least significant bit) is 1, the integer returned approximates
  1499. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1500. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1501. case, the approximation returned lies strictly within +/-2 of the exact
  1502. value.
  1503. -------------------------------------------------------------------------------
  1504. *}
  1505. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1506. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1507. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1508. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1509. );
  1510. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1511. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1512. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1513. );
  1514. Var
  1515. index: int8;
  1516. z: bits32;
  1517. Begin
  1518. index := ( a shr 27 ) AND 15;
  1519. if ( aExp AND 1 ) <> 0 then
  1520. Begin
  1521. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1522. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1523. a := a shr 1;
  1524. End
  1525. else
  1526. Begin
  1527. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1528. z := a div z + z;
  1529. if ( $20000 <= z ) then
  1530. z := $FFFF8000
  1531. else
  1532. z := ( z shl 15 );
  1533. if ( z <= a ) then
  1534. Begin
  1535. estimateSqrt32 := bits32 ( SarLongint( sbits32 (a)) );
  1536. exit;
  1537. End;
  1538. End;
  1539. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1540. End;
  1541. {*
  1542. -------------------------------------------------------------------------------
  1543. Returns the number of leading 0 bits before the most-significant 1 bit of
  1544. `a'. If `a' is zero, 32 is returned.
  1545. -------------------------------------------------------------------------------
  1546. *}
  1547. Function countLeadingZeros32( a:bits32 ): int8;
  1548. const countLeadingZerosHigh:array[0..255] of int8 = (
  1549. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1550. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1551. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1552. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1553. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1554. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1555. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1556. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1557. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1558. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1559. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1560. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1561. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1562. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1563. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1564. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1565. );
  1566. Var
  1567. shiftCount: int8;
  1568. Begin
  1569. shiftCount := 0;
  1570. if ( a < $10000 ) then
  1571. Begin
  1572. shiftCount := shiftcount + 16;
  1573. a := a shl 16;
  1574. End;
  1575. if ( a < $1000000 ) then
  1576. Begin
  1577. shiftCount := shiftcount + 8;
  1578. a := a shl 8;
  1579. end;
  1580. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1581. countLeadingZeros32:= shiftCount;
  1582. End;
  1583. {*----------------------------------------------------------------------------
  1584. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1585. | `a'. If `a' is zero, 64 is returned.
  1586. *----------------------------------------------------------------------------*}
  1587. function countLeadingZeros64( a : bits64): int8;
  1588. var
  1589. shiftcount : int8;
  1590. Begin
  1591. shiftCount := 0;
  1592. if ( a < bits64(bits64(1) shl 32 )) then
  1593. shiftCount := shiftcount + 32
  1594. else
  1595. a := a shr 32;
  1596. shiftCount := shiftCount + countLeadingZeros32( a );
  1597. countLeadingZeros64:= shiftCount;
  1598. End;
  1599. {*
  1600. -------------------------------------------------------------------------------
  1601. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1602. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1603. Otherwise, returns 0.
  1604. -------------------------------------------------------------------------------
  1605. *}
  1606. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1607. Begin
  1608. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1609. End;
  1610. {*
  1611. -------------------------------------------------------------------------------
  1612. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1613. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1614. returns 0.
  1615. -------------------------------------------------------------------------------
  1616. *}
  1617. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1618. Begin
  1619. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1620. End;
  1621. const
  1622. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1623. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1624. (*****************************************************************************)
  1625. (* End Low-Level arithmetic *)
  1626. (*****************************************************************************)
  1627. {*----------------------------------------------------------------------------
  1628. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1629. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1630. | returns 0.
  1631. *----------------------------------------------------------------------------*}
  1632. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1633. begin
  1634. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1635. end;
  1636. {*
  1637. -------------------------------------------------------------------------------
  1638. Functions and definitions to determine: (1) whether tininess for underflow
  1639. is detected before or after rounding by default, (2) what (if anything)
  1640. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1641. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1642. are propagated from function inputs to output. These details are ENDIAN
  1643. specific
  1644. -------------------------------------------------------------------------------
  1645. *}
  1646. {$IFDEF ENDIAN_LITTLE}
  1647. {*
  1648. -------------------------------------------------------------------------------
  1649. Internal canonical NaN format.
  1650. -------------------------------------------------------------------------------
  1651. *}
  1652. TYPE
  1653. commonNaNT = record
  1654. high, low : bits32;
  1655. sign: flag;
  1656. end;
  1657. {*
  1658. -------------------------------------------------------------------------------
  1659. The pattern for a default generated single-precision NaN.
  1660. -------------------------------------------------------------------------------
  1661. *}
  1662. const float32_default_nan = $FFC00000;
  1663. {*
  1664. -------------------------------------------------------------------------------
  1665. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1666. otherwise returns 0.
  1667. -------------------------------------------------------------------------------
  1668. *}
  1669. Function float32_is_nan( a : float32 ): flag;
  1670. Begin
  1671. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1672. End;
  1673. {*
  1674. -------------------------------------------------------------------------------
  1675. Returns 1 if the single-precision floating-point value `a' is a signaling
  1676. NaN; otherwise returns 0.
  1677. -------------------------------------------------------------------------------
  1678. *}
  1679. Function float32_is_signaling_nan( a : float32 ): flag;
  1680. Begin
  1681. float32_is_signaling_nan := flag
  1682. (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));
  1683. End;
  1684. {*
  1685. -------------------------------------------------------------------------------
  1686. Returns the result of converting the single-precision floating-point NaN
  1687. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1688. exception is raised.
  1689. -------------------------------------------------------------------------------
  1690. *}
  1691. function float32ToCommonNaN(a: float32) : commonNaNT;
  1692. var
  1693. z : commonNaNT ;
  1694. Begin
  1695. if ( float32_is_signaling_nan( a ) <> 0) then
  1696. float_raise( float_flag_invalid );
  1697. z.sign := a shr 31;
  1698. z.low := 0;
  1699. z.high := a shl 9;
  1700. result := z;
  1701. End;
  1702. {*
  1703. -------------------------------------------------------------------------------
  1704. Returns the result of converting the canonical NaN `a' to the single-
  1705. precision floating-point format.
  1706. -------------------------------------------------------------------------------
  1707. *}
  1708. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1709. Begin
  1710. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1711. End;
  1712. {*
  1713. -------------------------------------------------------------------------------
  1714. Takes two single-precision floating-point values `a' and `b', one of which
  1715. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1716. signaling NaN, the invalid exception is raised.
  1717. -------------------------------------------------------------------------------
  1718. *}
  1719. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1720. Var
  1721. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1722. label returnLargerSignificand;
  1723. Begin
  1724. aIsNaN := float32_is_nan( a );
  1725. aIsSignalingNaN := float32_is_signaling_nan( a );
  1726. bIsNaN := float32_is_nan( b );
  1727. bIsSignalingNaN := float32_is_signaling_nan( b );
  1728. a := a or $00400000;
  1729. b := b or $00400000;
  1730. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1731. float_raise( float_flag_invalid );
  1732. if ( aIsSignalingNaN )<> 0 then
  1733. Begin
  1734. if ( bIsSignalingNaN ) <> 0 then
  1735. goto returnLargerSignificand;
  1736. if bIsNan <> 0 then
  1737. propagateFloat32NaN := b
  1738. else
  1739. propagateFloat32NaN := a;
  1740. exit;
  1741. End
  1742. else if ( aIsNaN <> 0) then
  1743. Begin
  1744. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1745. Begin
  1746. propagateFloat32NaN := a;
  1747. exit;
  1748. End;
  1749. returnLargerSignificand:
  1750. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1751. Begin
  1752. propagateFloat32NaN := b;
  1753. exit;
  1754. End;
  1755. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1756. Begin
  1757. propagateFloat32NaN := a;
  1758. End;
  1759. if a < b then
  1760. propagateFloat32NaN := a
  1761. else
  1762. propagateFloat32NaN := b;
  1763. exit;
  1764. End
  1765. else
  1766. Begin
  1767. propagateFloat32NaN := b;
  1768. exit;
  1769. End;
  1770. End;
  1771. {*
  1772. -------------------------------------------------------------------------------
  1773. The pattern for a default generated double-precision NaN. The `high' and
  1774. `low' values hold the most- and least-significant bits, respectively.
  1775. -------------------------------------------------------------------------------
  1776. *}
  1777. const
  1778. float64_default_nan_high = $FFF80000;
  1779. float64_default_nan_low = $00000000;
  1780. {*
  1781. -------------------------------------------------------------------------------
  1782. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1783. otherwise returns 0.
  1784. -------------------------------------------------------------------------------
  1785. *}
  1786. Function float64_is_nan( a : float64 ) : flag;
  1787. Begin
  1788. float64_is_nan :=
  1789. flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1790. and (( a.low or ( a.high and $000FFFFF ) )<>0));
  1791. End;
  1792. {*
  1793. -------------------------------------------------------------------------------
  1794. Returns 1 if the double-precision floating-point value `a' is a signaling
  1795. NaN; otherwise returns 0.
  1796. -------------------------------------------------------------------------------
  1797. *}
  1798. Function float64_is_signaling_nan( a : float64 ): flag;
  1799. Begin
  1800. float64_is_signaling_nan :=
  1801. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1802. and ( a.low or ( a.high and $0007FFFF ) );
  1803. End;
  1804. {*
  1805. -------------------------------------------------------------------------------
  1806. Returns the result of converting the double-precision floating-point NaN
  1807. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1808. exception is raised.
  1809. -------------------------------------------------------------------------------
  1810. *}
  1811. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1812. Var
  1813. z : commonNaNT;
  1814. Begin
  1815. if ( float64_is_signaling_nan( a )<>0 ) then
  1816. float_raise( float_flag_invalid );
  1817. z.sign := a.high shr 31;
  1818. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1819. result := z;
  1820. End;
  1821. {*
  1822. -------------------------------------------------------------------------------
  1823. Returns the result of converting the canonical NaN `a' to the double-
  1824. precision floating-point format.
  1825. -------------------------------------------------------------------------------
  1826. *}
  1827. function commonNaNToFloat64( a : commonNaNT) : float64;
  1828. Var
  1829. z: float64;
  1830. Begin
  1831. shift64Right( a.high, a.low, 12, z.high, z.low );
  1832. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1833. result := z;
  1834. End;
  1835. {*
  1836. -------------------------------------------------------------------------------
  1837. Takes two double-precision floating-point values `a' and `b', one of which
  1838. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1839. signaling NaN, the invalid exception is raised.
  1840. -------------------------------------------------------------------------------
  1841. *}
  1842. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1843. Var
  1844. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1845. label returnLargerSignificand;
  1846. Begin
  1847. aIsNaN := float64_is_nan( a );
  1848. aIsSignalingNaN := float64_is_signaling_nan( a );
  1849. bIsNaN := float64_is_nan( b );
  1850. bIsSignalingNaN := float64_is_signaling_nan( b );
  1851. a.high := a.high or $00080000;
  1852. b.high := b.high or $00080000;
  1853. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1854. float_raise( float_flag_invalid );
  1855. if ( aIsSignalingNaN )<>0 then
  1856. Begin
  1857. if ( bIsSignalingNaN )<>0 then
  1858. goto returnLargerSignificand;
  1859. if bIsNan <> 0 then
  1860. c := b
  1861. else
  1862. c := a;
  1863. exit;
  1864. End
  1865. else if ( aIsNaN )<> 0 then
  1866. Begin
  1867. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1868. Begin
  1869. c := a;
  1870. exit;
  1871. End;
  1872. returnLargerSignificand:
  1873. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1874. Begin
  1875. c := b;
  1876. exit;
  1877. End;
  1878. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1879. Begin
  1880. c := a;
  1881. exit;
  1882. End;
  1883. if a.high < b.high then
  1884. c := a
  1885. else
  1886. c := b;
  1887. exit;
  1888. End
  1889. else
  1890. Begin
  1891. c := b;
  1892. exit;
  1893. End;
  1894. End;
  1895. {*----------------------------------------------------------------------------
  1896. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1897. | otherwise returns 0.
  1898. *----------------------------------------------------------------------------*}
  1899. function float128_is_nan( a : float128): flag;
  1900. begin
  1901. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1902. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1903. end;
  1904. {*----------------------------------------------------------------------------
  1905. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1906. | signaling NaN; otherwise returns 0.
  1907. *----------------------------------------------------------------------------*}
  1908. function float128_is_signaling_nan( a : float128): flag;
  1909. begin
  1910. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1911. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1912. end;
  1913. {*----------------------------------------------------------------------------
  1914. | Returns the result of converting the quadruple-precision floating-point NaN
  1915. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1916. | exception is raised.
  1917. *----------------------------------------------------------------------------*}
  1918. function float128ToCommonNaN( a : float128): commonNaNT;
  1919. var
  1920. z: commonNaNT;
  1921. qhigh,qlow : qword;
  1922. begin
  1923. if ( float128_is_signaling_nan( a )<>0) then
  1924. float_raise( float_flag_invalid );
  1925. z.sign := a.high shr 63;
  1926. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1927. z.high:=qhigh shr 32;
  1928. z.low:=qhigh and $ffffffff;
  1929. result:=z;
  1930. end;
  1931. {*----------------------------------------------------------------------------
  1932. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1933. | precision floating-point format.
  1934. *----------------------------------------------------------------------------*}
  1935. function commonNaNToFloat128( a : commonNaNT): float128;
  1936. var
  1937. z: float128;
  1938. begin
  1939. shift128Right( a.high, a.low, 16, z.high, z.low );
  1940. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1941. result:=z;
  1942. end;
  1943. {*----------------------------------------------------------------------------
  1944. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1945. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1946. | `b' is a signaling NaN, the invalid exception is raised.
  1947. *----------------------------------------------------------------------------*}
  1948. function propagateFloat128NaN( a: float128; b : float128): float128;
  1949. var
  1950. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1951. label
  1952. returnLargerSignificand;
  1953. begin
  1954. aIsNaN := float128_is_nan( a );
  1955. aIsSignalingNaN := float128_is_signaling_nan( a );
  1956. bIsNaN := float128_is_nan( b );
  1957. bIsSignalingNaN := float128_is_signaling_nan( b );
  1958. a.high := a.high or int64( $0000800000000000 );
  1959. b.high := b.high or int64( $0000800000000000 );
  1960. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1961. float_raise( float_flag_invalid );
  1962. if ( aIsSignalingNaN )<>0 then
  1963. begin
  1964. if ( bIsSignalingNaN )<>0 then
  1965. goto returnLargerSignificand;
  1966. if bIsNaN<>0 then
  1967. result := b
  1968. else
  1969. result := a;
  1970. exit;
  1971. end
  1972. else if ( aIsNaN )<>0 then
  1973. begin
  1974. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1975. begin
  1976. result := a;
  1977. exit;
  1978. end;
  1979. returnLargerSignificand:
  1980. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1981. begin
  1982. result := b;
  1983. exit;
  1984. end;
  1985. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1986. begin
  1987. result := a;
  1988. exit
  1989. end;
  1990. if ( a.high < b.high ) then
  1991. result := a
  1992. else
  1993. result := b;
  1994. exit;
  1995. end
  1996. else
  1997. result:=b;
  1998. end;
  1999. {$ELSE}
  2000. { Big endian code }
  2001. (*----------------------------------------------------------------------------
  2002. | Internal canonical NaN format.
  2003. *----------------------------------------------------------------------------*)
  2004. type
  2005. commonNANT = record
  2006. high, low : bits32;
  2007. sign : flag;
  2008. end;
  2009. (*----------------------------------------------------------------------------
  2010. | The pattern for a default generated single-precision NaN.
  2011. *----------------------------------------------------------------------------*)
  2012. const float32_default_nan = $7FFFFFFF;
  2013. (*----------------------------------------------------------------------------
  2014. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  2015. | otherwise returns 0.
  2016. *----------------------------------------------------------------------------*)
  2017. function float32_is_nan(a: float32): flag;
  2018. begin
  2019. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  2020. end;
  2021. (*----------------------------------------------------------------------------
  2022. | Returns 1 if the single-precision floating-point value `a' is a signaling
  2023. | NaN; otherwise returns 0.
  2024. *----------------------------------------------------------------------------*)
  2025. function float32_is_signaling_nan(a: float32):flag;
  2026. begin
  2027. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2028. end;
  2029. (*----------------------------------------------------------------------------
  2030. | Returns the result of converting the single-precision floating-point NaN
  2031. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2032. | exception is raised.
  2033. *----------------------------------------------------------------------------*)
  2034. function float32ToCommonNaN( a: float32) : commonNaNT;
  2035. var
  2036. z: commonNANT;
  2037. begin
  2038. if float32_is_signaling_nan(a)<>0 then
  2039. float_raise(float_flag_invalid);
  2040. z.sign := a shr 31;
  2041. z.low := 0;
  2042. z.high := a shl 9;
  2043. result:=z;
  2044. end;
  2045. (*----------------------------------------------------------------------------
  2046. | Returns the result of converting the canonical NaN `a' to the single-
  2047. | precision floating-point format.
  2048. *----------------------------------------------------------------------------*)
  2049. function CommonNanToFloat32(a : CommonNaNT): float32;
  2050. begin
  2051. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2052. end;
  2053. (*----------------------------------------------------------------------------
  2054. | Takes two single-precision floating-point values `a' and `b', one of which
  2055. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2056. | signaling NaN, the invalid exception is raised.
  2057. *----------------------------------------------------------------------------*)
  2058. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2059. var
  2060. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2061. begin
  2062. aIsNaN := float32_is_nan( a );
  2063. aIsSignalingNaN := float32_is_signaling_nan( a );
  2064. bIsNaN := float32_is_nan( b );
  2065. bIsSignalingNaN := float32_is_signaling_nan( b );
  2066. a := a or $00400000;
  2067. b := b or $00400000;
  2068. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2069. float_raise( float_flag_invalid );
  2070. if bIsSignalingNaN<>0 then
  2071. propagateFloat32Nan := b
  2072. else if aIsSignalingNan<>0 then
  2073. propagateFloat32Nan := a
  2074. else if bIsNan<>0 then
  2075. propagateFloat32Nan := b
  2076. else
  2077. propagateFloat32Nan := a;
  2078. end;
  2079. (*----------------------------------------------------------------------------
  2080. | The pattern for a default generated double-precision NaN. The `high' and
  2081. | `low' values hold the most- and least-significant bits, respectively.
  2082. *----------------------------------------------------------------------------*)
  2083. const
  2084. float64_default_nan_high = $7FFFFFFF;
  2085. float64_default_nan_low = $FFFFFFFF;
  2086. (*----------------------------------------------------------------------------
  2087. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2088. | otherwise returns 0.
  2089. *----------------------------------------------------------------------------*)
  2090. function float64_is_nan(a: float64): flag;
  2091. begin
  2092. float64_is_nan := flag (
  2093. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2094. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2095. end;
  2096. (*----------------------------------------------------------------------------
  2097. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2098. | NaN; otherwise returns 0.
  2099. *----------------------------------------------------------------------------*)
  2100. function float64_is_signaling_nan( a:float64): flag;
  2101. begin
  2102. float64_is_signaling_nan := flag(
  2103. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2104. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2105. end;
  2106. (*----------------------------------------------------------------------------
  2107. | Returns the result of converting the double-precision floating-point NaN
  2108. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2109. | exception is raised.
  2110. *----------------------------------------------------------------------------*)
  2111. function float64ToCommonNaN( a : float64) : commonNaNT;
  2112. var
  2113. z : commonNaNT;
  2114. begin
  2115. if ( float64_is_signaling_nan( a )<>0 ) then
  2116. float_raise( float_flag_invalid );
  2117. z.sign := a.high shr 31;
  2118. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2119. result:=z;
  2120. end;
  2121. (*----------------------------------------------------------------------------
  2122. | Returns the result of converting the canonical NaN `a' to the double-
  2123. | precision floating-point format.
  2124. *----------------------------------------------------------------------------*)
  2125. function commonNaNToFloat64( a : commonNaNT): float64;
  2126. var
  2127. z: float64;
  2128. begin
  2129. shift64Right( a.high, a.low, 12, z.high, z.low );
  2130. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2131. result:=z;
  2132. end;
  2133. (*----------------------------------------------------------------------------
  2134. | Takes two double-precision floating-point values `a' and `b', one of which
  2135. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2136. | signaling NaN, the invalid exception is raised.
  2137. *----------------------------------------------------------------------------*)
  2138. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2139. var
  2140. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2141. begin
  2142. aIsNaN := float64_is_nan( a );
  2143. aIsSignalingNaN := float64_is_signaling_nan( a );
  2144. bIsNaN := float64_is_nan( b );
  2145. bIsSignalingNaN := float64_is_signaling_nan( b );
  2146. a.high := a.high or $00080000;
  2147. b.high := b.high or $00080000;
  2148. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2149. float_raise( float_flag_invalid );
  2150. if bIsSignalingNaN<>0 then
  2151. c := b
  2152. else if aIsSignalingNan<>0 then
  2153. c := a
  2154. else if bIsNan<>0 then
  2155. c := b
  2156. else
  2157. c := a;
  2158. end;
  2159. {*----------------------------------------------------------------------------
  2160. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  2161. | otherwise returns 0.
  2162. *----------------------------------------------------------------------------*}
  2163. function float128_is_nan( a : float128): flag;
  2164. begin
  2165. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  2166. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  2167. end;
  2168. {*----------------------------------------------------------------------------
  2169. | Returns 1 if the quadruple-precision floating-point value `a' is a
  2170. | signaling NaN; otherwise returns 0.
  2171. *----------------------------------------------------------------------------*}
  2172. function float128_is_signaling_nan( a : float128): flag;
  2173. begin
  2174. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  2175. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  2176. end;
  2177. {*----------------------------------------------------------------------------
  2178. | Returns the result of converting the quadruple-precision floating-point NaN
  2179. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2180. | exception is raised.
  2181. *----------------------------------------------------------------------------*}
  2182. function float128ToCommonNaN( a : float128): commonNaNT;
  2183. var
  2184. z: commonNaNT;
  2185. qhigh,qlow : qword;
  2186. begin
  2187. if ( float128_is_signaling_nan( a )<>0) then
  2188. float_raise( float_flag_invalid );
  2189. z.sign := a.high shr 63;
  2190. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  2191. z.high:=qhigh shr 32;
  2192. z.low:=qhigh and $ffffffff;
  2193. result:=z;
  2194. end;
  2195. {*----------------------------------------------------------------------------
  2196. | Returns the result of converting the canonical NaN `a' to the quadruple-
  2197. | precision floating-point format.
  2198. *----------------------------------------------------------------------------*}
  2199. function commonNaNToFloat128( a : commonNaNT): float128;
  2200. var
  2201. z: float128;
  2202. begin
  2203. shift128Right( a.high, a.low, 16, z.high, z.low );
  2204. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  2205. result:=z;
  2206. end;
  2207. {*----------------------------------------------------------------------------
  2208. | Takes two quadruple-precision floating-point values `a' and `b', one of
  2209. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  2210. | `b' is a signaling NaN, the invalid exception is raised.
  2211. *----------------------------------------------------------------------------*}
  2212. function propagateFloat128NaN( a: float128; b : float128): float128;
  2213. var
  2214. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2215. label
  2216. returnLargerSignificand;
  2217. begin
  2218. aIsNaN := float128_is_nan( a );
  2219. aIsSignalingNaN := float128_is_signaling_nan( a );
  2220. bIsNaN := float128_is_nan( b );
  2221. bIsSignalingNaN := float128_is_signaling_nan( b );
  2222. a.high := a.high or int64( $0000800000000000 );
  2223. b.high := b.high or int64( $0000800000000000 );
  2224. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2225. float_raise( float_flag_invalid );
  2226. if ( aIsSignalingNaN )<>0 then
  2227. begin
  2228. if ( bIsSignalingNaN )<>0 then
  2229. goto returnLargerSignificand;
  2230. if bIsNaN<>0 then
  2231. result := b
  2232. else
  2233. result := a;
  2234. exit;
  2235. end
  2236. else if ( aIsNaN )<>0 then
  2237. begin
  2238. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  2239. begin
  2240. result := a;
  2241. exit;
  2242. end;
  2243. returnLargerSignificand:
  2244. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  2245. begin
  2246. result := b;
  2247. exit;
  2248. end;
  2249. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  2250. begin
  2251. result := a;
  2252. exit
  2253. end;
  2254. if ( a.high < b.high ) then
  2255. result := a
  2256. else
  2257. result := b;
  2258. exit;
  2259. end
  2260. else
  2261. result:=b;
  2262. end;
  2263. {$ENDIF}
  2264. (****************************************************************************)
  2265. (* END ENDIAN SPECIFIC CODE *)
  2266. (****************************************************************************)
  2267. {*
  2268. -------------------------------------------------------------------------------
  2269. Returns the fraction bits of the single-precision floating-point value `a'.
  2270. -------------------------------------------------------------------------------
  2271. *}
  2272. Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
  2273. Begin
  2274. ExtractFloat32Frac := A AND $007FFFFF;
  2275. End;
  2276. {*
  2277. -------------------------------------------------------------------------------
  2278. Returns the exponent bits of the single-precision floating-point value `a'.
  2279. -------------------------------------------------------------------------------
  2280. *}
  2281. Function extractFloat32Exp( a: float32 ): Int16; inline;
  2282. Begin
  2283. extractFloat32Exp := (a shr 23) AND $FF;
  2284. End;
  2285. {*
  2286. -------------------------------------------------------------------------------
  2287. Returns the sign bit of the single-precision floating-point value `a'.
  2288. -------------------------------------------------------------------------------
  2289. *}
  2290. Function extractFloat32Sign( a: float32 ): Flag; inline;
  2291. Begin
  2292. extractFloat32Sign := a shr 31;
  2293. End;
  2294. {*
  2295. -------------------------------------------------------------------------------
  2296. Normalizes the subnormal single-precision floating-point value represented
  2297. by the denormalized significand `aSig'. The normalized exponent and
  2298. significand are stored at the locations pointed to by `zExpPtr' and
  2299. `zSigPtr', respectively.
  2300. -------------------------------------------------------------------------------
  2301. *}
  2302. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2303. Var
  2304. ShiftCount : BYTE;
  2305. Begin
  2306. shiftCount := countLeadingZeros32( aSig ) - 8;
  2307. zSigPtr := aSig shl shiftCount;
  2308. zExpPtr := 1 - shiftCount;
  2309. End;
  2310. {*
  2311. -------------------------------------------------------------------------------
  2312. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2313. single-precision floating-point value, returning the result. After being
  2314. shifted into the proper positions, the three fields are simply added
  2315. together to form the result. This means that any integer portion of `zSig'
  2316. will be added into the exponent. Since a properly normalized significand
  2317. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2318. than the desired result exponent whenever `zSig' is a complete, normalized
  2319. significand.
  2320. -------------------------------------------------------------------------------
  2321. *}
  2322. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
  2323. Begin
  2324. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2325. + zSig;
  2326. End;
  2327. {*
  2328. -------------------------------------------------------------------------------
  2329. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2330. and significand `zSig', and returns the proper single-precision floating-
  2331. point value corresponding to the abstract input. Ordinarily, the abstract
  2332. value is simply rounded and packed into the single-precision format, with
  2333. the inexact exception raised if the abstract input cannot be represented
  2334. exactly. However, if the abstract value is too large, the overflow and
  2335. inexact exceptions are raised and an infinity or maximal finite value is
  2336. returned. If the abstract value is too small, the input value is rounded to
  2337. a subnormal number, and the underflow and inexact exceptions are raised if
  2338. the abstract input cannot be represented exactly as a subnormal single-
  2339. precision floating-point number.
  2340. The input significand `zSig' has its binary point between bits 30
  2341. and 29, which is 7 bits to the left of the usual location. This shifted
  2342. significand must be normalized or smaller. If `zSig' is not normalized,
  2343. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2344. and it must not require rounding. In the usual case that `zSig' is
  2345. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2346. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2347. Binary Floating-Point Arithmetic.
  2348. -------------------------------------------------------------------------------
  2349. *}
  2350. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2351. Var
  2352. roundingMode : TFPURoundingMode;
  2353. roundNearestEven : boolean;
  2354. roundIncrement, roundBits : BYTE;
  2355. IsTiny : boolean;
  2356. Begin
  2357. roundingMode := softfloat_rounding_mode;
  2358. roundNearestEven := (roundingMode = float_round_nearest_even);
  2359. roundIncrement := $40;
  2360. if not roundNearestEven then
  2361. Begin
  2362. if ( roundingMode = float_round_to_zero ) Then
  2363. Begin
  2364. roundIncrement := 0;
  2365. End
  2366. else
  2367. Begin
  2368. roundIncrement := $7F;
  2369. if ( zSign <> 0 ) then
  2370. Begin
  2371. if roundingMode = float_round_up then roundIncrement := 0;
  2372. End
  2373. else
  2374. Begin
  2375. if roundingMode = float_round_down then roundIncrement := 0;
  2376. End;
  2377. End
  2378. End;
  2379. roundBits := zSig AND $7F;
  2380. if ($FD <= bits16 (zExp) ) then
  2381. Begin
  2382. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2383. Begin
  2384. float_raise( [float_flag_overflow,float_flag_inexact] );
  2385. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2386. exit;
  2387. End;
  2388. if ( zExp < 0 ) then
  2389. Begin
  2390. isTiny :=
  2391. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2392. OR ( zExp < -1 )
  2393. OR ( (zSig + roundIncrement) < $80000000 );
  2394. shift32RightJamming( zSig, - zExp, zSig );
  2395. zExp := 0;
  2396. roundBits := zSig AND $7F;
  2397. if ( isTiny and (roundBits<>0) ) then
  2398. float_raise( float_flag_underflow );
  2399. End;
  2400. End;
  2401. if ( roundBits )<> 0 then
  2402. set_inexact_flag;
  2403. zSig := ( zSig + roundIncrement ) shr 7;
  2404. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
  2405. if ( zSig = 0 ) then zExp := 0;
  2406. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2407. End;
  2408. {*
  2409. -------------------------------------------------------------------------------
  2410. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2411. and significand `zSig', and returns the proper single-precision floating-
  2412. point value corresponding to the abstract input. This routine is just like
  2413. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2414. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2415. floating-point exponent.
  2416. -------------------------------------------------------------------------------
  2417. *}
  2418. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2419. Var
  2420. ShiftCount : int8;
  2421. Begin
  2422. shiftCount := countLeadingZeros32( zSig ) - 1;
  2423. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2424. End;
  2425. {*
  2426. -------------------------------------------------------------------------------
  2427. Returns the most-significant 20 fraction bits of the double-precision
  2428. floating-point value `a'.
  2429. -------------------------------------------------------------------------------
  2430. *}
  2431. Function extractFloat64Frac0(a: float64): bits32; inline;
  2432. Begin
  2433. extractFloat64Frac0 := a.high and $000FFFFF;
  2434. End;
  2435. {*
  2436. -------------------------------------------------------------------------------
  2437. Returns the least-significant 32 fraction bits of the double-precision
  2438. floating-point value `a'.
  2439. -------------------------------------------------------------------------------
  2440. *}
  2441. Function extractFloat64Frac1(a: float64): bits32; inline;
  2442. Begin
  2443. extractFloat64Frac1 := a.low;
  2444. End;
  2445. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2446. Function extractFloat64Frac(a: float64): bits64; inline;
  2447. Begin
  2448. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2449. End;
  2450. {*
  2451. -------------------------------------------------------------------------------
  2452. Returns the exponent bits of the double-precision floating-point value `a'.
  2453. -------------------------------------------------------------------------------
  2454. *}
  2455. Function extractFloat64Exp(a: float64): int16; inline;
  2456. Begin
  2457. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2458. End;
  2459. {*
  2460. -------------------------------------------------------------------------------
  2461. Returns the sign bit of the double-precision floating-point value `a'.
  2462. -------------------------------------------------------------------------------
  2463. *}
  2464. Function extractFloat64Sign(a: float64) : flag; inline;
  2465. Begin
  2466. extractFloat64Sign := a.high shr 31;
  2467. End;
  2468. {*
  2469. -------------------------------------------------------------------------------
  2470. Normalizes the subnormal double-precision floating-point value represented
  2471. by the denormalized significand formed by the concatenation of `aSig0' and
  2472. `aSig1'. The normalized exponent is stored at the location pointed to by
  2473. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2474. stored at the location pointed to by `zSig0Ptr', and the least significant
  2475. 32 bits of the normalized significand are stored at the location pointed to
  2476. by `zSig1Ptr'.
  2477. -------------------------------------------------------------------------------
  2478. *}
  2479. Procedure normalizeFloat64Subnormal(
  2480. aSig0: bits32;
  2481. aSig1: bits32;
  2482. VAR zExpPtr : Int16;
  2483. VAR zSig0Ptr : Bits32;
  2484. VAR zSig1Ptr : Bits32
  2485. );
  2486. Var
  2487. ShiftCount : Int8;
  2488. Begin
  2489. if ( aSig0 = 0 ) then
  2490. Begin
  2491. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2492. if ( shiftCount < 0 ) then
  2493. Begin
  2494. zSig0Ptr := aSig1 shr ( - shiftCount );
  2495. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2496. End
  2497. else
  2498. Begin
  2499. zSig0Ptr := aSig1 shl shiftCount;
  2500. zSig1Ptr := 0;
  2501. End;
  2502. zExpPtr := - shiftCount - 31;
  2503. End
  2504. else
  2505. Begin
  2506. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2507. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2508. zExpPtr := 1 - shiftCount;
  2509. End;
  2510. End;
  2511. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2512. var
  2513. shiftCount : int8;
  2514. begin
  2515. shiftCount := countLeadingZeros64( aSig ) - 11;
  2516. zSigPtr := aSig shl shiftCount;
  2517. zExpPtr := 1 - shiftCount;
  2518. end;
  2519. {*
  2520. -------------------------------------------------------------------------------
  2521. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2522. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2523. point value, returning the result. After being shifted into the proper
  2524. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2525. together to form the most significant 32 bits of the result. This means
  2526. that any integer portion of `zSig0' will be added into the exponent. Since
  2527. a properly normalized significand will have an integer portion equal to 1,
  2528. the `zExp' input should be 1 less than the desired result exponent whenever
  2529. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2530. -------------------------------------------------------------------------------
  2531. *}
  2532. Procedure
  2533. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2534. var
  2535. z: Float64;
  2536. Begin
  2537. z.low := zSig1;
  2538. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2539. c := z;
  2540. End;
  2541. {*----------------------------------------------------------------------------
  2542. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2543. | double-precision floating-point value, returning the result. After being
  2544. | shifted into the proper positions, the three fields are simply added
  2545. | together to form the result. This means that any integer portion of `zSig'
  2546. | will be added into the exponent. Since a properly normalized significand
  2547. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2548. | than the desired result exponent whenever `zSig' is a complete, normalized
  2549. | significand.
  2550. *----------------------------------------------------------------------------*}
  2551. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2552. begin
  2553. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2554. end;
  2555. {*
  2556. -------------------------------------------------------------------------------
  2557. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2558. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2559. and `zSig2', and returns the proper double-precision floating-point value
  2560. corresponding to the abstract input. Ordinarily, the abstract value is
  2561. simply rounded and packed into the double-precision format, with the inexact
  2562. exception raised if the abstract input cannot be represented exactly.
  2563. However, if the abstract value is too large, the overflow and inexact
  2564. exceptions are raised and an infinity or maximal finite value is returned.
  2565. If the abstract value is too small, the input value is rounded to a
  2566. subnormal number, and the underflow and inexact exceptions are raised if the
  2567. abstract input cannot be represented exactly as a subnormal double-precision
  2568. floating-point number.
  2569. The input significand must be normalized or smaller. If the input
  2570. significand is not normalized, `zExp' must be 0; in that case, the result
  2571. returned is a subnormal number, and it must not require rounding. In the
  2572. usual case that the input significand is normalized, `zExp' must be 1 less
  2573. than the ``true'' floating-point exponent. The handling of underflow and
  2574. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2575. -------------------------------------------------------------------------------
  2576. *}
  2577. Procedure
  2578. roundAndPackFloat64(
  2579. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2580. Var
  2581. roundingMode : TFPURoundingMode;
  2582. roundNearestEven, increment, isTiny : Flag;
  2583. Begin
  2584. roundingMode := softfloat_rounding_mode;
  2585. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2586. increment := flag( sbits32 (zSig2) < 0 );
  2587. if ( roundNearestEven = flag(FALSE) ) then
  2588. Begin
  2589. if ( roundingMode = float_round_to_zero ) then
  2590. increment := 0
  2591. else
  2592. Begin
  2593. if ( zSign )<> 0 then
  2594. Begin
  2595. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2596. End
  2597. else
  2598. Begin
  2599. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2600. End
  2601. End
  2602. End;
  2603. if ( $7FD <= bits16 (zExp) ) then
  2604. Begin
  2605. if (( $7FD < zExp )
  2606. or (( zExp = $7FD )
  2607. and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
  2608. and (increment<>0)
  2609. )
  2610. ) then
  2611. Begin
  2612. float_raise( [float_flag_overflow,float_flag_inexact] );
  2613. if (( roundingMode = float_round_to_zero )
  2614. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2615. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2616. ) then
  2617. Begin
  2618. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2619. exit;
  2620. End;
  2621. packFloat64( zSign, $7FF, 0, 0, c );
  2622. exit;
  2623. End;
  2624. if ( zExp < 0 ) then
  2625. Begin
  2626. isTiny :=
  2627. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2628. or flag( zExp < -1 )
  2629. or flag(increment = 0)
  2630. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2631. shift64ExtraRightJamming(
  2632. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2633. zExp := 0;
  2634. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2635. if ( roundNearestEven )<>0 then
  2636. Begin
  2637. increment := flag( sbits32 (zSig2) < 0 );
  2638. End
  2639. else
  2640. Begin
  2641. if ( zSign )<>0 then
  2642. Begin
  2643. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2644. End
  2645. else
  2646. Begin
  2647. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2648. End
  2649. End;
  2650. End;
  2651. End;
  2652. if ( zSig2 )<>0 then
  2653. set_inexact_flag;
  2654. if ( increment )<>0 then
  2655. Begin
  2656. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2657. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2658. End
  2659. else
  2660. Begin
  2661. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2662. End;
  2663. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2664. End;
  2665. {*----------------------------------------------------------------------------
  2666. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2667. | and significand `zSig', and returns the proper double-precision floating-
  2668. | point value corresponding to the abstract input. Ordinarily, the abstract
  2669. | value is simply rounded and packed into the double-precision format, with
  2670. | the inexact exception raised if the abstract input cannot be represented
  2671. | exactly. However, if the abstract value is too large, the overflow and
  2672. | inexact exceptions are raised and an infinity or maximal finite value is
  2673. | returned. If the abstract value is too small, the input value is rounded
  2674. | to a subnormal number, and the underflow and inexact exceptions are raised
  2675. | if the abstract input cannot be represented exactly as a subnormal double-
  2676. | precision floating-point number.
  2677. | The input significand `zSig' has its binary point between bits 62
  2678. | and 61, which is 10 bits to the left of the usual location. This shifted
  2679. | significand must be normalized or smaller. If `zSig' is not normalized,
  2680. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2681. | and it must not require rounding. In the usual case that `zSig' is
  2682. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2683. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2684. | Binary Floating-Point Arithmetic.
  2685. *----------------------------------------------------------------------------*}
  2686. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2687. var
  2688. roundingMode: TFPURoundingMode;
  2689. roundNearestEven: flag;
  2690. roundIncrement, roundBits: int16;
  2691. isTiny: flag;
  2692. begin
  2693. roundingMode := softfloat_rounding_mode;
  2694. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2695. roundIncrement := $200;
  2696. if ( roundNearestEven=0 ) then
  2697. begin
  2698. if ( roundingMode = float_round_to_zero ) then
  2699. begin
  2700. roundIncrement := 0;
  2701. end
  2702. else begin
  2703. roundIncrement := $3FF;
  2704. if ( zSign<>0 ) then
  2705. begin
  2706. if ( roundingMode = float_round_up ) then
  2707. roundIncrement := 0;
  2708. end
  2709. else begin
  2710. if ( roundingMode = float_round_down ) then
  2711. roundIncrement := 0;
  2712. end
  2713. end
  2714. end;
  2715. roundBits := zSig and $3FF;
  2716. if ( $7FD <= bits16(zExp) ) then
  2717. begin
  2718. if ( ( $7FD < zExp )
  2719. or ( ( zExp = $7FD )
  2720. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2721. ) then
  2722. begin
  2723. float_raise( [float_flag_overflow,float_flag_inexact] );
  2724. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2725. exit;
  2726. end;
  2727. if ( zExp < 0 ) then
  2728. begin
  2729. isTiny := ord(
  2730. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2731. or ( zExp < -1 )
  2732. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2733. shift64RightJamming( zSig, - zExp, zSig );
  2734. zExp := 0;
  2735. roundBits := zSig and $3FF;
  2736. if ( isTiny and roundBits )<>0 then
  2737. float_raise( float_flag_underflow );
  2738. end
  2739. end;
  2740. if ( roundBits<>0 ) then
  2741. set_inexact_flag;
  2742. zSig := ( zSig + roundIncrement ) shr 10;
  2743. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2744. if ( zSig = 0 ) then
  2745. zExp := 0;
  2746. result:=packFloat64( zSign, zExp, zSig );
  2747. end;
  2748. {*
  2749. -------------------------------------------------------------------------------
  2750. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2751. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2752. returns the proper double-precision floating-point value corresponding
  2753. to the abstract input. This routine is just like `roundAndPackFloat64'
  2754. except that the input significand has fewer bits and does not have to be
  2755. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2756. point exponent.
  2757. -------------------------------------------------------------------------------
  2758. *}
  2759. Procedure
  2760. normalizeRoundAndPackFloat64(
  2761. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2762. Var
  2763. shiftCount : int8;
  2764. zSig2 : bits32;
  2765. Begin
  2766. if ( zSig0 = 0 ) then
  2767. Begin
  2768. zSig0 := zSig1;
  2769. zSig1 := 0;
  2770. zExp := zExp -32;
  2771. End;
  2772. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2773. if ( 0 <= shiftCount ) then
  2774. Begin
  2775. zSig2 := 0;
  2776. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2777. End
  2778. else
  2779. Begin
  2780. shift64ExtraRightJamming
  2781. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2782. End;
  2783. zExp := zExp - shiftCount;
  2784. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2785. End;
  2786. {*
  2787. ----------------------------------------------------------------------------
  2788. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2789. and significand `zSig', and returns the proper double-precision floating-
  2790. point value corresponding to the abstract input. This routine is just like
  2791. `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  2792. Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2793. floating-point exponent.
  2794. ----------------------------------------------------------------------------
  2795. *}
  2796. function normalizeRoundAndPackFloat64(zSign: flag; zExp: int16; zSig: bits64): float64;
  2797. var
  2798. shiftCount: int8;
  2799. begin
  2800. shiftCount := countLeadingZeros64( zSig ) - 1;
  2801. result := roundAndPackFloat64( zSign, zExp - shiftCount, zSig shl shiftCount);
  2802. end;
  2803. {*
  2804. -------------------------------------------------------------------------------
  2805. Returns the result of converting the 32-bit two's complement integer `a' to
  2806. the single-precision floating-point format. The conversion is performed
  2807. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2808. -------------------------------------------------------------------------------
  2809. *}
  2810. Function int32_to_float32( a: int32): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  2811. Var
  2812. zSign : Flag;
  2813. Begin
  2814. if ( a = 0 ) then
  2815. Begin
  2816. int32_to_float32.float32 := 0;
  2817. exit;
  2818. End;
  2819. if ( a = sbits32 ($80000000) ) then
  2820. Begin
  2821. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2822. exit;
  2823. end;
  2824. zSign := flag( a < 0 );
  2825. If zSign<>0 then
  2826. a := -a;
  2827. int32_to_float32.float32:=
  2828. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2829. End;
  2830. {*
  2831. -------------------------------------------------------------------------------
  2832. Returns the result of converting the 32-bit two's complement integer `a' to
  2833. the double-precision floating-point format. The conversion is performed
  2834. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2835. -------------------------------------------------------------------------------
  2836. *}
  2837. Function int32_to_float64( a: int32) : float64;{$ifdef FPC_IS_SYSTEM} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2838. var
  2839. zSign : flag;
  2840. absA : bits32;
  2841. shiftCount : int8;
  2842. zSig0, zSig1 : bits32;
  2843. Begin
  2844. if ( a = 0 ) then
  2845. Begin
  2846. packFloat64( 0, 0, 0, 0, result );
  2847. exit;
  2848. end;
  2849. zSign := flag( a < 0 );
  2850. if ZSign<>0 then
  2851. AbsA := -a
  2852. else
  2853. AbsA := a;
  2854. shiftCount := countLeadingZeros32( absA ) - 11;
  2855. if ( 0 <= shiftCount ) then
  2856. Begin
  2857. zSig0 := absA shl shiftCount;
  2858. zSig1 := 0;
  2859. End
  2860. else
  2861. Begin
  2862. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2863. End;
  2864. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2865. End;
  2866. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2867. {$if not defined(packFloatx80)}
  2868. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2869. forward;
  2870. {$endif}
  2871. {*----------------------------------------------------------------------------
  2872. | Returns the result of converting the 32-bit two's complement integer `a'
  2873. | to the extended double-precision floating-point format. The conversion
  2874. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2875. | Arithmetic.
  2876. *----------------------------------------------------------------------------*}
  2877. function int32_to_floatx80( a: int32 ): floatx80;
  2878. var
  2879. zSign: flag;
  2880. absA: uint32;
  2881. shiftCount: int8;
  2882. zSig: bits64;
  2883. begin
  2884. if ( a = 0 ) then begin
  2885. result := packFloatx80( 0, 0, 0 );
  2886. exit;
  2887. end;
  2888. zSign := ord( a < 0 );
  2889. if zSign <> 0 then absA := - a else absA := a;
  2890. shiftCount := countLeadingZeros32( absA ) + 32;
  2891. zSig := absA;
  2892. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2893. end;
  2894. {$endif FPC_SOFTFLOAT_FLOATX80}
  2895. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2896. {$if not defined(packFloat128)}
  2897. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2898. forward;
  2899. {$endif}
  2900. {*----------------------------------------------------------------------------
  2901. | Returns the result of converting the 32-bit two's complement integer `a' to
  2902. | the quadruple-precision floating-point format. The conversion is performed
  2903. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2904. *----------------------------------------------------------------------------*}
  2905. function int32_to_float128( a: int32 ): float128;
  2906. var
  2907. zSign: flag;
  2908. absA: uint32;
  2909. shiftCount: int8;
  2910. zSig0: bits64;
  2911. begin
  2912. if ( a = 0 ) then begin
  2913. result := packFloat128( 0, 0, 0, 0 );
  2914. exit;
  2915. end;
  2916. zSign := ord( a < 0 );
  2917. if zSign <> 0 then absA := - a else absA := a;
  2918. shiftCount := countLeadingZeros32( absA ) + 17;
  2919. zSig0 := absA;
  2920. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2921. end;
  2922. {$endif FPC_SOFTFLOAT_FLOAT128}
  2923. {*
  2924. -------------------------------------------------------------------------------
  2925. Returns the result of converting the single-precision floating-point value
  2926. `a' to the 32-bit two's complement integer format. The conversion is
  2927. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2928. Arithmetic---which means in particular that the conversion is rounded
  2929. according to the current rounding mode. If `a' is a NaN, the largest
  2930. positive integer is returned. Otherwise, if the conversion overflows, the
  2931. largest integer with the same sign as `a' is returned.
  2932. -------------------------------------------------------------------------------
  2933. *}
  2934. Function float32_to_int32( a : float32rec) : int32;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  2935. Var
  2936. aSign: flag;
  2937. aExp, shiftCount: int16;
  2938. aSig, aSigExtra: bits32;
  2939. z: int32;
  2940. roundingMode: TFPURoundingMode;
  2941. Begin
  2942. aSig := extractFloat32Frac( a.float32 );
  2943. aExp := extractFloat32Exp( a.float32 );
  2944. aSign := extractFloat32Sign( a.float32 );
  2945. shiftCount := aExp - $96;
  2946. if ( 0 <= shiftCount ) then
  2947. Begin
  2948. if ( $9E <= aExp ) then
  2949. Begin
  2950. if ( a.float32 <> $CF000000 ) then
  2951. Begin
  2952. float_raise( float_flag_invalid );
  2953. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2954. Begin
  2955. float32_to_int32 := $7FFFFFFF;
  2956. exit;
  2957. End;
  2958. End;
  2959. float32_to_int32 := sbits32 ($80000000);
  2960. exit;
  2961. End;
  2962. z := ( aSig or $00800000 ) shl shiftCount;
  2963. if ( aSign<>0 ) then z := - z;
  2964. End
  2965. else
  2966. Begin
  2967. if ( aExp < $7E ) then
  2968. Begin
  2969. aSigExtra := aExp OR aSig;
  2970. z := 0;
  2971. End
  2972. else
  2973. Begin
  2974. aSig := aSig OR $00800000;
  2975. aSigExtra := aSig shl ( shiftCount and 31 );
  2976. z := aSig shr ( - shiftCount );
  2977. End;
  2978. if ( aSigExtra<>0 ) then
  2979. set_inexact_flag;
  2980. roundingMode := softfloat_rounding_mode;
  2981. if ( roundingMode = float_round_nearest_even ) then
  2982. Begin
  2983. if ( sbits32 (aSigExtra) < 0 ) then
  2984. Begin
  2985. Inc(z);
  2986. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2987. z := z and not 1;
  2988. End;
  2989. if ( aSign<>0 ) then
  2990. z := - z;
  2991. End
  2992. else
  2993. Begin
  2994. aSigExtra := flag( aSigExtra <> 0 );
  2995. if ( aSign<>0 ) then
  2996. Begin
  2997. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2998. z := - z;
  2999. End
  3000. else
  3001. Begin
  3002. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  3003. End
  3004. End;
  3005. End;
  3006. float32_to_int32 := z;
  3007. End;
  3008. {*
  3009. -------------------------------------------------------------------------------
  3010. Returns the result of converting the single-precision floating-point value
  3011. `a' to the 32-bit two's complement integer format. The conversion is
  3012. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3013. Arithmetic, except that the conversion is always rounded toward zero.
  3014. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3015. the conversion overflows, the largest integer with the same sign as `a' is
  3016. returned.
  3017. -------------------------------------------------------------------------------
  3018. *}
  3019. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3020. Var
  3021. aSign : flag;
  3022. aExp, shiftCount : int16;
  3023. aSig : bits32;
  3024. z : int32;
  3025. Begin
  3026. aSig := extractFloat32Frac( a.float32 );
  3027. aExp := extractFloat32Exp( a.float32 );
  3028. aSign := extractFloat32Sign( a.float32 );
  3029. shiftCount := aExp - $9E;
  3030. if ( 0 <= shiftCount ) then
  3031. Begin
  3032. if ( a.float32 <> $CF000000 ) then
  3033. Begin
  3034. float_raise( float_flag_invalid );
  3035. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  3036. Begin
  3037. float32_to_int32_round_to_zero := $7FFFFFFF;
  3038. exit;
  3039. end;
  3040. End;
  3041. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  3042. exit;
  3043. End
  3044. else
  3045. if ( aExp <= $7E ) then
  3046. Begin
  3047. if ( aExp or aSig )<>0 then
  3048. set_inexact_flag;
  3049. float32_to_int32_round_to_zero := 0;
  3050. exit;
  3051. End;
  3052. aSig := ( aSig or $00800000 ) shl 8;
  3053. z := aSig shr ( - shiftCount );
  3054. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  3055. Begin
  3056. set_inexact_flag;
  3057. End;
  3058. if ( aSign<>0 ) then z := - z;
  3059. float32_to_int32_round_to_zero := z;
  3060. End;
  3061. {*----------------------------------------------------------------------------
  3062. | Returns the result of converting the single-precision floating-point value
  3063. | `a' to the 64-bit two's complement integer format. The conversion is
  3064. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3065. | Arithmetic---which means in particular that the conversion is rounded
  3066. | according to the current rounding mode. If `a' is a NaN, the largest
  3067. | positive integer is returned. Otherwise, if the conversion overflows, the
  3068. | largest integer with the same sign as `a' is returned.
  3069. *----------------------------------------------------------------------------*}
  3070. function float32_to_int64( a: float32 ): int64;
  3071. var
  3072. aSign: flag;
  3073. aExp, shiftCount: int16;
  3074. aSig: bits32;
  3075. aSig64, aSigExtra: bits64;
  3076. begin
  3077. aSig := extractFloat32Frac( a );
  3078. aExp := extractFloat32Exp( a );
  3079. aSign := extractFloat32Sign( a );
  3080. shiftCount := $BE - aExp;
  3081. if ( shiftCount < 0 ) then begin
  3082. float_raise( float_flag_invalid );
  3083. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3084. result := $7FFFFFFFFFFFFFFF;
  3085. exit;
  3086. end;
  3087. result := $8000000000000000;
  3088. exit;
  3089. end;
  3090. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  3091. aSig64 := aSig;
  3092. aSig64 := aSig64 shl 40;
  3093. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  3094. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  3095. end;
  3096. {*----------------------------------------------------------------------------
  3097. | Returns the result of converting the single-precision floating-point value
  3098. | `a' to the 64-bit two's complement integer format. The conversion is
  3099. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3100. | Arithmetic, except that the conversion is always rounded toward zero. If
  3101. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  3102. | conversion overflows, the largest integer with the same sign as `a' is
  3103. | returned.
  3104. *----------------------------------------------------------------------------*}
  3105. function float32_to_int64_round_to_zero( a: float32 ): int64;
  3106. var
  3107. aSign: flag;
  3108. aExp, shiftCount: int16;
  3109. aSig: bits32;
  3110. aSig64: bits64;
  3111. z: int64;
  3112. begin
  3113. aSig := extractFloat32Frac( a );
  3114. aExp := extractFloat32Exp( a );
  3115. aSign := extractFloat32Sign( a );
  3116. shiftCount := aExp - $BE;
  3117. if ( 0 <= shiftCount ) then begin
  3118. if ( a <> $DF000000 ) then begin
  3119. float_raise( float_flag_invalid );
  3120. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3121. result := $7FFFFFFFFFFFFFFF;
  3122. exit;
  3123. end;
  3124. end;
  3125. result := $8000000000000000;
  3126. exit;
  3127. end
  3128. else if ( aExp <= $7E ) then begin
  3129. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  3130. result := 0;
  3131. exit;
  3132. end;
  3133. aSig64 := aSig or $00800000;
  3134. aSig64 := aSig64 shl 40;
  3135. z := aSig64 shr ( - shiftCount );
  3136. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3137. set_inexact_flag;
  3138. if ( aSign <> 0 ) then z := - z;
  3139. result := z;
  3140. end;
  3141. {*
  3142. -------------------------------------------------------------------------------
  3143. Returns the result of converting the single-precision floating-point value
  3144. `a' to the double-precision floating-point format. The conversion is
  3145. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3146. Arithmetic.
  3147. -------------------------------------------------------------------------------
  3148. *}
  3149. Function float32_to_float64( a : float32rec) : Float64;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3150. Var
  3151. aSign : flag;
  3152. aExp : int16;
  3153. aSig, zSig0, zSig1: bits32;
  3154. tmp : CommonNanT;
  3155. Begin
  3156. aSig := extractFloat32Frac( a.float32 );
  3157. aExp := extractFloat32Exp( a.float32 );
  3158. aSign := extractFloat32Sign( a.float32 );
  3159. if ( aExp = $FF ) then
  3160. Begin
  3161. if ( aSig<>0 ) then
  3162. Begin
  3163. tmp:=float32ToCommonNaN(a.float32);
  3164. result:=commonNaNToFloat64(tmp);
  3165. exit;
  3166. End;
  3167. packFloat64( aSign, $7FF, 0, 0, result);
  3168. exit;
  3169. End;
  3170. if ( aExp = 0 ) then
  3171. Begin
  3172. if ( aSig = 0 ) then
  3173. Begin
  3174. packFloat64( aSign, 0, 0, 0, result );
  3175. exit;
  3176. end;
  3177. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3178. Dec(aExp);
  3179. End;
  3180. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3181. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3182. End;
  3183. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3184. {*----------------------------------------------------------------------------
  3185. | Returns the result of converting the canonical NaN `a' to the extended
  3186. | double-precision floating-point format.
  3187. *----------------------------------------------------------------------------*}
  3188. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3189. var
  3190. z : floatx80;
  3191. begin
  3192. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3193. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3194. result := z;
  3195. end;
  3196. {*----------------------------------------------------------------------------
  3197. | Returns the result of converting the single-precision floating-point value
  3198. | `a' to the extended double-precision floating-point format. The conversion
  3199. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3200. | Arithmetic.
  3201. *----------------------------------------------------------------------------*}
  3202. function float32_to_floatx80( a: float32 ): floatx80;
  3203. var
  3204. aSign: flag;
  3205. aExp: int16;
  3206. aSig: bits32;
  3207. tmp: commonNaNT;
  3208. begin
  3209. aSig := extractFloat32Frac( a );
  3210. aExp := extractFloat32Exp( a );
  3211. aSign := extractFloat32Sign( a );
  3212. if ( aExp = $FF ) then begin
  3213. if ( aSig <> 0 ) then begin
  3214. tmp:=float32ToCommonNaN(a);
  3215. result := commonNaNToFloatx80( tmp );
  3216. exit;
  3217. end;
  3218. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3219. exit;
  3220. end;
  3221. if ( aExp = 0 ) then begin
  3222. if ( aSig = 0 ) then begin
  3223. result := packFloatx80( aSign, 0, 0 );
  3224. exit;
  3225. end;
  3226. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3227. end;
  3228. aSig := aSig or $00800000;
  3229. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3230. end;
  3231. {$endif FPC_SOFTFLOAT_FLOATX80}
  3232. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3233. {*----------------------------------------------------------------------------
  3234. | Returns the result of converting the single-precision floating-point value
  3235. | `a' to the double-precision floating-point format. The conversion is
  3236. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3237. | Arithmetic.
  3238. *----------------------------------------------------------------------------*}
  3239. function float32_to_float128( a: float32 ): float128;
  3240. var
  3241. aSign: flag;
  3242. aExp: int16;
  3243. aSig: bits32;
  3244. tmp: commonNaNT;
  3245. begin
  3246. aSig := extractFloat32Frac( a );
  3247. aExp := extractFloat32Exp( a );
  3248. aSign := extractFloat32Sign( a );
  3249. if ( aExp = $FF ) then begin
  3250. if ( aSig <> 0 ) then begin
  3251. tmp:=float32ToCommonNaN(a);
  3252. result := commonNaNToFloat128( tmp );
  3253. exit;
  3254. end;
  3255. result := packFloat128( aSign, $7FFF, 0, 0 );
  3256. exit;
  3257. end;
  3258. if ( aExp = 0 ) then begin
  3259. if ( aSig = 0 ) then begin
  3260. result := packFloat128( aSign, 0, 0, 0 );
  3261. exit;
  3262. end;
  3263. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3264. dec( aExp );
  3265. end;
  3266. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3267. end;
  3268. {$endif FPC_SOFTFLOAT_FLOAT128}
  3269. {*
  3270. -------------------------------------------------------------------------------
  3271. Rounds the single-precision floating-point value `a' to an integer,
  3272. and returns the result as a single-precision floating-point value. The
  3273. operation is performed according to the IEC/IEEE Standard for Binary
  3274. Floating-Point Arithmetic.
  3275. -------------------------------------------------------------------------------
  3276. *}
  3277. Function float32_round_to_int( a: float32rec): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3278. Var
  3279. aSign: flag;
  3280. aExp: int16;
  3281. lastBitMask, roundBitsMask: bits32;
  3282. roundingMode: TFPURoundingMode;
  3283. z: float32;
  3284. Begin
  3285. aExp := extractFloat32Exp( a.float32 );
  3286. if ( $96 <= aExp ) then
  3287. Begin
  3288. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3289. Begin
  3290. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3291. exit;
  3292. End;
  3293. float32_round_to_int:=a;
  3294. exit;
  3295. End;
  3296. if ( aExp <= $7E ) then
  3297. Begin
  3298. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3299. Begin
  3300. float32_round_to_int:=a;
  3301. exit;
  3302. end;
  3303. set_inexact_flag;
  3304. aSign := extractFloat32Sign( a.float32 );
  3305. case ( softfloat_rounding_mode ) of
  3306. float_round_nearest_even:
  3307. Begin
  3308. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3309. Begin
  3310. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3311. exit;
  3312. End;
  3313. End;
  3314. float_round_down:
  3315. Begin
  3316. if aSign <> 0 then
  3317. float32_round_to_int.float32 := $BF800000
  3318. else
  3319. float32_round_to_int.float32 := 0;
  3320. exit;
  3321. End;
  3322. float_round_up:
  3323. Begin
  3324. if aSign <> 0 then
  3325. float32_round_to_int.float32 := $80000000
  3326. else
  3327. float32_round_to_int.float32 := $3F800000;
  3328. exit;
  3329. End;
  3330. end;
  3331. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3332. exit;
  3333. End;
  3334. lastBitMask := 1;
  3335. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3336. lastBitMask := lastBitMask shl ($96 - aExp);
  3337. roundBitsMask := lastBitMask - 1;
  3338. z := a.float32;
  3339. roundingMode := softfloat_rounding_mode;
  3340. if ( roundingMode = float_round_nearest_even ) then
  3341. Begin
  3342. z := z + (lastBitMask shr 1);
  3343. if ( ( z and roundBitsMask ) = 0 ) then
  3344. z := z and not lastBitMask;
  3345. End
  3346. else if ( roundingMode <> float_round_to_zero ) then
  3347. Begin
  3348. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3349. Begin
  3350. z := z + roundBitsMask;
  3351. End;
  3352. End;
  3353. z := z and not roundBitsMask;
  3354. if ( z <> a.float32 ) then
  3355. set_inexact_flag;
  3356. float32_round_to_int.float32 := z;
  3357. End;
  3358. {*
  3359. -------------------------------------------------------------------------------
  3360. Returns the result of adding the absolute values of the single-precision
  3361. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3362. before being returned. `zSign' is ignored if the result is a NaN.
  3363. The addition is performed according to the IEC/IEEE Standard for Binary
  3364. Floating-Point Arithmetic.
  3365. -------------------------------------------------------------------------------
  3366. *}
  3367. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3368. Var
  3369. aExp, bExp, zExp: int16;
  3370. aSig, bSig, zSig: bits32;
  3371. expDiff: int16;
  3372. label roundAndPack;
  3373. Begin
  3374. aSig:=extractFloat32Frac( a );
  3375. aExp:=extractFloat32Exp( a );
  3376. bSig:=extractFloat32Frac( b );
  3377. bExp := extractFloat32Exp( b );
  3378. expDiff := aExp - bExp;
  3379. aSig := aSig shl 6;
  3380. bSig := bSig shl 6;
  3381. if ( 0 < expDiff ) then
  3382. Begin
  3383. if ( aExp = $FF ) then
  3384. Begin
  3385. if ( aSig <> 0) then
  3386. Begin
  3387. addFloat32Sigs := propagateFloat32NaN( a, b );
  3388. exit;
  3389. End;
  3390. addFloat32Sigs := a;
  3391. exit;
  3392. End;
  3393. if ( bExp = 0 ) then
  3394. Begin
  3395. Dec(expDiff);
  3396. End
  3397. else
  3398. Begin
  3399. bSig := bSig or $20000000;
  3400. End;
  3401. shift32RightJamming( bSig, expDiff, bSig );
  3402. zExp := aExp;
  3403. End
  3404. else
  3405. If ( expDiff < 0 ) then
  3406. Begin
  3407. if ( bExp = $FF ) then
  3408. Begin
  3409. if ( bSig<>0 ) then
  3410. Begin
  3411. addFloat32Sigs := propagateFloat32NaN( a, b );
  3412. exit;
  3413. end;
  3414. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3415. exit;
  3416. End;
  3417. if ( aExp = 0 ) then
  3418. Begin
  3419. Inc(expDiff);
  3420. End
  3421. else
  3422. Begin
  3423. aSig := aSig OR $20000000;
  3424. End;
  3425. shift32RightJamming( aSig, - expDiff, aSig );
  3426. zExp := bExp;
  3427. End
  3428. else
  3429. Begin
  3430. if ( aExp = $FF ) then
  3431. Begin
  3432. if ( aSig OR bSig )<> 0 then
  3433. Begin
  3434. addFloat32Sigs := propagateFloat32NaN( a, b );
  3435. exit;
  3436. end;
  3437. addFloat32Sigs := a;
  3438. exit;
  3439. End;
  3440. if ( aExp = 0 ) then
  3441. Begin
  3442. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3443. exit;
  3444. end;
  3445. zSig := $40000000 + aSig + bSig;
  3446. zExp := aExp;
  3447. goto roundAndPack;
  3448. End;
  3449. aSig := aSig OR $20000000;
  3450. zSig := ( aSig + bSig ) shl 1;
  3451. Dec(zExp);
  3452. if ( sbits32 (zSig) < 0 ) then
  3453. Begin
  3454. zSig := aSig + bSig;
  3455. Inc(zExp);
  3456. End;
  3457. roundAndPack:
  3458. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3459. End;
  3460. {*
  3461. -------------------------------------------------------------------------------
  3462. Returns the result of subtracting the absolute values of the single-
  3463. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3464. difference is negated before being returned. `zSign' is ignored if the
  3465. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3466. Standard for Binary Floating-Point Arithmetic.
  3467. -------------------------------------------------------------------------------
  3468. *}
  3469. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3470. Var
  3471. aExp, bExp, zExp: int16;
  3472. aSig, bSig, zSig: bits32;
  3473. expDiff : int16;
  3474. label aExpBigger;
  3475. label bExpBigger;
  3476. label aBigger;
  3477. label bBigger;
  3478. label normalizeRoundAndPack;
  3479. Begin
  3480. aSig := extractFloat32Frac( a );
  3481. aExp := extractFloat32Exp( a );
  3482. bSig := extractFloat32Frac( b );
  3483. bExp := extractFloat32Exp( b );
  3484. expDiff := aExp - bExp;
  3485. aSig := aSig shl 7;
  3486. bSig := bSig shl 7;
  3487. if ( 0 < expDiff ) then goto aExpBigger;
  3488. if ( expDiff < 0 ) then goto bExpBigger;
  3489. if ( aExp = $FF ) then
  3490. Begin
  3491. if ( aSig OR bSig )<> 0 then
  3492. Begin
  3493. subFloat32Sigs := propagateFloat32NaN( a, b );
  3494. exit;
  3495. End;
  3496. float_raise( float_flag_invalid );
  3497. subFloat32Sigs := float32_default_nan;
  3498. exit;
  3499. End;
  3500. if ( aExp = 0 ) then
  3501. Begin
  3502. aExp := 1;
  3503. bExp := 1;
  3504. End;
  3505. if ( bSig < aSig ) Then goto aBigger;
  3506. if ( aSig < bSig ) Then goto bBigger;
  3507. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3508. exit;
  3509. bExpBigger:
  3510. if ( bExp = $FF ) then
  3511. Begin
  3512. if ( bSig<>0 ) then
  3513. Begin
  3514. subFloat32Sigs := propagateFloat32NaN( a, b );
  3515. exit;
  3516. End;
  3517. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3518. exit;
  3519. End;
  3520. if ( aExp = 0 ) then
  3521. Begin
  3522. Inc(expDiff);
  3523. End
  3524. else
  3525. Begin
  3526. aSig := aSig OR $40000000;
  3527. End;
  3528. shift32RightJamming( aSig, - expDiff, aSig );
  3529. bSig := bSig OR $40000000;
  3530. bBigger:
  3531. zSig := bSig - aSig;
  3532. zExp := bExp;
  3533. zSign := zSign xor 1;
  3534. goto normalizeRoundAndPack;
  3535. aExpBigger:
  3536. if ( aExp = $FF ) then
  3537. Begin
  3538. if ( aSig <> 0) then
  3539. Begin
  3540. subFloat32Sigs := propagateFloat32NaN( a, b );
  3541. exit;
  3542. End;
  3543. subFloat32Sigs := a;
  3544. exit;
  3545. End;
  3546. if ( bExp = 0 ) then
  3547. Begin
  3548. Dec(expDiff);
  3549. End
  3550. else
  3551. Begin
  3552. bSig := bSig OR $40000000;
  3553. End;
  3554. shift32RightJamming( bSig, expDiff, bSig );
  3555. aSig := aSig OR $40000000;
  3556. aBigger:
  3557. zSig := aSig - bSig;
  3558. zExp := aExp;
  3559. normalizeRoundAndPack:
  3560. Dec(zExp);
  3561. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3562. End;
  3563. {*
  3564. -------------------------------------------------------------------------------
  3565. Returns the result of adding the single-precision floating-point values `a'
  3566. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3567. Binary Floating-Point Arithmetic.
  3568. -------------------------------------------------------------------------------
  3569. *}
  3570. Function float32_add( a: float32rec; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3571. Var
  3572. aSign, bSign: Flag;
  3573. Begin
  3574. aSign := extractFloat32Sign( a.float32 );
  3575. bSign := extractFloat32Sign( b.float32 );
  3576. if ( aSign = bSign ) then
  3577. Begin
  3578. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3579. End
  3580. else
  3581. Begin
  3582. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3583. End;
  3584. End;
  3585. {*
  3586. -------------------------------------------------------------------------------
  3587. Returns the result of subtracting the single-precision floating-point values
  3588. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3589. for Binary Floating-Point Arithmetic.
  3590. -------------------------------------------------------------------------------
  3591. *}
  3592. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3593. Var
  3594. aSign, bSign: flag;
  3595. Begin
  3596. aSign := extractFloat32Sign( a.float32 );
  3597. bSign := extractFloat32Sign( b.float32 );
  3598. if ( aSign = bSign ) then
  3599. Begin
  3600. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3601. End
  3602. else
  3603. Begin
  3604. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3605. End;
  3606. End;
  3607. {*
  3608. -------------------------------------------------------------------------------
  3609. Returns the result of multiplying the single-precision floating-point values
  3610. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3611. for Binary Floating-Point Arithmetic.
  3612. -------------------------------------------------------------------------------
  3613. *}
  3614. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3615. Var
  3616. aSign, bSign, zSign: flag;
  3617. aExp, bExp, zExp : int16;
  3618. aSig, bSig, zSig0, zSig1: bits32;
  3619. Begin
  3620. aSig := extractFloat32Frac( a.float32 );
  3621. aExp := extractFloat32Exp( a.float32 );
  3622. aSign := extractFloat32Sign( a.float32 );
  3623. bSig := extractFloat32Frac( b.float32 );
  3624. bExp := extractFloat32Exp( b.float32 );
  3625. bSign := extractFloat32Sign( b.float32 );
  3626. zSign := aSign xor bSign;
  3627. if ( aExp = $FF ) then
  3628. Begin
  3629. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3630. Begin
  3631. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3632. exit;
  3633. End;
  3634. if ( ( bits32(bExp) OR bSig ) = 0 ) then
  3635. Begin
  3636. float_raise( float_flag_invalid );
  3637. float32_mul.float32 := float32_default_nan;
  3638. exit;
  3639. End;
  3640. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3641. exit;
  3642. End;
  3643. if ( bExp = $FF ) then
  3644. Begin
  3645. if ( bSig <> 0 ) then
  3646. Begin
  3647. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3648. exit;
  3649. End;
  3650. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3651. Begin
  3652. float_raise( float_flag_invalid );
  3653. float32_mul.float32 := float32_default_nan;
  3654. exit;
  3655. End;
  3656. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3657. exit;
  3658. End;
  3659. if ( aExp = 0 ) then
  3660. Begin
  3661. if ( aSig = 0 ) then
  3662. Begin
  3663. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3664. exit;
  3665. End;
  3666. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3667. End;
  3668. if ( bExp = 0 ) then
  3669. Begin
  3670. if ( bSig = 0 ) then
  3671. Begin
  3672. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3673. exit;
  3674. End;
  3675. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3676. End;
  3677. zExp := aExp + bExp - $7F;
  3678. aSig := ( aSig OR $00800000 ) shl 7;
  3679. bSig := ( bSig OR $00800000 ) shl 8;
  3680. mul32To64( aSig, bSig, zSig0, zSig1 );
  3681. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3682. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3683. Begin
  3684. zSig0 := zSig0 shl 1;
  3685. Dec(zExp);
  3686. End;
  3687. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3688. End;
  3689. {*
  3690. -------------------------------------------------------------------------------
  3691. Returns the result of dividing the single-precision floating-point value `a'
  3692. by the corresponding value `b'. The operation is performed according to the
  3693. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3694. -------------------------------------------------------------------------------
  3695. *}
  3696. Function float32_div(a: float32rec;b: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3697. Var
  3698. aSign, bSign, zSign: flag;
  3699. aExp, bExp, zExp: int16;
  3700. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3701. Begin
  3702. aSig := extractFloat32Frac( a.float32 );
  3703. aExp := extractFloat32Exp( a.float32 );
  3704. aSign := extractFloat32Sign( a.float32 );
  3705. bSig := extractFloat32Frac( b.float32 );
  3706. bExp := extractFloat32Exp( b.float32 );
  3707. bSign := extractFloat32Sign( b.float32 );
  3708. zSign := aSign xor bSign;
  3709. if ( aExp = $FF ) then
  3710. Begin
  3711. if ( aSig <> 0 ) then
  3712. Begin
  3713. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3714. exit;
  3715. End;
  3716. if ( bExp = $FF ) then
  3717. Begin
  3718. if ( bSig <> 0) then
  3719. Begin
  3720. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3721. exit;
  3722. End;
  3723. float_raise( float_flag_invalid );
  3724. float32_div.float32 := float32_default_nan;
  3725. exit;
  3726. End;
  3727. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3728. exit;
  3729. End;
  3730. if ( bExp = $FF ) then
  3731. Begin
  3732. if ( bSig <> 0) then
  3733. Begin
  3734. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3735. exit;
  3736. End;
  3737. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3738. exit;
  3739. End;
  3740. if ( bExp = 0 ) Then
  3741. Begin
  3742. if ( bSig = 0 ) Then
  3743. Begin
  3744. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3745. Begin
  3746. float_raise( float_flag_invalid );
  3747. float32_div.float32 := float32_default_nan;
  3748. exit;
  3749. End;
  3750. float_raise( float_flag_divbyzero );
  3751. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3752. exit;
  3753. End;
  3754. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3755. End;
  3756. if ( aExp = 0 ) Then
  3757. Begin
  3758. if ( aSig = 0 ) Then
  3759. Begin
  3760. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3761. exit;
  3762. End;
  3763. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3764. End;
  3765. zExp := aExp - bExp + $7D;
  3766. aSig := ( aSig OR $00800000 ) shl 7;
  3767. bSig := ( bSig OR $00800000 ) shl 8;
  3768. if ( bSig <= ( aSig + aSig ) ) then
  3769. Begin
  3770. aSig := aSig shr 1;
  3771. Inc(zExp);
  3772. End;
  3773. zSig := estimateDiv64To32( aSig, 0, bSig );
  3774. if ( ( zSig and $3F ) <= 2 ) then
  3775. Begin
  3776. mul32To64( bSig, zSig, term0, term1 );
  3777. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3778. while ( sbits32 (rem0) < 0 ) do
  3779. Begin
  3780. Dec(zSig);
  3781. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3782. End;
  3783. zSig := zSig or bits32( rem1 <> 0 );
  3784. End;
  3785. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3786. End;
  3787. {*
  3788. -------------------------------------------------------------------------------
  3789. Returns the remainder of the single-precision floating-point value `a'
  3790. with respect to the corresponding value `b'. The operation is performed
  3791. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3792. -------------------------------------------------------------------------------
  3793. *}
  3794. Function float32_rem(a: float32rec; b: float32rec ):float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3795. Var
  3796. aSign, zSign: flag;
  3797. aExp, bExp, expDiff: int16;
  3798. aSig, bSig, q, alternateASig: bits32;
  3799. sigMean: sbits32;
  3800. Begin
  3801. aSig := extractFloat32Frac( a.float32 );
  3802. aExp := extractFloat32Exp( a.float32 );
  3803. aSign := extractFloat32Sign( a.float32 );
  3804. bSig := extractFloat32Frac( b.float32 );
  3805. bExp := extractFloat32Exp( b.float32 );
  3806. if ( aExp = $FF ) then
  3807. Begin
  3808. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3809. Begin
  3810. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3811. exit;
  3812. End;
  3813. float_raise( float_flag_invalid );
  3814. float32_rem.float32 := float32_default_nan;
  3815. exit;
  3816. End;
  3817. if ( bExp = $FF ) then
  3818. Begin
  3819. if ( bSig <> 0 ) then
  3820. Begin
  3821. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3822. exit;
  3823. End;
  3824. float32_rem := a;
  3825. exit;
  3826. End;
  3827. if ( bExp = 0 ) then
  3828. Begin
  3829. if ( bSig = 0 ) then
  3830. Begin
  3831. float_raise( float_flag_invalid );
  3832. float32_rem.float32 := float32_default_nan;
  3833. exit;
  3834. End;
  3835. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3836. End;
  3837. if ( aExp = 0 ) then
  3838. Begin
  3839. if ( aSig = 0 ) then
  3840. Begin
  3841. float32_rem := a;
  3842. exit;
  3843. End;
  3844. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3845. End;
  3846. expDiff := aExp - bExp;
  3847. aSig := ( aSig OR $00800000 ) shl 8;
  3848. bSig := ( bSig OR $00800000 ) shl 8;
  3849. if ( expDiff < 0 ) then
  3850. Begin
  3851. if ( expDiff < -1 ) then
  3852. Begin
  3853. float32_rem := a;
  3854. exit;
  3855. End;
  3856. aSig := aSig shr 1;
  3857. End;
  3858. q := bits32( bSig <= aSig );
  3859. if ( q <> 0) then
  3860. aSig := aSig - bSig;
  3861. expDiff := expDiff - 32;
  3862. while ( 0 < expDiff ) do
  3863. Begin
  3864. q := estimateDiv64To32( aSig, 0, bSig );
  3865. if (2 < q) then
  3866. q := q - 2
  3867. else
  3868. q := 0;
  3869. aSig := - ( ( bSig shr 2 ) * q );
  3870. expDiff := expDiff - 30;
  3871. End;
  3872. expDiff := expDiff + 32;
  3873. if ( 0 < expDiff ) then
  3874. Begin
  3875. q := estimateDiv64To32( aSig, 0, bSig );
  3876. if (2 < q) then
  3877. q := q - 2
  3878. else
  3879. q := 0;
  3880. q := q shr (32 - expDiff);
  3881. bSig := bSig shr 2;
  3882. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3883. End
  3884. else
  3885. Begin
  3886. aSig := aSig shr 2;
  3887. bSig := bSig shr 2;
  3888. End;
  3889. Repeat
  3890. alternateASig := aSig;
  3891. Inc(q);
  3892. aSig := aSig - bSig;
  3893. Until not ( 0 <= sbits32 (aSig) );
  3894. sigMean := aSig + alternateASig;
  3895. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3896. Begin
  3897. aSig := alternateASig;
  3898. End;
  3899. zSign := flag( sbits32 (aSig) < 0 );
  3900. if ( zSign<>0 ) then
  3901. aSig := - aSig;
  3902. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3903. End;
  3904. {*
  3905. -------------------------------------------------------------------------------
  3906. Returns the square root of the single-precision floating-point value `a'.
  3907. The operation is performed according to the IEC/IEEE Standard for Binary
  3908. Floating-Point Arithmetic.
  3909. -------------------------------------------------------------------------------
  3910. *}
  3911. Function float32_sqrt(a: float32rec ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3912. Var
  3913. aSign : flag;
  3914. aExp, zExp : int16;
  3915. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3916. label roundAndPack;
  3917. Begin
  3918. aSig := extractFloat32Frac( a.float32 );
  3919. aExp := extractFloat32Exp( a.float32 );
  3920. aSign := extractFloat32Sign( a.float32 );
  3921. if ( aExp = $FF ) then
  3922. Begin
  3923. if ( aSig <> 0) then
  3924. Begin
  3925. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3926. exit;
  3927. End;
  3928. if ( aSign = 0) then
  3929. Begin
  3930. float32_sqrt := a;
  3931. exit;
  3932. End;
  3933. float_raise( float_flag_invalid );
  3934. float32_sqrt.float32 := float32_default_nan;
  3935. exit;
  3936. End;
  3937. if ( aSign <> 0) then
  3938. Begin
  3939. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3940. Begin
  3941. float32_sqrt := a;
  3942. exit;
  3943. End;
  3944. float_raise( float_flag_invalid );
  3945. float32_sqrt.float32 := float32_default_nan;
  3946. exit;
  3947. End;
  3948. if ( aExp = 0 ) then
  3949. Begin
  3950. if ( aSig = 0 ) then
  3951. Begin
  3952. float32_sqrt.float32 := 0;
  3953. exit;
  3954. End;
  3955. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3956. End;
  3957. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3958. aSig := ( aSig OR $00800000 ) shl 8;
  3959. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3960. if ( ( zSig and $7F ) <= 5 ) then
  3961. Begin
  3962. if ( zSig < 2 ) then
  3963. Begin
  3964. zSig := $7FFFFFFF;
  3965. goto roundAndPack;
  3966. End
  3967. else
  3968. Begin
  3969. aSig := aSig shr (aExp and 1);
  3970. mul32To64( zSig, zSig, term0, term1 );
  3971. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3972. while ( sbits32 (rem0) < 0 ) do
  3973. Begin
  3974. Dec(zSig);
  3975. shortShift64Left( 0, zSig, 1, term0, term1 );
  3976. term1 := term1 or 1;
  3977. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3978. End;
  3979. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3980. End;
  3981. End;
  3982. shift32RightJamming( zSig, 1, zSig );
  3983. roundAndPack:
  3984. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3985. End;
  3986. {*
  3987. -------------------------------------------------------------------------------
  3988. Returns 1 if the single-precision floating-point value `a' is equal to
  3989. the corresponding value `b', and 0 otherwise. The comparison is performed
  3990. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3991. -------------------------------------------------------------------------------
  3992. *}
  3993. Function float32_eq( a:float32rec; b:float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3994. Begin
  3995. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3996. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3997. ) then
  3998. Begin
  3999. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  4000. Begin
  4001. float_raise( float_flag_invalid );
  4002. End;
  4003. float32_eq := 0;
  4004. exit;
  4005. End;
  4006. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  4007. End;
  4008. {*
  4009. -------------------------------------------------------------------------------
  4010. Returns 1 if the single-precision floating-point value `a' is less than
  4011. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4012. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4013. Arithmetic.
  4014. -------------------------------------------------------------------------------
  4015. *}
  4016. Function float32_le( a: float32rec; b : float32rec ):flag;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4017. var
  4018. aSign, bSign: flag;
  4019. Begin
  4020. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  4021. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  4022. ) then
  4023. Begin
  4024. float_raise( float_flag_invalid );
  4025. float32_le := 0;
  4026. exit;
  4027. End;
  4028. aSign := extractFloat32Sign( a.float32 );
  4029. bSign := extractFloat32Sign( b.float32 );
  4030. if ( aSign <> bSign ) then
  4031. Begin
  4032. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  4033. exit;
  4034. End;
  4035. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  4036. End;
  4037. {*
  4038. -------------------------------------------------------------------------------
  4039. Returns 1 if the single-precision floating-point value `a' is less than
  4040. the corresponding value `b', and 0 otherwise. The comparison is performed
  4041. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4042. -------------------------------------------------------------------------------
  4043. *}
  4044. Function float32_lt( a:float32rec ; b : float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4045. var
  4046. aSign, bSign: flag;
  4047. Begin
  4048. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  4049. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  4050. ) then
  4051. Begin
  4052. float_raise( float_flag_invalid );
  4053. float32_lt :=0;
  4054. exit;
  4055. End;
  4056. aSign := extractFloat32Sign( a.float32 );
  4057. bSign := extractFloat32Sign( b.float32 );
  4058. if ( aSign <> bSign ) then
  4059. Begin
  4060. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  4061. exit;
  4062. End;
  4063. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  4064. End;
  4065. {*
  4066. -------------------------------------------------------------------------------
  4067. Returns 1 if the single-precision floating-point value `a' is equal to
  4068. the corresponding value `b', and 0 otherwise. The invalid exception is
  4069. raised if either operand is a NaN. Otherwise, the comparison is performed
  4070. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4071. -------------------------------------------------------------------------------
  4072. *}
  4073. Function float32_eq_signaling( a: float32; b: float32) : flag;
  4074. Begin
  4075. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  4076. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  4077. ) then
  4078. Begin
  4079. float_raise( float_flag_invalid );
  4080. float32_eq_signaling := 0;
  4081. exit;
  4082. End;
  4083. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  4084. End;
  4085. {*
  4086. -------------------------------------------------------------------------------
  4087. Returns 1 if the single-precision floating-point value `a' is less than or
  4088. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4089. cause an exception. Otherwise, the comparison is performed according to the
  4090. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4091. -------------------------------------------------------------------------------
  4092. *}
  4093. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  4094. Var
  4095. aSign, bSign: flag;
  4096. Begin
  4097. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4098. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4099. ) then
  4100. Begin
  4101. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4102. Begin
  4103. float_raise( float_flag_invalid );
  4104. End;
  4105. float32_le_quiet := 0;
  4106. exit;
  4107. End;
  4108. aSign := extractFloat32Sign( a );
  4109. bSign := extractFloat32Sign( b );
  4110. if ( aSign <> bSign ) then
  4111. Begin
  4112. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  4113. exit;
  4114. End;
  4115. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  4116. End;
  4117. {*
  4118. -------------------------------------------------------------------------------
  4119. Returns 1 if the single-precision floating-point value `a' is less than
  4120. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4121. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4122. Standard for Binary Floating-Point Arithmetic.
  4123. -------------------------------------------------------------------------------
  4124. *}
  4125. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  4126. Var
  4127. aSign, bSign: flag;
  4128. Begin
  4129. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4130. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4131. ) then
  4132. Begin
  4133. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4134. Begin
  4135. float_raise( float_flag_invalid );
  4136. End;
  4137. float32_lt_quiet := 0;
  4138. exit;
  4139. End;
  4140. aSign := extractFloat32Sign( a );
  4141. bSign := extractFloat32Sign( b );
  4142. if ( aSign <> bSign ) then
  4143. Begin
  4144. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4145. exit;
  4146. End;
  4147. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4148. End;
  4149. {*
  4150. -------------------------------------------------------------------------------
  4151. Returns the result of converting the double-precision floating-point value
  4152. `a' to the 32-bit two's complement integer format. The conversion is
  4153. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4154. Arithmetic---which means in particular that the conversion is rounded
  4155. according to the current rounding mode. If `a' is a NaN, the largest
  4156. positive integer is returned. Otherwise, if the conversion overflows, the
  4157. largest integer with the same sign as `a' is returned.
  4158. -------------------------------------------------------------------------------
  4159. *}
  4160. Function float64_to_int32(a: float64): int32;{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4161. var
  4162. aSign: flag;
  4163. aExp, shiftCount: int16;
  4164. aSig0, aSig1, absZ, aSigExtra: bits32;
  4165. z: int32;
  4166. roundingMode: TFPURoundingMode;
  4167. label invalid;
  4168. Begin
  4169. aSig1 := extractFloat64Frac1( a );
  4170. aSig0 := extractFloat64Frac0( a );
  4171. aExp := extractFloat64Exp( a );
  4172. aSign := extractFloat64Sign( a );
  4173. shiftCount := aExp - $413;
  4174. if ( 0 <= shiftCount ) then
  4175. Begin
  4176. if ( $41E < aExp ) then
  4177. Begin
  4178. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4179. aSign := 0;
  4180. goto invalid;
  4181. End;
  4182. shortShift64Left(
  4183. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4184. if ( $80000000 < absZ ) then
  4185. goto invalid;
  4186. End
  4187. else
  4188. Begin
  4189. aSig1 := flag( aSig1 <> 0 );
  4190. if ( aExp < $3FE ) then
  4191. Begin
  4192. aSigExtra := aExp OR aSig0 OR aSig1;
  4193. absZ := 0;
  4194. End
  4195. else
  4196. Begin
  4197. aSig0 := aSig0 OR $00100000;
  4198. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4199. absZ := aSig0 shr ( - shiftCount );
  4200. End;
  4201. End;
  4202. roundingMode := softfloat_rounding_mode;
  4203. if ( roundingMode = float_round_nearest_even ) then
  4204. Begin
  4205. if ( sbits32(aSigExtra) < 0 ) then
  4206. Begin
  4207. Inc(absZ);
  4208. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4209. absZ := absZ and not 1;
  4210. End;
  4211. if aSign <> 0 then
  4212. z := - absZ
  4213. else
  4214. z := absZ;
  4215. End
  4216. else
  4217. Begin
  4218. aSigExtra := bits32( aSigExtra <> 0 );
  4219. if ( aSign <> 0) then
  4220. Begin
  4221. z := - ( absZ
  4222. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4223. End
  4224. else
  4225. Begin
  4226. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4227. End
  4228. End;
  4229. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4230. Begin
  4231. invalid:
  4232. float_raise( float_flag_invalid );
  4233. if (aSign <> 0 ) then
  4234. float64_to_int32 := sbits32 ($80000000)
  4235. else
  4236. float64_to_int32 := $7FFFFFFF;
  4237. exit;
  4238. End;
  4239. if ( aSigExtra <> 0) then
  4240. set_inexact_flag;
  4241. float64_to_int32 := z;
  4242. End;
  4243. {*
  4244. -------------------------------------------------------------------------------
  4245. Returns the result of converting the double-precision floating-point value
  4246. `a' to the 32-bit two's complement integer format. The conversion is
  4247. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4248. Arithmetic, except that the conversion is always rounded toward zero.
  4249. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4250. the conversion overflows, the largest integer with the same sign as `a' is
  4251. returned.
  4252. -------------------------------------------------------------------------------
  4253. *}
  4254. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4255. {$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4256. Var
  4257. aSign: flag;
  4258. aExp, shiftCount: int16;
  4259. aSig0, aSig1, absZ, aSigExtra: bits32;
  4260. z: int32;
  4261. label invalid;
  4262. Begin
  4263. aSig1 := extractFloat64Frac1( a );
  4264. aSig0 := extractFloat64Frac0( a );
  4265. aExp := extractFloat64Exp( a );
  4266. aSign := extractFloat64Sign( a );
  4267. shiftCount := aExp - $413;
  4268. if ( 0 <= shiftCount ) then
  4269. Begin
  4270. if ( $41E < aExp ) then
  4271. Begin
  4272. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4273. aSign := 0;
  4274. goto invalid;
  4275. End;
  4276. shortShift64Left(
  4277. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4278. End
  4279. else
  4280. Begin
  4281. if ( aExp < $3FF ) then
  4282. Begin
  4283. if ( bits32(aExp) OR aSig0 OR aSig1 )<>0 then
  4284. Begin
  4285. set_inexact_flag;
  4286. End;
  4287. float64_to_int32_round_to_zero := 0;
  4288. exit;
  4289. End;
  4290. aSig0 := aSig0 or $00100000;
  4291. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4292. absZ := aSig0 shr ( - shiftCount );
  4293. End;
  4294. if aSign <> 0 then
  4295. z := - absZ
  4296. else
  4297. z := absZ;
  4298. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4299. Begin
  4300. invalid:
  4301. float_raise( float_flag_invalid );
  4302. if (aSign <> 0) then
  4303. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4304. else
  4305. float64_to_int32_round_to_zero := $7FFFFFFF;
  4306. exit;
  4307. End;
  4308. if ( aSigExtra <> 0) then
  4309. set_inexact_flag;
  4310. float64_to_int32_round_to_zero := z;
  4311. End;
  4312. {*----------------------------------------------------------------------------
  4313. | Returns the result of converting the double-precision floating-point value
  4314. | `a' to the 64-bit two's complement integer format. The conversion is
  4315. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4316. | Arithmetic---which means in particular that the conversion is rounded
  4317. | according to the current rounding mode. If `a' is a NaN, the largest
  4318. | positive integer is returned. Otherwise, if the conversion overflows, the
  4319. | largest integer with the same sign as `a' is returned.
  4320. *----------------------------------------------------------------------------*}
  4321. function float64_to_int64( a: float64 ): int64;
  4322. var
  4323. aSign: flag;
  4324. aExp, shiftCount: int16;
  4325. aSig, aSigExtra: bits64;
  4326. begin
  4327. aSig := extractFloat64Frac( a );
  4328. aExp := extractFloat64Exp( a );
  4329. aSign := extractFloat64Sign( a );
  4330. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4331. shiftCount := $433 - aExp;
  4332. if ( shiftCount <= 0 ) then begin
  4333. if ( $43E < aExp ) then begin
  4334. float_raise( float_flag_invalid );
  4335. if ( ( aSign = 0 )
  4336. or ( ( aExp = $7FF )
  4337. and ( aSig <> $0010000000000000 ) )
  4338. ) then begin
  4339. result := $7FFFFFFFFFFFFFFF;
  4340. exit;
  4341. end;
  4342. result := $8000000000000000;
  4343. exit;
  4344. end;
  4345. aSigExtra := 0;
  4346. aSig := aSig shl ( - shiftCount );
  4347. end
  4348. else
  4349. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4350. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4351. end;
  4352. {*----------------------------------------------------------------------------
  4353. | Returns the result of converting the double-precision floating-point value
  4354. | `a' to the 64-bit two's complement integer format. The conversion is
  4355. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4356. | Arithmetic, except that the conversion is always rounded toward zero.
  4357. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4358. | the conversion overflows, the largest integer with the same sign as `a' is
  4359. | returned.
  4360. *----------------------------------------------------------------------------*}
  4361. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4362. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4363. var
  4364. aSign: flag;
  4365. aExp, shiftCount: int16;
  4366. aSig: bits64;
  4367. z: int64;
  4368. begin
  4369. aSig := extractFloat64Frac( a );
  4370. aExp := extractFloat64Exp( a );
  4371. aSign := extractFloat64Sign( a );
  4372. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4373. shiftCount := aExp - $433;
  4374. if ( 0 <= shiftCount ) then begin
  4375. if ( $43E <= aExp ) then begin
  4376. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4377. float_raise( float_flag_invalid );
  4378. if ( ( aSign = 0 )
  4379. or ( ( aExp = $7FF )
  4380. and ( aSig <> $0010000000000000 ) )
  4381. ) then begin
  4382. result := $7FFFFFFFFFFFFFFF;
  4383. exit;
  4384. end;
  4385. end;
  4386. result := $8000000000000000;
  4387. exit;
  4388. end;
  4389. z := aSig shl shiftCount;
  4390. end
  4391. else begin
  4392. if ( aExp < $3FE ) then begin
  4393. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  4394. result := 0;
  4395. exit;
  4396. end;
  4397. z := aSig shr ( - shiftCount );
  4398. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4399. set_inexact_flag;
  4400. end;
  4401. if ( aSign <> 0 ) then z := - z;
  4402. result := z;
  4403. end;
  4404. {*
  4405. -------------------------------------------------------------------------------
  4406. Returns the result of converting the double-precision floating-point value
  4407. `a' to the single-precision floating-point format. The conversion is
  4408. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4409. Arithmetic.
  4410. -------------------------------------------------------------------------------
  4411. *}
  4412. Function float64_to_float32(a: float64 ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4413. Var
  4414. aSign: flag;
  4415. aExp: int16;
  4416. aSig0, aSig1, zSig: bits32;
  4417. allZero: bits32;
  4418. tmp : CommonNanT;
  4419. Begin
  4420. aSig1 := extractFloat64Frac1( a );
  4421. aSig0 := extractFloat64Frac0( a );
  4422. aExp := extractFloat64Exp( a );
  4423. aSign := extractFloat64Sign( a );
  4424. if ( aExp = $7FF ) then
  4425. Begin
  4426. if ( aSig0 OR aSig1 ) <> 0 then
  4427. Begin
  4428. tmp:=float64ToCommonNaN(a);
  4429. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4430. exit;
  4431. End;
  4432. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4433. exit;
  4434. End;
  4435. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4436. if ( aExp <> 0) then
  4437. zSig := zSig OR $40000000;
  4438. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4439. End;
  4440. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4441. {*----------------------------------------------------------------------------
  4442. | Returns the result of converting the double-precision floating-point value
  4443. | `a' to the extended double-precision floating-point format. The conversion
  4444. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4445. | Arithmetic.
  4446. *----------------------------------------------------------------------------*}
  4447. function float64_to_floatx80( a: float64 ): floatx80;
  4448. var
  4449. aSign: flag;
  4450. aExp: int16;
  4451. aSig: bits64;
  4452. begin
  4453. aSig := extractFloat64Frac( a );
  4454. aExp := extractFloat64Exp( a );
  4455. aSign := extractFloat64Sign( a );
  4456. if ( aExp = $7FF ) then begin
  4457. if ( aSig <> 0 ) then begin
  4458. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4459. exit;
  4460. end;
  4461. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4462. exit;
  4463. end;
  4464. if ( aExp = 0 ) then begin
  4465. if ( aSig = 0 ) then begin
  4466. result := packFloatx80( aSign, 0, 0 );
  4467. exit;
  4468. end;
  4469. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4470. end;
  4471. result :=
  4472. packFloatx80(
  4473. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4474. end;
  4475. {$endif FPC_SOFTFLOAT_FLOATX80}
  4476. {*
  4477. -------------------------------------------------------------------------------
  4478. Rounds the double-precision floating-point value `a' to an integer,
  4479. and returns the result as a double-precision floating-point value. The
  4480. operation is performed according to the IEC/IEEE Standard for Binary
  4481. Floating-Point Arithmetic.
  4482. -------------------------------------------------------------------------------
  4483. *}
  4484. function float64_round_to_int(a: float64) : Float64;{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4485. Var
  4486. aSign: flag;
  4487. aExp: int16;
  4488. lastBitMask, roundBitsMask: bits32;
  4489. roundingMode: TFPURoundingMode;
  4490. z: float64;
  4491. Begin
  4492. aExp := extractFloat64Exp( a );
  4493. if ( $413 <= aExp ) then
  4494. Begin
  4495. if ( $433 <= aExp ) then
  4496. Begin
  4497. if ( ( aExp = $7FF )
  4498. AND
  4499. (
  4500. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4501. ) <>0)
  4502. ) then
  4503. Begin
  4504. propagateFloat64NaN( a, a, result );
  4505. exit;
  4506. End;
  4507. result := a;
  4508. exit;
  4509. End;
  4510. lastBitMask := 1;
  4511. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4512. roundBitsMask := lastBitMask - 1;
  4513. z := a;
  4514. roundingMode := softfloat_rounding_mode;
  4515. if ( roundingMode = float_round_nearest_even ) then
  4516. Begin
  4517. if ( lastBitMask <> 0) then
  4518. Begin
  4519. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4520. if ( ( z.low and roundBitsMask ) = 0 ) then
  4521. z.low := z.low and not lastBitMask;
  4522. End
  4523. else
  4524. Begin
  4525. if ( sbits32 (z.low) < 0 ) then
  4526. Begin
  4527. Inc(z.high);
  4528. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4529. z.high := z.high and not 1;
  4530. End;
  4531. End;
  4532. End
  4533. else if ( roundingMode <> float_round_to_zero ) then
  4534. Begin
  4535. if ( extractFloat64Sign( z )
  4536. xor flag( roundingMode = float_round_up ) )<> 0 then
  4537. Begin
  4538. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4539. End;
  4540. End;
  4541. z.low := z.low and not roundBitsMask;
  4542. End
  4543. else
  4544. Begin
  4545. if ( aExp <= $3FE ) then
  4546. Begin
  4547. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4548. Begin
  4549. result := a;
  4550. exit;
  4551. End;
  4552. set_inexact_flag;
  4553. aSign := extractFloat64Sign( a );
  4554. case ( softfloat_rounding_mode ) of
  4555. float_round_nearest_even:
  4556. Begin
  4557. if ( ( aExp = $3FE )
  4558. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4559. ) then
  4560. Begin
  4561. packFloat64( aSign, $3FF, 0, 0, result );
  4562. exit;
  4563. End;
  4564. End;
  4565. float_round_down:
  4566. Begin
  4567. if aSign<>0 then
  4568. packFloat64( 1, $3FF, 0, 0, result )
  4569. else
  4570. packFloat64( 0, 0, 0, 0, result );
  4571. exit;
  4572. End;
  4573. float_round_up:
  4574. Begin
  4575. if aSign <> 0 then
  4576. packFloat64( 1, 0, 0, 0, result )
  4577. else
  4578. packFloat64( 0, $3FF, 0, 0, result );
  4579. exit;
  4580. End;
  4581. end;
  4582. packFloat64( aSign, 0, 0, 0, result );
  4583. exit;
  4584. End;
  4585. lastBitMask := 1;
  4586. lastBitMask := lastBitMask shl ($413 - aExp);
  4587. roundBitsMask := lastBitMask - 1;
  4588. z.low := 0;
  4589. z.high := a.high;
  4590. roundingMode := softfloat_rounding_mode;
  4591. if ( roundingMode = float_round_nearest_even ) then
  4592. Begin
  4593. z.high := z.high + lastBitMask shr 1;
  4594. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4595. Begin
  4596. z.high := z.high and not lastBitMask;
  4597. End;
  4598. End
  4599. else if ( roundingMode <> float_round_to_zero ) then
  4600. Begin
  4601. if ( extractFloat64Sign( z )
  4602. xor flag( roundingMode = float_round_up ) )<> 0 then
  4603. Begin
  4604. z.high := z.high or bits32( a.low <> 0 );
  4605. z.high := z.high + roundBitsMask;
  4606. End;
  4607. End;
  4608. z.high := z.high and not roundBitsMask;
  4609. End;
  4610. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4611. Begin
  4612. set_inexact_flag;
  4613. End;
  4614. result := z;
  4615. End;
  4616. {*
  4617. -------------------------------------------------------------------------------
  4618. Returns the result of adding the absolute values of the double-precision
  4619. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4620. before being returned. `zSign' is ignored if the result is a NaN.
  4621. The addition is performed according to the IEC/IEEE Standard for Binary
  4622. Floating-Point Arithmetic.
  4623. -------------------------------------------------------------------------------
  4624. *}
  4625. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4626. Var
  4627. aExp, bExp, zExp: int16;
  4628. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4629. expDiff: int16;
  4630. label shiftRight1;
  4631. label roundAndPack;
  4632. Begin
  4633. aSig1 := extractFloat64Frac1( a );
  4634. aSig0 := extractFloat64Frac0( a );
  4635. aExp := extractFloat64Exp( a );
  4636. bSig1 := extractFloat64Frac1( b );
  4637. bSig0 := extractFloat64Frac0( b );
  4638. bExp := extractFloat64Exp( b );
  4639. expDiff := aExp - bExp;
  4640. if ( 0 < expDiff ) then
  4641. Begin
  4642. if ( aExp = $7FF ) then
  4643. Begin
  4644. if ( aSig0 OR aSig1 ) <> 0 then
  4645. Begin
  4646. propagateFloat64NaN( a, b, out );
  4647. exit;
  4648. end;
  4649. out := a;
  4650. exit;
  4651. End;
  4652. if ( bExp = 0 ) then
  4653. Begin
  4654. Dec(expDiff);
  4655. End
  4656. else
  4657. Begin
  4658. bSig0 := bSig0 or $00100000;
  4659. End;
  4660. shift64ExtraRightJamming(
  4661. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4662. zExp := aExp;
  4663. End
  4664. else if ( expDiff < 0 ) then
  4665. Begin
  4666. if ( bExp = $7FF ) then
  4667. Begin
  4668. if ( bSig0 OR bSig1 ) <> 0 then
  4669. Begin
  4670. propagateFloat64NaN( a, b, out );
  4671. exit;
  4672. End;
  4673. packFloat64( zSign, $7FF, 0, 0, out );
  4674. exit;
  4675. End;
  4676. if ( aExp = 0 ) then
  4677. Begin
  4678. Inc(expDiff);
  4679. End
  4680. else
  4681. Begin
  4682. aSig0 := aSig0 or $00100000;
  4683. End;
  4684. shift64ExtraRightJamming(
  4685. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4686. zExp := bExp;
  4687. End
  4688. else
  4689. Begin
  4690. if ( aExp = $7FF ) then
  4691. Begin
  4692. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4693. Begin
  4694. propagateFloat64NaN( a, b, out );
  4695. exit;
  4696. End;
  4697. out := a;
  4698. exit;
  4699. End;
  4700. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4701. if ( aExp = 0 ) then
  4702. Begin
  4703. packFloat64( zSign, 0, zSig0, zSig1, out );
  4704. exit;
  4705. End;
  4706. zSig2 := 0;
  4707. zSig0 := zSig0 or $00200000;
  4708. zExp := aExp;
  4709. goto shiftRight1;
  4710. End;
  4711. aSig0 := aSig0 or $00100000;
  4712. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4713. Dec(zExp);
  4714. if ( zSig0 < $00200000 ) then
  4715. goto roundAndPack;
  4716. Inc(zExp);
  4717. shiftRight1:
  4718. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4719. roundAndPack:
  4720. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4721. End;
  4722. {*
  4723. -------------------------------------------------------------------------------
  4724. Returns the result of subtracting the absolute values of the double-
  4725. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4726. difference is negated before being returned. `zSign' is ignored if the
  4727. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4728. Standard for Binary Floating-Point Arithmetic.
  4729. -------------------------------------------------------------------------------
  4730. *}
  4731. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4732. Var
  4733. aExp, bExp, zExp: int16;
  4734. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4735. expDiff: int16;
  4736. z: float64;
  4737. label aExpBigger;
  4738. label bExpBigger;
  4739. label aBigger;
  4740. label bBigger;
  4741. label normalizeRoundAndPack;
  4742. Begin
  4743. aSig1 := extractFloat64Frac1( a );
  4744. aSig0 := extractFloat64Frac0( a );
  4745. aExp := extractFloat64Exp( a );
  4746. bSig1 := extractFloat64Frac1( b );
  4747. bSig0 := extractFloat64Frac0( b );
  4748. bExp := extractFloat64Exp( b );
  4749. expDiff := aExp - bExp;
  4750. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4751. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4752. if ( 0 < expDiff ) then goto aExpBigger;
  4753. if ( expDiff < 0 ) then goto bExpBigger;
  4754. if ( aExp = $7FF ) then
  4755. Begin
  4756. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4757. Begin
  4758. propagateFloat64NaN( a, b, out );
  4759. exit;
  4760. End;
  4761. float_raise( float_flag_invalid );
  4762. z.low := float64_default_nan_low;
  4763. z.high := float64_default_nan_high;
  4764. out := z;
  4765. exit;
  4766. End;
  4767. if ( aExp = 0 ) then
  4768. Begin
  4769. aExp := 1;
  4770. bExp := 1;
  4771. End;
  4772. if ( bSig0 < aSig0 ) then goto aBigger;
  4773. if ( aSig0 < bSig0 ) then goto bBigger;
  4774. if ( bSig1 < aSig1 ) then goto aBigger;
  4775. if ( aSig1 < bSig1 ) then goto bBigger;
  4776. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4777. exit;
  4778. bExpBigger:
  4779. if ( bExp = $7FF ) then
  4780. Begin
  4781. if ( bSig0 OR bSig1 ) <> 0 then
  4782. Begin
  4783. propagateFloat64NaN( a, b, out );
  4784. exit;
  4785. End;
  4786. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4787. exit;
  4788. End;
  4789. if ( aExp = 0 ) then
  4790. Begin
  4791. Inc(expDiff);
  4792. End
  4793. else
  4794. Begin
  4795. aSig0 := aSig0 or $40000000;
  4796. End;
  4797. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4798. bSig0 := bSig0 or $40000000;
  4799. bBigger:
  4800. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4801. zExp := bExp;
  4802. zSign := zSign xor 1;
  4803. goto normalizeRoundAndPack;
  4804. aExpBigger:
  4805. if ( aExp = $7FF ) then
  4806. Begin
  4807. if ( aSig0 OR aSig1 ) <> 0 then
  4808. Begin
  4809. propagateFloat64NaN( a, b, out );
  4810. exit;
  4811. End;
  4812. out := a;
  4813. exit;
  4814. End;
  4815. if ( bExp = 0 ) then
  4816. Begin
  4817. Dec(expDiff);
  4818. End
  4819. else
  4820. Begin
  4821. bSig0 := bSig0 or $40000000;
  4822. End;
  4823. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4824. aSig0 := aSig0 or $40000000;
  4825. aBigger:
  4826. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4827. zExp := aExp;
  4828. normalizeRoundAndPack:
  4829. Dec(zExp);
  4830. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4831. End;
  4832. {*
  4833. -------------------------------------------------------------------------------
  4834. Returns the result of adding the double-precision floating-point values `a'
  4835. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4836. Binary Floating-Point Arithmetic.
  4837. -------------------------------------------------------------------------------
  4838. *}
  4839. Function float64_add( a: float64; b : float64) : Float64;
  4840. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4841. Var
  4842. aSign, bSign: flag;
  4843. Begin
  4844. aSign := extractFloat64Sign( a );
  4845. bSign := extractFloat64Sign( b );
  4846. if ( aSign = bSign ) then
  4847. Begin
  4848. addFloat64Sigs( a, b, aSign, result );
  4849. End
  4850. else
  4851. Begin
  4852. subFloat64Sigs( a, b, aSign, result );
  4853. End;
  4854. End;
  4855. {*
  4856. -------------------------------------------------------------------------------
  4857. Returns the result of subtracting the double-precision floating-point values
  4858. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4859. for Binary Floating-Point Arithmetic.
  4860. -------------------------------------------------------------------------------
  4861. *}
  4862. Function float64_sub(a: float64; b : float64) : Float64;
  4863. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4864. Var
  4865. aSign, bSign: flag;
  4866. Begin
  4867. aSign := extractFloat64Sign( a );
  4868. bSign := extractFloat64Sign( b );
  4869. if ( aSign = bSign ) then
  4870. Begin
  4871. subFloat64Sigs( a, b, aSign, result );
  4872. End
  4873. else
  4874. Begin
  4875. addFloat64Sigs( a, b, aSign, result );
  4876. End;
  4877. End;
  4878. {*
  4879. -------------------------------------------------------------------------------
  4880. Returns the result of multiplying the double-precision floating-point values
  4881. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4882. for Binary Floating-Point Arithmetic.
  4883. -------------------------------------------------------------------------------
  4884. *}
  4885. Function float64_mul( a: float64; b:float64) : Float64;
  4886. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4887. Var
  4888. aSign, bSign, zSign: flag;
  4889. aExp, bExp, zExp: int16;
  4890. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4891. z: float64;
  4892. label invalid;
  4893. Begin
  4894. aSig1 := extractFloat64Frac1( a );
  4895. aSig0 := extractFloat64Frac0( a );
  4896. aExp := extractFloat64Exp( a );
  4897. aSign := extractFloat64Sign( a );
  4898. bSig1 := extractFloat64Frac1( b );
  4899. bSig0 := extractFloat64Frac0( b );
  4900. bExp := extractFloat64Exp( b );
  4901. bSign := extractFloat64Sign( b );
  4902. zSign := aSign xor bSign;
  4903. if ( aExp = $7FF ) then
  4904. Begin
  4905. if ( (( aSig0 OR aSig1 ) <>0)
  4906. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4907. Begin
  4908. propagateFloat64NaN( a, b, result );
  4909. exit;
  4910. End;
  4911. if ( ( bits32(bExp) OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4912. packFloat64( zSign, $7FF, 0, 0, result );
  4913. exit;
  4914. End;
  4915. if ( bExp = $7FF ) then
  4916. Begin
  4917. if ( bSig0 OR bSig1 )<> 0 then
  4918. Begin
  4919. propagateFloat64NaN( a, b, result );
  4920. exit;
  4921. End;
  4922. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4923. Begin
  4924. invalid:
  4925. float_raise( float_flag_invalid );
  4926. z.low := float64_default_nan_low;
  4927. z.high := float64_default_nan_high;
  4928. result := z;
  4929. exit;
  4930. End;
  4931. packFloat64( zSign, $7FF, 0, 0, result );
  4932. exit;
  4933. End;
  4934. if ( aExp = 0 ) then
  4935. Begin
  4936. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4937. Begin
  4938. packFloat64( zSign, 0, 0, 0, result );
  4939. exit;
  4940. End;
  4941. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4942. End;
  4943. if ( bExp = 0 ) then
  4944. Begin
  4945. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4946. Begin
  4947. packFloat64( zSign, 0, 0, 0, result );
  4948. exit;
  4949. End;
  4950. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4951. End;
  4952. zExp := aExp + bExp - $400;
  4953. aSig0 := aSig0 or $00100000;
  4954. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4955. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4956. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4957. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4958. if ( $00200000 <= zSig0 ) then
  4959. Begin
  4960. shift64ExtraRightJamming(
  4961. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4962. Inc(zExp);
  4963. End;
  4964. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4965. End;
  4966. {*
  4967. -------------------------------------------------------------------------------
  4968. Returns the result of dividing the double-precision floating-point value `a'
  4969. by the corresponding value `b'. The operation is performed according to the
  4970. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4971. -------------------------------------------------------------------------------
  4972. *}
  4973. Function float64_div(a: float64; b : float64) : Float64;
  4974. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4975. Var
  4976. aSign, bSign, zSign: flag;
  4977. aExp, bExp, zExp: int16;
  4978. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4979. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4980. z: float64;
  4981. label invalid;
  4982. Begin
  4983. aSig1 := extractFloat64Frac1( a );
  4984. aSig0 := extractFloat64Frac0( a );
  4985. aExp := extractFloat64Exp( a );
  4986. aSign := extractFloat64Sign( a );
  4987. bSig1 := extractFloat64Frac1( b );
  4988. bSig0 := extractFloat64Frac0( b );
  4989. bExp := extractFloat64Exp( b );
  4990. bSign := extractFloat64Sign( b );
  4991. zSign := aSign xor bSign;
  4992. if ( aExp = $7FF ) then
  4993. Begin
  4994. if ( aSig0 OR aSig1 )<> 0 then
  4995. Begin
  4996. propagateFloat64NaN( a, b, result );
  4997. exit;
  4998. end;
  4999. if ( bExp = $7FF ) then
  5000. Begin
  5001. if ( bSig0 OR bSig1 )<>0 then
  5002. Begin
  5003. propagateFloat64NaN( a, b, result );
  5004. exit;
  5005. End;
  5006. goto invalid;
  5007. End;
  5008. packFloat64( zSign, $7FF, 0, 0, result );
  5009. exit;
  5010. End;
  5011. if ( bExp = $7FF ) then
  5012. Begin
  5013. if ( bSig0 OR bSig1 )<> 0 then
  5014. Begin
  5015. propagateFloat64NaN( a, b, result );
  5016. exit;
  5017. End;
  5018. packFloat64( zSign, 0, 0, 0, result );
  5019. exit;
  5020. End;
  5021. if ( bExp = 0 ) then
  5022. Begin
  5023. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5024. Begin
  5025. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5026. Begin
  5027. invalid:
  5028. float_raise( float_flag_invalid );
  5029. z.low := float64_default_nan_low;
  5030. z.high := float64_default_nan_high;
  5031. result := z;
  5032. exit;
  5033. End;
  5034. float_raise( float_flag_divbyzero );
  5035. packFloat64( zSign, $7FF, 0, 0, result );
  5036. exit;
  5037. End;
  5038. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5039. End;
  5040. if ( aExp = 0 ) then
  5041. Begin
  5042. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5043. Begin
  5044. packFloat64( zSign, 0, 0, 0, result );
  5045. exit;
  5046. End;
  5047. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5048. End;
  5049. zExp := aExp - bExp + $3FD;
  5050. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  5051. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5052. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  5053. Begin
  5054. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  5055. Inc(zExp);
  5056. End;
  5057. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5058. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  5059. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  5060. while ( sbits32 (rem0) < 0 ) do
  5061. Begin
  5062. Dec(zSig0);
  5063. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  5064. End;
  5065. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  5066. if ( ( zSig1 and $3FF ) <= 4 ) then
  5067. Begin
  5068. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  5069. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  5070. while ( sbits32 (rem1) < 0 ) do
  5071. Begin
  5072. Dec(zSig1);
  5073. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  5074. End;
  5075. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5076. End;
  5077. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  5078. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  5079. End;
  5080. {*
  5081. -------------------------------------------------------------------------------
  5082. Returns the remainder of the double-precision floating-point value `a'
  5083. with respect to the corresponding value `b'. The operation is performed
  5084. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5085. -------------------------------------------------------------------------------
  5086. *}
  5087. Function float64_rem(a: float64; b : float64) : float64;
  5088. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  5089. Var
  5090. aSign, zSign: flag;
  5091. aExp, bExp, expDiff: int16;
  5092. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  5093. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  5094. sigMean0: sbits32;
  5095. z: float64;
  5096. label invalid;
  5097. Begin
  5098. aSig1 := extractFloat64Frac1( a );
  5099. aSig0 := extractFloat64Frac0( a );
  5100. aExp := extractFloat64Exp( a );
  5101. aSign := extractFloat64Sign( a );
  5102. bSig1 := extractFloat64Frac1( b );
  5103. bSig0 := extractFloat64Frac0( b );
  5104. bExp := extractFloat64Exp( b );
  5105. if ( aExp = $7FF ) then
  5106. Begin
  5107. if ((( aSig0 OR aSig1 )<>0)
  5108. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  5109. Begin
  5110. propagateFloat64NaN( a, b, result );
  5111. exit;
  5112. End;
  5113. goto invalid;
  5114. End;
  5115. if ( bExp = $7FF ) then
  5116. Begin
  5117. if ( bSig0 OR bSig1 ) <> 0 then
  5118. Begin
  5119. propagateFloat64NaN( a, b, result );
  5120. exit;
  5121. End;
  5122. result := a;
  5123. exit;
  5124. End;
  5125. if ( bExp = 0 ) then
  5126. Begin
  5127. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5128. Begin
  5129. invalid:
  5130. float_raise( float_flag_invalid );
  5131. z.low := float64_default_nan_low;
  5132. z.high := float64_default_nan_high;
  5133. result := z;
  5134. exit;
  5135. End;
  5136. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5137. End;
  5138. if ( aExp = 0 ) then
  5139. Begin
  5140. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5141. Begin
  5142. result := a;
  5143. exit;
  5144. End;
  5145. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5146. End;
  5147. expDiff := aExp - bExp;
  5148. if ( expDiff < -1 ) then
  5149. Begin
  5150. result := a;
  5151. exit;
  5152. End;
  5153. shortShift64Left(
  5154. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5155. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5156. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5157. if ( q )<>0 then
  5158. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5159. expDiff := expDiff - 32;
  5160. while ( 0 < expDiff ) do
  5161. Begin
  5162. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5163. if 4 < q then
  5164. q:= q - 4
  5165. else
  5166. q := 0;
  5167. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5168. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5169. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5170. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5171. expDiff := expDiff - 29;
  5172. End;
  5173. if ( -32 < expDiff ) then
  5174. Begin
  5175. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5176. if 4 < q then
  5177. q := q - 4
  5178. else
  5179. q := 0;
  5180. q := q shr (- expDiff);
  5181. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5182. expDiff := expDiff + 24;
  5183. if ( expDiff < 0 ) then
  5184. Begin
  5185. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5186. End
  5187. else
  5188. Begin
  5189. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5190. End;
  5191. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5192. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5193. End
  5194. else
  5195. Begin
  5196. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5197. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5198. End;
  5199. Repeat
  5200. alternateASig0 := aSig0;
  5201. alternateASig1 := aSig1;
  5202. Inc(q);
  5203. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5204. Until not ( 0 <= sbits32 (aSig0) );
  5205. add64(
  5206. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5207. if ( ( sigMean0 < 0 )
  5208. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5209. Begin
  5210. aSig0 := alternateASig0;
  5211. aSig1 := alternateASig1;
  5212. End;
  5213. zSign := flag( sbits32 (aSig0) < 0 );
  5214. if ( zSign <> 0 ) then
  5215. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5216. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5217. End;
  5218. {*
  5219. -------------------------------------------------------------------------------
  5220. Returns the square root of the double-precision floating-point value `a'.
  5221. The operation is performed according to the IEC/IEEE Standard for Binary
  5222. Floating-Point Arithmetic.
  5223. -------------------------------------------------------------------------------
  5224. *}
  5225. function float64_sqrt( a: float64 ): float64;
  5226. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5227. Var
  5228. aSign: flag;
  5229. aExp, zExp: int16;
  5230. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5231. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5232. label invalid;
  5233. Begin
  5234. aSig1 := extractFloat64Frac1( a );
  5235. aSig0 := extractFloat64Frac0( a );
  5236. aExp := extractFloat64Exp( a );
  5237. aSign := extractFloat64Sign( a );
  5238. if ( aExp = $7FF ) then
  5239. Begin
  5240. if ( aSig0 OR aSig1 ) <> 0 then
  5241. Begin
  5242. propagateFloat64NaN( a, a, result );
  5243. exit;
  5244. End;
  5245. if ( aSign = 0) then
  5246. Begin
  5247. result := a;
  5248. exit;
  5249. End;
  5250. goto invalid;
  5251. End;
  5252. if ( aSign <> 0 ) then
  5253. Begin
  5254. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5255. Begin
  5256. result := a;
  5257. exit;
  5258. End;
  5259. invalid:
  5260. float_raise( float_flag_invalid );
  5261. result.low := float64_default_nan_low;
  5262. result.high := float64_default_nan_high;
  5263. exit;
  5264. End;
  5265. if ( aExp = 0 ) then
  5266. Begin
  5267. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5268. Begin
  5269. packFloat64( 0, 0, 0, 0, result );
  5270. exit;
  5271. End;
  5272. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5273. End;
  5274. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5275. aSig0 := aSig0 or $00100000;
  5276. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5277. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5278. if ( zSig0 = 0 ) then
  5279. zSig0 := $7FFFFFFF;
  5280. doubleZSig0 := zSig0 + zSig0;
  5281. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5282. mul32To64( zSig0, zSig0, term0, term1 );
  5283. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5284. while ( sbits32 (rem0) < 0 ) do
  5285. Begin
  5286. Dec(zSig0);
  5287. doubleZSig0 := doubleZSig0 - 2;
  5288. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5289. End;
  5290. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5291. if ( ( zSig1 and $1FF ) <= 5 ) then
  5292. Begin
  5293. if ( zSig1 = 0 ) then
  5294. zSig1 := 1;
  5295. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5296. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5297. mul32To64( zSig1, zSig1, term2, term3 );
  5298. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5299. while ( sbits32 (rem1) < 0 ) do
  5300. Begin
  5301. Dec(zSig1);
  5302. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5303. term3 := term3 or 1;
  5304. term2 := term2 or doubleZSig0;
  5305. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5306. End;
  5307. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5308. End;
  5309. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5310. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
  5311. End;
  5312. {*
  5313. -------------------------------------------------------------------------------
  5314. Returns 1 if the double-precision floating-point value `a' is equal to
  5315. the corresponding value `b', and 0 otherwise. The comparison is performed
  5316. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5317. -------------------------------------------------------------------------------
  5318. *}
  5319. Function float64_eq(a: float64; b: float64): flag;
  5320. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5321. Begin
  5322. if
  5323. (
  5324. ( extractFloat64Exp( a ) = $7FF )
  5325. AND
  5326. (
  5327. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5328. )
  5329. )
  5330. OR (
  5331. ( extractFloat64Exp( b ) = $7FF )
  5332. AND (
  5333. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5334. )
  5335. )
  5336. ) then
  5337. Begin
  5338. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5339. float_raise( float_flag_invalid );
  5340. float64_eq := 0;
  5341. exit;
  5342. End;
  5343. float64_eq := flag(
  5344. ( a.low = b.low )
  5345. AND ( ( a.high = b.high )
  5346. OR ( ( a.low = 0 )
  5347. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5348. ));
  5349. End;
  5350. {*
  5351. -------------------------------------------------------------------------------
  5352. Returns 1 if the double-precision floating-point value `a' is less than
  5353. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5354. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5355. Arithmetic.
  5356. -------------------------------------------------------------------------------
  5357. *}
  5358. Function float64_le(a: float64;b: float64): flag;
  5359. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5360. Var
  5361. aSign, bSign: flag;
  5362. Begin
  5363. if
  5364. (
  5365. ( extractFloat64Exp( a ) = $7FF )
  5366. AND
  5367. (
  5368. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5369. )
  5370. )
  5371. OR (
  5372. ( extractFloat64Exp( b ) = $7FF )
  5373. AND (
  5374. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5375. )
  5376. )
  5377. ) then
  5378. Begin
  5379. float_raise( float_flag_invalid );
  5380. float64_le := 0;
  5381. exit;
  5382. End;
  5383. aSign := extractFloat64Sign( a );
  5384. bSign := extractFloat64Sign( b );
  5385. if ( aSign <> bSign ) then
  5386. Begin
  5387. float64_le := flag(
  5388. (aSign <> 0)
  5389. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5390. = 0 ));
  5391. exit;
  5392. End;
  5393. if aSign <> 0 then
  5394. float64_le := le64( b.high, b.low, a.high, a.low )
  5395. else
  5396. float64_le := le64( a.high, a.low, b.high, b.low );
  5397. End;
  5398. {*
  5399. -------------------------------------------------------------------------------
  5400. Returns 1 if the double-precision floating-point value `a' is less than
  5401. the corresponding value `b', and 0 otherwise. The comparison is performed
  5402. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5403. -------------------------------------------------------------------------------
  5404. *}
  5405. Function float64_lt(a: float64;b: float64): flag;
  5406. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5407. Var
  5408. aSign, bSign: flag;
  5409. Begin
  5410. if
  5411. (
  5412. ( extractFloat64Exp( a ) = $7FF )
  5413. AND
  5414. (
  5415. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5416. )
  5417. )
  5418. OR (
  5419. ( extractFloat64Exp( b ) = $7FF )
  5420. AND (
  5421. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5422. )
  5423. )
  5424. ) then
  5425. Begin
  5426. float_raise( float_flag_invalid );
  5427. float64_lt := 0;
  5428. exit;
  5429. End;
  5430. aSign := extractFloat64Sign( a );
  5431. bSign := extractFloat64Sign( b );
  5432. if ( aSign <> bSign ) then
  5433. Begin
  5434. float64_lt := flag(
  5435. (aSign <> 0)
  5436. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5437. <> 0 ));
  5438. exit;
  5439. End;
  5440. if aSign <> 0 then
  5441. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5442. else
  5443. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5444. End;
  5445. {*
  5446. -------------------------------------------------------------------------------
  5447. Returns 1 if the double-precision floating-point value `a' is equal to
  5448. the corresponding value `b', and 0 otherwise. The invalid exception is
  5449. raised if either operand is a NaN. Otherwise, the comparison is performed
  5450. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5451. -------------------------------------------------------------------------------
  5452. *}
  5453. Function float64_eq_signaling( a: float64; b: float64): flag;
  5454. Begin
  5455. if
  5456. (
  5457. ( extractFloat64Exp( a ) = $7FF )
  5458. AND
  5459. (
  5460. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5461. )
  5462. )
  5463. OR (
  5464. ( extractFloat64Exp( b ) = $7FF )
  5465. AND (
  5466. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5467. )
  5468. )
  5469. ) then
  5470. Begin
  5471. float_raise( float_flag_invalid );
  5472. float64_eq_signaling := 0;
  5473. exit;
  5474. End;
  5475. float64_eq_signaling := flag(
  5476. ( a.low = b.low )
  5477. AND ( ( a.high = b.high )
  5478. OR ( ( a.low = 0 )
  5479. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5480. ));
  5481. End;
  5482. {*
  5483. -------------------------------------------------------------------------------
  5484. Returns 1 if the double-precision floating-point value `a' is less than or
  5485. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5486. cause an exception. Otherwise, the comparison is performed according to the
  5487. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5488. -------------------------------------------------------------------------------
  5489. *}
  5490. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5491. Var
  5492. aSign, bSign : flag;
  5493. Begin
  5494. if
  5495. (
  5496. ( extractFloat64Exp( a ) = $7FF )
  5497. AND
  5498. (
  5499. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5500. )
  5501. )
  5502. OR (
  5503. ( extractFloat64Exp( b ) = $7FF )
  5504. AND (
  5505. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5506. )
  5507. )
  5508. ) then
  5509. Begin
  5510. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5511. float_raise( float_flag_invalid );
  5512. float64_le_quiet := 0;
  5513. exit;
  5514. End;
  5515. aSign := extractFloat64Sign( a );
  5516. bSign := extractFloat64Sign( b );
  5517. if ( aSign <> bSign ) then
  5518. Begin
  5519. float64_le_quiet := flag
  5520. ((aSign <> 0)
  5521. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5522. = 0 ));
  5523. exit;
  5524. End;
  5525. if aSign <> 0 then
  5526. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5527. else
  5528. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5529. End;
  5530. {*
  5531. -------------------------------------------------------------------------------
  5532. Returns 1 if the double-precision floating-point value `a' is less than
  5533. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5534. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5535. Standard for Binary Floating-Point Arithmetic.
  5536. -------------------------------------------------------------------------------
  5537. *}
  5538. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5539. Var
  5540. aSign, bSign: flag;
  5541. Begin
  5542. if
  5543. (
  5544. ( extractFloat64Exp( a ) = $7FF )
  5545. AND
  5546. (
  5547. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5548. )
  5549. )
  5550. OR (
  5551. ( extractFloat64Exp( b ) = $7FF )
  5552. AND (
  5553. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5554. )
  5555. )
  5556. ) then
  5557. Begin
  5558. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5559. float_raise( float_flag_invalid );
  5560. float64_lt_quiet := 0;
  5561. exit;
  5562. End;
  5563. aSign := extractFloat64Sign( a );
  5564. bSign := extractFloat64Sign( b );
  5565. if ( aSign <> bSign ) then
  5566. Begin
  5567. float64_lt_quiet := flag(
  5568. (aSign<>0)
  5569. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5570. <> 0 ));
  5571. exit;
  5572. End;
  5573. If aSign <> 0 then
  5574. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5575. else
  5576. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5577. End;
  5578. {*----------------------------------------------------------------------------
  5579. | Returns the result of converting the 64-bit two's complement integer `a'
  5580. | to the single-precision floating-point format. The conversion is performed
  5581. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5582. *----------------------------------------------------------------------------*}
  5583. function int64_to_float32( a: int64 ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  5584. var
  5585. zSign : flag;
  5586. absA : uint64;
  5587. shiftCount: int8;
  5588. Begin
  5589. if ( a = 0 ) then
  5590. begin
  5591. int64_to_float32.float32 := 0;
  5592. exit;
  5593. end;
  5594. if a < 0 then
  5595. zSign := flag(TRUE)
  5596. else
  5597. zSign := flag(FALSE);
  5598. if zSign<>0 then
  5599. absA := -a
  5600. else
  5601. absA := a;
  5602. shiftCount := countLeadingZeros64( absA ) - 40;
  5603. if ( 0 <= shiftCount ) then
  5604. begin
  5605. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5606. end
  5607. else
  5608. begin
  5609. shiftCount := shiftCount + 7;
  5610. if ( shiftCount < 0 ) then
  5611. shift64RightJamming( absA, - shiftCount, absA )
  5612. else
  5613. absA := absA shl shiftCount;
  5614. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5615. end;
  5616. End;
  5617. {*----------------------------------------------------------------------------
  5618. | Returns the result of converting the 64-bit two's complement integer `a'
  5619. | to the single-precision floating-point format. The conversion is performed
  5620. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5621. | Unisgned version.
  5622. *----------------------------------------------------------------------------*}
  5623. function qword_to_float32( a: qword ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  5624. var
  5625. absA : uint64;
  5626. shiftCount: int8;
  5627. Begin
  5628. if ( a = 0 ) then
  5629. begin
  5630. qword_to_float32.float32 := 0;
  5631. exit;
  5632. end;
  5633. absA := a;
  5634. shiftCount := countLeadingZeros64( absA ) - 40;
  5635. if ( 0 <= shiftCount ) then
  5636. begin
  5637. qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
  5638. end
  5639. else
  5640. begin
  5641. shiftCount := shiftCount + 7;
  5642. if ( shiftCount < 0 ) then
  5643. shift64RightJamming( absA, - shiftCount, absA )
  5644. else
  5645. absA := absA shl shiftCount;
  5646. qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
  5647. end;
  5648. End;
  5649. {*----------------------------------------------------------------------------
  5650. | Returns the result of converting the 64-bit two's complement integer `a'
  5651. | to the double-precision floating-point format. The conversion is performed
  5652. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5653. *----------------------------------------------------------------------------*}
  5654. function qword_to_float64( a: qword ): float64;
  5655. {$ifdef FPC_IS_SYSTEM}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5656. var
  5657. shiftCount: int8;
  5658. Begin
  5659. if ( a = 0 ) then
  5660. result := packFloat64( 0, 0, 0 )
  5661. else
  5662. begin
  5663. shiftCount := countLeadingZeros64(a) - 1;
  5664. { numbers with <= 53 significant bits are converted exactly }
  5665. if (shiftCount > 9) then
  5666. result := packFloat64(0, $43c - shiftCount, a shl (shiftCount-10))
  5667. else if (shiftCount>=0) then
  5668. result := roundAndPackFloat64( 0, $43c - shiftCount, a shl shiftCount)
  5669. else
  5670. begin
  5671. { the only possible negative value is -1, in case bit 63 of 'a' is set }
  5672. shift64RightJamming(a, 1, a);
  5673. result := roundAndPackFloat64(0, $43d, a);
  5674. end;
  5675. end;
  5676. End;
  5677. {*----------------------------------------------------------------------------
  5678. | Returns the result of converting the 64-bit two's complement integer `a'
  5679. | to the double-precision floating-point format. The conversion is performed
  5680. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5681. *----------------------------------------------------------------------------*}
  5682. function int64_to_float64( a: int64 ): float64;
  5683. {$ifdef FPC_IS_SYSTEM}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5684. Begin
  5685. if ( a = 0 ) then
  5686. result := packFloat64( 0, 0, 0 )
  5687. else if (a = int64($8000000000000000)) then
  5688. result := packFloat64( 1, $43e, 0 )
  5689. else if (a < 0) then
  5690. result := normalizeRoundAndPackFloat64( 1, $43c, -a )
  5691. else
  5692. result := normalizeRoundAndPackFloat64( 0, $43c, a );
  5693. End;
  5694. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5695. {*----------------------------------------------------------------------------
  5696. | Returns the result of converting the 64-bit two's complement integer `a'
  5697. | to the extended double-precision floating-point format. The conversion
  5698. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5699. | Arithmetic.
  5700. *----------------------------------------------------------------------------*}
  5701. function int64_to_floatx80( a: int64 ): floatx80;
  5702. var
  5703. zSign: flag;
  5704. absA: uint64;
  5705. shiftCount: int8;
  5706. begin
  5707. if ( a = 0 ) then begin
  5708. result := packFloatx80( 0, 0, 0 );
  5709. exit;
  5710. end;
  5711. zSign := ord( a < 0 );
  5712. if zSign <> 0 then absA := - a else absA := a;
  5713. shiftCount := countLeadingZeros64( absA );
  5714. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5715. end;
  5716. {*----------------------------------------------------------------------------
  5717. | Returns the result of converting the 64-bit two's complement integer `a'
  5718. | to the extended double-precision floating-point format. The conversion
  5719. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5720. | Arithmetic.
  5721. | Unsigned version.
  5722. *----------------------------------------------------------------------------*}
  5723. function qword_to_floatx80( a: qword ): floatx80;
  5724. var
  5725. absA: bits64;
  5726. shiftCount: int8;
  5727. begin
  5728. if ( a = 0 ) then begin
  5729. result := packFloatx80( 0, 0, 0 );
  5730. exit;
  5731. end;
  5732. absA := a;
  5733. shiftCount := countLeadingZeros64( absA );
  5734. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5735. end;
  5736. {$endif FPC_SOFTFLOAT_FLOATX80}
  5737. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5738. {*----------------------------------------------------------------------------
  5739. | Returns the result of converting the 64-bit two's complement integer `a' to
  5740. | the quadruple-precision floating-point format. The conversion is performed
  5741. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5742. *----------------------------------------------------------------------------*}
  5743. function int64_to_float128( a: int64 ): float128;
  5744. var
  5745. zSign: flag;
  5746. absA: uint64;
  5747. shiftCount: int8;
  5748. zExp: int32;
  5749. zSig0, zSig1: bits64;
  5750. begin
  5751. if ( a = 0 ) then begin
  5752. result := packFloat128( 0, 0, 0, 0 );
  5753. exit;
  5754. end;
  5755. zSign := ord( a < 0 );
  5756. if zSign <> 0 then absA := - a else absA := a;
  5757. shiftCount := countLeadingZeros64( absA ) + 49;
  5758. zExp := $406E - shiftCount;
  5759. if ( 64 <= shiftCount ) then begin
  5760. zSig1 := 0;
  5761. zSig0 := absA;
  5762. dec( shiftCount, 64 );
  5763. end
  5764. else begin
  5765. zSig1 := absA;
  5766. zSig0 := 0;
  5767. end;
  5768. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5769. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5770. end;
  5771. {*----------------------------------------------------------------------------
  5772. | Returns the result of converting the 64-bit two's complement integer `a' to
  5773. | the quadruple-precision floating-point format. The conversion is performed
  5774. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5775. | Unsigned version.
  5776. *----------------------------------------------------------------------------*}
  5777. function qword_to_float128( a: qword ): float128;
  5778. var
  5779. absA: bits64;
  5780. shiftCount: int8;
  5781. zExp: int32;
  5782. zSig0, zSig1: bits64;
  5783. begin
  5784. if ( a = 0 ) then begin
  5785. result := packFloat128( 0, 0, 0, 0 );
  5786. exit;
  5787. end;
  5788. absA := a;
  5789. shiftCount := countLeadingZeros64( absA ) + 49;
  5790. zExp := $406E - shiftCount;
  5791. if ( 64 <= shiftCount ) then begin
  5792. zSig1 := 0;
  5793. zSig0 := absA;
  5794. dec( shiftCount, 64 );
  5795. end
  5796. else begin
  5797. zSig1 := absA;
  5798. zSig0 := 0;
  5799. end;
  5800. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5801. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5802. end;
  5803. {$endif FPC_SOFTFLOAT_FLOAT128}
  5804. {*----------------------------------------------------------------------------
  5805. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5806. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5807. | Otherwise, returns 0.
  5808. *----------------------------------------------------------------------------*}
  5809. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5810. begin
  5811. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5812. end;
  5813. {*----------------------------------------------------------------------------
  5814. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5815. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5816. | Otherwise, returns 0.
  5817. *----------------------------------------------------------------------------*}
  5818. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5819. begin
  5820. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5821. end;
  5822. {*----------------------------------------------------------------------------
  5823. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5824. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5825. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5826. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5827. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5828. | the most-significant bit of the extra result, and the other 63 bits of the
  5829. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5830. | were all zero. This extra result is stored in the location pointed to by
  5831. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5832. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5833. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5834. | fixed-point value is shifted right by the number of bits given in `count',
  5835. | and the integer part of the result is returned at the locations pointed to
  5836. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5837. | corrupted as described above, and is returned at the location pointed to by
  5838. | `z2Ptr'.)
  5839. *----------------------------------------------------------------------------*}
  5840. procedure shift128ExtraRightJamming(
  5841. a0: bits64;
  5842. a1: bits64;
  5843. a2: bits64;
  5844. count: int16;
  5845. var z0Ptr: bits64;
  5846. var z1Ptr: bits64;
  5847. var z2Ptr: bits64);
  5848. var
  5849. z0, z1, z2: bits64;
  5850. negCount: int8;
  5851. begin
  5852. negCount := ( - count ) and 63;
  5853. if ( count = 0 ) then
  5854. begin
  5855. z2 := a2;
  5856. z1 := a1;
  5857. z0 := a0;
  5858. end
  5859. else begin
  5860. if ( count < 64 ) then
  5861. begin
  5862. z2 := a1 shl negCount;
  5863. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5864. z0 := a0 shr count;
  5865. end
  5866. else begin
  5867. if ( count = 64 ) then
  5868. begin
  5869. z2 := a1;
  5870. z1 := a0;
  5871. end
  5872. else begin
  5873. a2 := a2 or a1;
  5874. if ( count < 128 ) then
  5875. begin
  5876. z2 := a0 shl negCount;
  5877. z1 := a0 shr ( count and 63 );
  5878. end
  5879. else begin
  5880. if ( count = 128 ) then
  5881. z2 := a0
  5882. else
  5883. z2 := ord( a0 <> 0 );
  5884. z1 := 0;
  5885. end;
  5886. end;
  5887. z0 := 0;
  5888. end;
  5889. z2 := z2 or ord( a2 <> 0 );
  5890. end;
  5891. z2Ptr := z2;
  5892. z1Ptr := z1;
  5893. z0Ptr := z0;
  5894. end;
  5895. {*----------------------------------------------------------------------------
  5896. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5897. | _plus_ the number of bits given in `count'. The shifted result is at most
  5898. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5899. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5900. | shifted off is the most-significant bit of the extra result, and the other
  5901. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5902. | bits shifted off were all zero. This extra result is stored in the location
  5903. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5904. | (This routine makes more sense if `a0' and `a1' are considered to form
  5905. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5906. | point value is shifted right by the number of bits given in `count', and
  5907. | the integer part of the result is returned at the location pointed to by
  5908. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5909. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5910. *----------------------------------------------------------------------------*}
  5911. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5912. var
  5913. z0, z1: bits64;
  5914. negCount: int8;
  5915. begin
  5916. negCount := ( - count ) and 63;
  5917. if ( count = 0 ) then
  5918. begin
  5919. z1 := a1;
  5920. z0 := a0;
  5921. end
  5922. else if ( count < 64 ) then
  5923. begin
  5924. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5925. z0 := a0 shr count;
  5926. end
  5927. else begin
  5928. if ( count = 64 ) then
  5929. begin
  5930. z1 := a0 or ord( a1 <> 0 );
  5931. end
  5932. else begin
  5933. z1 := ord( ( a0 or a1 ) <> 0 );
  5934. end;
  5935. z0 := 0;
  5936. end;
  5937. z1Ptr := z1;
  5938. z0Ptr := z0;
  5939. end;
  5940. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5941. {*----------------------------------------------------------------------------
  5942. | Returns the fraction bits of the extended double-precision floating-point
  5943. | value `a'.
  5944. *----------------------------------------------------------------------------*}
  5945. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5946. begin
  5947. result:=a.low;
  5948. end;
  5949. {*----------------------------------------------------------------------------
  5950. | Returns the exponent bits of the extended double-precision floating-point
  5951. | value `a'.
  5952. *----------------------------------------------------------------------------*}
  5953. function extractFloatx80Exp(a : floatx80): int32;inline;
  5954. begin
  5955. result:=a.high and $7FFF;
  5956. end;
  5957. {*----------------------------------------------------------------------------
  5958. | Returns the sign bit of the extended double-precision floating-point value
  5959. | `a'.
  5960. *----------------------------------------------------------------------------*}
  5961. function extractFloatx80Sign(a : floatx80): flag;inline;
  5962. begin
  5963. result:=a.high shr 15;
  5964. end;
  5965. {*----------------------------------------------------------------------------
  5966. | Normalizes the subnormal extended double-precision floating-point value
  5967. | represented by the denormalized significand `aSig'. The normalized exponent
  5968. | and significand are stored at the locations pointed to by `zExpPtr' and
  5969. | `zSigPtr', respectively.
  5970. *----------------------------------------------------------------------------*}
  5971. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5972. var
  5973. shiftCount: int8;
  5974. begin
  5975. shiftCount := countLeadingZeros64( aSig );
  5976. zSigPtr := aSig shl shiftCount;
  5977. zExpPtr := 1 - shiftCount;
  5978. end;
  5979. {*----------------------------------------------------------------------------
  5980. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5981. | extended double-precision floating-point value, returning the result.
  5982. *----------------------------------------------------------------------------*}
  5983. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5984. var
  5985. z: floatx80;
  5986. begin
  5987. z.low := zSig;
  5988. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5989. result:=z;
  5990. end;
  5991. {*----------------------------------------------------------------------------
  5992. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5993. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5994. | and returns the proper extended double-precision floating-point value
  5995. | corresponding to the abstract input. Ordinarily, the abstract value is
  5996. | rounded and packed into the extended double-precision format, with the
  5997. | inexact exception raised if the abstract input cannot be represented
  5998. | exactly. However, if the abstract value is too large, the overflow and
  5999. | inexact exceptions are raised and an infinity or maximal finite value is
  6000. | returned. If the abstract value is too small, the input value is rounded to
  6001. | a subnormal number, and the underflow and inexact exceptions are raised if
  6002. | the abstract input cannot be represented exactly as a subnormal extended
  6003. | double-precision floating-point number.
  6004. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  6005. | number of bits as single or double precision, respectively. Otherwise, the
  6006. | result is rounded to the full precision of the extended double-precision
  6007. | format.
  6008. | The input significand must be normalized or smaller. If the input
  6009. | significand is not normalized, `zExp' must be 0; in that case, the result
  6010. | returned is a subnormal number, and it must not require rounding. The
  6011. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  6012. | Floating-Point Arithmetic.
  6013. *----------------------------------------------------------------------------*}
  6014. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6015. var
  6016. roundingMode: TFPURoundingMode;
  6017. roundNearestEven, increment, isTiny: flag;
  6018. roundIncrement, roundMask, roundBits: int64;
  6019. label
  6020. precision80, overflow;
  6021. begin
  6022. roundingMode := softfloat_rounding_mode;
  6023. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  6024. if ( roundingPrecision = 80 ) then
  6025. goto precision80;
  6026. if ( roundingPrecision = 64 ) then
  6027. begin
  6028. roundIncrement := int64( $0000000000000400 );
  6029. roundMask := int64( $00000000000007FF );
  6030. end
  6031. else if ( roundingPrecision = 32 ) then
  6032. begin
  6033. roundIncrement := int64( $0000008000000000 );
  6034. roundMask := int64( $000000FFFFFFFFFF );
  6035. end
  6036. else begin
  6037. goto precision80;
  6038. end;
  6039. zSig0 := zSig0 or ord( zSig1 <> 0 );
  6040. if ( not (roundNearestEven<>0) ) then
  6041. begin
  6042. if ( roundingMode = float_round_to_zero ) then
  6043. begin
  6044. roundIncrement := 0;
  6045. end
  6046. else begin
  6047. roundIncrement := roundMask;
  6048. if ( zSign<>0 ) then
  6049. begin
  6050. if ( roundingMode = float_round_up ) then
  6051. roundIncrement := 0;
  6052. end
  6053. else begin
  6054. if ( roundingMode = float_round_down ) then
  6055. roundIncrement := 0;
  6056. end;
  6057. end;
  6058. end;
  6059. roundBits := zSig0 and roundMask;
  6060. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6061. if ( ( $7FFE < zExp )
  6062. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  6063. ) then begin
  6064. goto overflow;
  6065. end;
  6066. if ( zExp <= 0 ) then begin
  6067. isTiny := ord (
  6068. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6069. or ( zExp < 0 )
  6070. or ( zSig0 <= zSig0 + roundIncrement ) );
  6071. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  6072. zExp := 0;
  6073. roundBits := zSig0 and roundMask;
  6074. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  6075. if ( roundBits <> 0 ) then set_inexact_flag;
  6076. inc( zSig0, roundIncrement );
  6077. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6078. roundIncrement := roundMask + 1;
  6079. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6080. roundMask := roundMask or roundIncrement;
  6081. end;
  6082. zSig0 := zSig0 and not roundMask;
  6083. result:=packFloatx80( zSign, zExp, zSig0 );
  6084. exit;
  6085. end;
  6086. end;
  6087. if ( roundBits <> 0 ) then set_inexact_flag;
  6088. inc( zSig0, roundIncrement );
  6089. if ( zSig0 < roundIncrement ) then begin
  6090. inc(zExp);
  6091. zSig0 := bits64( $8000000000000000 );
  6092. end;
  6093. roundIncrement := roundMask + 1;
  6094. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6095. roundMask := roundMask or roundIncrement;
  6096. end;
  6097. zSig0 := zSig0 and not roundMask;
  6098. if ( zSig0 = 0 ) then zExp := 0;
  6099. result:=packFloatx80( zSign, zExp, zSig0 );
  6100. exit;
  6101. precision80:
  6102. increment := ord ( sbits64( zSig1 ) < 0 );
  6103. if ( roundNearestEven = 0 ) then begin
  6104. if ( roundingMode = float_round_to_zero ) then begin
  6105. increment := 0;
  6106. end
  6107. else begin
  6108. if ( zSign <> 0 ) then begin
  6109. increment := ord ( roundingMode = float_round_down ) and zSig1;
  6110. end
  6111. else begin
  6112. increment := ord ( roundingMode = float_round_up ) and zSig1;
  6113. end;
  6114. end;
  6115. end;
  6116. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6117. if ( ( $7FFE < zExp )
  6118. or ( ( zExp = $7FFE )
  6119. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  6120. and ( increment <> 0 )
  6121. )
  6122. ) then begin
  6123. roundMask := 0;
  6124. overflow:
  6125. float_raise( [float_flag_overflow,float_flag_inexact] );
  6126. if ( ( roundingMode = float_round_to_zero )
  6127. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6128. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6129. ) then begin
  6130. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6131. exit;
  6132. end;
  6133. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6134. exit;
  6135. end;
  6136. if ( zExp <= 0 ) then begin
  6137. isTiny := ord(
  6138. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6139. or ( zExp < 0 )
  6140. or ( increment = 0 )
  6141. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6142. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6143. zExp := 0;
  6144. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6145. if ( zSig1 <> 0 ) then set_inexact_flag;
  6146. if ( roundNearestEven <> 0 ) then begin
  6147. increment := ord( sbits64( zSig1 ) < 0 );
  6148. end
  6149. else begin
  6150. if ( zSign <> 0 ) then begin
  6151. increment := ord( roundingMode = float_round_down ) and zSig1;
  6152. end
  6153. else begin
  6154. increment := ord( roundingMode = float_round_up ) and zSig1;
  6155. end;
  6156. end;
  6157. if ( increment <> 0 ) then begin
  6158. inc(zSig0);
  6159. zSig0 :=
  6160. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6161. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6162. end;
  6163. result:=packFloatx80( zSign, zExp, zSig0 );
  6164. exit;
  6165. end;
  6166. end;
  6167. if ( zSig1 <> 0 ) then set_inexact_flag;
  6168. if ( increment <> 0 ) then begin
  6169. inc(zSig0);
  6170. if ( zSig0 = 0 ) then begin
  6171. inc(zExp);
  6172. zSig0 := bits64( $8000000000000000 );
  6173. end
  6174. else begin
  6175. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6176. end;
  6177. end
  6178. else begin
  6179. if ( zSig0 = 0 ) then zExp := 0;
  6180. end;
  6181. result:=packFloatx80( zSign, zExp, zSig0 );
  6182. end;
  6183. {*----------------------------------------------------------------------------
  6184. | Takes an abstract floating-point value having sign `zSign', exponent
  6185. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6186. | and returns the proper extended double-precision floating-point value
  6187. | corresponding to the abstract input. This routine is just like
  6188. | `roundAndPackFloatx80' except that the input significand does not have to be
  6189. | normalized.
  6190. *----------------------------------------------------------------------------*}
  6191. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6192. var
  6193. shiftCount: int8;
  6194. begin
  6195. if ( zSig0 = 0 ) then begin
  6196. zSig0 := zSig1;
  6197. zSig1 := 0;
  6198. dec( zExp, 64 );
  6199. end;
  6200. shiftCount := countLeadingZeros64( zSig0 );
  6201. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6202. zExp := zExp - shiftCount;
  6203. result :=
  6204. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6205. end;
  6206. {*----------------------------------------------------------------------------
  6207. | Returns the result of converting the extended double-precision floating-
  6208. | point value `a' to the 32-bit two's complement integer format. The
  6209. | conversion is performed according to the IEC/IEEE Standard for Binary
  6210. | Floating-Point Arithmetic---which means in particular that the conversion
  6211. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6212. | largest positive integer is returned. Otherwise, if the conversion
  6213. | overflows, the largest integer with the same sign as `a' is returned.
  6214. *----------------------------------------------------------------------------*}
  6215. function floatx80_to_int32(a: floatx80): int32;
  6216. var
  6217. aSign: flag;
  6218. aExp, shiftCount: int32;
  6219. aSig: bits64;
  6220. begin
  6221. aSig := extractFloatx80Frac( a );
  6222. aExp := extractFloatx80Exp( a );
  6223. aSign := extractFloatx80Sign( a );
  6224. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6225. shiftCount := $4037 - aExp;
  6226. if ( shiftCount <= 0 ) then shiftCount := 1;
  6227. shift64RightJamming( aSig, shiftCount, aSig );
  6228. result := roundAndPackInt32( aSign, aSig );
  6229. end;
  6230. {*----------------------------------------------------------------------------
  6231. | Returns the result of converting the extended double-precision floating-
  6232. | point value `a' to the 32-bit two's complement integer format. The
  6233. | conversion is performed according to the IEC/IEEE Standard for Binary
  6234. | Floating-Point Arithmetic, except that the conversion is always rounded
  6235. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6236. | Otherwise, if the conversion overflows, the largest integer with the same
  6237. | sign as `a' is returned.
  6238. *----------------------------------------------------------------------------*}
  6239. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6240. var
  6241. aSign: flag;
  6242. aExp, shiftCount: int32;
  6243. aSig, savedASig: bits64;
  6244. z: int32;
  6245. label
  6246. invalid;
  6247. begin
  6248. aSig := extractFloatx80Frac( a );
  6249. aExp := extractFloatx80Exp( a );
  6250. aSign := extractFloatx80Sign( a );
  6251. if ( $401E < aExp ) then begin
  6252. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6253. goto invalid;
  6254. end
  6255. else if ( aExp < $3FFF ) then begin
  6256. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6257. result := 0;
  6258. exit;
  6259. end;
  6260. shiftCount := $403E - aExp;
  6261. savedASig := aSig;
  6262. aSig := aSig shr shiftCount;
  6263. z := aSig;
  6264. if ( aSign <> 0 ) then z := - z;
  6265. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6266. invalid:
  6267. float_raise( float_flag_invalid );
  6268. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6269. exit;
  6270. end;
  6271. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6272. set_inexact_flag;
  6273. end;
  6274. result := z;
  6275. end;
  6276. {*----------------------------------------------------------------------------
  6277. | Returns the result of converting the extended double-precision floating-
  6278. | point value `a' to the 64-bit two's complement integer format. The
  6279. | conversion is performed according to the IEC/IEEE Standard for Binary
  6280. | Floating-Point Arithmetic---which means in particular that the conversion
  6281. | is rounded according to the current rounding mode. If `a' is a NaN,
  6282. | the largest positive integer is returned. Otherwise, if the conversion
  6283. | overflows, the largest integer with the same sign as `a' is returned.
  6284. *----------------------------------------------------------------------------*}
  6285. function floatx80_to_int64(a: floatx80): int64;
  6286. var
  6287. aSign: flag;
  6288. aExp, shiftCount: int32;
  6289. aSig, aSigExtra: bits64;
  6290. begin
  6291. aSig := extractFloatx80Frac( a );
  6292. aExp := extractFloatx80Exp( a );
  6293. aSign := extractFloatx80Sign( a );
  6294. shiftCount := $403E - aExp;
  6295. if ( shiftCount <= 0 ) then begin
  6296. if ( shiftCount <> 0 ) then begin
  6297. float_raise( float_flag_invalid );
  6298. if ( ( aSign = 0 )
  6299. or ( ( aExp = $7FFF )
  6300. and ( aSig <> bits64( $8000000000000000 ) ) )
  6301. ) then begin
  6302. result := $7FFFFFFFFFFFFFFF;
  6303. exit;
  6304. end;
  6305. result := $8000000000000000;
  6306. exit;
  6307. end;
  6308. aSigExtra := 0;
  6309. end
  6310. else begin
  6311. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6312. end;
  6313. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6314. end;
  6315. {*----------------------------------------------------------------------------
  6316. | Returns the result of converting the extended double-precision floating-
  6317. | point value `a' to the 64-bit two's complement integer format. The
  6318. | conversion is performed according to the IEC/IEEE Standard for Binary
  6319. | Floating-Point Arithmetic, except that the conversion is always rounded
  6320. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6321. | Otherwise, if the conversion overflows, the largest integer with the same
  6322. | sign as `a' is returned.
  6323. *----------------------------------------------------------------------------*}
  6324. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6325. var
  6326. aSign: flag;
  6327. aExp, shiftCount: int32;
  6328. aSig: bits64;
  6329. z: int64;
  6330. begin
  6331. aSig := extractFloatx80Frac( a );
  6332. aExp := extractFloatx80Exp( a );
  6333. aSign := extractFloatx80Sign( a );
  6334. shiftCount := aExp - $403E;
  6335. if ( 0 <= shiftCount ) then begin
  6336. aSig := $7FFFFFFFFFFFFFFF;
  6337. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6338. float_raise( float_flag_invalid );
  6339. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6340. result := $7FFFFFFFFFFFFFFF;
  6341. exit;
  6342. end;
  6343. end;
  6344. result := $8000000000000000;
  6345. exit;
  6346. end
  6347. else if ( aExp < $3FFF ) then begin
  6348. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6349. result := 0;
  6350. exit;
  6351. end;
  6352. z := aSig shr ( - shiftCount );
  6353. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6354. set_inexact_flag;
  6355. end;
  6356. if ( aSign <> 0 ) then z := - z;
  6357. result := z;
  6358. end;
  6359. {*----------------------------------------------------------------------------
  6360. | The pattern for a default generated extended double-precision NaN. The
  6361. | `high' and `low' values hold the most- and least-significant bits,
  6362. | respectively.
  6363. *----------------------------------------------------------------------------*}
  6364. const
  6365. floatx80_default_nan_high = $FFFF;
  6366. floatx80_default_nan_low = bits64( $C000000000000000 );
  6367. FLOATX80_EXP_BIAS = $3FFF;
  6368. floatx80_default_nan : floatx80 =
  6369. (low: floatx80_default_nan_low;
  6370. high: floatx80_default_nan_high);
  6371. {*----------------------------------------------------------------------------
  6372. | Returns 1 if the extended double-precision floating-point value `a' is a
  6373. | signaling NaN; otherwise returns 0.
  6374. *----------------------------------------------------------------------------*}
  6375. function floatx80_is_signaling_nan(a : floatx80): flag;
  6376. var
  6377. aLow: bits64;
  6378. begin
  6379. aLow := a.low and not $4000000000000000;
  6380. result := ord(
  6381. ( a.high and $7FFF = $7FFF )
  6382. and ( bits64( aLow shl 1 ) <> 0 )
  6383. and ( a.low = aLow ) );
  6384. end;
  6385. {----------------------------------------------------------------------------
  6386. | Returns 1 if the extended double-precision floating-point value `a' is an
  6387. | unsupported; otherwise returns 0.
  6388. *----------------------------------------------------------------------------}
  6389. function floatx80_is_unsupported(a: floatx80): Integer;
  6390. begin
  6391. Result := ord(((a.high and $7FFF)<>0) and not((a.low and QWord($8000000000000000))<>0));
  6392. end;
  6393. {*----------------------------------------------------------------------------
  6394. | Returns the result of converting the extended double-precision floating-
  6395. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6396. | invalid exception is raised.
  6397. *----------------------------------------------------------------------------*}
  6398. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6399. var
  6400. z: commonNaNT;
  6401. begin
  6402. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6403. z.sign := a.high shr 15;
  6404. z.low := 0;
  6405. z.high := a.low shl 1;
  6406. result := z;
  6407. end;
  6408. {*----------------------------------------------------------------------------
  6409. | Returns 1 if the extended double-precision floating-point value `a' is a
  6410. | NaN; otherwise returns 0.
  6411. *----------------------------------------------------------------------------*}
  6412. function floatx80_is_nan(a : floatx80 ): flag;
  6413. begin
  6414. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
  6415. end;
  6416. {*----------------------------------------------------------------------------
  6417. | Takes two extended double-precision floating-point values `a' and `b', one
  6418. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6419. | `b' is a signaling NaN, the invalid exception is raised.
  6420. *----------------------------------------------------------------------------*}
  6421. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6422. var
  6423. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6424. label
  6425. returnLargerSignificand;
  6426. begin
  6427. aIsNaN := floatx80_is_nan( a );
  6428. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6429. bIsNaN := floatx80_is_nan( b );
  6430. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6431. a.low := a.low or $C000000000000000;
  6432. b.low := b.low or $C000000000000000;
  6433. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6434. if aIsSignalingNaN <> 0 then begin
  6435. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6436. if bIsNaN <> 0 then result := b else result := a;
  6437. exit;
  6438. end
  6439. else if aIsNaN <>0 then begin
  6440. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6441. result := a;
  6442. exit;
  6443. end;
  6444. returnLargerSignificand:
  6445. if ( a.low < b.low ) then begin
  6446. result := b;
  6447. exit;
  6448. end;
  6449. if ( b.low < a.low ) then begin
  6450. result := a;
  6451. exit;
  6452. end;
  6453. if a.high < b.high then result := a else result := b;
  6454. exit;
  6455. end
  6456. else
  6457. result := b;
  6458. end;
  6459. {*----------------------------------------------------------------------------
  6460. | Returns the result of converting the extended double-precision floating-
  6461. | point value `a' to the single-precision floating-point format. The
  6462. | conversion is performed according to the IEC/IEEE Standard for Binary
  6463. | Floating-Point Arithmetic.
  6464. *----------------------------------------------------------------------------*}
  6465. function floatx80_to_float32(a: floatx80): float32;
  6466. var
  6467. aSign: flag;
  6468. aExp: int32;
  6469. aSig: bits64;
  6470. begin
  6471. aSig := extractFloatx80Frac( a );
  6472. aExp := extractFloatx80Exp( a );
  6473. aSign := extractFloatx80Sign( a );
  6474. if ( aExp = $7FFF ) then begin
  6475. if bits64( aSig shl 1 ) <> 0 then begin
  6476. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6477. exit;
  6478. end;
  6479. result := packFloat32( aSign, $FF, 0 );
  6480. exit;
  6481. end;
  6482. shift64RightJamming( aSig, 33, aSig );
  6483. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6484. result := roundAndPackFloat32( aSign, aExp, aSig );
  6485. end;
  6486. {*----------------------------------------------------------------------------
  6487. | Returns the result of converting the extended double-precision floating-
  6488. | point value `a' to the double-precision floating-point format. The
  6489. | conversion is performed according to the IEC/IEEE Standard for Binary
  6490. | Floating-Point Arithmetic.
  6491. *----------------------------------------------------------------------------*}
  6492. function floatx80_to_float64(a: floatx80): float64;
  6493. var
  6494. aSign: flag;
  6495. aExp: int32;
  6496. aSig, zSig: bits64;
  6497. begin
  6498. aSig := extractFloatx80Frac( a );
  6499. aExp := extractFloatx80Exp( a );
  6500. aSign := extractFloatx80Sign( a );
  6501. if ( aExp = $7FFF ) then begin
  6502. if bits64( aSig shl 1 ) <> 0 then begin
  6503. result:=commonNaNToFloat64(floatx80ToCommonNaN(a));
  6504. exit;
  6505. end;
  6506. result := packFloat64( aSign, $7FF, 0 );
  6507. exit;
  6508. end;
  6509. shift64RightJamming( aSig, 1, zSig );
  6510. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6511. result := roundAndPackFloat64( aSign, aExp, zSig );
  6512. end;
  6513. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6514. {*----------------------------------------------------------------------------
  6515. | Returns the result of converting the extended double-precision floating-
  6516. | point value `a' to the quadruple-precision floating-point format. The
  6517. | conversion is performed according to the IEC/IEEE Standard for Binary
  6518. | Floating-Point Arithmetic.
  6519. *----------------------------------------------------------------------------*}
  6520. function floatx80_to_float128(a: floatx80): float128;
  6521. var
  6522. aSign: flag;
  6523. aExp: int16;
  6524. aSig, zSig0, zSig1: bits64;
  6525. begin
  6526. aSig := extractFloatx80Frac( a );
  6527. aExp := extractFloatx80Exp( a );
  6528. aSign := extractFloatx80Sign( a );
  6529. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6530. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6531. exit;
  6532. end;
  6533. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6534. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6535. end;
  6536. {$endif FPC_SOFTFLOAT_FLOAT128}
  6537. {*----------------------------------------------------------------------------
  6538. | Rounds the extended double-precision floating-point value `a' to an integer,
  6539. | and Returns the result as an extended quadruple-precision floating-point
  6540. | value. The operation is performed according to the IEC/IEEE Standard for
  6541. | Binary Floating-Point Arithmetic.
  6542. *----------------------------------------------------------------------------*}
  6543. function floatx80_round_to_int(a: floatx80): floatx80;
  6544. var
  6545. aSign: flag;
  6546. aExp: int32;
  6547. lastBitMask, roundBitsMask: bits64;
  6548. roundingMode: TFPURoundingMode;
  6549. z: floatx80;
  6550. begin
  6551. aExp := extractFloatx80Exp( a );
  6552. if ( $403E <= aExp ) then begin
  6553. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6554. result := propagateFloatx80NaN( a, a );
  6555. exit;
  6556. end;
  6557. result := a;
  6558. exit;
  6559. end;
  6560. if ( aExp < $3FFF ) then begin
  6561. if ( ( aExp = 0 )
  6562. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6563. result := a;
  6564. exit;
  6565. end;
  6566. set_inexact_flag;
  6567. aSign := extractFloatx80Sign( a );
  6568. case softfloat_rounding_mode of
  6569. float_round_nearest_even:
  6570. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6571. ) then begin
  6572. result :=
  6573. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6574. exit;
  6575. end;
  6576. float_round_down: begin
  6577. if aSign <> 0 then
  6578. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6579. else
  6580. result := packFloatx80( 0, 0, 0 );
  6581. exit;
  6582. end;
  6583. float_round_up: begin
  6584. if aSign <> 0 then
  6585. result := packFloatx80( 1, 0, 0 )
  6586. else
  6587. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6588. exit;
  6589. end;
  6590. end;
  6591. result := packFloatx80( aSign, 0, 0 );
  6592. exit;
  6593. end;
  6594. lastBitMask := 1;
  6595. lastBitMask := lastBitMask shl ( $403E - aExp );
  6596. roundBitsMask := lastBitMask - 1;
  6597. z := a;
  6598. roundingMode := softfloat_rounding_mode;
  6599. if ( roundingMode = float_round_nearest_even ) then begin
  6600. inc( z.low, lastBitMask shr 1 );
  6601. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6602. end
  6603. else if ( roundingMode <> float_round_to_zero ) then begin
  6604. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6605. inc( z.low, roundBitsMask );
  6606. end;
  6607. end;
  6608. z.low := z.low and not roundBitsMask;
  6609. if ( z.low = 0 ) then begin
  6610. inc(z.high);
  6611. z.low := bits64( $8000000000000000 );
  6612. end;
  6613. if ( z.low <> a.low ) then set_inexact_flag;
  6614. result := z;
  6615. end;
  6616. {*----------------------------------------------------------------------------
  6617. | Returns the result of adding the absolute values of the extended double-
  6618. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6619. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6620. | The addition is performed according to the IEC/IEEE Standard for Binary
  6621. | Floating-Point Arithmetic.
  6622. *----------------------------------------------------------------------------*}
  6623. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6624. var
  6625. aExp, bExp, zExp: int32;
  6626. aSig, bSig, zSig0, zSig1: bits64;
  6627. expDiff: int32;
  6628. label
  6629. shiftRight1, roundAndPack;
  6630. begin
  6631. aSig := extractFloatx80Frac( a );
  6632. aExp := extractFloatx80Exp( a );
  6633. bSig := extractFloatx80Frac( b );
  6634. bExp := extractFloatx80Exp( b );
  6635. expDiff := aExp - bExp;
  6636. if ( 0 < expDiff ) then begin
  6637. if ( aExp = $7FFF ) then begin
  6638. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6639. result := propagateFloatx80NaN( a, b );
  6640. exit;
  6641. end;
  6642. result := a;
  6643. exit;
  6644. end;
  6645. if ( bExp = 0 ) then dec(expDiff);
  6646. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6647. zExp := aExp;
  6648. end
  6649. else if ( expDiff < 0 ) then begin
  6650. if ( bExp = $7FFF ) then begin
  6651. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6652. result := propagateFloatx80NaN( a, b );
  6653. exit;
  6654. end;
  6655. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6656. exit;
  6657. end;
  6658. if ( aExp = 0 ) then inc(expDiff);
  6659. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6660. zExp := bExp;
  6661. end
  6662. else begin
  6663. if ( aExp = $7FFF ) then begin
  6664. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6665. result := propagateFloatx80NaN( a, b );
  6666. exit;
  6667. end;
  6668. result := a;
  6669. exit;
  6670. end;
  6671. zSig1 := 0;
  6672. zSig0 := aSig + bSig;
  6673. if ( aExp = 0 ) then begin
  6674. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6675. goto roundAndPack;
  6676. end;
  6677. zExp := aExp;
  6678. goto shiftRight1;
  6679. end;
  6680. zSig0 := aSig + bSig;
  6681. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6682. shiftRight1:
  6683. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6684. zSig0 := zSig0 or $8000000000000000;
  6685. inc(zExp);
  6686. roundAndPack:
  6687. result :=
  6688. roundAndPackFloatx80(
  6689. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6690. end;
  6691. {*----------------------------------------------------------------------------
  6692. | Returns the result of subtracting the absolute values of the extended
  6693. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6694. | difference is negated before being returned. `zSign' is ignored if the
  6695. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6696. | Standard for Binary Floating-Point Arithmetic.
  6697. *----------------------------------------------------------------------------*}
  6698. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6699. var
  6700. aExp, bExp, zExp: int32;
  6701. aSig, bSig, zSig0, zSig1: bits64;
  6702. expDiff: int32;
  6703. z: floatx80;
  6704. label
  6705. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6706. begin
  6707. aSig := extractFloatx80Frac( a );
  6708. aExp := extractFloatx80Exp( a );
  6709. bSig := extractFloatx80Frac( b );
  6710. bExp := extractFloatx80Exp( b );
  6711. expDiff := aExp - bExp;
  6712. if ( 0 < expDiff ) then goto aExpBigger;
  6713. if ( expDiff < 0 ) then goto bExpBigger;
  6714. if ( aExp = $7FFF ) then begin
  6715. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6716. result := propagateFloatx80NaN( a, b );
  6717. exit;
  6718. end;
  6719. float_raise( float_flag_invalid );
  6720. z.low := floatx80_default_nan_low;
  6721. z.high := floatx80_default_nan_high;
  6722. result := z;
  6723. exit;
  6724. end;
  6725. if ( aExp = 0 ) then begin
  6726. aExp := 1;
  6727. bExp := 1;
  6728. end;
  6729. zSig1 := 0;
  6730. if ( bSig < aSig ) then goto aBigger;
  6731. if ( aSig < bSig ) then goto bBigger;
  6732. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6733. exit;
  6734. bExpBigger:
  6735. if ( bExp = $7FFF ) then begin
  6736. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6737. result := propagateFloatx80NaN( a, b );
  6738. exit;
  6739. end;
  6740. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6741. exit;
  6742. end;
  6743. if ( aExp = 0 ) then inc(expDiff);
  6744. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6745. bBigger:
  6746. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6747. zExp := bExp;
  6748. zSign := zSign xor 1;
  6749. goto normalizeRoundAndPack;
  6750. aExpBigger:
  6751. if ( aExp = $7FFF ) then begin
  6752. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6753. result := propagateFloatx80NaN( a, b );
  6754. exit;
  6755. end;
  6756. result := a;
  6757. exit;
  6758. end;
  6759. if ( bExp = 0 ) then dec(expDiff);
  6760. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6761. aBigger:
  6762. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6763. zExp := aExp;
  6764. normalizeRoundAndPack:
  6765. result :=
  6766. normalizeRoundAndPackFloatx80(
  6767. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6768. end;
  6769. {*----------------------------------------------------------------------------
  6770. | Returns the result of adding the extended double-precision floating-point
  6771. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6772. | Standard for Binary Floating-Point Arithmetic.
  6773. *----------------------------------------------------------------------------*}
  6774. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6775. var
  6776. aSign, bSign: flag;
  6777. begin
  6778. aSign := extractFloatx80Sign( a );
  6779. bSign := extractFloatx80Sign( b );
  6780. if ( aSign = bSign ) then begin
  6781. result := addFloatx80Sigs( a, b, aSign );
  6782. end
  6783. else begin
  6784. result := subFloatx80Sigs( a, b, aSign );
  6785. end;
  6786. end;
  6787. {*----------------------------------------------------------------------------
  6788. | Returns the result of subtracting the extended double-precision floating-
  6789. | point values `a' and `b'. The operation is performed according to the
  6790. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6791. *----------------------------------------------------------------------------*}
  6792. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6793. var
  6794. aSign, bSign: flag;
  6795. begin
  6796. aSign := extractFloatx80Sign( a );
  6797. bSign := extractFloatx80Sign( b );
  6798. if ( aSign = bSign ) then begin
  6799. result := subFloatx80Sigs( a, b, aSign );
  6800. end
  6801. else begin
  6802. result := addFloatx80Sigs( a, b, aSign );
  6803. end;
  6804. end;
  6805. {*----------------------------------------------------------------------------
  6806. | Returns the result of multiplying the extended double-precision floating-
  6807. | point values `a' and `b'. The operation is performed according to the
  6808. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6809. *----------------------------------------------------------------------------*}
  6810. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6811. var
  6812. aSign, bSign, zSign: flag;
  6813. aExp, bExp, zExp: int32;
  6814. aSig, bSig, zSig0, zSig1: bits64;
  6815. z: floatx80;
  6816. label
  6817. invalid;
  6818. begin
  6819. aSig := extractFloatx80Frac( a );
  6820. aExp := extractFloatx80Exp( a );
  6821. aSign := extractFloatx80Sign( a );
  6822. bSig := extractFloatx80Frac( b );
  6823. bExp := extractFloatx80Exp( b );
  6824. bSign := extractFloatx80Sign( b );
  6825. zSign := aSign xor bSign;
  6826. if ( aExp = $7FFF ) then begin
  6827. if ( bits64( aSig shl 1 ) <> 0 )
  6828. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6829. result := propagateFloatx80NaN( a, b );
  6830. exit;
  6831. end;
  6832. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6833. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6834. exit;
  6835. end;
  6836. if ( bExp = $7FFF ) then begin
  6837. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6838. result := propagateFloatx80NaN( a, b );
  6839. exit;
  6840. end;
  6841. if ( ( aExp or aSig ) = 0 ) then begin
  6842. invalid:
  6843. float_raise( float_flag_invalid );
  6844. z.low := floatx80_default_nan_low;
  6845. z.high := floatx80_default_nan_high;
  6846. result := z;
  6847. exit;
  6848. end;
  6849. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6850. exit;
  6851. end;
  6852. if ( aExp = 0 ) then begin
  6853. if ( aSig = 0 ) then begin
  6854. result := packFloatx80( zSign, 0, 0 );
  6855. exit;
  6856. end;
  6857. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6858. end;
  6859. if ( bExp = 0 ) then begin
  6860. if ( bSig = 0 ) then begin
  6861. result := packFloatx80( zSign, 0, 0 );
  6862. exit;
  6863. end;
  6864. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6865. end;
  6866. zExp := aExp + bExp - $3FFE;
  6867. mul64To128( aSig, bSig, zSig0, zSig1 );
  6868. if 0 < sbits64( zSig0 ) then begin
  6869. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6870. dec(zExp);
  6871. end;
  6872. result :=
  6873. roundAndPackFloatx80(
  6874. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6875. end;
  6876. {*----------------------------------------------------------------------------
  6877. | Returns the result of dividing the extended double-precision floating-point
  6878. | value `a' by the corresponding value `b'. The operation is performed
  6879. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6880. *----------------------------------------------------------------------------*}
  6881. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6882. var
  6883. aSign, bSign, zSign: flag;
  6884. aExp, bExp, zExp: int32;
  6885. aSig, bSig, zSig0, zSig1: bits64;
  6886. rem0, rem1, rem2, term0, term1, term2: bits64;
  6887. z: floatx80;
  6888. label
  6889. invalid;
  6890. begin
  6891. aSig := extractFloatx80Frac( a );
  6892. aExp := extractFloatx80Exp( a );
  6893. aSign := extractFloatx80Sign( a );
  6894. bSig := extractFloatx80Frac( b );
  6895. bExp := extractFloatx80Exp( b );
  6896. bSign := extractFloatx80Sign( b );
  6897. zSign := aSign xor bSign;
  6898. if ( aExp = $7FFF ) then begin
  6899. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6900. result := propagateFloatx80NaN( a, b );
  6901. exit;
  6902. end;
  6903. if ( bExp = $7FFF ) then begin
  6904. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6905. result := propagateFloatx80NaN( a, b );
  6906. exit;
  6907. end;
  6908. goto invalid;
  6909. end;
  6910. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6911. exit;
  6912. end;
  6913. if ( bExp = $7FFF ) then begin
  6914. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6915. result := propagateFloatx80NaN( a, b );
  6916. exit;
  6917. end;
  6918. result := packFloatx80( zSign, 0, 0 );
  6919. exit;
  6920. end;
  6921. if ( bExp = 0 ) then begin
  6922. if ( bSig = 0 ) then begin
  6923. if ( ( aExp or aSig ) = 0 ) then begin
  6924. invalid:
  6925. float_raise( float_flag_invalid );
  6926. z.low := floatx80_default_nan_low;
  6927. z.high := floatx80_default_nan_high;
  6928. result := z;
  6929. exit;
  6930. end;
  6931. float_raise( float_flag_divbyzero );
  6932. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6933. exit;
  6934. end;
  6935. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6936. end;
  6937. if ( aExp = 0 ) then begin
  6938. if ( aSig = 0 ) then begin
  6939. result := packFloatx80( zSign, 0, 0 );
  6940. exit;
  6941. end;
  6942. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6943. end;
  6944. zExp := aExp - bExp + $3FFE;
  6945. rem1 := 0;
  6946. if ( bSig <= aSig ) then begin
  6947. shift128Right( aSig, 0, 1, aSig, rem1 );
  6948. inc(zExp);
  6949. end;
  6950. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6951. mul64To128( bSig, zSig0, term0, term1 );
  6952. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6953. while ( sbits64( rem0 ) < 0 ) do begin
  6954. dec(zSig0);
  6955. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6956. end;
  6957. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6958. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6959. mul64To128( bSig, zSig1, term1, term2 );
  6960. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6961. while ( sbits64( rem1 ) < 0 ) do begin
  6962. dec(zSig1);
  6963. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6964. end;
  6965. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6966. end;
  6967. result :=
  6968. roundAndPackFloatx80(
  6969. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6970. end;
  6971. {*----------------------------------------------------------------------------
  6972. | Returns the remainder of the extended double-precision floating-point value
  6973. | `a' with respect to the corresponding value `b'. The operation is performed
  6974. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6975. *----------------------------------------------------------------------------*}
  6976. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6977. var
  6978. aSign, zSign: flag;
  6979. aExp, bExp, expDiff: int32;
  6980. aSig0, aSig1, bSig: bits64;
  6981. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6982. z: floatx80;
  6983. label
  6984. invalid;
  6985. begin
  6986. aSig0 := extractFloatx80Frac( a );
  6987. aExp := extractFloatx80Exp( a );
  6988. aSign := extractFloatx80Sign( a );
  6989. bSig := extractFloatx80Frac( b );
  6990. bExp := extractFloatx80Exp( b );
  6991. if ( aExp = $7FFF ) then begin
  6992. if ( bits64( aSig0 shl 1 ) <> 0 )
  6993. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6994. result := propagateFloatx80NaN( a, b );
  6995. exit;
  6996. end;
  6997. goto invalid;
  6998. end;
  6999. if ( bExp = $7FFF ) then begin
  7000. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  7001. result := propagateFloatx80NaN( a, b );
  7002. exit;
  7003. end;
  7004. result := a;
  7005. exit;
  7006. end;
  7007. if ( bExp = 0 ) then begin
  7008. if ( bSig = 0 ) then begin
  7009. invalid:
  7010. float_raise( float_flag_invalid );
  7011. z.low := floatx80_default_nan_low;
  7012. z.high := floatx80_default_nan_high;
  7013. result := z;
  7014. exit;
  7015. end;
  7016. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  7017. end;
  7018. if ( aExp = 0 ) then begin
  7019. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  7020. result := a;
  7021. exit;
  7022. end;
  7023. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7024. end;
  7025. bSig := bSig or $8000000000000000;
  7026. zSign := aSign;
  7027. expDiff := aExp - bExp;
  7028. aSig1 := 0;
  7029. if ( expDiff < 0 ) then begin
  7030. if ( expDiff < -1 ) then begin
  7031. result := a;
  7032. exit;
  7033. end;
  7034. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  7035. expDiff := 0;
  7036. end;
  7037. q := ord( bSig <= aSig0 );
  7038. if ( q <> 0 ) then dec( aSig0, bSig );
  7039. dec( expDiff, 64 );
  7040. while ( 0 < expDiff ) do begin
  7041. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7042. if ( 2 < q ) then q := q - 2 else q := 0;
  7043. mul64To128( bSig, q, term0, term1 );
  7044. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7045. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  7046. dec( expDiff, 62 );
  7047. end;
  7048. inc( expDiff, 64 );
  7049. if ( 0 < expDiff ) then begin
  7050. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7051. if ( 2 < q ) then q:= q - 2 else q := 0;
  7052. q := q shr ( 64 - expDiff );
  7053. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  7054. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7055. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  7056. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  7057. inc(q);
  7058. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7059. end;
  7060. end
  7061. else begin
  7062. term1 := 0;
  7063. term0 := bSig;
  7064. end;
  7065. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  7066. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7067. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7068. and ( q and 1 <> 0 ) )
  7069. then begin
  7070. aSig0 := alternateASig0;
  7071. aSig1 := alternateASig1;
  7072. zSign := ord( zSign = 0 );
  7073. end;
  7074. result :=
  7075. normalizeRoundAndPackFloatx80(
  7076. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  7077. end;
  7078. {*----------------------------------------------------------------------------
  7079. | Returns the square root of the extended double-precision floating-point
  7080. | value `a'. The operation is performed according to the IEC/IEEE Standard
  7081. | for Binary Floating-Point Arithmetic.
  7082. *----------------------------------------------------------------------------*}
  7083. function floatx80_sqrt(a: floatx80): floatx80;
  7084. var
  7085. aSign: flag;
  7086. aExp, zExp: int32;
  7087. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  7088. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7089. z: floatx80;
  7090. label
  7091. invalid;
  7092. begin
  7093. aSig0 := extractFloatx80Frac( a );
  7094. aExp := extractFloatx80Exp( a );
  7095. aSign := extractFloatx80Sign( a );
  7096. if ( aExp = $7FFF ) then begin
  7097. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  7098. result := propagateFloatx80NaN( a, a );
  7099. exit;
  7100. end;
  7101. if ( aSign = 0 ) then begin
  7102. result := a;
  7103. exit;
  7104. end;
  7105. goto invalid;
  7106. end;
  7107. if ( aSign <> 0 ) then begin
  7108. if ( ( aExp or aSig0 ) = 0 ) then begin
  7109. result := a;
  7110. exit;
  7111. end;
  7112. invalid:
  7113. float_raise( float_flag_invalid );
  7114. z.low := floatx80_default_nan_low;
  7115. z.high := floatx80_default_nan_high;
  7116. result := z;
  7117. exit;
  7118. end;
  7119. if ( aExp = 0 ) then begin
  7120. if ( aSig0 = 0 ) then begin
  7121. result := packFloatx80( 0, 0, 0 );
  7122. exit;
  7123. end;
  7124. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7125. end;
  7126. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
  7127. zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
  7128. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  7129. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7130. doubleZSig0 := zSig0 shl 1;
  7131. mul64To128( zSig0, zSig0, term0, term1 );
  7132. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7133. while ( sbits64( rem0 ) < 0 ) do begin
  7134. dec(zSig0);
  7135. dec( doubleZSig0, 2 );
  7136. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7137. end;
  7138. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7139. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7140. if ( zSig1 = 0 ) then zSig1 := 1;
  7141. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7142. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7143. mul64To128( zSig1, zSig1, term2, term3 );
  7144. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7145. while ( sbits64( rem1 ) < 0 ) do begin
  7146. dec(zSig1);
  7147. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7148. term3 := term3 or 1;
  7149. term2 := term2 or doubleZSig0;
  7150. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7151. end;
  7152. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7153. end;
  7154. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7155. zSig0 := zSig0 or doubleZSig0;
  7156. result :=
  7157. roundAndPackFloatx80(
  7158. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7159. end;
  7160. {*----------------------------------------------------------------------------
  7161. | Returns 1 if the extended double-precision floating-point value `a' is
  7162. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7163. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7164. | Arithmetic.
  7165. *----------------------------------------------------------------------------*}
  7166. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7167. begin
  7168. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7169. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7170. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7171. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7172. ) then begin
  7173. if ( floatx80_is_signaling_nan( a )
  7174. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7175. float_raise( float_flag_invalid );
  7176. end;
  7177. result := 0;
  7178. exit;
  7179. end;
  7180. result := ord(
  7181. ( a.low = b.low )
  7182. and ( ( a.high = b.high )
  7183. or ( ( a.low = 0 )
  7184. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7185. ) );
  7186. end;
  7187. {*----------------------------------------------------------------------------
  7188. | Returns 1 if the extended double-precision floating-point value `a' is
  7189. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7190. | comparison is performed according to the IEC/IEEE Standard for Binary
  7191. | Floating-Point Arithmetic.
  7192. *----------------------------------------------------------------------------*}
  7193. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7194. var
  7195. aSign, bSign: flag;
  7196. begin
  7197. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7198. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7199. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7200. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7201. then begin
  7202. float_raise( float_flag_invalid );
  7203. result := 0;
  7204. exit;
  7205. end;
  7206. aSign := extractFloatx80Sign( a );
  7207. bSign := extractFloatx80Sign( b );
  7208. if ( aSign <> bSign ) then begin
  7209. result := ord(
  7210. ( aSign <> 0 )
  7211. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7212. exit;
  7213. end;
  7214. if aSign<>0 then
  7215. result := le128( b.high, b.low, a.high, a.low )
  7216. else
  7217. result := le128( a.high, a.low, b.high, b.low );
  7218. end;
  7219. {*----------------------------------------------------------------------------
  7220. | Returns 1 if the extended double-precision floating-point value `a' is
  7221. | less than the corresponding value `b', and 0 otherwise. The comparison
  7222. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7223. | Arithmetic.
  7224. *----------------------------------------------------------------------------*}
  7225. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7226. var
  7227. aSign, bSign: flag;
  7228. begin
  7229. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7230. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7231. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7232. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7233. then begin
  7234. float_raise( float_flag_invalid );
  7235. result := 0;
  7236. exit;
  7237. end;
  7238. aSign := extractFloatx80Sign( a );
  7239. bSign := extractFloatx80Sign( b );
  7240. if ( aSign <> bSign ) then begin
  7241. result := ord(
  7242. ( aSign <> 0 )
  7243. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7244. exit;
  7245. end;
  7246. if aSign <> 0 then
  7247. result := lt128( b.high, b.low, a.high, a.low )
  7248. else
  7249. result := lt128( a.high, a.low, b.high, b.low );
  7250. end;
  7251. {*----------------------------------------------------------------------------
  7252. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7253. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7254. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7255. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7256. *----------------------------------------------------------------------------*}
  7257. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7258. begin
  7259. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7260. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7261. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7262. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7263. then begin
  7264. float_raise( float_flag_invalid );
  7265. result := 0;
  7266. exit;
  7267. end;
  7268. result := ord(
  7269. ( a.low = b.low )
  7270. and ( ( a.high = b.high )
  7271. or ( ( a.low = 0 )
  7272. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7273. ) );
  7274. end;
  7275. {*----------------------------------------------------------------------------
  7276. | Returns 1 if the extended double-precision floating-point value `a' is less
  7277. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7278. | do not cause an exception. Otherwise, the comparison is performed according
  7279. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7280. *----------------------------------------------------------------------------*}
  7281. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7282. var
  7283. aSign, bSign: flag;
  7284. begin
  7285. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7286. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7287. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7288. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7289. then begin
  7290. if ( floatx80_is_signaling_nan( a )
  7291. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7292. float_raise( float_flag_invalid );
  7293. end;
  7294. result := 0;
  7295. exit;
  7296. end;
  7297. aSign := extractFloatx80Sign( a );
  7298. bSign := extractFloatx80Sign( b );
  7299. if ( aSign <> bSign ) then begin
  7300. result := ord(
  7301. ( aSign <> 0 )
  7302. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7303. exit;
  7304. end;
  7305. if aSign <> 0 then
  7306. result := le128( b.high, b.low, a.high, a.low )
  7307. else
  7308. result := le128( a.high, a.low, b.high, b.low );
  7309. end;
  7310. {*----------------------------------------------------------------------------
  7311. | Returns 1 if the extended double-precision floating-point value `a' is less
  7312. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7313. | an exception. Otherwise, the comparison is performed according to the
  7314. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7315. *----------------------------------------------------------------------------*}
  7316. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7317. var
  7318. aSign, bSign: flag;
  7319. begin
  7320. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7321. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7322. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7323. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7324. then begin
  7325. if ( floatx80_is_signaling_nan( a )
  7326. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7327. float_raise( float_flag_invalid );
  7328. end;
  7329. result := 0;
  7330. exit;
  7331. end;
  7332. aSign := extractFloatx80Sign( a );
  7333. bSign := extractFloatx80Sign( b );
  7334. if ( aSign <> bSign ) then begin
  7335. result := ord(
  7336. ( aSign <> 0 )
  7337. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7338. exit;
  7339. end;
  7340. if aSign <> 0 then
  7341. result := lt128( b.high, b.low, a.high, a.low )
  7342. else
  7343. result := lt128( a.high, a.low, b.high, b.low );
  7344. end;
  7345. {$endif FPC_SOFTFLOAT_FLOATX80}
  7346. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7347. {*----------------------------------------------------------------------------
  7348. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7349. | floating-point value `a'.
  7350. *----------------------------------------------------------------------------*}
  7351. function extractFloat128Frac1(a : float128): bits64;
  7352. begin
  7353. result:=a.low;
  7354. end;
  7355. {*----------------------------------------------------------------------------
  7356. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7357. | floating-point value `a'.
  7358. *----------------------------------------------------------------------------*}
  7359. function extractFloat128Frac0(a : float128): bits64;
  7360. begin
  7361. result:=a.high and int64($0000FFFFFFFFFFFF);
  7362. end;
  7363. {*----------------------------------------------------------------------------
  7364. | Returns the exponent bits of the quadruple-precision floating-point value
  7365. | `a'.
  7366. *----------------------------------------------------------------------------*}
  7367. function extractFloat128Exp(a : float128): int32;
  7368. begin
  7369. result:=( a.high shr 48 ) and $7FFF;
  7370. end;
  7371. {*----------------------------------------------------------------------------
  7372. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7373. *----------------------------------------------------------------------------*}
  7374. function extractFloat128Sign(a : float128): flag;
  7375. begin
  7376. result:=a.high shr 63;
  7377. end;
  7378. {*----------------------------------------------------------------------------
  7379. | Normalizes the subnormal quadruple-precision floating-point value
  7380. | represented by the denormalized significand formed by the concatenation of
  7381. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7382. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7383. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7384. | least significant 64 bits of the normalized significand are stored at the
  7385. | location pointed to by `zSig1Ptr'.
  7386. *----------------------------------------------------------------------------*}
  7387. procedure normalizeFloat128Subnormal(
  7388. aSig0: bits64;
  7389. aSig1: bits64;
  7390. var zExpPtr: int32;
  7391. var zSig0Ptr: bits64;
  7392. var zSig1Ptr: bits64);
  7393. var
  7394. shiftCount: int8;
  7395. begin
  7396. if ( aSig0 = 0 ) then
  7397. begin
  7398. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7399. if ( shiftCount < 0 ) then
  7400. begin
  7401. zSig0Ptr := aSig1 shr ( - shiftCount );
  7402. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7403. end
  7404. else begin
  7405. zSig0Ptr := aSig1 shl shiftCount;
  7406. zSig1Ptr := 0;
  7407. end;
  7408. zExpPtr := - shiftCount - 63;
  7409. end
  7410. else begin
  7411. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7412. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7413. zExpPtr := 1 - shiftCount;
  7414. end;
  7415. end;
  7416. {*----------------------------------------------------------------------------
  7417. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7418. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7419. | floating-point value, returning the result. After being shifted into the
  7420. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7421. | added together to form the most significant 32 bits of the result. This
  7422. | means that any integer portion of `zSig0' will be added into the exponent.
  7423. | Since a properly normalized significand will have an integer portion equal
  7424. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7425. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7426. | significand.
  7427. *----------------------------------------------------------------------------*}
  7428. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7429. var
  7430. z: float128;
  7431. begin
  7432. z.low := zSig1;
  7433. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7434. result:=z;
  7435. end;
  7436. {*----------------------------------------------------------------------------
  7437. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7438. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7439. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7440. | corresponding to the abstract input. Ordinarily, the abstract value is
  7441. | simply rounded and packed into the quadruple-precision format, with the
  7442. | inexact exception raised if the abstract input cannot be represented
  7443. | exactly. However, if the abstract value is too large, the overflow and
  7444. | inexact exceptions are raised and an infinity or maximal finite value is
  7445. | returned. If the abstract value is too small, the input value is rounded to
  7446. | a subnormal number, and the underflow and inexact exceptions are raised if
  7447. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7448. | precision floating-point number.
  7449. | The input significand must be normalized or smaller. If the input
  7450. | significand is not normalized, `zExp' must be 0; in that case, the result
  7451. | returned is a subnormal number, and it must not require rounding. In the
  7452. | usual case that the input significand is normalized, `zExp' must be 1 less
  7453. | than the ``true'' floating-point exponent. The handling of underflow and
  7454. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7455. *----------------------------------------------------------------------------*}
  7456. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7457. var
  7458. roundingMode: TFPURoundingMode;
  7459. roundNearestEven, increment, isTiny: flag;
  7460. begin
  7461. roundingMode := softfloat_rounding_mode;
  7462. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7463. increment := ord( sbits64(zSig2) < 0 );
  7464. if ( roundNearestEven=0 ) then
  7465. begin
  7466. if ( roundingMode = float_round_to_zero ) then
  7467. begin
  7468. increment := 0;
  7469. end
  7470. else begin
  7471. if ( zSign<>0 ) then
  7472. begin
  7473. increment := ord( roundingMode = float_round_down ) and zSig2;
  7474. end
  7475. else begin
  7476. increment := ord( roundingMode = float_round_up ) and zSig2;
  7477. end;
  7478. end;
  7479. end;
  7480. if ( $7FFD <= bits32(zExp) ) then
  7481. begin
  7482. if ( ord( $7FFD < zExp )
  7483. or ( ord( zExp = $7FFD )
  7484. and eq128(
  7485. int64( $0001FFFFFFFFFFFF ),
  7486. bits64( $FFFFFFFFFFFFFFFF ),
  7487. zSig0,
  7488. zSig1
  7489. )
  7490. and increment
  7491. )
  7492. )<>0 then
  7493. begin
  7494. float_raise( [float_flag_overflow,float_flag_inexact] );
  7495. if ( ord( roundingMode = float_round_to_zero )
  7496. or ( zSign and ord( roundingMode = float_round_up ) )
  7497. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7498. )<>0 then
  7499. begin
  7500. result :=
  7501. packFloat128(
  7502. zSign,
  7503. $7FFE,
  7504. int64( $0000FFFFFFFFFFFF ),
  7505. bits64( $FFFFFFFFFFFFFFFF )
  7506. );
  7507. exit;
  7508. end;
  7509. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7510. exit;
  7511. end;
  7512. if ( zExp < 0 ) then
  7513. begin
  7514. isTiny :=
  7515. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7516. or ( zExp < -1 )
  7517. or not( increment<>0 )
  7518. or boolean(lt128(
  7519. zSig0,
  7520. zSig1,
  7521. int64( $0001FFFFFFFFFFFF ),
  7522. bits64( $FFFFFFFFFFFFFFFF )
  7523. )));
  7524. shift128ExtraRightJamming(
  7525. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7526. zExp := 0;
  7527. if ( isTiny and zSig2 )<>0 then
  7528. float_raise( float_flag_underflow );
  7529. if ( roundNearestEven<>0 ) then
  7530. begin
  7531. increment := ord( sbits64(zSig2) < 0 );
  7532. end
  7533. else begin
  7534. if ( zSign<>0 ) then
  7535. begin
  7536. increment := ord( roundingMode = float_round_down ) and zSig2;
  7537. end
  7538. else begin
  7539. increment := ord( roundingMode = float_round_up ) and zSig2;
  7540. end;
  7541. end;
  7542. end;
  7543. end;
  7544. if ( zSig2<>0 ) then
  7545. set_inexact_flag;
  7546. if ( increment<>0 ) then
  7547. begin
  7548. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7549. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7550. end
  7551. else begin
  7552. if ( ( zSig0 or zSig1 ) = 0 ) then
  7553. zExp := 0;
  7554. end;
  7555. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7556. end;
  7557. {*----------------------------------------------------------------------------
  7558. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7559. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7560. | returns the proper quadruple-precision floating-point value corresponding
  7561. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7562. | except that the input significand has fewer bits and does not have to be
  7563. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7564. | point exponent.
  7565. *----------------------------------------------------------------------------*}
  7566. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7567. var
  7568. shiftCount: int8;
  7569. zSig2: bits64;
  7570. begin
  7571. if ( zSig0 = 0 ) then
  7572. begin
  7573. zSig0 := zSig1;
  7574. zSig1 := 0;
  7575. dec(zExp, 64);
  7576. end;
  7577. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7578. if ( 0 <= shiftCount ) then
  7579. begin
  7580. zSig2 := 0;
  7581. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7582. end
  7583. else begin
  7584. shift128ExtraRightJamming(
  7585. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7586. end;
  7587. dec(zExp, shiftCount);
  7588. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7589. end;
  7590. {*----------------------------------------------------------------------------
  7591. | Returns the result of converting the quadruple-precision floating-point
  7592. | value `a' to the 32-bit two's complement integer format. The conversion
  7593. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7594. | Arithmetic---which means in particular that the conversion is rounded
  7595. | according to the current rounding mode. If `a' is a NaN, the largest
  7596. | positive integer is returned. Otherwise, if the conversion overflows, the
  7597. | largest integer with the same sign as `a' is returned.
  7598. *----------------------------------------------------------------------------*}
  7599. function float128_to_int32(a: float128): int32;
  7600. var
  7601. aSign: flag;
  7602. aExp, shiftCount: int32;
  7603. aSig0, aSig1: bits64;
  7604. begin
  7605. aSig1 := extractFloat128Frac1( a );
  7606. aSig0 := extractFloat128Frac0( a );
  7607. aExp := extractFloat128Exp( a );
  7608. aSign := extractFloat128Sign( a );
  7609. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7610. aSign := 0;
  7611. if ( aExp<>0 ) then
  7612. aSig0 := aSig0 or int64( $0001000000000000 );
  7613. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7614. shiftCount := $4028 - aExp;
  7615. if ( 0 < shiftCount ) then
  7616. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7617. result := roundAndPackInt32( aSign, aSig0 );
  7618. end;
  7619. {*----------------------------------------------------------------------------
  7620. | Returns the result of converting the quadruple-precision floating-point
  7621. | value `a' to the 32-bit two's complement integer format. The conversion
  7622. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7623. | Arithmetic, except that the conversion is always rounded toward zero. If
  7624. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7625. | conversion overflows, the largest integer with the same sign as `a' is
  7626. | returned.
  7627. *----------------------------------------------------------------------------*}
  7628. function float128_to_int32_round_to_zero(a: float128): int32;
  7629. var
  7630. aSign: flag;
  7631. aExp, shiftCount: int32;
  7632. aSig0, aSig1, savedASig: bits64;
  7633. z: int32;
  7634. label
  7635. invalid;
  7636. begin
  7637. aSig1 := extractFloat128Frac1( a );
  7638. aSig0 := extractFloat128Frac0( a );
  7639. aExp := extractFloat128Exp( a );
  7640. aSign := extractFloat128Sign( a );
  7641. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7642. if ( $401E < aExp ) then
  7643. begin
  7644. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7645. aSign := 0;
  7646. goto invalid;
  7647. end
  7648. else if ( aExp < $3FFF ) then
  7649. begin
  7650. if ( aExp or aSig0 )<>0 then
  7651. set_inexact_flag;
  7652. result := 0;
  7653. exit;
  7654. end;
  7655. aSig0 := aSig0 or int64( $0001000000000000 );
  7656. shiftCount := $402F - aExp;
  7657. savedASig := aSig0;
  7658. aSig0 := aSig0 shr shiftCount;
  7659. z := aSig0;
  7660. if ( aSign )<>0 then
  7661. z := - z;
  7662. if ( ord( z < 0 ) xor aSign )<>0 then
  7663. begin
  7664. invalid:
  7665. float_raise( float_flag_invalid );
  7666. if aSign<>0 then
  7667. result:= int32( $80000000 )
  7668. else
  7669. result:=$7FFFFFFF;
  7670. exit;
  7671. end;
  7672. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7673. begin
  7674. set_inexact_flag;
  7675. end;
  7676. result := z;
  7677. end;
  7678. {*----------------------------------------------------------------------------
  7679. | Returns the result of converting the quadruple-precision floating-point
  7680. | value `a' to the 64-bit two's complement integer format. The conversion
  7681. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7682. | Arithmetic---which means in particular that the conversion is rounded
  7683. | according to the current rounding mode. If `a' is a NaN, the largest
  7684. | positive integer is returned. Otherwise, if the conversion overflows, the
  7685. | largest integer with the same sign as `a' is returned.
  7686. *----------------------------------------------------------------------------*}
  7687. function float128_to_int64(a: float128): int64;
  7688. var
  7689. aSign: flag;
  7690. aExp, shiftCount: int32;
  7691. aSig0, aSig1: bits64;
  7692. begin
  7693. aSig1 := extractFloat128Frac1( a );
  7694. aSig0 := extractFloat128Frac0( a );
  7695. aExp := extractFloat128Exp( a );
  7696. aSign := extractFloat128Sign( a );
  7697. if ( aExp<>0 ) then
  7698. aSig0 := aSig0 or int64( $0001000000000000 );
  7699. shiftCount := $402F - aExp;
  7700. if ( shiftCount <= 0 ) then
  7701. begin
  7702. if ( $403E < aExp ) then
  7703. begin
  7704. float_raise( float_flag_invalid );
  7705. if ( (aSign=0)
  7706. or ( ( aExp = $7FFF )
  7707. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7708. )
  7709. ) then
  7710. begin
  7711. result := int64( $7FFFFFFFFFFFFFFF );
  7712. exit;
  7713. end;
  7714. result := int64( $8000000000000000 );
  7715. exit;
  7716. end;
  7717. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7718. end
  7719. else begin
  7720. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7721. end;
  7722. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7723. end;
  7724. {*----------------------------------------------------------------------------
  7725. | Returns the result of converting the quadruple-precision floating-point
  7726. | value `a' to the 64-bit two's complement integer format. The conversion
  7727. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7728. | Arithmetic, except that the conversion is always rounded toward zero.
  7729. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7730. | the conversion overflows, the largest integer with the same sign as `a' is
  7731. | returned.
  7732. *----------------------------------------------------------------------------*}
  7733. function float128_to_int64_round_to_zero(a: float128): int64;
  7734. var
  7735. aSign: flag;
  7736. aExp, shiftCount: int32;
  7737. aSig0, aSig1: bits64;
  7738. z: int64;
  7739. begin
  7740. aSig1 := extractFloat128Frac1( a );
  7741. aSig0 := extractFloat128Frac0( a );
  7742. aExp := extractFloat128Exp( a );
  7743. aSign := extractFloat128Sign( a );
  7744. if ( aExp<>0 ) then
  7745. aSig0 := aSig0 or int64( $0001000000000000 );
  7746. shiftCount := aExp - $402F;
  7747. if ( 0 < shiftCount ) then
  7748. begin
  7749. if ( $403E <= aExp ) then
  7750. begin
  7751. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7752. if ( ( a.high = bits64( $C03E000000000000 ) )
  7753. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7754. begin
  7755. if ( aSig1<>0 ) then
  7756. set_inexact_flag;
  7757. end
  7758. else begin
  7759. float_raise( float_flag_invalid );
  7760. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7761. begin
  7762. result := int64( $7FFFFFFFFFFFFFFF );
  7763. exit;
  7764. end;
  7765. end;
  7766. result := int64( $8000000000000000 );
  7767. exit;
  7768. end;
  7769. z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
  7770. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7771. begin
  7772. set_inexact_flag;
  7773. end;
  7774. end
  7775. else begin
  7776. if ( aExp < $3FFF ) then
  7777. begin
  7778. if ( aExp or aSig0 or aSig1 )<>0 then
  7779. begin
  7780. set_inexact_flag;
  7781. end;
  7782. result := 0;
  7783. exit;
  7784. end;
  7785. z := aSig0 shr ( - shiftCount );
  7786. if ( (aSig1<>0)
  7787. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7788. begin
  7789. set_inexact_flag;
  7790. end;
  7791. end;
  7792. if ( aSign<>0 ) then
  7793. z := - z;
  7794. result := z;
  7795. end;
  7796. {*----------------------------------------------------------------------------
  7797. | Returns the result of converting the quadruple-precision floating-point
  7798. | value `a' to the single-precision floating-point format. The conversion
  7799. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7800. | Arithmetic.
  7801. *----------------------------------------------------------------------------*}
  7802. function float128_to_float32(a: float128): float32;
  7803. var
  7804. aSign: flag;
  7805. aExp: int32;
  7806. aSig0, aSig1: bits64;
  7807. zSig: bits32;
  7808. begin
  7809. aSig1 := extractFloat128Frac1( a );
  7810. aSig0 := extractFloat128Frac0( a );
  7811. aExp := extractFloat128Exp( a );
  7812. aSign := extractFloat128Sign( a );
  7813. if ( aExp = $7FFF ) then
  7814. begin
  7815. if ( aSig0 or aSig1 )<>0 then
  7816. begin
  7817. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7818. exit;
  7819. end;
  7820. result := packFloat32( aSign, $FF, 0 );
  7821. exit;
  7822. end;
  7823. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7824. shift64RightJamming( aSig0, 18, aSig0 );
  7825. zSig := aSig0;
  7826. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7827. begin
  7828. zSig := zSig or $40000000;
  7829. dec(aExp,$3F81);
  7830. end;
  7831. result := roundAndPackFloat32( aSign, aExp, zSig );
  7832. end;
  7833. {*----------------------------------------------------------------------------
  7834. | Returns the result of converting the quadruple-precision floating-point
  7835. | value `a' to the double-precision floating-point format. The conversion
  7836. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7837. | Arithmetic.
  7838. *----------------------------------------------------------------------------*}
  7839. function float128_to_float64(a: float128): float64;
  7840. var
  7841. aSign: flag;
  7842. aExp: int32;
  7843. aSig0, aSig1: bits64;
  7844. begin
  7845. aSig1 := extractFloat128Frac1( a );
  7846. aSig0 := extractFloat128Frac0( a );
  7847. aExp := extractFloat128Exp( a );
  7848. aSign := extractFloat128Sign( a );
  7849. if ( aExp = $7FFF ) then
  7850. begin
  7851. if ( aSig0 or aSig1 )<>0 then
  7852. begin
  7853. result:=commonNaNToFloat64(float128ToCommonNaN(a));
  7854. exit;
  7855. end;
  7856. result:=packFloat64( aSign, $7FF, 0);
  7857. exit;
  7858. end;
  7859. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7860. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7861. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7862. begin
  7863. aSig0 := aSig0 or int64( $4000000000000000 );
  7864. dec(aExp,$3C01);
  7865. end;
  7866. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7867. end;
  7868. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7869. {*----------------------------------------------------------------------------
  7870. | Returns the result of converting the quadruple-precision floating-point
  7871. | value `a' to the extended double-precision floating-point format. The
  7872. | conversion is performed according to the IEC/IEEE Standard for Binary
  7873. | Floating-Point Arithmetic.
  7874. *----------------------------------------------------------------------------*}
  7875. function float128_to_floatx80(a: float128): floatx80;
  7876. var
  7877. aSign: flag;
  7878. aExp: int32;
  7879. aSig0, aSig1: bits64;
  7880. begin
  7881. aSig1 := extractFloat128Frac1( a );
  7882. aSig0 := extractFloat128Frac0( a );
  7883. aExp := extractFloat128Exp( a );
  7884. aSign := extractFloat128Sign( a );
  7885. if ( aExp = $7FFF ) then begin
  7886. if ( aSig0 or aSig1 <> 0 ) then begin
  7887. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7888. exit;
  7889. end;
  7890. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7891. exit;
  7892. end;
  7893. if ( aExp = 0 ) then begin
  7894. if ( ( aSig0 or aSig1 ) = 0 ) then
  7895. begin
  7896. result := packFloatx80( aSign, 0, 0 );
  7897. exit;
  7898. end;
  7899. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7900. end
  7901. else begin
  7902. aSig0 := aSig0 or int64( $0001000000000000 );
  7903. end;
  7904. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7905. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7906. end;
  7907. {$endif FPC_SOFTFLOAT_FLOATX80}
  7908. {*----------------------------------------------------------------------------
  7909. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7910. | Returns the result as a quadruple-precision floating-point value. The
  7911. | operation is performed according to the IEC/IEEE Standard for Binary
  7912. | Floating-Point Arithmetic.
  7913. *----------------------------------------------------------------------------*}
  7914. function float128_round_to_int(a: float128): float128;
  7915. var
  7916. aSign: flag;
  7917. aExp: int32;
  7918. lastBitMask, roundBitsMask: bits64;
  7919. roundingMode: TFPURoundingMode;
  7920. z: float128;
  7921. begin
  7922. aExp := extractFloat128Exp( a );
  7923. if ( $402F <= aExp ) then
  7924. begin
  7925. if ( $406F <= aExp ) then
  7926. begin
  7927. if ( ( aExp = $7FFF )
  7928. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7929. ) then
  7930. begin
  7931. result := propagateFloat128NaN( a, a );
  7932. exit;
  7933. end;
  7934. result := a;
  7935. exit;
  7936. end;
  7937. lastBitMask := 1;
  7938. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7939. roundBitsMask := lastBitMask - 1;
  7940. z := a;
  7941. roundingMode := softfloat_rounding_mode;
  7942. if ( roundingMode = float_round_nearest_even ) then
  7943. begin
  7944. if ( lastBitMask )<>0 then
  7945. begin
  7946. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7947. if ( ( z.low and roundBitsMask ) = 0 ) then
  7948. z.low := z.low and not(lastBitMask);
  7949. end
  7950. else begin
  7951. if ( sbits64(z.low) < 0 ) then
  7952. begin
  7953. inc(z.high);
  7954. if ( bits64( z.low shl 1 ) = 0 ) then
  7955. z.high := z.high and not bits64( 1 );
  7956. end;
  7957. end;
  7958. end
  7959. else if ( roundingMode <> float_round_to_zero ) then
  7960. begin
  7961. if ( extractFloat128Sign( z )
  7962. xor ord( roundingMode = float_round_up ) )<>0 then
  7963. begin
  7964. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7965. end;
  7966. end;
  7967. z.low := z.low and not(roundBitsMask);
  7968. end
  7969. else begin
  7970. if ( aExp < $3FFF ) then
  7971. begin
  7972. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7973. begin
  7974. result := a;
  7975. exit;
  7976. end;
  7977. set_inexact_flag;
  7978. aSign := extractFloat128Sign( a );
  7979. case softfloat_rounding_mode of
  7980. float_round_nearest_even:
  7981. if ( ( aExp = $3FFE )
  7982. and ( (extractFloat128Frac0( a )<>0)
  7983. or (extractFloat128Frac1( a )<>0) )
  7984. ) then begin
  7985. begin
  7986. result := packFloat128( aSign, $3FFF, 0, 0 );
  7987. exit;
  7988. end;
  7989. end;
  7990. float_round_down:
  7991. begin
  7992. if aSign<>0 then
  7993. result:=packFloat128( 1, $3FFF, 0, 0 )
  7994. else
  7995. result:=packFloat128( 0, 0, 0, 0 );
  7996. exit;
  7997. end;
  7998. float_round_up:
  7999. begin
  8000. if aSign<>0 then
  8001. result := packFloat128( 1, 0, 0, 0 )
  8002. else
  8003. result:=packFloat128( 0, $3FFF, 0, 0 );
  8004. exit;
  8005. end;
  8006. end;
  8007. result := packFloat128( aSign, 0, 0, 0 );
  8008. exit;
  8009. end;
  8010. lastBitMask := 1;
  8011. lastBitMask := lastBitMask shl ($402F - aExp);
  8012. roundBitsMask := lastBitMask - 1;
  8013. z.low := 0;
  8014. z.high := a.high;
  8015. roundingMode := softfloat_rounding_mode;
  8016. if ( roundingMode = float_round_nearest_even ) then begin
  8017. inc(z.high,lastBitMask shr 1);
  8018. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  8019. z.high := z.high and not(lastBitMask);
  8020. end;
  8021. end
  8022. else if ( roundingMode <> float_round_to_zero ) then begin
  8023. if ( (extractFloat128Sign( z )<>0)
  8024. xor ( roundingMode = float_round_up ) ) then begin
  8025. z.high := z.high or ord( a.low <> 0 );
  8026. z.high := z.high+roundBitsMask;
  8027. end;
  8028. end;
  8029. z.high := z.high and not(roundBitsMask);
  8030. end;
  8031. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  8032. set_inexact_flag;
  8033. end;
  8034. result := z;
  8035. end;
  8036. {*----------------------------------------------------------------------------
  8037. | Returns the result of adding the absolute values of the quadruple-precision
  8038. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  8039. | before being returned. `zSign' is ignored if the result is a NaN.
  8040. | The addition is performed according to the IEC/IEEE Standard for Binary
  8041. | Floating-Point Arithmetic.
  8042. *----------------------------------------------------------------------------*}
  8043. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  8044. var
  8045. aExp, bExp, zExp: int32;
  8046. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8047. expDiff: int32;
  8048. label
  8049. shiftRight1,roundAndPack;
  8050. begin
  8051. aSig1 := extractFloat128Frac1( a );
  8052. aSig0 := extractFloat128Frac0( a );
  8053. aExp := extractFloat128Exp( a );
  8054. bSig1 := extractFloat128Frac1( b );
  8055. bSig0 := extractFloat128Frac0( b );
  8056. bExp := extractFloat128Exp( b );
  8057. expDiff := aExp - bExp;
  8058. if ( 0 < expDiff ) then begin
  8059. if ( aExp = $7FFF ) then begin
  8060. if ( aSig0 or aSig1 )<>0 then
  8061. begin
  8062. result := propagateFloat128NaN( a, b );
  8063. exit;
  8064. end;
  8065. result := a;
  8066. exit;
  8067. end;
  8068. if ( bExp = 0 ) then begin
  8069. dec(expDiff);
  8070. end
  8071. else begin
  8072. bSig0 := bSig0 or int64( $0001000000000000 );
  8073. end;
  8074. shift128ExtraRightJamming(
  8075. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  8076. zExp := aExp;
  8077. end
  8078. else if ( expDiff < 0 ) then begin
  8079. if ( bExp = $7FFF ) then begin
  8080. if ( bSig0 or bSig1 )<>0 then
  8081. begin
  8082. result := propagateFloat128NaN( a, b );
  8083. exit;
  8084. end;
  8085. result := packFloat128( zSign, $7FFF, 0, 0 );
  8086. exit;
  8087. end;
  8088. if ( aExp = 0 ) then begin
  8089. inc(expDiff);
  8090. end
  8091. else begin
  8092. aSig0 := aSig0 or int64( $0001000000000000 );
  8093. end;
  8094. shift128ExtraRightJamming(
  8095. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  8096. zExp := bExp;
  8097. end
  8098. else begin
  8099. if ( aExp = $7FFF ) then begin
  8100. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8101. result := propagateFloat128NaN( a, b );
  8102. exit;
  8103. end;
  8104. result := a;
  8105. exit;
  8106. end;
  8107. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8108. if ( aExp = 0 ) then
  8109. begin
  8110. result := packFloat128( zSign, 0, zSig0, zSig1 );
  8111. exit;
  8112. end;
  8113. zSig2 := 0;
  8114. zSig0 := zSig0 or int64( $0002000000000000 );
  8115. zExp := aExp;
  8116. goto shiftRight1;
  8117. end;
  8118. aSig0 := aSig0 or int64( $0001000000000000 );
  8119. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8120. dec(zExp);
  8121. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  8122. inc(zExp);
  8123. shiftRight1:
  8124. shift128ExtraRightJamming(
  8125. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8126. roundAndPack:
  8127. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8128. end;
  8129. {*----------------------------------------------------------------------------
  8130. | Returns the result of subtracting the absolute values of the quadruple-
  8131. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  8132. | difference is negated before being returned. `zSign' is ignored if the
  8133. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  8134. | Standard for Binary Floating-Point Arithmetic.
  8135. *----------------------------------------------------------------------------*}
  8136. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  8137. var
  8138. aExp, bExp, zExp: int32;
  8139. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8140. expDiff: int32;
  8141. z: float128;
  8142. label
  8143. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8144. begin
  8145. aSig1 := extractFloat128Frac1( a );
  8146. aSig0 := extractFloat128Frac0( a );
  8147. aExp := extractFloat128Exp( a );
  8148. bSig1 := extractFloat128Frac1( b );
  8149. bSig0 := extractFloat128Frac0( b );
  8150. bExp := extractFloat128Exp( b );
  8151. expDiff := aExp - bExp;
  8152. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8153. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8154. if ( 0 < expDiff ) then goto aExpBigger;
  8155. if ( expDiff < 0 ) then goto bExpBigger;
  8156. if ( aExp = $7FFF ) then begin
  8157. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8158. result := propagateFloat128NaN( a, b );
  8159. exit;
  8160. end;
  8161. float_raise( float_flag_invalid );
  8162. z.low := float128_default_nan_low;
  8163. z.high := float128_default_nan_high;
  8164. result := z;
  8165. exit;
  8166. end;
  8167. if ( aExp = 0 ) then begin
  8168. aExp := 1;
  8169. bExp := 1;
  8170. end;
  8171. if ( bSig0 < aSig0 ) then goto aBigger;
  8172. if ( aSig0 < bSig0 ) then goto bBigger;
  8173. if ( bSig1 < aSig1 ) then goto aBigger;
  8174. if ( aSig1 < bSig1 ) then goto bBigger;
  8175. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8176. exit;
  8177. bExpBigger:
  8178. if ( bExp = $7FFF ) then begin
  8179. if ( bSig0 or bSig1 )<>0 then
  8180. begin
  8181. result := propagateFloat128NaN( a, b );
  8182. exit;
  8183. end;
  8184. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8185. exit;
  8186. end;
  8187. if ( aExp = 0 ) then begin
  8188. inc(expDiff);
  8189. end
  8190. else begin
  8191. aSig0 := aSig0 or int64( $4000000000000000 );
  8192. end;
  8193. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8194. bSig0 := bSig0 or int64( $4000000000000000 );
  8195. bBigger:
  8196. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8197. zExp := bExp;
  8198. zSign := zSign xor 1;
  8199. goto normalizeRoundAndPack;
  8200. aExpBigger:
  8201. if ( aExp = $7FFF ) then begin
  8202. if ( aSig0 or aSig1 )<>0 then
  8203. begin
  8204. result := propagateFloat128NaN( a, b );
  8205. exit;
  8206. end;
  8207. result := a;
  8208. exit;
  8209. end;
  8210. if ( bExp = 0 ) then begin
  8211. dec(expDiff);
  8212. end
  8213. else begin
  8214. bSig0 := bSig0 or int64( $4000000000000000 );
  8215. end;
  8216. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8217. aSig0 := aSig0 or int64( $4000000000000000 );
  8218. aBigger:
  8219. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8220. zExp := aExp;
  8221. normalizeRoundAndPack:
  8222. dec(zExp);
  8223. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8224. end;
  8225. {*----------------------------------------------------------------------------
  8226. | Returns the result of adding the quadruple-precision floating-point values
  8227. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8228. | for Binary Floating-Point Arithmetic.
  8229. *----------------------------------------------------------------------------*}
  8230. function float128_add(a: float128; b: float128): float128;
  8231. var
  8232. aSign, bSign: flag;
  8233. begin
  8234. aSign := extractFloat128Sign( a );
  8235. bSign := extractFloat128Sign( b );
  8236. if ( aSign = bSign ) then begin
  8237. result := addFloat128Sigs( a, b, aSign );
  8238. end
  8239. else begin
  8240. result := subFloat128Sigs( a, b, aSign );
  8241. end;
  8242. end;
  8243. {*----------------------------------------------------------------------------
  8244. | Returns the result of subtracting the quadruple-precision floating-point
  8245. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8246. | Standard for Binary Floating-Point Arithmetic.
  8247. *----------------------------------------------------------------------------*}
  8248. function float128_sub(a: float128; b: float128): float128;
  8249. var
  8250. aSign, bSign: flag;
  8251. begin
  8252. aSign := extractFloat128Sign( a );
  8253. bSign := extractFloat128Sign( b );
  8254. if ( aSign = bSign ) then begin
  8255. result := subFloat128Sigs( a, b, aSign );
  8256. end
  8257. else begin
  8258. result := addFloat128Sigs( a, b, aSign );
  8259. end;
  8260. end;
  8261. {*----------------------------------------------------------------------------
  8262. | Returns the result of multiplying the quadruple-precision floating-point
  8263. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8264. | Standard for Binary Floating-Point Arithmetic.
  8265. *----------------------------------------------------------------------------*}
  8266. function float128_mul(a: float128; b: float128): float128;
  8267. var
  8268. aSign, bSign, zSign: flag;
  8269. aExp, bExp, zExp: int32;
  8270. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8271. z: float128;
  8272. label
  8273. invalid;
  8274. begin
  8275. aSig1 := extractFloat128Frac1( a );
  8276. aSig0 := extractFloat128Frac0( a );
  8277. aExp := extractFloat128Exp( a );
  8278. aSign := extractFloat128Sign( a );
  8279. bSig1 := extractFloat128Frac1( b );
  8280. bSig0 := extractFloat128Frac0( b );
  8281. bExp := extractFloat128Exp( b );
  8282. bSign := extractFloat128Sign( b );
  8283. zSign := aSign xor bSign;
  8284. if ( aExp = $7FFF ) then begin
  8285. if ( (( aSig0 or aSig1 )<>0)
  8286. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8287. result := propagateFloat128NaN( a, b );
  8288. exit;
  8289. end;
  8290. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8291. result := packFloat128( zSign, $7FFF, 0, 0 );
  8292. exit;
  8293. end;
  8294. if ( bExp = $7FFF ) then begin
  8295. if ( bSig0 or bSig1 )<>0 then
  8296. begin
  8297. result := propagateFloat128NaN( a, b );
  8298. exit;
  8299. end;
  8300. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8301. invalid:
  8302. float_raise( float_flag_invalid );
  8303. z.low := float128_default_nan_low;
  8304. z.high := float128_default_nan_high;
  8305. result := z;
  8306. exit;
  8307. end;
  8308. result := packFloat128( zSign, $7FFF, 0, 0 );
  8309. exit;
  8310. end;
  8311. if ( aExp = 0 ) then begin
  8312. if ( ( aSig0 or aSig1 ) = 0 ) then
  8313. begin
  8314. result := packFloat128( zSign, 0, 0, 0 );
  8315. exit;
  8316. end;
  8317. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8318. end;
  8319. if ( bExp = 0 ) then begin
  8320. if ( ( bSig0 or bSig1 ) = 0 ) then
  8321. begin
  8322. result := packFloat128( zSign, 0, 0, 0 );
  8323. exit;
  8324. end;
  8325. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8326. end;
  8327. zExp := aExp + bExp - $4000;
  8328. aSig0 := aSig0 or int64( $0001000000000000 );
  8329. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8330. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8331. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8332. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8333. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8334. shift128ExtraRightJamming(
  8335. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8336. inc(zExp);
  8337. end;
  8338. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8339. end;
  8340. {*----------------------------------------------------------------------------
  8341. | Returns the result of dividing the quadruple-precision floating-point value
  8342. | `a' by the corresponding value `b'. The operation is performed according to
  8343. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8344. *----------------------------------------------------------------------------*}
  8345. function float128_div(a: float128; b: float128): float128;
  8346. var
  8347. aSign, bSign, zSign: flag;
  8348. aExp, bExp, zExp: int32;
  8349. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8350. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8351. z: float128;
  8352. label
  8353. invalid;
  8354. begin
  8355. aSig1 := extractFloat128Frac1( a );
  8356. aSig0 := extractFloat128Frac0( a );
  8357. aExp := extractFloat128Exp( a );
  8358. aSign := extractFloat128Sign( a );
  8359. bSig1 := extractFloat128Frac1( b );
  8360. bSig0 := extractFloat128Frac0( b );
  8361. bExp := extractFloat128Exp( b );
  8362. bSign := extractFloat128Sign( b );
  8363. zSign := aSign xor bSign;
  8364. if ( aExp = $7FFF ) then begin
  8365. if ( aSig0 or aSig1 )<>0 then
  8366. begin
  8367. result := propagateFloat128NaN( a, b );
  8368. exit;
  8369. end;
  8370. if ( bExp = $7FFF ) then begin
  8371. if ( bSig0 or bSig1 )<>0 then
  8372. begin
  8373. result := propagateFloat128NaN( a, b );
  8374. exit;
  8375. end;
  8376. goto invalid;
  8377. end;
  8378. result := packFloat128( zSign, $7FFF, 0, 0 );
  8379. exit;
  8380. end;
  8381. if ( bExp = $7FFF ) then begin
  8382. if ( bSig0 or bSig1 )<>0 then
  8383. begin
  8384. result := propagateFloat128NaN( a, b );
  8385. exit;
  8386. end;
  8387. result := packFloat128( zSign, 0, 0, 0 );
  8388. exit;
  8389. end;
  8390. if ( bExp = 0 ) then begin
  8391. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8392. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8393. invalid:
  8394. float_raise( float_flag_invalid );
  8395. z.low := float128_default_nan_low;
  8396. z.high := float128_default_nan_high;
  8397. result := z;
  8398. exit;
  8399. end;
  8400. float_raise( float_flag_divbyzero );
  8401. result := packFloat128( zSign, $7FFF, 0, 0 );
  8402. exit;
  8403. end;
  8404. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8405. end;
  8406. if ( aExp = 0 ) then begin
  8407. if ( ( aSig0 or aSig1 ) = 0 ) then
  8408. begin
  8409. result := packFloat128( zSign, 0, 0, 0 );
  8410. exit;
  8411. end;
  8412. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8413. end;
  8414. zExp := aExp - bExp + $3FFD;
  8415. shortShift128Left(
  8416. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8417. shortShift128Left(
  8418. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8419. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8420. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8421. inc(zExp);
  8422. end;
  8423. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8424. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8425. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8426. while ( sbits64(rem0) < 0 ) do begin
  8427. dec(zSig0);
  8428. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8429. end;
  8430. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8431. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8432. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8433. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8434. while ( sbits64(rem1) < 0 ) do begin
  8435. dec(zSig1);
  8436. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8437. end;
  8438. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8439. end;
  8440. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8441. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8442. end;
  8443. {*----------------------------------------------------------------------------
  8444. | Returns the remainder of the quadruple-precision floating-point value `a'
  8445. | with respect to the corresponding value `b'. The operation is performed
  8446. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8447. *----------------------------------------------------------------------------*}
  8448. function float128_rem(a: float128; b: float128): float128;
  8449. var
  8450. aSign, zSign: flag;
  8451. aExp, bExp, expDiff: int32;
  8452. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8453. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8454. sigMean0: sbits64;
  8455. z: float128;
  8456. label
  8457. invalid;
  8458. begin
  8459. aSig1 := extractFloat128Frac1( a );
  8460. aSig0 := extractFloat128Frac0( a );
  8461. aExp := extractFloat128Exp( a );
  8462. aSign := extractFloat128Sign( a );
  8463. bSig1 := extractFloat128Frac1( b );
  8464. bSig0 := extractFloat128Frac0( b );
  8465. bExp := extractFloat128Exp( b );
  8466. if ( aExp = $7FFF ) then begin
  8467. if ( (( aSig0 or aSig1 )<>0)
  8468. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8469. result := propagateFloat128NaN( a, b );
  8470. exit;
  8471. end;
  8472. goto invalid;
  8473. end;
  8474. if ( bExp = $7FFF ) then begin
  8475. if ( bSig0 or bSig1 )<>0 then
  8476. begin
  8477. result := propagateFloat128NaN( a, b );
  8478. exit;
  8479. end;
  8480. result := a;
  8481. exit;
  8482. end;
  8483. if ( bExp = 0 ) then begin
  8484. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8485. invalid:
  8486. float_raise( float_flag_invalid );
  8487. z.low := float128_default_nan_low;
  8488. z.high := float128_default_nan_high;
  8489. result := z;
  8490. exit;
  8491. end;
  8492. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8493. end;
  8494. if ( aExp = 0 ) then begin
  8495. if ( ( aSig0 or aSig1 ) = 0 ) then
  8496. begin
  8497. result := a;
  8498. exit;
  8499. end;
  8500. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8501. end;
  8502. expDiff := aExp - bExp;
  8503. if ( expDiff < -1 ) then
  8504. begin
  8505. result := a;
  8506. exit;
  8507. end;
  8508. shortShift128Left(
  8509. aSig0 or int64( $0001000000000000 ),
  8510. aSig1,
  8511. 15 - ord( expDiff < 0 ),
  8512. aSig0,
  8513. aSig1
  8514. );
  8515. shortShift128Left(
  8516. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8517. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8518. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8519. dec(expDiff,64);
  8520. while ( 0 < expDiff ) do begin
  8521. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8522. if ( 4 < q ) then
  8523. q := q - 4
  8524. else
  8525. q := 0;
  8526. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8527. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8528. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8529. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8530. dec(expDiff,61);
  8531. end;
  8532. if ( -64 < expDiff ) then begin
  8533. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8534. if ( 4 < q ) then
  8535. q := q - 4
  8536. else
  8537. q := 0;
  8538. q := q shr (- expDiff);
  8539. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8540. inc(expDiff,52);
  8541. if ( expDiff < 0 ) then begin
  8542. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8543. end
  8544. else begin
  8545. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8546. end;
  8547. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8548. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8549. end
  8550. else begin
  8551. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8552. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8553. end;
  8554. repeat
  8555. alternateASig0 := aSig0;
  8556. alternateASig1 := aSig1;
  8557. inc(q);
  8558. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8559. until not( 0 <= sbits64(aSig0) );
  8560. add128(
  8561. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8562. if ( ( sigMean0 < 0 )
  8563. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8564. aSig0 := alternateASig0;
  8565. aSig1 := alternateASig1;
  8566. end;
  8567. zSign := ord( sbits64(aSig0) < 0 );
  8568. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8569. result :=
  8570. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8571. end;
  8572. {*----------------------------------------------------------------------------
  8573. | Returns the square root of the quadruple-precision floating-point value `a'.
  8574. | The operation is performed according to the IEC/IEEE Standard for Binary
  8575. | Floating-Point Arithmetic.
  8576. *----------------------------------------------------------------------------*}
  8577. function float128_sqrt(a: float128): float128;
  8578. var
  8579. aSign: flag;
  8580. aExp, zExp: int32;
  8581. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8582. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8583. z: float128;
  8584. label
  8585. invalid;
  8586. begin
  8587. aSig1 := extractFloat128Frac1( a );
  8588. aSig0 := extractFloat128Frac0( a );
  8589. aExp := extractFloat128Exp( a );
  8590. aSign := extractFloat128Sign( a );
  8591. if ( aExp = $7FFF ) then begin
  8592. if ( aSig0 or aSig1 )<>0 then
  8593. begin
  8594. result := propagateFloat128NaN( a, a );
  8595. exit;
  8596. end;
  8597. if ( aSign=0 ) then
  8598. begin
  8599. result := a;
  8600. exit;
  8601. end;
  8602. goto invalid;
  8603. end;
  8604. if ( aSign<>0 ) then begin
  8605. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8606. begin
  8607. result := a;
  8608. exit;
  8609. end;
  8610. invalid:
  8611. float_raise( float_flag_invalid );
  8612. z.low := float128_default_nan_low;
  8613. z.high := float128_default_nan_high;
  8614. result := z;
  8615. exit;
  8616. end;
  8617. if ( aExp = 0 ) then begin
  8618. if ( ( aSig0 or aSig1 ) = 0 ) then
  8619. begin
  8620. result := packFloat128( 0, 0, 0, 0 );
  8621. exit;
  8622. end;
  8623. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8624. end;
  8625. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
  8626. aSig0 := aSig0 or int64( $0001000000000000 );
  8627. zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
  8628. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8629. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8630. doubleZSig0 := zSig0 shl 1;
  8631. mul64To128( zSig0, zSig0, term0, term1 );
  8632. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8633. while ( sbits64(rem0) < 0 ) do begin
  8634. dec(zSig0);
  8635. dec(doubleZSig0,2);
  8636. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8637. end;
  8638. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8639. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8640. if ( zSig1 = 0 ) then zSig1 := 1;
  8641. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8642. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8643. mul64To128( zSig1, zSig1, term2, term3 );
  8644. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8645. while ( sbits64(rem1) < 0 ) do begin
  8646. dec(zSig1);
  8647. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8648. term3 := term3 or 1;
  8649. term2 := term2 or doubleZSig0;
  8650. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8651. end;
  8652. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8653. end;
  8654. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8655. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8656. end;
  8657. {*----------------------------------------------------------------------------
  8658. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8659. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8660. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8661. *----------------------------------------------------------------------------*}
  8662. function float128_eq(a: float128; b: float128): flag;
  8663. begin
  8664. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8665. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8666. or ( ( extractFloat128Exp( b ) = $7FFF )
  8667. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8668. ) then begin
  8669. if ( (float128_is_signaling_nan( a )<>0)
  8670. or (float128_is_signaling_nan( b )<>0) ) then begin
  8671. float_raise( float_flag_invalid );
  8672. end;
  8673. result := 0;
  8674. exit;
  8675. end;
  8676. result := ord(
  8677. ( a.low = b.low )
  8678. and ( ( a.high = b.high )
  8679. or ( ( a.low = 0 )
  8680. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8681. ));
  8682. end;
  8683. {*----------------------------------------------------------------------------
  8684. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8685. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8686. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8687. | Arithmetic.
  8688. *----------------------------------------------------------------------------*}
  8689. function float128_le(a: float128; b: float128): flag;
  8690. var
  8691. aSign, bSign: flag;
  8692. begin
  8693. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8694. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8695. or ( ( extractFloat128Exp( b ) = $7FFF )
  8696. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8697. ) then begin
  8698. float_raise( float_flag_invalid );
  8699. result := 0;
  8700. exit;
  8701. end;
  8702. aSign := extractFloat128Sign( a );
  8703. bSign := extractFloat128Sign( b );
  8704. if ( aSign <> bSign ) then begin
  8705. result := ord(
  8706. (aSign<>0)
  8707. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8708. = 0 ));
  8709. exit;
  8710. end;
  8711. if aSign<>0 then
  8712. result := le128( b.high, b.low, a.high, a.low )
  8713. else
  8714. result := le128( a.high, a.low, b.high, b.low );
  8715. end;
  8716. {*----------------------------------------------------------------------------
  8717. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8718. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8719. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8720. *----------------------------------------------------------------------------*}
  8721. function float128_lt(a: float128; b: float128): flag;
  8722. var
  8723. aSign, bSign: flag;
  8724. begin
  8725. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8726. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8727. or ( ( extractFloat128Exp( b ) = $7FFF )
  8728. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8729. ) then begin
  8730. float_raise( float_flag_invalid );
  8731. result := 0;
  8732. exit;
  8733. end;
  8734. aSign := extractFloat128Sign( a );
  8735. bSign := extractFloat128Sign( b );
  8736. if ( aSign <> bSign ) then begin
  8737. result := ord(
  8738. (aSign<>0)
  8739. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8740. <> 0 ));
  8741. exit;
  8742. end;
  8743. if aSign<>0 then
  8744. result := lt128( b.high, b.low, a.high, a.low )
  8745. else
  8746. result := lt128( a.high, a.low, b.high, b.low );
  8747. end;
  8748. {*----------------------------------------------------------------------------
  8749. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8750. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8751. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8752. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8753. *----------------------------------------------------------------------------*}
  8754. function float128_eq_signaling(a: float128; b: float128): flag;
  8755. begin
  8756. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8757. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8758. or ( ( extractFloat128Exp( b ) = $7FFF )
  8759. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8760. ) then begin
  8761. float_raise( float_flag_invalid );
  8762. result := 0;
  8763. exit;
  8764. end;
  8765. result := ord(
  8766. ( a.low = b.low )
  8767. and ( ( a.high = b.high )
  8768. or ( ( a.low = 0 )
  8769. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8770. ));
  8771. end;
  8772. {*----------------------------------------------------------------------------
  8773. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8774. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8775. | cause an exception. Otherwise, the comparison is performed according to the
  8776. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8777. *----------------------------------------------------------------------------*}
  8778. function float128_le_quiet(a: float128; b: float128): flag;
  8779. var
  8780. aSign, bSign: flag;
  8781. begin
  8782. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8783. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8784. or ( ( extractFloat128Exp( b ) = $7FFF )
  8785. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8786. ) then begin
  8787. if ( (float128_is_signaling_nan( a )<>0)
  8788. or (float128_is_signaling_nan( b )<>0) ) then begin
  8789. float_raise( float_flag_invalid );
  8790. end;
  8791. result := 0;
  8792. exit;
  8793. end;
  8794. aSign := extractFloat128Sign( a );
  8795. bSign := extractFloat128Sign( b );
  8796. if ( aSign <> bSign ) then begin
  8797. result := ord(
  8798. (aSign<>0)
  8799. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8800. = 0 ));
  8801. exit;
  8802. end;
  8803. if aSign<>0 then
  8804. result := le128( b.high, b.low, a.high, a.low )
  8805. else
  8806. result := le128( a.high, a.low, b.high, b.low );
  8807. end;
  8808. {*----------------------------------------------------------------------------
  8809. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8810. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8811. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8812. | Standard for Binary Floating-Point Arithmetic.
  8813. *----------------------------------------------------------------------------*}
  8814. function float128_lt_quiet(a: float128; b: float128): flag;
  8815. var
  8816. aSign, bSign: flag;
  8817. begin
  8818. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8819. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8820. or ( ( extractFloat128Exp( b ) = $7FFF )
  8821. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8822. ) then begin
  8823. if ( (float128_is_signaling_nan( a )<>0)
  8824. or (float128_is_signaling_nan( b )<>0) ) then begin
  8825. float_raise( float_flag_invalid );
  8826. end;
  8827. result := 0;
  8828. exit;
  8829. end;
  8830. aSign := extractFloat128Sign( a );
  8831. bSign := extractFloat128Sign( b );
  8832. if ( aSign <> bSign ) then begin
  8833. result := ord(
  8834. (aSign<>0)
  8835. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8836. <> 0 ));
  8837. exit;
  8838. end;
  8839. if aSign<>0 then
  8840. result:=lt128( b.high, b.low, a.high, a.low )
  8841. else
  8842. result:=lt128( a.high, a.low, b.high, b.low );
  8843. end;
  8844. {----------------------------------------------------------------------------
  8845. | Returns the result of converting the double-precision floating-point value
  8846. | `a' to the quadruple-precision floating-point format. The conversion is
  8847. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8848. | Arithmetic.
  8849. *----------------------------------------------------------------------------}
  8850. function float64_to_float128( a : float64) : float128;
  8851. var
  8852. aSign : flag;
  8853. aExp : int16;
  8854. aSig, zSig0, zSig1 : bits64;
  8855. begin
  8856. aSig := extractFloat64Frac( a );
  8857. aExp := extractFloat64Exp( a );
  8858. aSign := extractFloat64Sign( a );
  8859. if ( aExp = $7FF ) then begin
  8860. if ( aSig<>0 ) then begin
  8861. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8862. exit;
  8863. end;
  8864. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8865. exit;
  8866. end;
  8867. if ( aExp = 0 ) then begin
  8868. if ( aSig = 0 ) then
  8869. begin
  8870. result:=packFloat128( aSign, 0, 0, 0 );
  8871. exit;
  8872. end;
  8873. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8874. dec(aExp);
  8875. end;
  8876. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8877. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8878. end;
  8879. {$endif FPC_SOFTFLOAT_FLOAT128}
  8880. {$ifdef FPC_SOFTFLOAT_FLOATX80_FUNCS}
  8881. {$I f80sincos.inc}
  8882. {$endif FPC_SOFTFLOAT_FLOATX80_FUNCS}
  8883. {$endif not(defined(fpc_softfpu_interface))}
  8884. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8885. end.
  8886. {$ifdef FPC}
  8887. { restore context modified at implmentation start
  8888. to possibly re-enable range and overflow checking explicitly}
  8889. {$pop}
  8890. {$endif FPC}
  8891. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}