123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387 |
- {*
- ===============================================================================
- The original notice of the softfloat package is shown below. The conversion
- to pascal was done by Carl Eric Codere in 2002 ([email protected]).
- ===============================================================================
- This C source file is part of the SoftFloat IEC/IEEE Floating-Point
- Arithmetic Package, Release 2a.
- Written by John R. Hauser. This work was made possible in part by the
- International Computer Science Institute, located at Suite 600, 1947 Center
- Street, Berkeley, California 94704. Funding was partially provided by the
- National Science Foundation under grant MIP-9311980. The original version
- of this code was written as part of a project to build a fixed-point vector
- processor in collaboration with the University of California at Berkeley,
- overseen by Profs. Nelson Morgan and John Wawrzynek. More information
- is available through the Web page
- `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
- THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
- has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
- TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
- PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
- AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
- Derivative works are acceptable, even for commercial purposes, so long as
- (1) they include prominent notice that the work is derivative, and (2) they
- include prominent notice akin to these four paragraphs for those parts of
- this code that are retained.
- ===============================================================================
- The float80 and float128 part is translated from the softfloat package
- by Florian Klaempfl and contained the following copyright notice
- The code might contain some duplicate stuff because the floatx80/float128 port was
- done based on the 64 bit enabled softfloat code.
- ===============================================================================
- This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
- Package, Release 2b.
- Written by John R. Hauser. This work was made possible in part by the
- International Computer Science Institute, located at Suite 600, 1947 Center
- Street, Berkeley, California 94704. Funding was partially provided by the
- National Science Foundation under grant MIP-9311980. The original version
- of this code was written as part of a project to build a fixed-point vector
- processor in collaboration with the University of California at Berkeley,
- overseen by Profs. Nelson Morgan and John Wawrzynek. More information
- is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
- arithmetic/SoftFloat.html'.
- THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
- been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
- RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
- AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
- COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
- EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
- INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
- OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
- Derivative works are acceptable, even for commercial purposes, so long as
- (1) the source code for the derivative work includes prominent notice that
- the work is derivative, and (2) the source code includes prominent notice with
- these four paragraphs for those parts of this code that are retained.
- ===============================================================================
- *}
- { $define FPC_SOFTFLOAT_FLOATX80}
- { $define FPC_SOFTFLOAT_FLOAT128}
- { the softfpu unit can be also embedded directly into the system unit }
- {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
- {$mode objfpc}
- unit softfpu;
- { Overflow checking must be disabled,
- since some operations expect overflow!
- }
- {$Q-}
- {$goto on}
- {$macro on}
- {$define compilerproc:=stdcall }
- interface
- {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
- {$if not(defined(fpc_softfpu_implementation))}
- {
- -------------------------------------------------------------------------------
- Software IEC/IEEE floating-point types.
- -------------------------------------------------------------------------------
- }
- TYPE
- float32 = longword;
- {$define FPC_SYSTEM_HAS_float32}
- { we use here a record in the function header because
- the record allows bitwise conversion to single }
- float32rec = record
- float32 : float32;
- end;
- flag = byte;
- bits8 = byte;
- sbits8 = shortint;
- bits16 = word;
- sbits16 = smallint;
- sbits32 = longint;
- bits32 = longword;
- {$ifndef fpc}
- qword = int64;
- {$endif}
- { now part of the system unit
- uint64 = qword;
- }
- bits64 = qword;
- sbits64 = int64;
- {$ifdef ENDIAN_LITTLE}
- float64 = record
- case byte of
- 1: (low,high : bits32);
- // force the record to be aligned like a double
- // else *_to_double will fail for cpus like sparc
- // and avoid expensive unpacking/packing operations
- 2: (dummy : double);
- end;
- floatx80 = record
- case byte of
- 1: (low : qword;high : word);
- // force the record to be aligned like a double
- // else *_to_double will fail for cpus like sparc
- // and avoid expensive unpacking/packing operations
- 2: (dummy : extended);
- end;
- float128 = record
- case byte of
- 1: (low,high : qword);
- // force the record to be aligned like a double
- // else *_to_double will fail for cpus like sparc
- // and avoid expensive unpacking/packing operations
- 2: (dummy : qword);
- end;
- {$else}
- float64 = record
- case byte of
- 1: (high,low : bits32);
- // force the record to be aligned like a double
- // else *_to_double will fail for cpus like sparc
- 2: (dummy : double);
- end;
- floatx80 = record
- case byte of
- 1: (high : word;low : qword);
- // force the record to be aligned like a double
- // else *_to_double will fail for cpus like sparc
- // and avoid expensive unpacking/packing operations
- 2: (dummy : qword);
- end;
- float128 = record
- case byte of
- 1: (high : qword;low : qword);
- // force the record to be aligned like a double
- // else *_to_double will fail for cpus like sparc
- // and avoid expensive unpacking/packing operations
- 2: (dummy : qword);
- end;
- {$endif}
- {$define FPC_SYSTEM_HAS_float64}
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_lt(a: float64;b: float64): flag; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than
- or equal to the corresponding value `b', and 0 otherwise. The comparison
- is performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_le(a: float64;b: float64): flag; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_eq(a: float64;b: float64): flag; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the square root of the double-precision floating-point value `a'.
- The operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- function float64_sqrt( a: float64 ): float64; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the remainder of the double-precision floating-point value `a'
- with respect to the corresponding value `b'. The operation is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_rem(a: float64; b : float64) : float64; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of dividing the double-precision floating-point value `a'
- by the corresponding value `b'. The operation is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_div(a: float64; b : float64) : float64; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of multiplying the double-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_mul( a: float64; b:float64) : float64; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the double-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_sub(a: float64; b : float64) : float64; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the double-precision floating-point values `a'
- and `b'. The operation is performed according to the IEC/IEEE Standard for
- Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_add( a: float64; b : float64) : float64; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Rounds the double-precision floating-point value `a' to an integer,
- and returns the result as a double-precision floating-point value. The
- operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_round_to_int(a: float64) : float64; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the single-precision floating-point format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_float32(a: float64) : float32rec; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic, except that the conversion is always rounded toward zero.
- If `a' is a NaN, the largest positive integer is returned. Otherwise, if
- the conversion overflows, the largest integer with the same sign as `a' is
- returned.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic---which means in particular that the conversion is rounded
- according to the current rounding mode. If `a' is a NaN, the largest
- positive integer is returned. Otherwise, if the conversion overflows, the
- largest integer with the same sign as `a' is returned.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_int32(a: float64): int32; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than
- or equal to the corresponding value `b', and 0 otherwise. The comparison
- is performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the square root of the single-precision floating-point value `a'.
- The operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the remainder of the single-precision floating-point value `a'
- with respect to the corresponding value `b'. The operation is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of dividing the single-precision floating-point value `a'
- by the corresponding value `b'. The operation is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of multiplying the single-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the single-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the single-precision floating-point values `a'
- and `b'. The operation is performed according to the IEC/IEEE Standard for
- Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Rounds the single-precision floating-point value `a' to an integer,
- and returns the result as a single-precision floating-point value. The
- operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the double-precision floating-point format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_to_float64( a : float32rec) : Float64; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic, except that the conversion is always rounded toward zero.
- If `a' is a NaN, the largest positive integer is returned. Otherwise, if
- the conversion overflows, the largest integer with the same sign as `a' is
- returned.
- -------------------------------------------------------------------------------
- *}
- Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic---which means in particular that the conversion is rounded
- according to the current rounding mode. If `a' is a NaN, the largest
- positive integer is returned. Otherwise, if the conversion overflows, the
- largest integer with the same sign as `a' is returned.
- -------------------------------------------------------------------------------
- *}
- Function float32_to_int32( a : float32rec) : int32; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the 32-bit two's complement integer `a' to
- the double-precision floating-point format. The conversion is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function int32_to_float64( a: int32) : float64; compilerproc;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the 32-bit two's complement integer `a' to
- the single-precision floating-point format. The conversion is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function int32_to_float32( a: int32): float32rec; compilerproc;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the double-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- Function int64_to_float64( a: int64 ): float64; compilerproc;
- Function qword_to_float64( a: qword ): float64; compilerproc;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the single-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- Function int64_to_float32( a: int64 ): float32rec; compilerproc;
- Function qword_to_float32( a: qword ): float32rec; compilerproc;
- // +++
- function float32_to_int64( a: float32 ): int64;
- function float32_to_int64_round_to_zero( a: float32 ): int64;
- function float32_eq_signaling( a: float32; b: float32) : flag;
- function float32_le_quiet( a: float32 ; b : float32 ): flag;
- function float32_lt_quiet( a: float32 ; b: float32 ): flag;
- function float32_is_signaling_nan( a : float32 ): flag;
- function float32_is_nan( a : float32 ): flag;
- function float64_to_int64( a: float64 ): int64;
- function float64_to_int64_round_to_zero( a: float64 ): int64;
- function float64_eq_signaling( a: float64; b: float64): flag;
- function float64_le_quiet(a: float64 ; b: float64 ): flag;
- function float64_lt_quiet(a: float64; b: float64 ): Flag;
- function float64_is_signaling_nan( a : float64 ): flag;
- function float64_is_nan( a : float64 ): flag;
- // ===
- {$ifdef FPC_SOFTFLOAT_FLOATX80}
- {*----------------------------------------------------------------------------
- | Extended double-precision rounding precision
- *----------------------------------------------------------------------------*}
- var // threadvar!?
- floatx80_rounding_precision : int8 = 80;
- function int32_to_floatx80( a: int32 ): floatx80;
- function int64_to_floatx80( a: int64 ): floatx80;
- function qword_to_floatx80( a: qword ): floatx80;
- function float32_to_floatx80( a: float32 ): floatx80;
- function float64_to_floatx80( a: float64 ): floatx80;
- function floatx80_to_int32( a: floatx80 ): int32;
- function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
- function floatx80_to_int64( a: floatx80 ): int64;
- function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
- function floatx80_to_float32( a: floatx80 ): float32;
- function floatx80_to_float64( a: floatx80 ): float64;
- {$ifdef FPC_SOFTFLOAT_FLOAT128}
- function floatx80_to_float128( a: floatx80 ): float128;
- {$endif FPC_SOFTFLOAT_FLOAT128}
- function floatx80_round_to_int( a: floatx80 ): floatx80;
- function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
- function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
- function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
- function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
- function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
- function floatx80_sqrt( a: floatx80 ): floatx80;
- function floatx80_eq( a: floatx80; b: floatx80 ): flag;
- function floatx80_le( a: floatx80; b: floatx80 ): flag;
- function floatx80_lt( a: floatx80; b: floatx80 ): flag;
- function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
- function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
- function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
- function floatx80_is_signaling_nan( a: floatx80 ): flag;
- function floatx80_is_nan(a : floatx80 ): flag;
- {$endif FPC_SOFTFLOAT_FLOATX80}
- {$ifdef FPC_SOFTFLOAT_FLOAT128}
- function int32_to_float128( a: int32 ): float128;
- function int64_to_float128( a: int64 ): float128;
- function qword_to_float128( a: qword ): float128;
- function float32_to_float128( a: float32 ): float128;
- function float128_is_nan( a : float128): flag;
- function float128_is_signaling_nan( a : float128): flag;
- function float128_to_int32(a: float128): int32;
- function float128_to_int32_round_to_zero(a: float128): int32;
- function float128_to_int64(a: float128): int64;
- function float128_to_int64_round_to_zero(a: float128): int64;
- function float128_to_float32(a: float128): float32;
- function float128_to_float64(a: float128): float64;
- function float64_to_float128( a : float64) : float128;
- {$ifdef FPC_SOFTFLOAT_FLOATX80}
- function float128_to_floatx80(a: float128): floatx80;
- {$endif FPC_SOFTFLOAT_FLOATX80}
- function float128_round_to_int(a: float128): float128;
- function float128_add(a: float128; b: float128): float128;
- function float128_sub(a: float128; b: float128): float128;
- function float128_mul(a: float128; b: float128): float128;
- function float128_div(a: float128; b: float128): float128;
- function float128_rem(a: float128; b: float128): float128;
- function float128_sqrt(a: float128): float128;
- function float128_eq(a: float128; b: float128): flag;
- function float128_le(a: float128; b: float128): flag;
- function float128_lt(a: float128; b: float128): flag;
- function float128_eq_signaling(a: float128; b: float128): flag;
- function float128_le_quiet(a: float128; b: float128): flag;
- function float128_lt_quiet(a: float128; b: float128): flag;
- {$endif FPC_SOFTFLOAT_FLOAT128}
- CONST
- {-------------------------------------------------------------------------------
- Software IEC/IEEE floating-point underflow tininess-detection mode.
- -------------------------------------------------------------------------------
- *}
- float_tininess_after_rounding = 0;
- float_tininess_before_rounding = 1;
- {*
- -------------------------------------------------------------------------------
- Underflow tininess-detection mode, statically initialized to default value.
- (The declaration in `softfloat.h' must match the `int8' type here.)
- -------------------------------------------------------------------------------
- *}
- var // threadvar!?
- softfloat_detect_tininess: int8 = float_tininess_after_rounding;
- {$endif not(defined(fpc_softfpu_implementation))}
- {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
- implementation
- {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
- {$if not(defined(fpc_softfpu_interface))}
- {$ifdef FPC}
- { disable range and overflow checking explicitly }
- { This might be more essential for x80 and 128-bit
- floating point types and could, maybe be
- restricted to code handle floatx80 and float128 }
- {$push}
- {$R-}
- {$Q-}
- {$endif FPC}
- (*****************************************************************************)
- (*----------------------------------------------------------------------------*)
- (* Primitive arithmetic functions, including multi-word arithmetic, and *)
- (* division and square root approximations. (Can be specialized to target if *)
- (* desired.) *)
- (* ---------------------------------------------------------------------------*)
- (*****************************************************************************)
- { This procedure serves as a single access point to softfloat_exception_flags.
- It also helps to reduce code size a bit because softfloat_exception_flags is
- a threadvar. }
- procedure set_inexact_flag;
- begin
- include(softfloat_exception_flags,float_flag_inexact);
- end;
- {*----------------------------------------------------------------------------
- | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
- | and 7, and returns the properly rounded 32-bit integer corresponding to the
- | input. If `zSign' is 1, the input is negated before being converted to an
- | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
- | is simply rounded to an integer, with the inexact exception raised if the
- | input cannot be represented exactly as an integer. However, if the fixed-
- | point input is too large, the invalid exception is raised and the largest
- | positive or negative integer is returned.
- *----------------------------------------------------------------------------*}
- function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
- var
- roundingMode: TFPURoundingMode;
- roundNearestEven: boolean;
- roundIncrement, roundBits: int8;
- z: int32;
- begin
- roundingMode := softfloat_rounding_mode;
- roundNearestEven := (roundingMode = float_round_nearest_even);
- roundIncrement := $40;
- if not roundNearestEven then
- begin
- if ( roundingMode = float_round_to_zero ) then
- begin
- roundIncrement := 0;
- end
- else begin
- roundIncrement := $7F;
- if ( zSign<>0 ) then
- begin
- if ( roundingMode = float_round_up ) then
- roundIncrement := 0;
- end
- else begin
- if ( roundingMode = float_round_down ) then
- roundIncrement := 0;
- end;
- end;
- end;
- roundBits := lo(absZ) and $7F;
- absZ := ( absZ + roundIncrement ) shr 7;
- absZ := absZ and not( bits64( ord( ( roundBits xor $40 ) = 0 ) and ord(roundNearestEven) ));
- z := absZ;
- if ( zSign<>0 ) then
- z := - z;
- if ( longint(hi( absZ )) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
- begin
- float_raise( float_flag_invalid );
- if zSign<>0 then
- result:=sbits32($80000000)
- else
- result:=$7FFFFFFF;
- exit;
- end;
- if ( roundBits<>0 ) then
- set_inexact_flag;
- result:=z;
- end;
- {*----------------------------------------------------------------------------
- | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
- | `absZ1', with binary point between bits 63 and 64 (between the input words),
- | and returns the properly rounded 64-bit integer corresponding to the input.
- | If `zSign' is 1, the input is negated before being converted to an integer.
- | Ordinarily, the fixed-point input is simply rounded to an integer, with
- | the inexact exception raised if the input cannot be represented exactly as
- | an integer. However, if the fixed-point input is too large, the invalid
- | exception is raised and the largest positive or negative integer is
- | returned.
- *----------------------------------------------------------------------------*}
- function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
- var
- roundingMode: TFPURoundingMode;
- roundNearestEven, increment: flag;
- z: int64;
- label
- overflow;
- begin
- roundingMode := softfloat_rounding_mode;
- roundNearestEven := ord( roundingMode = float_round_nearest_even );
- increment := ord( sbits64(absZ1) < 0 );
- if ( roundNearestEven=0 ) then
- begin
- if ( roundingMode = float_round_to_zero ) then
- begin
- increment := 0;
- end
- else begin
- if ( zSign<>0 ) then
- begin
- increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
- end
- else begin
- increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
- end;
- end;
- end;
- if ( increment<>0 ) then
- begin
- inc(absZ0);
- if ( absZ0 = 0 ) then
- goto overflow;
- absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
- end;
- z := absZ0;
- if ( zSign<>0 ) then
- z := - z;
- if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
- begin
- overflow:
- float_raise( float_flag_invalid );
- if zSign<>0 then
- result:=int64($8000000000000000)
- else
- result:=int64($7FFFFFFFFFFFFFFF);
- exit;
- end;
- if ( absZ1<>0 ) then
- set_inexact_flag;
- result:=z;
- end;
- {*
- -------------------------------------------------------------------------------
- Shifts `a' right by the number of bits given in `count'. If any nonzero
- bits are shifted off, they are ``jammed'' into the least significant bit of
- the result by setting the least significant bit to 1. The value of `count'
- can be arbitrarily large; in particular, if `count' is greater than 32, the
- result will be either 0 or 1, depending on whether `a' is zero or nonzero.
- The result is stored in the location pointed to by `zPtr'.
- -------------------------------------------------------------------------------
- *}
- Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
- var
- z: Bits32;
- Begin
- if ( count = 0 ) then
- z := a
- else
- if ( count < 32 ) then
- Begin
- z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
- End
- else
- Begin
- z := bits32( a <> 0 );
- End;
- zPtr := z;
- End;
- {*----------------------------------------------------------------------------
- | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
- | number of bits given in `count'. Any bits shifted off are lost. The value
- | of `count' can be arbitrarily large; in particular, if `count' is greater
- | than 128, the result will be 0. The result is broken into two 64-bit pieces
- | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
- *----------------------------------------------------------------------------*}
- procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
- var
- z0, z1: bits64;
- negCount: int8;
- begin
- negCount := ( - count ) and 63;
- if ( count = 0 ) then
- begin
- z1 := a1;
- z0 := a0;
- end
- else if ( count < 64 ) then
- begin
- z1 := ( a0 shl negCount ) or ( a1 shr count );
- z0 := a0 shr count;
- end
- else
- begin
- if ( count < 128 ) then
- z1 := a0 shr ( count and 63 )
- else
- z1 := 0;
- z0 := 0;
- end;
- z1Ptr := z1;
- z0Ptr := z0;
- end;
- {*----------------------------------------------------------------------------
- | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
- | number of bits given in `count'. If any nonzero bits are shifted off, they
- | are ``jammed'' into the least significant bit of the result by setting the
- | least significant bit to 1. The value of `count' can be arbitrarily large;
- | in particular, if `count' is greater than 128, the result will be either
- | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
- | nonzero. The result is broken into two 64-bit pieces which are stored at
- | the locations pointed to by `z0Ptr' and `z1Ptr'.
- *----------------------------------------------------------------------------*}
- procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
- var
- z0,z1 : bits64;
- negCount : int8;
- begin
- negCount := ( - count ) and 63;
- if ( count = 0 ) then begin
- z1 := a1;
- z0 := a0;
- end
- else if ( count < 64 ) then begin
- z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
- z0 := a0 shr count;
- end
- else begin
- if ( count = 64 ) then begin
- z1 := a0 or ord( a1 <> 0 );
- end
- else if ( count < 128 ) then begin
- z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
- end
- else begin
- z1 := ord( ( a0 or a1 ) <> 0 );
- end;
- z0 := 0;
- end;
- z1Ptr := z1;
- z0Ptr := z0;
- end;
- {*
- -------------------------------------------------------------------------------
- Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
- number of bits given in `count'. Any bits shifted off are lost. The value
- of `count' can be arbitrarily large; in particular, if `count' is greater
- than 64, the result will be 0. The result is broken into two 32-bit pieces
- which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- shift64Right(
- a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
- Var
- z0, z1: bits32;
- negCount : int8;
- Begin
- negCount := ( - count ) AND 31;
- if ( count = 0 ) then
- Begin
- z1 := a1;
- z0 := a0;
- End
- else if ( count < 32 ) then
- Begin
- z1 := ( a0 shl negCount ) OR ( a1 shr count );
- z0 := a0 shr count;
- End
- else
- Begin
- if (count < 64) then
- z1 := ( a0 shr ( count AND 31 ) )
- else
- z1 := 0;
- z0 := 0;
- End;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
- number of bits given in `count'. If any nonzero bits are shifted off, they
- are ``jammed'' into the least significant bit of the result by setting the
- least significant bit to 1. The value of `count' can be arbitrarily large;
- in particular, if `count' is greater than 64, the result will be either 0
- or 1, depending on whether the concatenation of `a0' and `a1' is zero or
- nonzero. The result is broken into two 32-bit pieces which are stored at
- the locations pointed to by `z0Ptr' and `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- shift64RightJamming(
- a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
- VAR
- z0, z1 : bits32;
- negCount : int8;
- Begin
- negCount := ( - count ) AND 31;
- if ( count = 0 ) then
- Begin
- z1 := a1;
- z0 := a0;
- End
- else
- if ( count < 32 ) then
- Begin
- z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
- z0 := a0 shr count;
- End
- else
- Begin
- if ( count = 32 ) then
- Begin
- z1 := a0 OR bits32( a1 <> 0 );
- End
- else
- if ( count < 64 ) Then
- Begin
- z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
- End
- else
- Begin
- z1 := bits32( ( a0 OR a1 ) <> 0 );
- End;
- z0 := 0;
- End;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*----------------------------------------------------------------------------
- | Shifts `a' right by the number of bits given in `count'. If any nonzero
- | bits are shifted off, they are ``jammed'' into the least significant bit of
- | the result by setting the least significant bit to 1. The value of `count'
- | can be arbitrarily large; in particular, if `count' is greater than 64, the
- | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
- | The result is stored in the location pointed to by `zPtr'.
- *----------------------------------------------------------------------------*}
- procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
- var
- z: bits64;
- begin
- if ( count = 0 ) then
- begin
- z := a;
- end
- else if ( count < 64 ) then
- begin
- z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
- end
- else
- begin
- z := ord( a <> 0 );
- end;
- zPtr := z;
- end;
- {$if not defined(shift64ExtraRightJamming)}
- procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
- overload;
- forward;
- {$endif}
- {*
- -------------------------------------------------------------------------------
- Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
- by 32 _plus_ the number of bits given in `count'. The shifted result is
- at most 64 nonzero bits; these are broken into two 32-bit pieces which are
- stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
- off form a third 32-bit result as follows: The _last_ bit shifted off is
- the most-significant bit of the extra result, and the other 31 bits of the
- extra result are all zero if and only if _all_but_the_last_ bits shifted off
- were all zero. This extra result is stored in the location pointed to by
- `z2Ptr'. The value of `count' can be arbitrarily large.
- (This routine makes more sense if `a0', `a1', and `a2' are considered
- to form a fixed-point value with binary point between `a1' and `a2'. This
- fixed-point value is shifted right by the number of bits given in `count',
- and the integer part of the result is returned at the locations pointed to
- by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
- corrupted as described above, and is returned at the location pointed to by
- `z2Ptr'.)
- -------------------------------------------------------------------------------
- }
- Procedure
- shift64ExtraRightJamming(
- a0: bits32;
- a1: bits32;
- a2: bits32;
- count: int16;
- VAR z0Ptr: bits32;
- VAR z1Ptr: bits32;
- VAR z2Ptr: bits32
- ); overload;
- Var
- z0, z1, z2: bits32;
- negCount : int8;
- Begin
- negCount := ( - count ) AND 31;
- if ( count = 0 ) then
- Begin
- z2 := a2;
- z1 := a1;
- z0 := a0;
- End
- else
- Begin
- if ( count < 32 ) Then
- Begin
- z2 := a1 shl negCount;
- z1 := ( a0 shl negCount ) OR ( a1 shr count );
- z0 := a0 shr count;
- End
- else
- Begin
- if ( count = 32 ) then
- Begin
- z2 := a1;
- z1 := a0;
- End
- else
- Begin
- a2 := a2 or a1;
- if ( count < 64 ) then
- Begin
- z2 := a0 shl negCount;
- z1 := a0 shr ( count AND 31 );
- End
- else
- Begin
- if count = 64 then
- z2 := a0
- else
- z2 := bits32(a0 <> 0);
- z1 := 0;
- End;
- End;
- z0 := 0;
- End;
- z2 := z2 or bits32( a2 <> 0 );
- End;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
- number of bits given in `count'. Any bits shifted off are lost. The value
- of `count' must be less than 32. The result is broken into two 32-bit
- pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- shortShift64Left(
- a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
- Begin
- z1Ptr := a1 shl count;
- if count = 0 then
- z0Ptr := a0
- else
- z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
- End;
- {*
- -------------------------------------------------------------------------------
- Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
- by the number of bits given in `count'. Any bits shifted off are lost.
- The value of `count' must be less than 32. The result is broken into three
- 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
- `z1Ptr', and `z2Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- shortShift96Left(
- a0: bits32;
- a1: bits32;
- a2: bits32;
- count: int16;
- VAR z0Ptr: bits32;
- VAR z1Ptr: bits32;
- VAR z2Ptr: bits32
- );
- Var
- z0, z1, z2: bits32;
- negCount: int8;
- Begin
- z2 := a2 shl count;
- z1 := a1 shl count;
- z0 := a0 shl count;
- if ( 0 < count ) then
- Begin
- negCount := ( ( - count ) AND 31 );
- z1 := z1 or (a2 shr negCount);
- z0 := z0 or (a1 shr negCount);
- End;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*----------------------------------------------------------------------------
- | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
- | number of bits given in `count'. Any bits shifted off are lost. The value
- | of `count' must be less than 64. The result is broken into two 64-bit
- | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
- *----------------------------------------------------------------------------*}
- procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
- begin
- z1Ptr := a1 shl count;
- if count=0 then
- z0Ptr:=a0
- else
- z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
- end;
- {*
- -------------------------------------------------------------------------------
- Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
- value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
- any carry out is lost. The result is broken into two 32-bit pieces which
- are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- add64(
- a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
- Var
- z1: bits32;
- Begin
- z1 := a1 + b1;
- z1Ptr := z1;
- z0Ptr := a0 + b0 + bits32( z1 < a1 );
- End;
- {*
- -------------------------------------------------------------------------------
- Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
- 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
- modulo 2^96, so any carry out is lost. The result is broken into three
- 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
- `z1Ptr', and `z2Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- add96(
- a0: bits32;
- a1: bits32;
- a2: bits32;
- b0: bits32;
- b1: bits32;
- b2: bits32;
- VAR z0Ptr: bits32;
- VAR z1Ptr: bits32;
- VAR z2Ptr: bits32
- );
- var
- z0, z1, z2: bits32;
- carry0, carry1: int8;
- Begin
- z2 := a2 + b2;
- carry1 := int8( z2 < a2 );
- z1 := a1 + b1;
- carry0 := int8( z1 < a1 );
- z0 := a0 + b0;
- z1 := z1 + carry1;
- z0 := z0 + bits32( z1 < carry1 );
- z0 := z0 + carry0;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*----------------------------------------------------------------------------
- | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
- | by the number of bits given in `count'. Any bits shifted off are lost.
- | The value of `count' must be less than 64. The result is broken into three
- | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
- | `z1Ptr', and `z2Ptr'.
- *----------------------------------------------------------------------------*}
- procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
- var
- z0, z1, z2 : bits64;
- negCount : int8;
- begin
- z2 := a2 shl count;
- z1 := a1 shl count;
- z0 := a0 shl count;
- if ( 0 < count ) then
- begin
- negCount := ( ( - count ) and 63 );
- z1 := z1 or (a2 shr negCount);
- z0 := z0 or (a1 shr negCount);
- end;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- end;
- {*----------------------------------------------------------------------------
- | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
- | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
- | any carry out is lost. The result is broken into two 64-bit pieces which
- | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
- *----------------------------------------------------------------------------*}
- procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
- var
- z1 : bits64;
- begin
- z1 := a1 + b1;
- z1Ptr := z1;
- z0Ptr := a0 + b0 + ord( z1 < a1 );
- end;
- {*----------------------------------------------------------------------------
- | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
- | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
- | modulo 2^192, so any carry out is lost. The result is broken into three
- | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
- | `z1Ptr', and `z2Ptr'.
- *----------------------------------------------------------------------------*}
- procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
- var
- z0, z1, z2 : bits64;
- carry0, carry1 : int8;
- begin
- z2 := a2 + b2;
- carry1 := ord( z2 < a2 );
- z1 := a1 + b1;
- carry0 := ord( z1 < a1 );
- z0 := a0 + b0;
- inc(z1, carry1);
- inc(z0, ord( z1 < carry1 ));
- inc(z0, carry0);
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- end;
- {*
- -------------------------------------------------------------------------------
- Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
- 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
- 2^64, so any borrow out (carry out) is lost. The result is broken into two
- 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
- `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- sub64(
- a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
- Begin
- z1Ptr := a1 - b1;
- z0Ptr := a0 - b0 - bits32( a1 < b1 );
- End;
- {*
- -------------------------------------------------------------------------------
- Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
- the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
- is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
- into three 32-bit pieces which are stored at the locations pointed to by
- `z0Ptr', `z1Ptr', and `z2Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- sub96(
- a0:bits32;
- a1:bits32;
- a2:bits32;
- b0:bits32;
- b1:bits32;
- b2:bits32;
- VAR z0Ptr:bits32;
- VAR z1Ptr:bits32;
- VAR z2Ptr:bits32
- );
- Var
- z0, z1, z2: bits32;
- borrow0, borrow1: int8;
- Begin
- z2 := a2 - b2;
- borrow1 := int8( a2 < b2 );
- z1 := a1 - b1;
- borrow0 := int8( a1 < b1 );
- z0 := a0 - b0;
- z0 := z0 - bits32( z1 < borrow1 );
- z1 := z1 - borrow1;
- z0 := z0 -borrow0;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*----------------------------------------------------------------------------
- | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
- | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
- | 2^128, so any borrow out (carry out) is lost. The result is broken into two
- | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
- | `z1Ptr'.
- *----------------------------------------------------------------------------*}
- procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
- begin
- z1Ptr := a1 - b1;
- z0Ptr := a0 - b0 - ord( a1 < b1 );
- end;
- {*----------------------------------------------------------------------------
- | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
- | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
- | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
- | result is broken into three 64-bit pieces which are stored at the locations
- | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
- *----------------------------------------------------------------------------*}
- procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
- var
- z0, z1, z2 : bits64;
- borrow0, borrow1 : int8;
- begin
- z2 := a2 - b2;
- borrow1 := ord( a2 < b2 );
- z1 := a1 - b1;
- borrow0 := ord( a1 < b1 );
- z0 := a0 - b0;
- dec(z0, ord( z1 < borrow1 ));
- dec(z1, borrow1);
- dec(z0, borrow0);
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- end;
- {*
- -------------------------------------------------------------------------------
- Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
- into two 32-bit pieces which are stored at the locations pointed to by
- `z0Ptr' and `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- {$IFDEF SOFTFPU_COMPILER_MUL32TO64}
- Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr :bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
- var
- tmp: qword;
- begin
- tmp:=qword(a) * b;
- z0ptr:=hi(tmp);
- z1ptr:=lo(tmp);
- end;
- {$ELSE}
- Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
- :bits32 );
- Var
- aHigh, aLow, bHigh, bLow: bits16;
- z0, zMiddleA, zMiddleB, z1: bits32;
- Begin
- aLow := bits16(a);
- aHigh := a shr 16;
- bLow := bits16(b);
- bHigh := b shr 16;
- z1 := ( bits32( aLow) ) * bLow;
- zMiddleA := ( bits32 (aLow) ) * bHigh;
- zMiddleB := ( bits32 (aHigh) ) * bLow;
- z0 := ( bits32 (aHigh) ) * bHigh;
- zMiddleA := zMiddleA + zMiddleB;
- z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
- zMiddleA := zmiddleA shl 16;
- z1 := z1 + zMiddleA;
- z0 := z0 + bits32( z1 < zMiddleA );
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {$ENDIF}
- {*
- -------------------------------------------------------------------------------
- Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
- to obtain a 96-bit product. The product is broken into three 32-bit pieces
- which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
- `z2Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- mul64By32To96(
- a0:bits32;
- a1:bits32;
- b:bits32;
- VAR z0Ptr:bits32;
- VAR z1Ptr:bits32;
- VAR z2Ptr:bits32
- );
- Var
- z0, z1, z2, more1: bits32;
- Begin
- mul32To64( a1, b, z1, z2 );
- mul32To64( a0, b, z0, more1 );
- add64( z0, more1, 0, z1, z0, z1 );
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
- 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
- product. The product is broken into four 32-bit pieces which are stored at
- the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- mul64To128(
- a0:bits32;
- a1:bits32;
- b0:bits32;
- b1:bits32;
- VAR z0Ptr:bits32;
- VAR z1Ptr:bits32;
- VAR z2Ptr:bits32;
- VAR z3Ptr:bits32
- );
- Var
- z0, z1, z2, z3: bits32;
- more1, more2: bits32;
- Begin
- mul32To64( a1, b1, z2, z3 );
- mul32To64( a1, b0, z1, more2 );
- add64( z1, more2, 0, z2, z1, z2 );
- mul32To64( a0, b0, z0, more1 );
- add64( z0, more1, 0, z1, z0, z1 );
- mul32To64( a0, b1, more1, more2 );
- add64( more1, more2, 0, z2, more1, z2 );
- add64( z0, z1, 0, more1, z0, z1 );
- z3Ptr := z3;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*----------------------------------------------------------------------------
- | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
- | into two 64-bit pieces which are stored at the locations pointed to by
- | `z0Ptr' and `z1Ptr'.
- *----------------------------------------------------------------------------*}
- procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
- var
- aHigh, aLow, bHigh, bLow : bits32;
- z0, zMiddleA, zMiddleB, z1 : bits64;
- begin
- aLow := a;
- aHigh := a shr 32;
- bLow := b;
- bHigh := b shr 32;
- z1 := ( bits64(aLow) ) * bLow;
- zMiddleA := ( bits64( aLow )) * bHigh;
- zMiddleB := ( bits64( aHigh )) * bLow;
- z0 := ( bits64(aHigh) ) * bHigh;
- inc(zMiddleA, zMiddleB);
- inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
- zMiddleA := zMiddleA shl 32;
- inc(z1, zMiddleA);
- inc(z0, ord( z1 < zMiddleA ));
- z1Ptr := z1;
- z0Ptr := z0;
- end;
- {*----------------------------------------------------------------------------
- | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
- | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
- | product. The product is broken into four 64-bit pieces which are stored at
- | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
- *----------------------------------------------------------------------------*}
- procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
- var
- z0,z1,z2,z3,more1,more2 : bits64;
- begin
- mul64To128( a1, b1, z2, z3 );
- mul64To128( a1, b0, z1, more2 );
- add128( z1, more2, 0, z2, z1, z2 );
- mul64To128( a0, b0, z0, more1 );
- add128( z0, more1, 0, z1, z0, z1 );
- mul64To128( a0, b1, more1, more2 );
- add128( more1, more2, 0, z2, more1, z2 );
- add128( z0, z1, 0, more1, z0, z1 );
- z3Ptr := z3;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- end;
- {*----------------------------------------------------------------------------
- | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
- | `b' to obtain a 192-bit product. The product is broken into three 64-bit
- | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
- | `z2Ptr'.
- *----------------------------------------------------------------------------*}
- procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
- var
- z0, z1, z2, more1 : bits64;
- begin
- mul64To128( a1, b, z1, z2 );
- mul64To128( a0, b, z0, more1 );
- add128( z0, more1, 0, z1, z0, z1 );
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- end;
- {*----------------------------------------------------------------------------
- | Returns an approximation to the 64-bit integer quotient obtained by dividing
- | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
- | divisor `b' must be at least 2^63. If q is the exact quotient truncated
- | toward zero, the approximation returned lies between q and q + 2 inclusive.
- | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
- | unsigned integer is returned.
- *----------------------------------------------------------------------------*}
- Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
- var
- b0, b1, rem0, rem1, term0, term1, z : bits64;
- begin
- if ( b <= a0 ) then
- begin
- result:=qword( $FFFFFFFFFFFFFFFF );
- exit;
- end;
- b0 := b shr 32;
- if ( b0 shl 32 <= a0 ) then
- z:=qword( $FFFFFFFF00000000 )
- else
- z:=( a0 div b0 ) shl 32;
- mul64To128( b, z, term0, term1 );
- sub128( a0, a1, term0, term1, rem0, rem1 );
- while ( ( sbits64(rem0) ) < 0 ) do begin
- dec(z,qword( $100000000 ));
- b1 := b shl 32;
- add128( rem0, rem1, b0, b1, rem0, rem1 );
- end;
- rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
- if ( b0 shl 32 <= rem0 ) then
- z:=z or $FFFFFFFF
- else
- z:=z or rem0 div b0;
- result:=z;
- end;
- {*
- -------------------------------------------------------------------------------
- Returns an approximation to the 32-bit integer quotient obtained by dividing
- `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
- divisor `b' must be at least 2^31. If q is the exact quotient truncated
- toward zero, the approximation returned lies between q and q + 2 inclusive.
- If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
- unsigned integer is returned.
- -------------------------------------------------------------------------------
- *}
- Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
- Var
- b0, b1: bits32;
- rem0, rem1, term0, term1: bits32;
- z: bits32;
- Begin
- if ( b <= a0 ) then
- Begin
- estimateDiv64To32 := $FFFFFFFF;
- exit;
- End;
- b0 := b shr 16;
- if ( b0 shl 16 <= a0 ) then
- z:= $FFFF0000
- else
- z:= ( a0 div b0 ) shl 16;
- mul32To64( b, z, term0, term1 );
- sub64( a0, a1, term0, term1, rem0, rem1 );
- while ( ( sbits32 (rem0) ) < 0 ) do
- Begin
- z := z - $10000;
- b1 := b shl 16;
- add64( rem0, rem1, b0, b1, rem0, rem1 );
- End;
- rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
- if ( b0 shl 16 <= rem0 ) then
- z := z or $FFFF
- else
- z := z or (rem0 div b0);
- estimateDiv64To32 := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns an approximation to the square root of the 32-bit significand given
- by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
- `aExp' (the least significant bit) is 1, the integer returned approximates
- 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
- is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
- case, the approximation returned lies strictly within +/-2 of the exact
- value.
- -------------------------------------------------------------------------------
- *}
- Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
- const sqrtOddAdjustments: array[0..15] of bits16 = (
- $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
- $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
- );
- const sqrtEvenAdjustments: array[0..15] of bits16 = (
- $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
- $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
- );
- Var
- index: int8;
- z: bits32;
- Begin
- index := ( a shr 27 ) AND 15;
- if ( aExp AND 1 ) <> 0 then
- Begin
- z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
- z := ( ( a div z ) shl 14 ) + ( z shl 15 );
- a := a shr 1;
- End
- else
- Begin
- z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
- z := a div z + z;
- if ( $20000 <= z ) then
- z := $FFFF8000
- else
- z := ( z shl 15 );
- if ( z <= a ) then
- Begin
- estimateSqrt32 := bits32 ( SarLongint( sbits32 (a)) );
- exit;
- End;
- End;
- estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the number of leading 0 bits before the most-significant 1 bit of
- `a'. If `a' is zero, 32 is returned.
- -------------------------------------------------------------------------------
- *}
- Function countLeadingZeros32( a:bits32 ): int8;
- const countLeadingZerosHigh:array[0..255] of int8 = (
- 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- );
- Var
- shiftCount: int8;
- Begin
- shiftCount := 0;
- if ( a < $10000 ) then
- Begin
- shiftCount := shiftcount + 16;
- a := a shl 16;
- End;
- if ( a < $1000000 ) then
- Begin
- shiftCount := shiftcount + 8;
- a := a shl 8;
- end;
- shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
- countLeadingZeros32:= shiftCount;
- End;
- {*----------------------------------------------------------------------------
- | Returns the number of leading 0 bits before the most-significant 1 bit of
- | `a'. If `a' is zero, 64 is returned.
- *----------------------------------------------------------------------------*}
- function countLeadingZeros64( a : bits64): int8;
- var
- shiftcount : int8;
- Begin
- shiftCount := 0;
- if ( a < bits64(bits64(1) shl 32 )) then
- shiftCount := shiftcount + 32
- else
- a := a shr 32;
- shiftCount := shiftCount + countLeadingZeros32( a );
- countLeadingZeros64:= shiftCount;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
- than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
- Otherwise, returns 0.
- -------------------------------------------------------------------------------
- *}
- Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
- Begin
- le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
- than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
- returns 0.
- -------------------------------------------------------------------------------
- *}
- Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
- Begin
- lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
- End;
- const
- float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
- float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
- (*****************************************************************************)
- (* End Low-Level arithmetic *)
- (*****************************************************************************)
- {*----------------------------------------------------------------------------
- | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
- | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
- | returns 0.
- *----------------------------------------------------------------------------*}
- function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
- begin
- result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
- end;
- {*
- -------------------------------------------------------------------------------
- Functions and definitions to determine: (1) whether tininess for underflow
- is detected before or after rounding by default, (2) what (if anything)
- happens when exceptions are raised, (3) how signaling NaNs are distinguished
- from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
- are propagated from function inputs to output. These details are ENDIAN
- specific
- -------------------------------------------------------------------------------
- *}
- {$IFDEF ENDIAN_LITTLE}
- {*
- -------------------------------------------------------------------------------
- Internal canonical NaN format.
- -------------------------------------------------------------------------------
- *}
- TYPE
- commonNaNT = record
- high, low : bits32;
- sign: flag;
- end;
- {*
- -------------------------------------------------------------------------------
- The pattern for a default generated single-precision NaN.
- -------------------------------------------------------------------------------
- *}
- const float32_default_nan = $FFC00000;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is a NaN;
- otherwise returns 0.
- -------------------------------------------------------------------------------
- *}
- Function float32_is_nan( a : float32 ): flag;
- Begin
- float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is a signaling
- NaN; otherwise returns 0.
- -------------------------------------------------------------------------------
- *}
- Function float32_is_signaling_nan( a : float32 ): flag;
- Begin
- float32_is_signaling_nan := flag
- (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point NaN
- `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
- exception is raised.
- -------------------------------------------------------------------------------
- *}
- function float32ToCommonNaN(a: float32) : commonNaNT;
- var
- z : commonNaNT ;
- Begin
- if ( float32_is_signaling_nan( a ) <> 0) then
- float_raise( float_flag_invalid );
- z.sign := a shr 31;
- z.low := 0;
- z.high := a shl 9;
- result := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the canonical NaN `a' to the single-
- precision floating-point format.
- -------------------------------------------------------------------------------
- *}
- Function commonNaNToFloat32( a : commonNaNT ): float32;
- Begin
- commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
- End;
- {*
- -------------------------------------------------------------------------------
- Takes two single-precision floating-point values `a' and `b', one of which
- is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
- signaling NaN, the invalid exception is raised.
- -------------------------------------------------------------------------------
- *}
- Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
- Var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
- label returnLargerSignificand;
- Begin
- aIsNaN := float32_is_nan( a );
- aIsSignalingNaN := float32_is_signaling_nan( a );
- bIsNaN := float32_is_nan( b );
- bIsSignalingNaN := float32_is_signaling_nan( b );
- a := a or $00400000;
- b := b or $00400000;
- if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
- float_raise( float_flag_invalid );
- if ( aIsSignalingNaN )<> 0 then
- Begin
- if ( bIsSignalingNaN ) <> 0 then
- goto returnLargerSignificand;
- if bIsNan <> 0 then
- propagateFloat32NaN := b
- else
- propagateFloat32NaN := a;
- exit;
- End
- else if ( aIsNaN <> 0) then
- Begin
- if ( bIsSignalingNaN or not bIsNaN )<> 0 then
- Begin
- propagateFloat32NaN := a;
- exit;
- End;
- returnLargerSignificand:
- if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
- Begin
- propagateFloat32NaN := b;
- exit;
- End;
- if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
- Begin
- propagateFloat32NaN := a;
- End;
- if a < b then
- propagateFloat32NaN := a
- else
- propagateFloat32NaN := b;
- exit;
- End
- else
- Begin
- propagateFloat32NaN := b;
- exit;
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- The pattern for a default generated double-precision NaN. The `high' and
- `low' values hold the most- and least-significant bits, respectively.
- -------------------------------------------------------------------------------
- *}
- const
- float64_default_nan_high = $FFF80000;
- float64_default_nan_low = $00000000;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is a NaN;
- otherwise returns 0.
- -------------------------------------------------------------------------------
- *}
- Function float64_is_nan( a : float64 ) : flag;
- Begin
- float64_is_nan :=
- flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
- and (( a.low or ( a.high and $000FFFFF ) )<>0));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is a signaling
- NaN; otherwise returns 0.
- -------------------------------------------------------------------------------
- *}
- Function float64_is_signaling_nan( a : float64 ): flag;
- Begin
- float64_is_signaling_nan :=
- flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
- and ( a.low or ( a.high and $0007FFFF ) );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point NaN
- `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
- exception is raised.
- -------------------------------------------------------------------------------
- *}
- function float64ToCommonNaN( a : float64 ) : commonNaNT;
- Var
- z : commonNaNT;
- Begin
- if ( float64_is_signaling_nan( a )<>0 ) then
- float_raise( float_flag_invalid );
- z.sign := a.high shr 31;
- shortShift64Left( a.high, a.low, 12, z.high, z.low );
- result := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the canonical NaN `a' to the double-
- precision floating-point format.
- -------------------------------------------------------------------------------
- *}
- function commonNaNToFloat64( a : commonNaNT) : float64;
- Var
- z: float64;
- Begin
- shift64Right( a.high, a.low, 12, z.high, z.low );
- z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
- result := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Takes two double-precision floating-point values `a' and `b', one of which
- is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
- signaling NaN, the invalid exception is raised.
- -------------------------------------------------------------------------------
- *}
- Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
- Var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
- label returnLargerSignificand;
- Begin
- aIsNaN := float64_is_nan( a );
- aIsSignalingNaN := float64_is_signaling_nan( a );
- bIsNaN := float64_is_nan( b );
- bIsSignalingNaN := float64_is_signaling_nan( b );
- a.high := a.high or $00080000;
- b.high := b.high or $00080000;
- if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
- float_raise( float_flag_invalid );
- if ( aIsSignalingNaN )<>0 then
- Begin
- if ( bIsSignalingNaN )<>0 then
- goto returnLargerSignificand;
- if bIsNan <> 0 then
- c := b
- else
- c := a;
- exit;
- End
- else if ( aIsNaN )<> 0 then
- Begin
- if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
- Begin
- c := a;
- exit;
- End;
- returnLargerSignificand:
- if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
- Begin
- c := b;
- exit;
- End;
- if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
- Begin
- c := a;
- exit;
- End;
- if a.high < b.high then
- c := a
- else
- c := b;
- exit;
- End
- else
- Begin
- c := b;
- exit;
- End;
- End;
- {*----------------------------------------------------------------------------
- | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
- | otherwise returns 0.
- *----------------------------------------------------------------------------*}
- function float128_is_nan( a : float128): flag;
- begin
- result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
- and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the quadruple-precision floating-point value `a' is a
- | signaling NaN; otherwise returns 0.
- *----------------------------------------------------------------------------*}
- function float128_is_signaling_nan( a : float128): flag;
- begin
- result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
- ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the quadruple-precision floating-point NaN
- | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
- | exception is raised.
- *----------------------------------------------------------------------------*}
- function float128ToCommonNaN( a : float128): commonNaNT;
- var
- z: commonNaNT;
- qhigh,qlow : qword;
- begin
- if ( float128_is_signaling_nan( a )<>0) then
- float_raise( float_flag_invalid );
- z.sign := a.high shr 63;
- shortShift128Left( a.high, a.low, 16, qhigh, qlow );
- z.high:=qhigh shr 32;
- z.low:=qhigh and $ffffffff;
- result:=z;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the canonical NaN `a' to the quadruple-
- | precision floating-point format.
- *----------------------------------------------------------------------------*}
- function commonNaNToFloat128( a : commonNaNT): float128;
- var
- z: float128;
- begin
- shift128Right( a.high, a.low, 16, z.high, z.low );
- z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
- result:=z;
- end;
- {*----------------------------------------------------------------------------
- | Takes two quadruple-precision floating-point values `a' and `b', one of
- | which is a NaN, and returns the appropriate NaN result. If either `a' or
- | `b' is a signaling NaN, the invalid exception is raised.
- *----------------------------------------------------------------------------*}
- function propagateFloat128NaN( a: float128; b : float128): float128;
- var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
- label
- returnLargerSignificand;
- begin
- aIsNaN := float128_is_nan( a );
- aIsSignalingNaN := float128_is_signaling_nan( a );
- bIsNaN := float128_is_nan( b );
- bIsSignalingNaN := float128_is_signaling_nan( b );
- a.high := a.high or int64( $0000800000000000 );
- b.high := b.high or int64( $0000800000000000 );
- if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
- float_raise( float_flag_invalid );
- if ( aIsSignalingNaN )<>0 then
- begin
- if ( bIsSignalingNaN )<>0 then
- goto returnLargerSignificand;
- if bIsNaN<>0 then
- result := b
- else
- result := a;
- exit;
- end
- else if ( aIsNaN )<>0 then
- begin
- if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
- begin
- result := a;
- exit;
- end;
- returnLargerSignificand:
- if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
- begin
- result := b;
- exit;
- end;
- if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
- begin
- result := a;
- exit
- end;
- if ( a.high < b.high ) then
- result := a
- else
- result := b;
- exit;
- end
- else
- result:=b;
- end;
- {$ELSE}
- { Big endian code }
- (*----------------------------------------------------------------------------
- | Internal canonical NaN format.
- *----------------------------------------------------------------------------*)
- type
- commonNANT = record
- high, low : bits32;
- sign : flag;
- end;
- (*----------------------------------------------------------------------------
- | The pattern for a default generated single-precision NaN.
- *----------------------------------------------------------------------------*)
- const float32_default_nan = $7FFFFFFF;
- (*----------------------------------------------------------------------------
- | Returns 1 if the single-precision floating-point value `a' is a NaN;
- | otherwise returns 0.
- *----------------------------------------------------------------------------*)
- function float32_is_nan(a: float32): flag;
- begin
- float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
- end;
- (*----------------------------------------------------------------------------
- | Returns 1 if the single-precision floating-point value `a' is a signaling
- | NaN; otherwise returns 0.
- *----------------------------------------------------------------------------*)
- function float32_is_signaling_nan(a: float32):flag;
- begin
- float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
- end;
- (*----------------------------------------------------------------------------
- | Returns the result of converting the single-precision floating-point NaN
- | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
- | exception is raised.
- *----------------------------------------------------------------------------*)
- function float32ToCommonNaN( a: float32) : commonNaNT;
- var
- z: commonNANT;
- begin
- if float32_is_signaling_nan(a)<>0 then
- float_raise(float_flag_invalid);
- z.sign := a shr 31;
- z.low := 0;
- z.high := a shl 9;
- result:=z;
- end;
- (*----------------------------------------------------------------------------
- | Returns the result of converting the canonical NaN `a' to the single-
- | precision floating-point format.
- *----------------------------------------------------------------------------*)
- function CommonNanToFloat32(a : CommonNaNT): float32;
- begin
- CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
- end;
- (*----------------------------------------------------------------------------
- | Takes two single-precision floating-point values `a' and `b', one of which
- | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
- | signaling NaN, the invalid exception is raised.
- *----------------------------------------------------------------------------*)
- function propagateFloat32NaN( a: float32 ; b: float32): float32;
- var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
- begin
- aIsNaN := float32_is_nan( a );
- aIsSignalingNaN := float32_is_signaling_nan( a );
- bIsNaN := float32_is_nan( b );
- bIsSignalingNaN := float32_is_signaling_nan( b );
- a := a or $00400000;
- b := b or $00400000;
- if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
- float_raise( float_flag_invalid );
- if bIsSignalingNaN<>0 then
- propagateFloat32Nan := b
- else if aIsSignalingNan<>0 then
- propagateFloat32Nan := a
- else if bIsNan<>0 then
- propagateFloat32Nan := b
- else
- propagateFloat32Nan := a;
- end;
- (*----------------------------------------------------------------------------
- | The pattern for a default generated double-precision NaN. The `high' and
- | `low' values hold the most- and least-significant bits, respectively.
- *----------------------------------------------------------------------------*)
- const
- float64_default_nan_high = $7FFFFFFF;
- float64_default_nan_low = $FFFFFFFF;
- (*----------------------------------------------------------------------------
- | Returns 1 if the double-precision floating-point value `a' is a NaN;
- | otherwise returns 0.
- *----------------------------------------------------------------------------*)
- function float64_is_nan(a: float64): flag;
- begin
- float64_is_nan := flag (
- ( $FFE00000 <= bits32 ( a.high shl 1 ) )
- and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
- end;
- (*----------------------------------------------------------------------------
- | Returns 1 if the double-precision floating-point value `a' is a signaling
- | NaN; otherwise returns 0.
- *----------------------------------------------------------------------------*)
- function float64_is_signaling_nan( a:float64): flag;
- begin
- float64_is_signaling_nan := flag(
- ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
- and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
- end;
- (*----------------------------------------------------------------------------
- | Returns the result of converting the double-precision floating-point NaN
- | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
- | exception is raised.
- *----------------------------------------------------------------------------*)
- function float64ToCommonNaN( a : float64) : commonNaNT;
- var
- z : commonNaNT;
- begin
- if ( float64_is_signaling_nan( a )<>0 ) then
- float_raise( float_flag_invalid );
- z.sign := a.high shr 31;
- shortShift64Left( a.high, a.low, 12, z.high, z.low );
- result:=z;
- end;
- (*----------------------------------------------------------------------------
- | Returns the result of converting the canonical NaN `a' to the double-
- | precision floating-point format.
- *----------------------------------------------------------------------------*)
- function commonNaNToFloat64( a : commonNaNT): float64;
- var
- z: float64;
- begin
- shift64Right( a.high, a.low, 12, z.high, z.low );
- z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
- result:=z;
- end;
- (*----------------------------------------------------------------------------
- | Takes two double-precision floating-point values `a' and `b', one of which
- | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
- | signaling NaN, the invalid exception is raised.
- *----------------------------------------------------------------------------*)
- Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
- var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
- begin
- aIsNaN := float64_is_nan( a );
- aIsSignalingNaN := float64_is_signaling_nan( a );
- bIsNaN := float64_is_nan( b );
- bIsSignalingNaN := float64_is_signaling_nan( b );
- a.high := a.high or $00080000;
- b.high := b.high or $00080000;
- if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
- float_raise( float_flag_invalid );
- if bIsSignalingNaN<>0 then
- c := b
- else if aIsSignalingNan<>0 then
- c := a
- else if bIsNan<>0 then
- c := b
- else
- c := a;
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
- | otherwise returns 0.
- *----------------------------------------------------------------------------*}
- function float128_is_nan( a : float128): flag;
- begin
- result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
- and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the quadruple-precision floating-point value `a' is a
- | signaling NaN; otherwise returns 0.
- *----------------------------------------------------------------------------*}
- function float128_is_signaling_nan( a : float128): flag;
- begin
- result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
- ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the quadruple-precision floating-point NaN
- | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
- | exception is raised.
- *----------------------------------------------------------------------------*}
- function float128ToCommonNaN( a : float128): commonNaNT;
- var
- z: commonNaNT;
- qhigh,qlow : qword;
- begin
- if ( float128_is_signaling_nan( a )<>0) then
- float_raise( float_flag_invalid );
- z.sign := a.high shr 63;
- shortShift128Left( a.high, a.low, 16, qhigh, qlow );
- z.high:=qhigh shr 32;
- z.low:=qhigh and $ffffffff;
- result:=z;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the canonical NaN `a' to the quadruple-
- | precision floating-point format.
- *----------------------------------------------------------------------------*}
- function commonNaNToFloat128( a : commonNaNT): float128;
- var
- z: float128;
- begin
- shift128Right( a.high, a.low, 16, z.high, z.low );
- z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
- result:=z;
- end;
- {*----------------------------------------------------------------------------
- | Takes two quadruple-precision floating-point values `a' and `b', one of
- | which is a NaN, and returns the appropriate NaN result. If either `a' or
- | `b' is a signaling NaN, the invalid exception is raised.
- *----------------------------------------------------------------------------*}
- function propagateFloat128NaN( a: float128; b : float128): float128;
- var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
- label
- returnLargerSignificand;
- begin
- aIsNaN := float128_is_nan( a );
- aIsSignalingNaN := float128_is_signaling_nan( a );
- bIsNaN := float128_is_nan( b );
- bIsSignalingNaN := float128_is_signaling_nan( b );
- a.high := a.high or int64( $0000800000000000 );
- b.high := b.high or int64( $0000800000000000 );
- if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
- float_raise( float_flag_invalid );
- if ( aIsSignalingNaN )<>0 then
- begin
- if ( bIsSignalingNaN )<>0 then
- goto returnLargerSignificand;
- if bIsNaN<>0 then
- result := b
- else
- result := a;
- exit;
- end
- else if ( aIsNaN )<>0 then
- begin
- if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
- begin
- result := a;
- exit;
- end;
- returnLargerSignificand:
- if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
- begin
- result := b;
- exit;
- end;
- if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
- begin
- result := a;
- exit
- end;
- if ( a.high < b.high ) then
- result := a
- else
- result := b;
- exit;
- end
- else
- result:=b;
- end;
- {$ENDIF}
- (****************************************************************************)
- (* END ENDIAN SPECIFIC CODE *)
- (****************************************************************************)
- {*
- -------------------------------------------------------------------------------
- Returns the fraction bits of the single-precision floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
- Begin
- ExtractFloat32Frac := A AND $007FFFFF;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the exponent bits of the single-precision floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat32Exp( a: float32 ): Int16; inline;
- Begin
- extractFloat32Exp := (a shr 23) AND $FF;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the sign bit of the single-precision floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat32Sign( a: float32 ): Flag; inline;
- Begin
- extractFloat32Sign := a shr 31;
- End;
- {*
- -------------------------------------------------------------------------------
- Normalizes the subnormal single-precision floating-point value represented
- by the denormalized significand `aSig'. The normalized exponent and
- significand are stored at the locations pointed to by `zExpPtr' and
- `zSigPtr', respectively.
- -------------------------------------------------------------------------------
- *}
- Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
- Var
- ShiftCount : BYTE;
- Begin
- shiftCount := countLeadingZeros32( aSig ) - 8;
- zSigPtr := aSig shl shiftCount;
- zExpPtr := 1 - shiftCount;
- End;
- {*
- -------------------------------------------------------------------------------
- Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
- single-precision floating-point value, returning the result. After being
- shifted into the proper positions, the three fields are simply added
- together to form the result. This means that any integer portion of `zSig'
- will be added into the exponent. Since a properly normalized significand
- will have an integer portion equal to 1, the `zExp' input should be 1 less
- than the desired result exponent whenever `zSig' is a complete, normalized
- significand.
- -------------------------------------------------------------------------------
- *}
- Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
- Begin
- packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
- + zSig;
- End;
- {*
- -------------------------------------------------------------------------------
- Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- and significand `zSig', and returns the proper single-precision floating-
- point value corresponding to the abstract input. Ordinarily, the abstract
- value is simply rounded and packed into the single-precision format, with
- the inexact exception raised if the abstract input cannot be represented
- exactly. However, if the abstract value is too large, the overflow and
- inexact exceptions are raised and an infinity or maximal finite value is
- returned. If the abstract value is too small, the input value is rounded to
- a subnormal number, and the underflow and inexact exceptions are raised if
- the abstract input cannot be represented exactly as a subnormal single-
- precision floating-point number.
- The input significand `zSig' has its binary point between bits 30
- and 29, which is 7 bits to the left of the usual location. This shifted
- significand must be normalized or smaller. If `zSig' is not normalized,
- `zExp' must be 0; in that case, the result returned is a subnormal number,
- and it must not require rounding. In the usual case that `zSig' is
- normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
- The handling of underflow and overflow follows the IEC/IEEE Standard for
- Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
- Var
- roundingMode : TFPURoundingMode;
- roundNearestEven : boolean;
- roundIncrement, roundBits : BYTE;
- IsTiny : boolean;
- Begin
- roundingMode := softfloat_rounding_mode;
- roundNearestEven := (roundingMode = float_round_nearest_even);
- roundIncrement := $40;
- if not roundNearestEven then
- Begin
- if ( roundingMode = float_round_to_zero ) Then
- Begin
- roundIncrement := 0;
- End
- else
- Begin
- roundIncrement := $7F;
- if ( zSign <> 0 ) then
- Begin
- if roundingMode = float_round_up then roundIncrement := 0;
- End
- else
- Begin
- if roundingMode = float_round_down then roundIncrement := 0;
- End;
- End
- End;
- roundBits := zSig AND $7F;
- if ($FD <= bits16 (zExp) ) then
- Begin
- if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
- Begin
- float_raise( [float_flag_overflow,float_flag_inexact] );
- roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
- exit;
- End;
- if ( zExp < 0 ) then
- Begin
- isTiny :=
- ( softfloat_detect_tininess = float_tininess_before_rounding )
- OR ( zExp < -1 )
- OR ( (zSig + roundIncrement) < $80000000 );
- shift32RightJamming( zSig, - zExp, zSig );
- zExp := 0;
- roundBits := zSig AND $7F;
- if ( isTiny and (roundBits<>0) ) then
- float_raise( float_flag_underflow );
- End;
- End;
- if ( roundBits )<> 0 then
- set_inexact_flag;
- zSig := ( zSig + roundIncrement ) shr 7;
- zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
- if ( zSig = 0 ) then zExp := 0;
- roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- and significand `zSig', and returns the proper single-precision floating-
- point value corresponding to the abstract input. This routine is just like
- `roundAndPackFloat32' except that `zSig' does not have to be normalized.
- Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
- floating-point exponent.
- -------------------------------------------------------------------------------
- *}
- Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
- Var
- ShiftCount : int8;
- Begin
- shiftCount := countLeadingZeros32( zSig ) - 1;
- normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the most-significant 20 fraction bits of the double-precision
- floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat64Frac0(a: float64): bits32; inline;
- Begin
- extractFloat64Frac0 := a.high and $000FFFFF;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the least-significant 32 fraction bits of the double-precision
- floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat64Frac1(a: float64): bits32; inline;
- Begin
- extractFloat64Frac1 := a.low;
- End;
- {$define FPC_SYSTEM_HAS_extractFloat64Frac}
- Function extractFloat64Frac(a: float64): bits64; inline;
- Begin
- extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the exponent bits of the double-precision floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat64Exp(a: float64): int16; inline;
- Begin
- extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the sign bit of the double-precision floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat64Sign(a: float64) : flag; inline;
- Begin
- extractFloat64Sign := a.high shr 31;
- End;
- {*
- -------------------------------------------------------------------------------
- Normalizes the subnormal double-precision floating-point value represented
- by the denormalized significand formed by the concatenation of `aSig0' and
- `aSig1'. The normalized exponent is stored at the location pointed to by
- `zExpPtr'. The most significant 21 bits of the normalized significand are
- stored at the location pointed to by `zSig0Ptr', and the least significant
- 32 bits of the normalized significand are stored at the location pointed to
- by `zSig1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure normalizeFloat64Subnormal(
- aSig0: bits32;
- aSig1: bits32;
- VAR zExpPtr : Int16;
- VAR zSig0Ptr : Bits32;
- VAR zSig1Ptr : Bits32
- );
- Var
- ShiftCount : Int8;
- Begin
- if ( aSig0 = 0 ) then
- Begin
- shiftCount := countLeadingZeros32( aSig1 ) - 11;
- if ( shiftCount < 0 ) then
- Begin
- zSig0Ptr := aSig1 shr ( - shiftCount );
- zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
- End
- else
- Begin
- zSig0Ptr := aSig1 shl shiftCount;
- zSig1Ptr := 0;
- End;
- zExpPtr := - shiftCount - 31;
- End
- else
- Begin
- shiftCount := countLeadingZeros32( aSig0 ) - 11;
- shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
- zExpPtr := 1 - shiftCount;
- End;
- End;
- procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
- var
- shiftCount : int8;
- begin
- shiftCount := countLeadingZeros64( aSig ) - 11;
- zSigPtr := aSig shl shiftCount;
- zExpPtr := 1 - shiftCount;
- end;
- {*
- -------------------------------------------------------------------------------
- Packs the sign `zSign', the exponent `zExp', and the significand formed by
- the concatenation of `zSig0' and `zSig1' into a double-precision floating-
- point value, returning the result. After being shifted into the proper
- positions, the three fields `zSign', `zExp', and `zSig0' are simply added
- together to form the most significant 32 bits of the result. This means
- that any integer portion of `zSig0' will be added into the exponent. Since
- a properly normalized significand will have an integer portion equal to 1,
- the `zExp' input should be 1 less than the desired result exponent whenever
- `zSig0' and `zSig1' concatenated form a complete, normalized significand.
- -------------------------------------------------------------------------------
- *}
- Procedure
- packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
- var
- z: Float64;
- Begin
- z.low := zSig1;
- z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
- c := z;
- End;
- {*----------------------------------------------------------------------------
- | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
- | double-precision floating-point value, returning the result. After being
- | shifted into the proper positions, the three fields are simply added
- | together to form the result. This means that any integer portion of `zSig'
- | will be added into the exponent. Since a properly normalized significand
- | will have an integer portion equal to 1, the `zExp' input should be 1 less
- | than the desired result exponent whenever `zSig' is a complete, normalized
- | significand.
- *----------------------------------------------------------------------------*}
- function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
- begin
- result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
- end;
- {*
- -------------------------------------------------------------------------------
- Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- and extended significand formed by the concatenation of `zSig0', `zSig1',
- and `zSig2', and returns the proper double-precision floating-point value
- corresponding to the abstract input. Ordinarily, the abstract value is
- simply rounded and packed into the double-precision format, with the inexact
- exception raised if the abstract input cannot be represented exactly.
- However, if the abstract value is too large, the overflow and inexact
- exceptions are raised and an infinity or maximal finite value is returned.
- If the abstract value is too small, the input value is rounded to a
- subnormal number, and the underflow and inexact exceptions are raised if the
- abstract input cannot be represented exactly as a subnormal double-precision
- floating-point number.
- The input significand must be normalized or smaller. If the input
- significand is not normalized, `zExp' must be 0; in that case, the result
- returned is a subnormal number, and it must not require rounding. In the
- usual case that the input significand is normalized, `zExp' must be 1 less
- than the ``true'' floating-point exponent. The handling of underflow and
- overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure
- roundAndPackFloat64(
- zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
- Var
- roundingMode : TFPURoundingMode;
- roundNearestEven, increment, isTiny : Flag;
- Begin
- roundingMode := softfloat_rounding_mode;
- roundNearestEven := flag( roundingMode = float_round_nearest_even );
- increment := flag( sbits32 (zSig2) < 0 );
- if ( roundNearestEven = flag(FALSE) ) then
- Begin
- if ( roundingMode = float_round_to_zero ) then
- increment := 0
- else
- Begin
- if ( zSign )<> 0 then
- Begin
- increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
- End
- else
- Begin
- increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
- End
- End
- End;
- if ( $7FD <= bits16 (zExp) ) then
- Begin
- if (( $7FD < zExp )
- or (( zExp = $7FD )
- and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
- and (increment<>0)
- )
- ) then
- Begin
- float_raise( [float_flag_overflow,float_flag_inexact] );
- if (( roundingMode = float_round_to_zero )
- or ( (zSign<>0) and ( roundingMode = float_round_up ) )
- or ( (zSign = 0) and ( roundingMode = float_round_down ) )
- ) then
- Begin
- packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
- exit;
- End;
- packFloat64( zSign, $7FF, 0, 0, c );
- exit;
- End;
- if ( zExp < 0 ) then
- Begin
- isTiny :=
- flag( softfloat_detect_tininess = float_tininess_before_rounding )
- or flag( zExp < -1 )
- or flag(increment = 0)
- or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
- shift64ExtraRightJamming(
- zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
- zExp := 0;
- if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
- if ( roundNearestEven )<>0 then
- Begin
- increment := flag( sbits32 (zSig2) < 0 );
- End
- else
- Begin
- if ( zSign )<>0 then
- Begin
- increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
- End
- else
- Begin
- increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
- End
- End;
- End;
- End;
- if ( zSig2 )<>0 then
- set_inexact_flag;
- if ( increment )<>0 then
- Begin
- add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
- zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
- End
- else
- Begin
- if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
- End;
- packFloat64( zSign, zExp, zSig0, zSig1, c );
- End;
- {*----------------------------------------------------------------------------
- | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- | and significand `zSig', and returns the proper double-precision floating-
- | point value corresponding to the abstract input. Ordinarily, the abstract
- | value is simply rounded and packed into the double-precision format, with
- | the inexact exception raised if the abstract input cannot be represented
- | exactly. However, if the abstract value is too large, the overflow and
- | inexact exceptions are raised and an infinity or maximal finite value is
- | returned. If the abstract value is too small, the input value is rounded
- | to a subnormal number, and the underflow and inexact exceptions are raised
- | if the abstract input cannot be represented exactly as a subnormal double-
- | precision floating-point number.
- | The input significand `zSig' has its binary point between bits 62
- | and 61, which is 10 bits to the left of the usual location. This shifted
- | significand must be normalized or smaller. If `zSig' is not normalized,
- | `zExp' must be 0; in that case, the result returned is a subnormal number,
- | and it must not require rounding. In the usual case that `zSig' is
- | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
- | The handling of underflow and overflow follows the IEC/IEEE Standard for
- | Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
- var
- roundingMode: TFPURoundingMode;
- roundNearestEven: flag;
- roundIncrement, roundBits: int16;
- isTiny: flag;
- begin
- roundingMode := softfloat_rounding_mode;
- roundNearestEven := ord( roundingMode = float_round_nearest_even );
- roundIncrement := $200;
- if ( roundNearestEven=0 ) then
- begin
- if ( roundingMode = float_round_to_zero ) then
- begin
- roundIncrement := 0;
- end
- else begin
- roundIncrement := $3FF;
- if ( zSign<>0 ) then
- begin
- if ( roundingMode = float_round_up ) then
- roundIncrement := 0;
- end
- else begin
- if ( roundingMode = float_round_down ) then
- roundIncrement := 0;
- end
- end
- end;
- roundBits := zSig and $3FF;
- if ( $7FD <= bits16(zExp) ) then
- begin
- if ( ( $7FD < zExp )
- or ( ( zExp = $7FD )
- and ( sbits64( zSig + roundIncrement ) < 0 ) )
- ) then
- begin
- float_raise( [float_flag_overflow,float_flag_inexact] );
- result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
- exit;
- end;
- if ( zExp < 0 ) then
- begin
- isTiny := ord(
- ( softfloat_detect_tininess = float_tininess_before_rounding )
- or ( zExp < -1 )
- or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
- shift64RightJamming( zSig, - zExp, zSig );
- zExp := 0;
- roundBits := zSig and $3FF;
- if ( isTiny and roundBits )<>0 then
- float_raise( float_flag_underflow );
- end
- end;
- if ( roundBits<>0 ) then
- set_inexact_flag;
- zSig := ( zSig + roundIncrement ) shr 10;
- zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
- if ( zSig = 0 ) then
- zExp := 0;
- result:=packFloat64( zSign, zExp, zSig );
- end;
- {*
- -------------------------------------------------------------------------------
- Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- and significand formed by the concatenation of `zSig0' and `zSig1', and
- returns the proper double-precision floating-point value corresponding
- to the abstract input. This routine is just like `roundAndPackFloat64'
- except that the input significand has fewer bits and does not have to be
- normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
- point exponent.
- -------------------------------------------------------------------------------
- *}
- Procedure
- normalizeRoundAndPackFloat64(
- zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
- Var
- shiftCount : int8;
- zSig2 : bits32;
- Begin
- if ( zSig0 = 0 ) then
- Begin
- zSig0 := zSig1;
- zSig1 := 0;
- zExp := zExp -32;
- End;
- shiftCount := countLeadingZeros32( zSig0 ) - 11;
- if ( 0 <= shiftCount ) then
- Begin
- zSig2 := 0;
- shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
- End
- else
- Begin
- shift64ExtraRightJamming
- (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
- End;
- zExp := zExp - shiftCount;
- roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
- End;
- {*
- ----------------------------------------------------------------------------
- Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- and significand `zSig', and returns the proper double-precision floating-
- point value corresponding to the abstract input. This routine is just like
- `roundAndPackFloat64' except that `zSig' does not have to be normalized.
- Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
- floating-point exponent.
- ----------------------------------------------------------------------------
- *}
- function normalizeRoundAndPackFloat64(zSign: flag; zExp: int16; zSig: bits64): float64;
- var
- shiftCount: int8;
- begin
- shiftCount := countLeadingZeros64( zSig ) - 1;
- result := roundAndPackFloat64( zSign, zExp - shiftCount, zSig shl shiftCount);
- end;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the 32-bit two's complement integer `a' to
- the single-precision floating-point format. The conversion is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function int32_to_float32( a: int32): float32rec; compilerproc;
- Var
- zSign : Flag;
- Begin
- if ( a = 0 ) then
- Begin
- int32_to_float32.float32 := 0;
- exit;
- End;
- if ( a = sbits32 ($80000000) ) then
- Begin
- int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
- exit;
- end;
- zSign := flag( a < 0 );
- If zSign<>0 then
- a := -a;
- int32_to_float32.float32:=
- normalizeRoundAndPackFloat32( zSign, $9C, a );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the 32-bit two's complement integer `a' to
- the double-precision floating-point format. The conversion is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
- var
- zSign : flag;
- absA : bits32;
- shiftCount : int8;
- zSig0, zSig1 : bits32;
- Begin
- if ( a = 0 ) then
- Begin
- packFloat64( 0, 0, 0, 0, result );
- exit;
- end;
- zSign := flag( a < 0 );
- if ZSign<>0 then
- AbsA := -a
- else
- AbsA := a;
- shiftCount := countLeadingZeros32( absA ) - 11;
- if ( 0 <= shiftCount ) then
- Begin
- zSig0 := absA shl shiftCount;
- zSig1 := 0;
- End
- else
- Begin
- shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
- End;
- packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
- End;
- {$ifdef FPC_SOFTFLOAT_FLOATX80}
- {$if not defined(packFloatx80)}
- function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
- forward;
- {$endif}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 32-bit two's complement integer `a'
- | to the extended double-precision floating-point format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------*}
- function int32_to_floatx80( a: int32 ): floatx80;
- var
- zSign: flag;
- absA: uint32;
- shiftCount: int8;
- zSig: bits64;
- begin
- if ( a = 0 ) then begin
- result := packFloatx80( 0, 0, 0 );
- exit;
- end;
- zSign := ord( a < 0 );
- if zSign <> 0 then absA := - a else absA := a;
- shiftCount := countLeadingZeros32( absA ) + 32;
- zSig := absA;
- result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
- end;
- {$endif FPC_SOFTFLOAT_FLOATX80}
- {$ifdef FPC_SOFTFLOAT_FLOAT128}
- {$if not defined(packFloat128)}
- function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
- forward;
- {$endif}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 32-bit two's complement integer `a' to
- | the quadruple-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function int32_to_float128( a: int32 ): float128;
- var
- zSign: flag;
- absA: uint32;
- shiftCount: int8;
- zSig0: bits64;
- begin
- if ( a = 0 ) then begin
- result := packFloat128( 0, 0, 0, 0 );
- exit;
- end;
- zSign := ord( a < 0 );
- if zSign <> 0 then absA := - a else absA := a;
- shiftCount := countLeadingZeros32( absA ) + 17;
- zSig0 := absA;
- result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
- end;
- {$endif FPC_SOFTFLOAT_FLOAT128}
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic---which means in particular that the conversion is rounded
- according to the current rounding mode. If `a' is a NaN, the largest
- positive integer is returned. Otherwise, if the conversion overflows, the
- largest integer with the same sign as `a' is returned.
- -------------------------------------------------------------------------------
- *}
- Function float32_to_int32( a : float32rec) : int32;compilerproc;
- Var
- aSign: flag;
- aExp, shiftCount: int16;
- aSig, aSigExtra: bits32;
- z: int32;
- roundingMode: TFPURoundingMode;
- Begin
- aSig := extractFloat32Frac( a.float32 );
- aExp := extractFloat32Exp( a.float32 );
- aSign := extractFloat32Sign( a.float32 );
- shiftCount := aExp - $96;
- if ( 0 <= shiftCount ) then
- Begin
- if ( $9E <= aExp ) then
- Begin
- if ( a.float32 <> $CF000000 ) then
- Begin
- float_raise( float_flag_invalid );
- if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
- Begin
- float32_to_int32 := $7FFFFFFF;
- exit;
- End;
- End;
- float32_to_int32 := sbits32 ($80000000);
- exit;
- End;
- z := ( aSig or $00800000 ) shl shiftCount;
- if ( aSign<>0 ) then z := - z;
- End
- else
- Begin
- if ( aExp < $7E ) then
- Begin
- aSigExtra := aExp OR aSig;
- z := 0;
- End
- else
- Begin
- aSig := aSig OR $00800000;
- aSigExtra := aSig shl ( shiftCount and 31 );
- z := aSig shr ( - shiftCount );
- End;
- if ( aSigExtra<>0 ) then
- set_inexact_flag;
- roundingMode := softfloat_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- Begin
- if ( sbits32 (aSigExtra) < 0 ) then
- Begin
- Inc(z);
- if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
- z := z and not 1;
- End;
- if ( aSign<>0 ) then
- z := - z;
- End
- else
- Begin
- aSigExtra := flag( aSigExtra <> 0 );
- if ( aSign<>0 ) then
- Begin
- z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
- z := - z;
- End
- else
- Begin
- z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
- End
- End;
- End;
- float32_to_int32 := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic, except that the conversion is always rounded toward zero.
- If `a' is a NaN, the largest positive integer is returned. Otherwise, if
- the conversion overflows, the largest integer with the same sign as `a' is
- returned.
- -------------------------------------------------------------------------------
- *}
- Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
- Var
- aSign : flag;
- aExp, shiftCount : int16;
- aSig : bits32;
- z : int32;
- Begin
- aSig := extractFloat32Frac( a.float32 );
- aExp := extractFloat32Exp( a.float32 );
- aSign := extractFloat32Sign( a.float32 );
- shiftCount := aExp - $9E;
- if ( 0 <= shiftCount ) then
- Begin
- if ( a.float32 <> $CF000000 ) then
- Begin
- float_raise( float_flag_invalid );
- if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
- Begin
- float32_to_int32_round_to_zero := $7FFFFFFF;
- exit;
- end;
- End;
- float32_to_int32_round_to_zero:= sbits32 ($80000000);
- exit;
- End
- else
- if ( aExp <= $7E ) then
- Begin
- if ( aExp or aSig )<>0 then
- set_inexact_flag;
- float32_to_int32_round_to_zero := 0;
- exit;
- End;
- aSig := ( aSig or $00800000 ) shl 8;
- z := aSig shr ( - shiftCount );
- if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
- Begin
- set_inexact_flag;
- End;
- if ( aSign<>0 ) then z := - z;
- float32_to_int32_round_to_zero := z;
- End;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the single-precision floating-point value
- | `a' to the 64-bit two's complement integer format. The conversion is
- | performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic---which means in particular that the conversion is rounded
- | according to the current rounding mode. If `a' is a NaN, the largest
- | positive integer is returned. Otherwise, if the conversion overflows, the
- | largest integer with the same sign as `a' is returned.
- *----------------------------------------------------------------------------*}
- function float32_to_int64( a: float32 ): int64;
- var
- aSign: flag;
- aExp, shiftCount: int16;
- aSig: bits32;
- aSig64, aSigExtra: bits64;
- begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- shiftCount := $BE - aExp;
- if ( shiftCount < 0 ) then begin
- float_raise( float_flag_invalid );
- if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
- result := $7FFFFFFFFFFFFFFF;
- exit;
- end;
- result := $8000000000000000;
- exit;
- end;
- if ( aExp <> 0 ) then aSig := aSig or $00800000;
- aSig64 := aSig;
- aSig64 := aSig64 shl 40;
- shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
- result := roundAndPackInt64( aSign, aSig64, aSigExtra );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the single-precision floating-point value
- | `a' to the 64-bit two's complement integer format. The conversion is
- | performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic, except that the conversion is always rounded toward zero. If
- | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
- | conversion overflows, the largest integer with the same sign as `a' is
- | returned.
- *----------------------------------------------------------------------------*}
- function float32_to_int64_round_to_zero( a: float32 ): int64;
- var
- aSign: flag;
- aExp, shiftCount: int16;
- aSig: bits32;
- aSig64: bits64;
- z: int64;
- begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- shiftCount := aExp - $BE;
- if ( 0 <= shiftCount ) then begin
- if ( a <> $DF000000 ) then begin
- float_raise( float_flag_invalid );
- if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
- result := $7FFFFFFFFFFFFFFF;
- exit;
- end;
- end;
- result := $8000000000000000;
- exit;
- end
- else if ( aExp <= $7E ) then begin
- if ( aExp or aSig <> 0 ) then set_inexact_flag;
- result := 0;
- exit;
- end;
- aSig64 := aSig or $00800000;
- aSig64 := aSig64 shl 40;
- z := aSig64 shr ( - shiftCount );
- if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
- set_inexact_flag;
- if ( aSign <> 0 ) then z := - z;
- result := z;
- end;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the double-precision floating-point format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_to_float64( a : float32rec) : Float64;compilerproc;
- Var
- aSign : flag;
- aExp : int16;
- aSig, zSig0, zSig1: bits32;
- tmp : CommonNanT;
- Begin
- aSig := extractFloat32Frac( a.float32 );
- aExp := extractFloat32Exp( a.float32 );
- aSign := extractFloat32Sign( a.float32 );
- if ( aExp = $FF ) then
- Begin
- if ( aSig<>0 ) then
- Begin
- tmp:=float32ToCommonNaN(a.float32);
- result:=commonNaNToFloat64(tmp);
- exit;
- End;
- packFloat64( aSign, $7FF, 0, 0, result);
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- if ( aSig = 0 ) then
- Begin
- packFloat64( aSign, 0, 0, 0, result );
- exit;
- end;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- Dec(aExp);
- End;
- shift64Right( aSig, 0, 3, zSig0, zSig1 );
- packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
- End;
- {$ifdef FPC_SOFTFLOAT_FLOATX80}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the canonical NaN `a' to the extended
- | double-precision floating-point format.
- *----------------------------------------------------------------------------*}
- function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
- var
- z : floatx80;
- begin
- z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
- z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
- result := z;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the single-precision floating-point value
- | `a' to the extended double-precision floating-point format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------*}
- function float32_to_floatx80( a: float32 ): floatx80;
- var
- aSign: flag;
- aExp: int16;
- aSig: bits32;
- tmp: commonNaNT;
- begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- if ( aExp = $FF ) then begin
- if ( aSig <> 0 ) then begin
- tmp:=float32ToCommonNaN(a);
- result := commonNaNToFloatx80( tmp );
- exit;
- end;
- result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- if ( aExp = 0 ) then begin
- if ( aSig = 0 ) then begin
- result := packFloatx80( aSign, 0, 0 );
- exit;
- end;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- end;
- aSig := aSig or $00800000;
- result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
- end;
- {$endif FPC_SOFTFLOAT_FLOATX80}
- {$ifdef FPC_SOFTFLOAT_FLOAT128}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the single-precision floating-point value
- | `a' to the double-precision floating-point format. The conversion is
- | performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------*}
- function float32_to_float128( a: float32 ): float128;
- var
- aSign: flag;
- aExp: int16;
- aSig: bits32;
- tmp: commonNaNT;
- begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- if ( aExp = $FF ) then begin
- if ( aSig <> 0 ) then begin
- tmp:=float32ToCommonNaN(a);
- result := commonNaNToFloat128( tmp );
- exit;
- end;
- result := packFloat128( aSign, $7FFF, 0, 0 );
- exit;
- end;
- if ( aExp = 0 ) then begin
- if ( aSig = 0 ) then begin
- result := packFloat128( aSign, 0, 0, 0 );
- exit;
- end;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- dec( aExp );
- end;
- result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
- end;
- {$endif FPC_SOFTFLOAT_FLOAT128}
- {*
- -------------------------------------------------------------------------------
- Rounds the single-precision floating-point value `a' to an integer,
- and returns the result as a single-precision floating-point value. The
- operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
- Var
- aSign: flag;
- aExp: int16;
- lastBitMask, roundBitsMask: bits32;
- roundingMode: TFPURoundingMode;
- z: float32;
- Begin
- aExp := extractFloat32Exp( a.float32 );
- if ( $96 <= aExp ) then
- Begin
- if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
- Begin
- float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
- exit;
- End;
- float32_round_to_int:=a;
- exit;
- End;
- if ( aExp <= $7E ) then
- Begin
- if ( bits32 ( a.float32 shl 1 ) = 0 ) then
- Begin
- float32_round_to_int:=a;
- exit;
- end;
- set_inexact_flag;
- aSign := extractFloat32Sign( a.float32 );
- case ( softfloat_rounding_mode ) of
- float_round_nearest_even:
- Begin
- if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
- Begin
- float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
- exit;
- End;
- End;
- float_round_down:
- Begin
- if aSign <> 0 then
- float32_round_to_int.float32 := $BF800000
- else
- float32_round_to_int.float32 := 0;
- exit;
- End;
- float_round_up:
- Begin
- if aSign <> 0 then
- float32_round_to_int.float32 := $80000000
- else
- float32_round_to_int.float32 := $3F800000;
- exit;
- End;
- end;
- float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
- exit;
- End;
- lastBitMask := 1;
- {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
- lastBitMask := lastBitMask shl ($96 - aExp);
- roundBitsMask := lastBitMask - 1;
- z := a.float32;
- roundingMode := softfloat_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- Begin
- z := z + (lastBitMask shr 1);
- if ( ( z and roundBitsMask ) = 0 ) then
- z := z and not lastBitMask;
- End
- else if ( roundingMode <> float_round_to_zero ) then
- Begin
- if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
- Begin
- z := z + roundBitsMask;
- End;
- End;
- z := z and not roundBitsMask;
- if ( z <> a.float32 ) then
- set_inexact_flag;
- float32_round_to_int.float32 := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the absolute values of the single-precision
- floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
- before being returned. `zSign' is ignored if the result is a NaN.
- The addition is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
- Var
- aExp, bExp, zExp: int16;
- aSig, bSig, zSig: bits32;
- expDiff: int16;
- label roundAndPack;
- Begin
- aSig:=extractFloat32Frac( a );
- aExp:=extractFloat32Exp( a );
- bSig:=extractFloat32Frac( b );
- bExp := extractFloat32Exp( b );
- expDiff := aExp - bExp;
- aSig := aSig shl 6;
- bSig := bSig shl 6;
- if ( 0 < expDiff ) then
- Begin
- if ( aExp = $FF ) then
- Begin
- if ( aSig <> 0) then
- Begin
- addFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- End;
- addFloat32Sigs := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- Dec(expDiff);
- End
- else
- Begin
- bSig := bSig or $20000000;
- End;
- shift32RightJamming( bSig, expDiff, bSig );
- zExp := aExp;
- End
- else
- If ( expDiff < 0 ) then
- Begin
- if ( bExp = $FF ) then
- Begin
- if ( bSig<>0 ) then
- Begin
- addFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- end;
- addFloat32Sigs := packFloat32( zSign, $FF, 0 );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- Inc(expDiff);
- End
- else
- Begin
- aSig := aSig OR $20000000;
- End;
- shift32RightJamming( aSig, - expDiff, aSig );
- zExp := bExp;
- End
- else
- Begin
- if ( aExp = $FF ) then
- Begin
- if ( aSig OR bSig )<> 0 then
- Begin
- addFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- end;
- addFloat32Sigs := a;
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
- exit;
- end;
- zSig := $40000000 + aSig + bSig;
- zExp := aExp;
- goto roundAndPack;
- End;
- aSig := aSig OR $20000000;
- zSig := ( aSig + bSig ) shl 1;
- Dec(zExp);
- if ( sbits32 (zSig) < 0 ) then
- Begin
- zSig := aSig + bSig;
- Inc(zExp);
- End;
- roundAndPack:
- addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the absolute values of the single-
- precision floating-point values `a' and `b'. If `zSign' is 1, the
- difference is negated before being returned. `zSign' is ignored if the
- result is a NaN. The subtraction is performed according to the IEC/IEEE
- Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
- Var
- aExp, bExp, zExp: int16;
- aSig, bSig, zSig: bits32;
- expDiff : int16;
- label aExpBigger;
- label bExpBigger;
- label aBigger;
- label bBigger;
- label normalizeRoundAndPack;
- Begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- bSig := extractFloat32Frac( b );
- bExp := extractFloat32Exp( b );
- expDiff := aExp - bExp;
- aSig := aSig shl 7;
- bSig := bSig shl 7;
- if ( 0 < expDiff ) then goto aExpBigger;
- if ( expDiff < 0 ) then goto bExpBigger;
- if ( aExp = $FF ) then
- Begin
- if ( aSig OR bSig )<> 0 then
- Begin
- subFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- End;
- float_raise( float_flag_invalid );
- subFloat32Sigs := float32_default_nan;
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- aExp := 1;
- bExp := 1;
- End;
- if ( bSig < aSig ) Then goto aBigger;
- if ( aSig < bSig ) Then goto bBigger;
- subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
- exit;
- bExpBigger:
- if ( bExp = $FF ) then
- Begin
- if ( bSig<>0 ) then
- Begin
- subFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- End;
- subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- Inc(expDiff);
- End
- else
- Begin
- aSig := aSig OR $40000000;
- End;
- shift32RightJamming( aSig, - expDiff, aSig );
- bSig := bSig OR $40000000;
- bBigger:
- zSig := bSig - aSig;
- zExp := bExp;
- zSign := zSign xor 1;
- goto normalizeRoundAndPack;
- aExpBigger:
- if ( aExp = $FF ) then
- Begin
- if ( aSig <> 0) then
- Begin
- subFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- End;
- subFloat32Sigs := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- Dec(expDiff);
- End
- else
- Begin
- bSig := bSig OR $40000000;
- End;
- shift32RightJamming( bSig, expDiff, bSig );
- aSig := aSig OR $40000000;
- aBigger:
- zSig := aSig - bSig;
- zExp := aExp;
- normalizeRoundAndPack:
- Dec(zExp);
- subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the single-precision floating-point values `a'
- and `b'. The operation is performed according to the IEC/IEEE Standard for
- Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
- Var
- aSign, bSign: Flag;
- Begin
- aSign := extractFloat32Sign( a.float32 );
- bSign := extractFloat32Sign( b.float32 );
- if ( aSign = bSign ) then
- Begin
- float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
- End
- else
- Begin
- float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the single-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
- Var
- aSign, bSign: flag;
- Begin
- aSign := extractFloat32Sign( a.float32 );
- bSign := extractFloat32Sign( b.float32 );
- if ( aSign = bSign ) then
- Begin
- float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
- End
- else
- Begin
- float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of multiplying the single-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
- Var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp : int16;
- aSig, bSig, zSig0, zSig1: bits32;
- Begin
- aSig := extractFloat32Frac( a.float32 );
- aExp := extractFloat32Exp( a.float32 );
- aSign := extractFloat32Sign( a.float32 );
- bSig := extractFloat32Frac( b.float32 );
- bExp := extractFloat32Exp( b.float32 );
- bSign := extractFloat32Sign( b.float32 );
- zSign := aSign xor bSign;
- if ( aExp = $FF ) then
- Begin
- if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
- Begin
- float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
- exit;
- End;
- if ( ( bits32(bExp) OR bSig ) = 0 ) then
- Begin
- float_raise( float_flag_invalid );
- float32_mul.float32 := float32_default_nan;
- exit;
- End;
- float32_mul.float32 := packFloat32( zSign, $FF, 0 );
- exit;
- End;
- if ( bExp = $FF ) then
- Begin
- if ( bSig <> 0 ) then
- Begin
- float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
- exit;
- End;
- if ( ( bits32(aExp) OR aSig ) = 0 ) then
- Begin
- float_raise( float_flag_invalid );
- float32_mul.float32 := float32_default_nan;
- exit;
- End;
- float32_mul.float32 := packFloat32( zSign, $FF, 0 );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- if ( aSig = 0 ) then
- Begin
- float32_mul.float32 := packFloat32( zSign, 0, 0 );
- exit;
- End;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- End;
- if ( bExp = 0 ) then
- Begin
- if ( bSig = 0 ) then
- Begin
- float32_mul.float32 := packFloat32( zSign, 0, 0 );
- exit;
- End;
- normalizeFloat32Subnormal( bSig, bExp, bSig );
- End;
- zExp := aExp + bExp - $7F;
- aSig := ( aSig OR $00800000 ) shl 7;
- bSig := ( bSig OR $00800000 ) shl 8;
- mul32To64( aSig, bSig, zSig0, zSig1 );
- zSig0 := zSig0 OR bits32( zSig1 <> 0 );
- if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
- Begin
- zSig0 := zSig0 shl 1;
- Dec(zExp);
- End;
- float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of dividing the single-precision floating-point value `a'
- by the corresponding value `b'. The operation is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
- Var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp: int16;
- aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
- Begin
- aSig := extractFloat32Frac( a.float32 );
- aExp := extractFloat32Exp( a.float32 );
- aSign := extractFloat32Sign( a.float32 );
- bSig := extractFloat32Frac( b.float32 );
- bExp := extractFloat32Exp( b.float32 );
- bSign := extractFloat32Sign( b.float32 );
- zSign := aSign xor bSign;
- if ( aExp = $FF ) then
- Begin
- if ( aSig <> 0 ) then
- Begin
- float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
- exit;
- End;
- if ( bExp = $FF ) then
- Begin
- if ( bSig <> 0) then
- Begin
- float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
- exit;
- End;
- float_raise( float_flag_invalid );
- float32_div.float32 := float32_default_nan;
- exit;
- End;
- float32_div.float32 := packFloat32( zSign, $FF, 0 );
- exit;
- End;
- if ( bExp = $FF ) then
- Begin
- if ( bSig <> 0) then
- Begin
- float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
- exit;
- End;
- float32_div.float32 := packFloat32( zSign, 0, 0 );
- exit;
- End;
- if ( bExp = 0 ) Then
- Begin
- if ( bSig = 0 ) Then
- Begin
- if ( ( bits32(aExp) OR aSig ) = 0 ) then
- Begin
- float_raise( float_flag_invalid );
- float32_div.float32 := float32_default_nan;
- exit;
- End;
- float_raise( float_flag_divbyzero );
- float32_div.float32 := packFloat32( zSign, $FF, 0 );
- exit;
- End;
- normalizeFloat32Subnormal( bSig, bExp, bSig );
- End;
- if ( aExp = 0 ) Then
- Begin
- if ( aSig = 0 ) Then
- Begin
- float32_div.float32 := packFloat32( zSign, 0, 0 );
- exit;
- End;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- End;
- zExp := aExp - bExp + $7D;
- aSig := ( aSig OR $00800000 ) shl 7;
- bSig := ( bSig OR $00800000 ) shl 8;
- if ( bSig <= ( aSig + aSig ) ) then
- Begin
- aSig := aSig shr 1;
- Inc(zExp);
- End;
- zSig := estimateDiv64To32( aSig, 0, bSig );
- if ( ( zSig and $3F ) <= 2 ) then
- Begin
- mul32To64( bSig, zSig, term0, term1 );
- sub64( aSig, 0, term0, term1, rem0, rem1 );
- while ( sbits32 (rem0) < 0 ) do
- Begin
- Dec(zSig);
- add64( rem0, rem1, 0, bSig, rem0, rem1 );
- End;
- zSig := zSig or bits32( rem1 <> 0 );
- End;
- float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the remainder of the single-precision floating-point value `a'
- with respect to the corresponding value `b'. The operation is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
- Var
- aSign, zSign: flag;
- aExp, bExp, expDiff: int16;
- aSig, bSig, q, alternateASig: bits32;
- sigMean: sbits32;
- Begin
- aSig := extractFloat32Frac( a.float32 );
- aExp := extractFloat32Exp( a.float32 );
- aSign := extractFloat32Sign( a.float32 );
- bSig := extractFloat32Frac( b.float32 );
- bExp := extractFloat32Exp( b.float32 );
- if ( aExp = $FF ) then
- Begin
- if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
- Begin
- float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
- exit;
- End;
- float_raise( float_flag_invalid );
- float32_rem.float32 := float32_default_nan;
- exit;
- End;
- if ( bExp = $FF ) then
- Begin
- if ( bSig <> 0 ) then
- Begin
- float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
- exit;
- End;
- float32_rem := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- if ( bSig = 0 ) then
- Begin
- float_raise( float_flag_invalid );
- float32_rem.float32 := float32_default_nan;
- exit;
- End;
- normalizeFloat32Subnormal( bSig, bExp, bSig );
- End;
- if ( aExp = 0 ) then
- Begin
- if ( aSig = 0 ) then
- Begin
- float32_rem := a;
- exit;
- End;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- End;
- expDiff := aExp - bExp;
- aSig := ( aSig OR $00800000 ) shl 8;
- bSig := ( bSig OR $00800000 ) shl 8;
- if ( expDiff < 0 ) then
- Begin
- if ( expDiff < -1 ) then
- Begin
- float32_rem := a;
- exit;
- End;
- aSig := aSig shr 1;
- End;
- q := bits32( bSig <= aSig );
- if ( q <> 0) then
- aSig := aSig - bSig;
- expDiff := expDiff - 32;
- while ( 0 < expDiff ) do
- Begin
- q := estimateDiv64To32( aSig, 0, bSig );
- if (2 < q) then
- q := q - 2
- else
- q := 0;
- aSig := - ( ( bSig shr 2 ) * q );
- expDiff := expDiff - 30;
- End;
- expDiff := expDiff + 32;
- if ( 0 < expDiff ) then
- Begin
- q := estimateDiv64To32( aSig, 0, bSig );
- if (2 < q) then
- q := q - 2
- else
- q := 0;
- q := q shr (32 - expDiff);
- bSig := bSig shr 2;
- aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
- End
- else
- Begin
- aSig := aSig shr 2;
- bSig := bSig shr 2;
- End;
- Repeat
- alternateASig := aSig;
- Inc(q);
- aSig := aSig - bSig;
- Until not ( 0 <= sbits32 (aSig) );
- sigMean := aSig + alternateASig;
- if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
- Begin
- aSig := alternateASig;
- End;
- zSign := flag( sbits32 (aSig) < 0 );
- if ( zSign<>0 ) then
- aSig := - aSig;
- float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the square root of the single-precision floating-point value `a'.
- The operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
- Var
- aSign : flag;
- aExp, zExp : int16;
- aSig, zSig, rem0, rem1, term0, term1: bits32;
- label roundAndPack;
- Begin
- aSig := extractFloat32Frac( a.float32 );
- aExp := extractFloat32Exp( a.float32 );
- aSign := extractFloat32Sign( a.float32 );
- if ( aExp = $FF ) then
- Begin
- if ( aSig <> 0) then
- Begin
- float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
- exit;
- End;
- if ( aSign = 0) then
- Begin
- float32_sqrt := a;
- exit;
- End;
- float_raise( float_flag_invalid );
- float32_sqrt.float32 := float32_default_nan;
- exit;
- End;
- if ( aSign <> 0) then
- Begin
- if ( ( bits32(aExp) OR aSig ) = 0 ) then
- Begin
- float32_sqrt := a;
- exit;
- End;
- float_raise( float_flag_invalid );
- float32_sqrt.float32 := float32_default_nan;
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- if ( aSig = 0 ) then
- Begin
- float32_sqrt.float32 := 0;
- exit;
- End;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- End;
- zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
- aSig := ( aSig OR $00800000 ) shl 8;
- zSig := estimateSqrt32( aExp, aSig ) + 2;
- if ( ( zSig and $7F ) <= 5 ) then
- Begin
- if ( zSig < 2 ) then
- Begin
- zSig := $7FFFFFFF;
- goto roundAndPack;
- End
- else
- Begin
- aSig := aSig shr (aExp and 1);
- mul32To64( zSig, zSig, term0, term1 );
- sub64( aSig, 0, term0, term1, rem0, rem1 );
- while ( sbits32 (rem0) < 0 ) do
- Begin
- Dec(zSig);
- shortShift64Left( 0, zSig, 1, term0, term1 );
- term1 := term1 or 1;
- add64( rem0, rem1, term0, term1, rem0, rem1 );
- End;
- zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
- End;
- End;
- shift32RightJamming( zSig, 1, zSig );
- roundAndPack:
- float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
- Begin
- if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
- OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
- ) then
- Begin
- if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
- Begin
- float_raise( float_flag_invalid );
- End;
- float32_eq := 0;
- exit;
- End;
- float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than
- or equal to the corresponding value `b', and 0 otherwise. The comparison
- is performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
- var
- aSign, bSign: flag;
- Begin
- if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
- OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
- ) then
- Begin
- float_raise( float_flag_invalid );
- float32_le := 0;
- exit;
- End;
- aSign := extractFloat32Sign( a.float32 );
- bSign := extractFloat32Sign( b.float32 );
- if ( aSign <> bSign ) then
- Begin
- float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
- exit;
- End;
- float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
- var
- aSign, bSign: flag;
- Begin
- if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
- OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
- ) then
- Begin
- float_raise( float_flag_invalid );
- float32_lt :=0;
- exit;
- End;
- aSign := extractFloat32Sign( a.float32 );
- bSign := extractFloat32Sign( b.float32 );
- if ( aSign <> bSign ) then
- Begin
- float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
- exit;
- End;
- float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The invalid exception is
- raised if either operand is a NaN. Otherwise, the comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_eq_signaling( a: float32; b: float32) : flag;
- Begin
- if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
- OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
- ) then
- Begin
- float_raise( float_flag_invalid );
- float32_eq_signaling := 0;
- exit;
- End;
- float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than or
- equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
- cause an exception. Otherwise, the comparison is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_le_quiet( a: float32 ; b : float32 ): flag;
- Var
- aSign, bSign: flag;
- Begin
- if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
- OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
- ) then
- Begin
- if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
- Begin
- float_raise( float_flag_invalid );
- End;
- float32_le_quiet := 0;
- exit;
- End;
- aSign := extractFloat32Sign( a );
- bSign := extractFloat32Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
- exit;
- End;
- float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
- exception. Otherwise, the comparison is performed according to the IEC/IEEE
- Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
- Var
- aSign, bSign: flag;
- Begin
- if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
- OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
- ) then
- Begin
- if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
- Begin
- float_raise( float_flag_invalid );
- End;
- float32_lt_quiet := 0;
- exit;
- End;
- aSign := extractFloat32Sign( a );
- bSign := extractFloat32Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
- exit;
- End;
- float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic---which means in particular that the conversion is rounded
- according to the current rounding mode. If `a' is a NaN, the largest
- positive integer is returned. Otherwise, if the conversion overflows, the
- largest integer with the same sign as `a' is returned.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
- var
- aSign: flag;
- aExp, shiftCount: int16;
- aSig0, aSig1, absZ, aSigExtra: bits32;
- z: int32;
- roundingMode: TFPURoundingMode;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- shiftCount := aExp - $413;
- if ( 0 <= shiftCount ) then
- Begin
- if ( $41E < aExp ) then
- Begin
- if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
- aSign := 0;
- goto invalid;
- End;
- shortShift64Left(
- aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
- if ( $80000000 < absZ ) then
- goto invalid;
- End
- else
- Begin
- aSig1 := flag( aSig1 <> 0 );
- if ( aExp < $3FE ) then
- Begin
- aSigExtra := aExp OR aSig0 OR aSig1;
- absZ := 0;
- End
- else
- Begin
- aSig0 := aSig0 OR $00100000;
- aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
- absZ := aSig0 shr ( - shiftCount );
- End;
- End;
- roundingMode := softfloat_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- Begin
- if ( sbits32(aSigExtra) < 0 ) then
- Begin
- Inc(absZ);
- if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
- absZ := absZ and not 1;
- End;
- if aSign <> 0 then
- z := - absZ
- else
- z := absZ;
- End
- else
- Begin
- aSigExtra := bits32( aSigExtra <> 0 );
- if ( aSign <> 0) then
- Begin
- z := - ( absZ
- + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
- End
- else
- Begin
- z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
- End
- End;
- if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
- Begin
- invalid:
- float_raise( float_flag_invalid );
- if (aSign <> 0 ) then
- float64_to_int32 := sbits32 ($80000000)
- else
- float64_to_int32 := $7FFFFFFF;
- exit;
- End;
- if ( aSigExtra <> 0) then
- set_inexact_flag;
- float64_to_int32 := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic, except that the conversion is always rounded toward zero.
- If `a' is a NaN, the largest positive integer is returned. Otherwise, if
- the conversion overflows, the largest integer with the same sign as `a' is
- returned.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_int32_round_to_zero(a: float64 ): int32;
- {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
- Var
- aSign: flag;
- aExp, shiftCount: int16;
- aSig0, aSig1, absZ, aSigExtra: bits32;
- z: int32;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- shiftCount := aExp - $413;
- if ( 0 <= shiftCount ) then
- Begin
- if ( $41E < aExp ) then
- Begin
- if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
- aSign := 0;
- goto invalid;
- End;
- shortShift64Left(
- aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
- End
- else
- Begin
- if ( aExp < $3FF ) then
- Begin
- if ( bits32(aExp) OR aSig0 OR aSig1 )<>0 then
- Begin
- set_inexact_flag;
- End;
- float64_to_int32_round_to_zero := 0;
- exit;
- End;
- aSig0 := aSig0 or $00100000;
- aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
- absZ := aSig0 shr ( - shiftCount );
- End;
- if aSign <> 0 then
- z := - absZ
- else
- z := absZ;
- if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
- Begin
- invalid:
- float_raise( float_flag_invalid );
- if (aSign <> 0) then
- float64_to_int32_round_to_zero := sbits32 ($80000000)
- else
- float64_to_int32_round_to_zero := $7FFFFFFF;
- exit;
- End;
- if ( aSigExtra <> 0) then
- set_inexact_flag;
- float64_to_int32_round_to_zero := z;
- End;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the double-precision floating-point value
- | `a' to the 64-bit two's complement integer format. The conversion is
- | performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic---which means in particular that the conversion is rounded
- | according to the current rounding mode. If `a' is a NaN, the largest
- | positive integer is returned. Otherwise, if the conversion overflows, the
- | largest integer with the same sign as `a' is returned.
- *----------------------------------------------------------------------------*}
- function float64_to_int64( a: float64 ): int64;
- var
- aSign: flag;
- aExp, shiftCount: int16;
- aSig, aSigExtra: bits64;
- begin
- aSig := extractFloat64Frac( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
- shiftCount := $433 - aExp;
- if ( shiftCount <= 0 ) then begin
- if ( $43E < aExp ) then begin
- float_raise( float_flag_invalid );
- if ( ( aSign = 0 )
- or ( ( aExp = $7FF )
- and ( aSig <> $0010000000000000 ) )
- ) then begin
- result := $7FFFFFFFFFFFFFFF;
- exit;
- end;
- result := $8000000000000000;
- exit;
- end;
- aSigExtra := 0;
- aSig := aSig shl ( - shiftCount );
- end
- else
- shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
- result := roundAndPackInt64( aSign, aSig, aSigExtra );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the double-precision floating-point value
- | `a' to the 64-bit two's complement integer format. The conversion is
- | performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic, except that the conversion is always rounded toward zero.
- | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
- | the conversion overflows, the largest integer with the same sign as `a' is
- | returned.
- *----------------------------------------------------------------------------*}
- {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
- function float64_to_int64_round_to_zero( a: float64 ): int64;
- var
- aSign: flag;
- aExp, shiftCount: int16;
- aSig: bits64;
- z: int64;
- begin
- aSig := extractFloat64Frac( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
- shiftCount := aExp - $433;
- if ( 0 <= shiftCount ) then begin
- if ( $43E <= aExp ) then begin
- if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
- float_raise( float_flag_invalid );
- if ( ( aSign = 0 )
- or ( ( aExp = $7FF )
- and ( aSig <> $0010000000000000 ) )
- ) then begin
- result := $7FFFFFFFFFFFFFFF;
- exit;
- end;
- end;
- result := $8000000000000000;
- exit;
- end;
- z := aSig shl shiftCount;
- end
- else begin
- if ( aExp < $3FE ) then begin
- if ( aExp or aSig <> 0 ) then set_inexact_flag;
- result := 0;
- exit;
- end;
- z := aSig shr ( - shiftCount );
- if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
- set_inexact_flag;
- end;
- if ( aSign <> 0 ) then z := - z;
- result := z;
- end;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the single-precision floating-point format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_float32(a: float64 ): float32rec;compilerproc;
- Var
- aSign: flag;
- aExp: int16;
- aSig0, aSig1, zSig: bits32;
- allZero: bits32;
- tmp : CommonNanT;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 ) <> 0 then
- Begin
- tmp:=float64ToCommonNaN(a);
- float64_to_float32.float32 := commonNaNToFloat32( tmp );
- exit;
- End;
- float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
- exit;
- End;
- shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
- if ( aExp <> 0) then
- zSig := zSig OR $40000000;
- float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
- End;
- {$ifdef FPC_SOFTFLOAT_FLOATX80}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the double-precision floating-point value
- | `a' to the extended double-precision floating-point format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------*}
- function float64_to_floatx80( a: float64 ): floatx80;
- var
- aSign: flag;
- aExp: int16;
- aSig: bits64;
- begin
- aSig := extractFloat64Frac( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- if ( aExp = $7FF ) then begin
- if ( aSig <> 0 ) then begin
- result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
- exit;
- end;
- result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- if ( aExp = 0 ) then begin
- if ( aSig = 0 ) then begin
- result := packFloatx80( aSign, 0, 0 );
- exit;
- end;
- normalizeFloat64Subnormal( aSig, aExp, aSig );
- end;
- result :=
- packFloatx80(
- aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
- end;
- {$endif FPC_SOFTFLOAT_FLOATX80}
- {*
- -------------------------------------------------------------------------------
- Rounds the double-precision floating-point value `a' to an integer,
- and returns the result as a double-precision floating-point value. The
- operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
- Var
- aSign: flag;
- aExp: int16;
- lastBitMask, roundBitsMask: bits32;
- roundingMode: TFPURoundingMode;
- z: float64;
- Begin
- aExp := extractFloat64Exp( a );
- if ( $413 <= aExp ) then
- Begin
- if ( $433 <= aExp ) then
- Begin
- if ( ( aExp = $7FF )
- AND
- (
- ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
- ) <>0)
- ) then
- Begin
- propagateFloat64NaN( a, a, result );
- exit;
- End;
- result := a;
- exit;
- End;
- lastBitMask := 1;
- lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
- roundBitsMask := lastBitMask - 1;
- z := a;
- roundingMode := softfloat_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- Begin
- if ( lastBitMask <> 0) then
- Begin
- add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
- if ( ( z.low and roundBitsMask ) = 0 ) then
- z.low := z.low and not lastBitMask;
- End
- else
- Begin
- if ( sbits32 (z.low) < 0 ) then
- Begin
- Inc(z.high);
- if ( bits32 ( z.low shl 1 ) = 0 ) then
- z.high := z.high and not 1;
- End;
- End;
- End
- else if ( roundingMode <> float_round_to_zero ) then
- Begin
- if ( extractFloat64Sign( z )
- xor flag( roundingMode = float_round_up ) )<> 0 then
- Begin
- add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
- End;
- End;
- z.low := z.low and not roundBitsMask;
- End
- else
- Begin
- if ( aExp <= $3FE ) then
- Begin
- if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
- Begin
- result := a;
- exit;
- End;
- set_inexact_flag;
- aSign := extractFloat64Sign( a );
- case ( softfloat_rounding_mode ) of
- float_round_nearest_even:
- Begin
- if ( ( aExp = $3FE )
- AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
- ) then
- Begin
- packFloat64( aSign, $3FF, 0, 0, result );
- exit;
- End;
- End;
- float_round_down:
- Begin
- if aSign<>0 then
- packFloat64( 1, $3FF, 0, 0, result )
- else
- packFloat64( 0, 0, 0, 0, result );
- exit;
- End;
- float_round_up:
- Begin
- if aSign <> 0 then
- packFloat64( 1, 0, 0, 0, result )
- else
- packFloat64( 0, $3FF, 0, 0, result );
- exit;
- End;
- end;
- packFloat64( aSign, 0, 0, 0, result );
- exit;
- End;
- lastBitMask := 1;
- lastBitMask := lastBitMask shl ($413 - aExp);
- roundBitsMask := lastBitMask - 1;
- z.low := 0;
- z.high := a.high;
- roundingMode := softfloat_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- Begin
- z.high := z.high + lastBitMask shr 1;
- if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
- Begin
- z.high := z.high and not lastBitMask;
- End;
- End
- else if ( roundingMode <> float_round_to_zero ) then
- Begin
- if ( extractFloat64Sign( z )
- xor flag( roundingMode = float_round_up ) )<> 0 then
- Begin
- z.high := z.high or bits32( a.low <> 0 );
- z.high := z.high + roundBitsMask;
- End;
- End;
- z.high := z.high and not roundBitsMask;
- End;
- if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
- Begin
- set_inexact_flag;
- End;
- result := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the absolute values of the double-precision
- floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
- before being returned. `zSign' is ignored if the result is a NaN.
- The addition is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
- Var
- aExp, bExp, zExp: int16;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
- expDiff: int16;
- label shiftRight1;
- label roundAndPack;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- bSig1 := extractFloat64Frac1( b );
- bSig0 := extractFloat64Frac0( b );
- bExp := extractFloat64Exp( b );
- expDiff := aExp - bExp;
- if ( 0 < expDiff ) then
- Begin
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- end;
- out := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- Dec(expDiff);
- End
- else
- Begin
- bSig0 := bSig0 or $00100000;
- End;
- shift64ExtraRightJamming(
- bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
- zExp := aExp;
- End
- else if ( expDiff < 0 ) then
- Begin
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- packFloat64( zSign, $7FF, 0, 0, out );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- Inc(expDiff);
- End
- else
- Begin
- aSig0 := aSig0 or $00100000;
- End;
- shift64ExtraRightJamming(
- aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
- zExp := bExp;
- End
- else
- Begin
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- out := a;
- exit;
- End;
- add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
- if ( aExp = 0 ) then
- Begin
- packFloat64( zSign, 0, zSig0, zSig1, out );
- exit;
- End;
- zSig2 := 0;
- zSig0 := zSig0 or $00200000;
- zExp := aExp;
- goto shiftRight1;
- End;
- aSig0 := aSig0 or $00100000;
- add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
- Dec(zExp);
- if ( zSig0 < $00200000 ) then
- goto roundAndPack;
- Inc(zExp);
- shiftRight1:
- shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
- roundAndPack:
- roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the absolute values of the double-
- precision floating-point values `a' and `b'. If `zSign' is 1, the
- difference is negated before being returned. `zSign' is ignored if the
- result is a NaN. The subtraction is performed according to the IEC/IEEE
- Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
- Var
- aExp, bExp, zExp: int16;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
- expDiff: int16;
- z: float64;
- label aExpBigger;
- label bExpBigger;
- label aBigger;
- label bBigger;
- label normalizeRoundAndPack;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- bSig1 := extractFloat64Frac1( b );
- bSig0 := extractFloat64Frac0( b );
- bExp := extractFloat64Exp( b );
- expDiff := aExp - bExp;
- shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
- shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
- if ( 0 < expDiff ) then goto aExpBigger;
- if ( expDiff < 0 ) then goto bExpBigger;
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- float_raise( float_flag_invalid );
- z.low := float64_default_nan_low;
- z.high := float64_default_nan_high;
- out := z;
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- aExp := 1;
- bExp := 1;
- End;
- if ( bSig0 < aSig0 ) then goto aBigger;
- if ( aSig0 < bSig0 ) then goto bBigger;
- if ( bSig1 < aSig1 ) then goto aBigger;
- if ( aSig1 < bSig1 ) then goto bBigger;
- packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
- exit;
- bExpBigger:
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- packFloat64( zSign xor 1, $7FF, 0, 0, out );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- Inc(expDiff);
- End
- else
- Begin
- aSig0 := aSig0 or $40000000;
- End;
- shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
- bSig0 := bSig0 or $40000000;
- bBigger:
- sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
- zExp := bExp;
- zSign := zSign xor 1;
- goto normalizeRoundAndPack;
- aExpBigger:
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- out := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- Dec(expDiff);
- End
- else
- Begin
- bSig0 := bSig0 or $40000000;
- End;
- shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
- aSig0 := aSig0 or $40000000;
- aBigger:
- sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
- zExp := aExp;
- normalizeRoundAndPack:
- Dec(zExp);
- normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the double-precision floating-point values `a'
- and `b'. The operation is performed according to the IEC/IEEE Standard for
- Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_add( a: float64; b : float64) : Float64;
- {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
- Var
- aSign, bSign: flag;
- Begin
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign = bSign ) then
- Begin
- addFloat64Sigs( a, b, aSign, result );
- End
- else
- Begin
- subFloat64Sigs( a, b, aSign, result );
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the double-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_sub(a: float64; b : float64) : Float64;
- {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
- Var
- aSign, bSign: flag;
- Begin
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign = bSign ) then
- Begin
- subFloat64Sigs( a, b, aSign, result );
- End
- else
- Begin
- addFloat64Sigs( a, b, aSign, result );
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of multiplying the double-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_mul( a: float64; b:float64) : Float64;
- {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
- Var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp: int16;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
- z: float64;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- bSig1 := extractFloat64Frac1( b );
- bSig0 := extractFloat64Frac0( b );
- bExp := extractFloat64Exp( b );
- bSign := extractFloat64Sign( b );
- zSign := aSign xor bSign;
- if ( aExp = $7FF ) then
- Begin
- if ( (( aSig0 OR aSig1 ) <>0)
- OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
- Begin
- propagateFloat64NaN( a, b, result );
- exit;
- End;
- if ( ( bits32(bExp) OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
- packFloat64( zSign, $7FF, 0, 0, result );
- exit;
- End;
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 )<> 0 then
- Begin
- propagateFloat64NaN( a, b, result );
- exit;
- End;
- if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
- Begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := float64_default_nan_low;
- z.high := float64_default_nan_high;
- result := z;
- exit;
- End;
- packFloat64( zSign, $7FF, 0, 0, result );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- if ( ( aSig0 OR aSig1 ) = 0 ) then
- Begin
- packFloat64( zSign, 0, 0, 0, result );
- exit;
- End;
- normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- End;
- if ( bExp = 0 ) then
- Begin
- if ( ( bSig0 OR bSig1 ) = 0 ) then
- Begin
- packFloat64( zSign, 0, 0, 0, result );
- exit;
- End;
- normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
- End;
- zExp := aExp + bExp - $400;
- aSig0 := aSig0 or $00100000;
- shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
- mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
- add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
- zSig2 := zSig2 or flag( zSig3 <> 0 );
- if ( $00200000 <= zSig0 ) then
- Begin
- shift64ExtraRightJamming(
- zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
- Inc(zExp);
- End;
- roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of dividing the double-precision floating-point value `a'
- by the corresponding value `b'. The operation is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_div(a: float64; b : float64) : Float64;
- {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
- Var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp: int16;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
- rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
- z: float64;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- bSig1 := extractFloat64Frac1( b );
- bSig0 := extractFloat64Frac0( b );
- bExp := extractFloat64Exp( b );
- bSign := extractFloat64Sign( b );
- zSign := aSign xor bSign;
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 )<> 0 then
- Begin
- propagateFloat64NaN( a, b, result );
- exit;
- end;
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 )<>0 then
- Begin
- propagateFloat64NaN( a, b, result );
- exit;
- End;
- goto invalid;
- End;
- packFloat64( zSign, $7FF, 0, 0, result );
- exit;
- End;
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 )<> 0 then
- Begin
- propagateFloat64NaN( a, b, result );
- exit;
- End;
- packFloat64( zSign, 0, 0, 0, result );
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- if ( ( bSig0 OR bSig1 ) = 0 ) then
- Begin
- if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
- Begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := float64_default_nan_low;
- z.high := float64_default_nan_high;
- result := z;
- exit;
- End;
- float_raise( float_flag_divbyzero );
- packFloat64( zSign, $7FF, 0, 0, result );
- exit;
- End;
- normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
- End;
- if ( aExp = 0 ) then
- Begin
- if ( ( aSig0 OR aSig1 ) = 0 ) then
- Begin
- packFloat64( zSign, 0, 0, 0, result );
- exit;
- End;
- normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- End;
- zExp := aExp - bExp + $3FD;
- shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
- shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
- if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
- Begin
- shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
- Inc(zExp);
- End;
- zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
- mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
- sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
- while ( sbits32 (rem0) < 0 ) do
- Begin
- Dec(zSig0);
- add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
- End;
- zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
- if ( ( zSig1 and $3FF ) <= 4 ) then
- Begin
- mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
- sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
- while ( sbits32 (rem1) < 0 ) do
- Begin
- Dec(zSig1);
- add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
- End;
- zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
- End;
- shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
- roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the remainder of the double-precision floating-point value `a'
- with respect to the corresponding value `b'. The operation is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_rem(a: float64; b : float64) : float64;
- {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
- Var
- aSign, zSign: flag;
- aExp, bExp, expDiff: int16;
- aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
- allZero, alternateASig0, alternateASig1, sigMean1: bits32;
- sigMean0: sbits32;
- z: float64;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- bSig1 := extractFloat64Frac1( b );
- bSig0 := extractFloat64Frac0( b );
- bExp := extractFloat64Exp( b );
- if ( aExp = $7FF ) then
- Begin
- if ((( aSig0 OR aSig1 )<>0)
- OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
- Begin
- propagateFloat64NaN( a, b, result );
- exit;
- End;
- goto invalid;
- End;
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, result );
- exit;
- End;
- result := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- if ( ( bSig0 OR bSig1 ) = 0 ) then
- Begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := float64_default_nan_low;
- z.high := float64_default_nan_high;
- result := z;
- exit;
- End;
- normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
- End;
- if ( aExp = 0 ) then
- Begin
- if ( ( aSig0 OR aSig1 ) = 0 ) then
- Begin
- result := a;
- exit;
- End;
- normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- End;
- expDiff := aExp - bExp;
- if ( expDiff < -1 ) then
- Begin
- result := a;
- exit;
- End;
- shortShift64Left(
- aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
- shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
- q := le64( bSig0, bSig1, aSig0, aSig1 );
- if ( q )<>0 then
- sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
- expDiff := expDiff - 32;
- while ( 0 < expDiff ) do
- Begin
- q := estimateDiv64To32( aSig0, aSig1, bSig0 );
- if 4 < q then
- q:= q - 4
- else
- q := 0;
- mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
- shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
- shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
- sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
- expDiff := expDiff - 29;
- End;
- if ( -32 < expDiff ) then
- Begin
- q := estimateDiv64To32( aSig0, aSig1, bSig0 );
- if 4 < q then
- q := q - 4
- else
- q := 0;
- q := q shr (- expDiff);
- shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
- expDiff := expDiff + 24;
- if ( expDiff < 0 ) then
- Begin
- shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
- End
- else
- Begin
- shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
- End;
- mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
- sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
- End
- else
- Begin
- shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
- shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
- End;
- Repeat
- alternateASig0 := aSig0;
- alternateASig1 := aSig1;
- Inc(q);
- sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
- Until not ( 0 <= sbits32 (aSig0) );
- add64(
- aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
- if ( ( sigMean0 < 0 )
- OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
- Begin
- aSig0 := alternateASig0;
- aSig1 := alternateASig1;
- End;
- zSign := flag( sbits32 (aSig0) < 0 );
- if ( zSign <> 0 ) then
- sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
- normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the square root of the double-precision floating-point value `a'.
- The operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- function float64_sqrt( a: float64 ): float64;
- {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
- Var
- aSign: flag;
- aExp, zExp: int16;
- aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
- rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, a, result );
- exit;
- End;
- if ( aSign = 0) then
- Begin
- result := a;
- exit;
- End;
- goto invalid;
- End;
- if ( aSign <> 0 ) then
- Begin
- if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
- Begin
- result := a;
- exit;
- End;
- invalid:
- float_raise( float_flag_invalid );
- result.low := float64_default_nan_low;
- result.high := float64_default_nan_high;
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- if ( ( aSig0 OR aSig1 ) = 0 ) then
- Begin
- packFloat64( 0, 0, 0, 0, result );
- exit;
- End;
- normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- End;
- zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
- aSig0 := aSig0 or $00100000;
- shortShift64Left( aSig0, aSig1, 11, term0, term1 );
- zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
- if ( zSig0 = 0 ) then
- zSig0 := $7FFFFFFF;
- doubleZSig0 := zSig0 + zSig0;
- shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
- mul32To64( zSig0, zSig0, term0, term1 );
- sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
- while ( sbits32 (rem0) < 0 ) do
- Begin
- Dec(zSig0);
- doubleZSig0 := doubleZSig0 - 2;
- add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
- End;
- zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
- if ( ( zSig1 and $1FF ) <= 5 ) then
- Begin
- if ( zSig1 = 0 ) then
- zSig1 := 1;
- mul32To64( doubleZSig0, zSig1, term1, term2 );
- sub64( rem1, 0, term1, term2, rem1, rem2 );
- mul32To64( zSig1, zSig1, term2, term3 );
- sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
- while ( sbits32 (rem1) < 0 ) do
- Begin
- Dec(zSig1);
- shortShift64Left( 0, zSig1, 1, term2, term3 );
- term3 := term3 or 1;
- term2 := term2 or doubleZSig0;
- add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
- End;
- zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
- End;
- shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
- roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_eq(a: float64; b: float64): flag;
- {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
- float_raise( float_flag_invalid );
- float64_eq := 0;
- exit;
- End;
- float64_eq := flag(
- ( a.low = b.low )
- AND ( ( a.high = b.high )
- OR ( ( a.low = 0 )
- AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
- ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than
- or equal to the corresponding value `b', and 0 otherwise. The comparison
- is performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_le(a: float64;b: float64): flag;
- {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
- Var
- aSign, bSign: flag;
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- float_raise( float_flag_invalid );
- float64_le := 0;
- exit;
- End;
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float64_le := flag(
- (aSign <> 0)
- OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
- = 0 ));
- exit;
- End;
- if aSign <> 0 then
- float64_le := le64( b.high, b.low, a.high, a.low )
- else
- float64_le := le64( a.high, a.low, b.high, b.low );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_lt(a: float64;b: float64): flag;
- {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
- Var
- aSign, bSign: flag;
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- float_raise( float_flag_invalid );
- float64_lt := 0;
- exit;
- End;
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float64_lt := flag(
- (aSign <> 0)
- AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
- <> 0 ));
- exit;
- End;
- if aSign <> 0 then
- float64_lt := lt64( b.high, b.low, a.high, a.low )
- else
- float64_lt := lt64( a.high, a.low, b.high, b.low );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The invalid exception is
- raised if either operand is a NaN. Otherwise, the comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_eq_signaling( a: float64; b: float64): flag;
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- float_raise( float_flag_invalid );
- float64_eq_signaling := 0;
- exit;
- End;
- float64_eq_signaling := flag(
- ( a.low = b.low )
- AND ( ( a.high = b.high )
- OR ( ( a.low = 0 )
- AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
- ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than or
- equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
- cause an exception. Otherwise, the comparison is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_le_quiet(a: float64 ; b: float64 ): flag;
- Var
- aSign, bSign : flag;
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
- float_raise( float_flag_invalid );
- float64_le_quiet := 0;
- exit;
- End;
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float64_le_quiet := flag
- ((aSign <> 0)
- OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
- = 0 ));
- exit;
- End;
- if aSign <> 0 then
- float64_le_quiet := le64( b.high, b.low, a.high, a.low )
- else
- float64_le_quiet := le64( a.high, a.low, b.high, b.low );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
- exception. Otherwise, the comparison is performed according to the IEC/IEEE
- Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_lt_quiet(a: float64; b: float64 ): Flag;
- Var
- aSign, bSign: flag;
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
- float_raise( float_flag_invalid );
- float64_lt_quiet := 0;
- exit;
- End;
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float64_lt_quiet := flag(
- (aSign<>0)
- AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
- <> 0 ));
- exit;
- End;
- If aSign <> 0 then
- float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
- else
- float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
- End;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the single-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function int64_to_float32( a: int64 ): float32rec; compilerproc;
- var
- zSign : flag;
- absA : uint64;
- shiftCount: int8;
- Begin
- if ( a = 0 ) then
- begin
- int64_to_float32.float32 := 0;
- exit;
- end;
- if a < 0 then
- zSign := flag(TRUE)
- else
- zSign := flag(FALSE);
- if zSign<>0 then
- absA := -a
- else
- absA := a;
- shiftCount := countLeadingZeros64( absA ) - 40;
- if ( 0 <= shiftCount ) then
- begin
- int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
- end
- else
- begin
- shiftCount := shiftCount + 7;
- if ( shiftCount < 0 ) then
- shift64RightJamming( absA, - shiftCount, absA )
- else
- absA := absA shl shiftCount;
- int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
- end;
- End;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the single-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- | Unisgned version.
- *----------------------------------------------------------------------------*}
- function qword_to_float32( a: qword ): float32rec; compilerproc;
- var
- absA : uint64;
- shiftCount: int8;
- Begin
- if ( a = 0 ) then
- begin
- qword_to_float32.float32 := 0;
- exit;
- end;
- absA := a;
- shiftCount := countLeadingZeros64( absA ) - 40;
- if ( 0 <= shiftCount ) then
- begin
- qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
- end
- else
- begin
- shiftCount := shiftCount + 7;
- if ( shiftCount < 0 ) then
- shift64RightJamming( absA, - shiftCount, absA )
- else
- absA := absA shl shiftCount;
- qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
- end;
- End;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the double-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function qword_to_float64( a: qword ): float64;
- {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
- var
- shiftCount: int8;
- Begin
- if ( a = 0 ) then
- result := packFloat64( 0, 0, 0 )
- else
- begin
- shiftCount := countLeadingZeros64(a) - 1;
- { numbers with <= 53 significant bits are converted exactly }
- if (shiftCount > 9) then
- result := packFloat64(0, $43c - shiftCount, a shl (shiftCount-10))
- else if (shiftCount>=0) then
- result := roundAndPackFloat64( 0, $43c - shiftCount, a shl shiftCount)
- else
- begin
- { the only possible negative value is -1, in case bit 63 of 'a' is set }
- shift64RightJamming(a, 1, a);
- result := roundAndPackFloat64(0, $43d, a);
- end;
- end;
- End;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the double-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function int64_to_float64( a: int64 ): float64;
- {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
- Begin
- if ( a = 0 ) then
- result := packFloat64( 0, 0, 0 )
- else if (a = int64($8000000000000000)) then
- result := packFloat64( 1, $43e, 0 )
- else if (a < 0) then
- result := normalizeRoundAndPackFloat64( 1, $43c, -a )
- else
- result := normalizeRoundAndPackFloat64( 0, $43c, a );
- End;
- {$ifdef FPC_SOFTFLOAT_FLOATX80}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the extended double-precision floating-point format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------*}
- function int64_to_floatx80( a: int64 ): floatx80;
- var
- zSign: flag;
- absA: uint64;
- shiftCount: int8;
- begin
- if ( a = 0 ) then begin
- result := packFloatx80( 0, 0, 0 );
- exit;
- end;
- zSign := ord( a < 0 );
- if zSign <> 0 then absA := - a else absA := a;
- shiftCount := countLeadingZeros64( absA );
- result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the extended double-precision floating-point format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- | Unsigned version.
- *----------------------------------------------------------------------------*}
- function qword_to_floatx80( a: qword ): floatx80;
- var
- absA: bits64;
- shiftCount: int8;
- begin
- if ( a = 0 ) then begin
- result := packFloatx80( 0, 0, 0 );
- exit;
- end;
- absA := a;
- shiftCount := countLeadingZeros64( absA );
- result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
- end;
- {$endif FPC_SOFTFLOAT_FLOATX80}
- {$ifdef FPC_SOFTFLOAT_FLOAT128}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a' to
- | the quadruple-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function int64_to_float128( a: int64 ): float128;
- var
- zSign: flag;
- absA: uint64;
- shiftCount: int8;
- zExp: int32;
- zSig0, zSig1: bits64;
- begin
- if ( a = 0 ) then begin
- result := packFloat128( 0, 0, 0, 0 );
- exit;
- end;
- zSign := ord( a < 0 );
- if zSign <> 0 then absA := - a else absA := a;
- shiftCount := countLeadingZeros64( absA ) + 49;
- zExp := $406E - shiftCount;
- if ( 64 <= shiftCount ) then begin
- zSig1 := 0;
- zSig0 := absA;
- dec( shiftCount, 64 );
- end
- else begin
- zSig1 := absA;
- zSig0 := 0;
- end;
- shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
- result := packFloat128( zSign, zExp, zSig0, zSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a' to
- | the quadruple-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- | Unsigned version.
- *----------------------------------------------------------------------------*}
- function qword_to_float128( a: qword ): float128;
- var
- absA: bits64;
- shiftCount: int8;
- zExp: int32;
- zSig0, zSig1: bits64;
- begin
- if ( a = 0 ) then begin
- result := packFloat128( 0, 0, 0, 0 );
- exit;
- end;
- absA := a;
- shiftCount := countLeadingZeros64( absA ) + 49;
- zExp := $406E - shiftCount;
- if ( 64 <= shiftCount ) then begin
- zSig1 := 0;
- zSig0 := absA;
- dec( shiftCount, 64 );
- end
- else begin
- zSig1 := absA;
- zSig0 := 0;
- end;
- shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
- result := packFloat128( 0, zExp, zSig0, zSig1 );
- end;
- {$endif FPC_SOFTFLOAT_FLOAT128}
- {*----------------------------------------------------------------------------
- | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
- | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
- | Otherwise, returns 0.
- *----------------------------------------------------------------------------*}
- function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
- begin
- result := ord(( a0 = b0 ) and ( a1 = b1 ));
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
- | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
- | Otherwise, returns 0.
- *----------------------------------------------------------------------------*}
- function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
- begin
- result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
- end;
- {*----------------------------------------------------------------------------
- | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
- | by 64 _plus_ the number of bits given in `count'. The shifted result is
- | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
- | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
- | off form a third 64-bit result as follows: The _last_ bit shifted off is
- | the most-significant bit of the extra result, and the other 63 bits of the
- | extra result are all zero if and only if _all_but_the_last_ bits shifted off
- | were all zero. This extra result is stored in the location pointed to by
- | `z2Ptr'. The value of `count' can be arbitrarily large.
- | (This routine makes more sense if `a0', `a1', and `a2' are considered
- | to form a fixed-point value with binary point between `a1' and `a2'. This
- | fixed-point value is shifted right by the number of bits given in `count',
- | and the integer part of the result is returned at the locations pointed to
- | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
- | corrupted as described above, and is returned at the location pointed to by
- | `z2Ptr'.)
- *----------------------------------------------------------------------------*}
- procedure shift128ExtraRightJamming(
- a0: bits64;
- a1: bits64;
- a2: bits64;
- count: int16;
- var z0Ptr: bits64;
- var z1Ptr: bits64;
- var z2Ptr: bits64);
- var
- z0, z1, z2: bits64;
- negCount: int8;
- begin
- negCount := ( - count ) and 63;
- if ( count = 0 ) then
- begin
- z2 := a2;
- z1 := a1;
- z0 := a0;
- end
- else begin
- if ( count < 64 ) then
- begin
- z2 := a1 shl negCount;
- z1 := ( a0 shl negCount ) or ( a1 shr count );
- z0 := a0 shr count;
- end
- else begin
- if ( count = 64 ) then
- begin
- z2 := a1;
- z1 := a0;
- end
- else begin
- a2 := a2 or a1;
- if ( count < 128 ) then
- begin
- z2 := a0 shl negCount;
- z1 := a0 shr ( count and 63 );
- end
- else begin
- if ( count = 128 ) then
- z2 := a0
- else
- z2 := ord( a0 <> 0 );
- z1 := 0;
- end;
- end;
- z0 := 0;
- end;
- z2 := z2 or ord( a2 <> 0 );
- end;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- end;
- {*----------------------------------------------------------------------------
- | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
- | _plus_ the number of bits given in `count'. The shifted result is at most
- | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
- | bits shifted off form a second 64-bit result as follows: The _last_ bit
- | shifted off is the most-significant bit of the extra result, and the other
- | 63 bits of the extra result are all zero if and only if _all_but_the_last_
- | bits shifted off were all zero. This extra result is stored in the location
- | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
- | (This routine makes more sense if `a0' and `a1' are considered to form
- | a fixed-point value with binary point between `a0' and `a1'. This fixed-
- | point value is shifted right by the number of bits given in `count', and
- | the integer part of the result is returned at the location pointed to by
- | `z0Ptr'. The fractional part of the result may be slightly corrupted as
- | described above, and is returned at the location pointed to by `z1Ptr'.)
- *----------------------------------------------------------------------------*}
- procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
- var
- z0, z1: bits64;
- negCount: int8;
- begin
- negCount := ( - count ) and 63;
- if ( count = 0 ) then
- begin
- z1 := a1;
- z0 := a0;
- end
- else if ( count < 64 ) then
- begin
- z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
- z0 := a0 shr count;
- end
- else begin
- if ( count = 64 ) then
- begin
- z1 := a0 or ord( a1 <> 0 );
- end
- else begin
- z1 := ord( ( a0 or a1 ) <> 0 );
- end;
- z0 := 0;
- end;
- z1Ptr := z1;
- z0Ptr := z0;
- end;
- {$ifdef FPC_SOFTFLOAT_FLOATX80}
- {*----------------------------------------------------------------------------
- | Returns the fraction bits of the extended double-precision floating-point
- | value `a'.
- *----------------------------------------------------------------------------*}
- function extractFloatx80Frac(a : floatx80): bits64;inline;
- begin
- result:=a.low;
- end;
- {*----------------------------------------------------------------------------
- | Returns the exponent bits of the extended double-precision floating-point
- | value `a'.
- *----------------------------------------------------------------------------*}
- function extractFloatx80Exp(a : floatx80): int32;inline;
- begin
- result:=a.high and $7FFF;
- end;
- {*----------------------------------------------------------------------------
- | Returns the sign bit of the extended double-precision floating-point value
- | `a'.
- *----------------------------------------------------------------------------*}
- function extractFloatx80Sign(a : floatx80): flag;inline;
- begin
- result:=a.high shr 15;
- end;
- {*----------------------------------------------------------------------------
- | Normalizes the subnormal extended double-precision floating-point value
- | represented by the denormalized significand `aSig'. The normalized exponent
- | and significand are stored at the locations pointed to by `zExpPtr' and
- | `zSigPtr', respectively.
- *----------------------------------------------------------------------------*}
- procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
- var
- shiftCount: int8;
- begin
- shiftCount := countLeadingZeros64( aSig );
- zSigPtr := aSig shl shiftCount;
- zExpPtr := 1 - shiftCount;
- end;
- {*----------------------------------------------------------------------------
- | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
- | extended double-precision floating-point value, returning the result.
- *----------------------------------------------------------------------------*}
- function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
- var
- z: floatx80;
- begin
- z.low := zSig;
- z.high := ( bits16(zSign) shl 15 ) + zExp;
- result:=z;
- end;
- {*----------------------------------------------------------------------------
- | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- | and extended significand formed by the concatenation of `zSig0' and `zSig1',
- | and returns the proper extended double-precision floating-point value
- | corresponding to the abstract input. Ordinarily, the abstract value is
- | rounded and packed into the extended double-precision format, with the
- | inexact exception raised if the abstract input cannot be represented
- | exactly. However, if the abstract value is too large, the overflow and
- | inexact exceptions are raised and an infinity or maximal finite value is
- | returned. If the abstract value is too small, the input value is rounded to
- | a subnormal number, and the underflow and inexact exceptions are raised if
- | the abstract input cannot be represented exactly as a subnormal extended
- | double-precision floating-point number.
- | If `roundingPrecision' is 32 or 64, the result is rounded to the same
- | number of bits as single or double precision, respectively. Otherwise, the
- | result is rounded to the full precision of the extended double-precision
- | format.
- | The input significand must be normalized or smaller. If the input
- | significand is not normalized, `zExp' must be 0; in that case, the result
- | returned is a subnormal number, and it must not require rounding. The
- | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
- var
- roundingMode: TFPURoundingMode;
- roundNearestEven, increment, isTiny: flag;
- roundIncrement, roundMask, roundBits: int64;
- label
- precision80, overflow;
- begin
- roundingMode := softfloat_rounding_mode;
- roundNearestEven := flag( roundingMode = float_round_nearest_even );
- if ( roundingPrecision = 80 ) then
- goto precision80;
- if ( roundingPrecision = 64 ) then
- begin
- roundIncrement := int64( $0000000000000400 );
- roundMask := int64( $00000000000007FF );
- end
- else if ( roundingPrecision = 32 ) then
- begin
- roundIncrement := int64( $0000008000000000 );
- roundMask := int64( $000000FFFFFFFFFF );
- end
- else begin
- goto precision80;
- end;
- zSig0 := zSig0 or ord( zSig1 <> 0 );
- if ( not (roundNearestEven<>0) ) then
- begin
- if ( roundingMode = float_round_to_zero ) then
- begin
- roundIncrement := 0;
- end
- else begin
- roundIncrement := roundMask;
- if ( zSign<>0 ) then
- begin
- if ( roundingMode = float_round_up ) then
- roundIncrement := 0;
- end
- else begin
- if ( roundingMode = float_round_down ) then
- roundIncrement := 0;
- end;
- end;
- end;
- roundBits := zSig0 and roundMask;
- if ( $7FFD <= bits32( zExp - 1 ) ) then begin
- if ( ( $7FFE < zExp )
- or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
- ) then begin
- goto overflow;
- end;
- if ( zExp <= 0 ) then begin
- isTiny := ord (
- ( softfloat_detect_tininess = float_tininess_before_rounding )
- or ( zExp < 0 )
- or ( zSig0 <= zSig0 + roundIncrement ) );
- shift64RightJamming( zSig0, 1 - zExp, zSig0 );
- zExp := 0;
- roundBits := zSig0 and roundMask;
- if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
- if ( roundBits <> 0 ) then set_inexact_flag;
- inc( zSig0, roundIncrement );
- if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
- roundIncrement := roundMask + 1;
- if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
- roundMask := roundMask or roundIncrement;
- end;
- zSig0 := zSig0 and not roundMask;
- result:=packFloatx80( zSign, zExp, zSig0 );
- exit;
- end;
- end;
- if ( roundBits <> 0 ) then set_inexact_flag;
- inc( zSig0, roundIncrement );
- if ( zSig0 < roundIncrement ) then begin
- inc(zExp);
- zSig0 := bits64( $8000000000000000 );
- end;
- roundIncrement := roundMask + 1;
- if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
- roundMask := roundMask or roundIncrement;
- end;
- zSig0 := zSig0 and not roundMask;
- if ( zSig0 = 0 ) then zExp := 0;
- result:=packFloatx80( zSign, zExp, zSig0 );
- exit;
- precision80:
- increment := ord ( sbits64( zSig1 ) < 0 );
- if ( roundNearestEven = 0 ) then begin
- if ( roundingMode = float_round_to_zero ) then begin
- increment := 0;
- end
- else begin
- if ( zSign <> 0 ) then begin
- increment := ord ( roundingMode = float_round_down ) and zSig1;
- end
- else begin
- increment := ord ( roundingMode = float_round_up ) and zSig1;
- end;
- end;
- end;
- if ( $7FFD <= bits32( zExp - 1 ) ) then begin
- if ( ( $7FFE < zExp )
- or ( ( zExp = $7FFE )
- and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
- and ( increment <> 0 )
- )
- ) then begin
- roundMask := 0;
- overflow:
- float_raise( [float_flag_overflow,float_flag_inexact] );
- if ( ( roundingMode = float_round_to_zero )
- or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
- or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
- ) then begin
- result:=packFloatx80( zSign, $7FFE, not roundMask );
- exit;
- end;
- result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- if ( zExp <= 0 ) then begin
- isTiny := ord(
- ( softfloat_detect_tininess = float_tininess_before_rounding )
- or ( zExp < 0 )
- or ( increment = 0 )
- or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
- shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
- zExp := 0;
- if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
- if ( zSig1 <> 0 ) then set_inexact_flag;
- if ( roundNearestEven <> 0 ) then begin
- increment := ord( sbits64( zSig1 ) < 0 );
- end
- else begin
- if ( zSign <> 0 ) then begin
- increment := ord( roundingMode = float_round_down ) and zSig1;
- end
- else begin
- increment := ord( roundingMode = float_round_up ) and zSig1;
- end;
- end;
- if ( increment <> 0 ) then begin
- inc(zSig0);
- zSig0 :=
- not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
- if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
- end;
- result:=packFloatx80( zSign, zExp, zSig0 );
- exit;
- end;
- end;
- if ( zSig1 <> 0 ) then set_inexact_flag;
- if ( increment <> 0 ) then begin
- inc(zSig0);
- if ( zSig0 = 0 ) then begin
- inc(zExp);
- zSig0 := bits64( $8000000000000000 );
- end
- else begin
- zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
- end;
- end
- else begin
- if ( zSig0 = 0 ) then zExp := 0;
- end;
- result:=packFloatx80( zSign, zExp, zSig0 );
- end;
- {*----------------------------------------------------------------------------
- | Takes an abstract floating-point value having sign `zSign', exponent
- | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
- | and returns the proper extended double-precision floating-point value
- | corresponding to the abstract input. This routine is just like
- | `roundAndPackFloatx80' except that the input significand does not have to be
- | normalized.
- *----------------------------------------------------------------------------*}
- function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
- var
- shiftCount: int8;
- begin
- if ( zSig0 = 0 ) then begin
- zSig0 := zSig1;
- zSig1 := 0;
- dec( zExp, 64 );
- end;
- shiftCount := countLeadingZeros64( zSig0 );
- shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
- zExp := zExp - shiftCount;
- result :=
- roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the extended double-precision floating-
- | point value `a' to the 32-bit two's complement integer format. The
- | conversion is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic---which means in particular that the conversion
- | is rounded according to the current rounding mode. If `a' is a NaN, the
- | largest positive integer is returned. Otherwise, if the conversion
- | overflows, the largest integer with the same sign as `a' is returned.
- *----------------------------------------------------------------------------*}
- function floatx80_to_int32(a: floatx80): int32;
- var
- aSign: flag;
- aExp, shiftCount: int32;
- aSig: bits64;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
- shiftCount := $4037 - aExp;
- if ( shiftCount <= 0 ) then shiftCount := 1;
- shift64RightJamming( aSig, shiftCount, aSig );
- result := roundAndPackInt32( aSign, aSig );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the extended double-precision floating-
- | point value `a' to the 32-bit two's complement integer format. The
- | conversion is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic, except that the conversion is always rounded
- | toward zero. If `a' is a NaN, the largest positive integer is returned.
- | Otherwise, if the conversion overflows, the largest integer with the same
- | sign as `a' is returned.
- *----------------------------------------------------------------------------*}
- function floatx80_to_int32_round_to_zero(a: floatx80): int32;
- var
- aSign: flag;
- aExp, shiftCount: int32;
- aSig, savedASig: bits64;
- z: int32;
- label
- invalid;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- if ( $401E < aExp ) then begin
- if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
- goto invalid;
- end
- else if ( aExp < $3FFF ) then begin
- if ( aExp or aSig <> 0 ) then set_inexact_flag;
- result := 0;
- exit;
- end;
- shiftCount := $403E - aExp;
- savedASig := aSig;
- aSig := aSig shr shiftCount;
- z := aSig;
- if ( aSign <> 0 ) then z := - z;
- if ( ord( z < 0 ) xor aSign ) <> 0 then begin
- invalid:
- float_raise( float_flag_invalid );
- if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
- exit;
- end;
- if ( ( aSig shl shiftCount ) <> savedASig ) then begin
- set_inexact_flag;
- end;
- result := z;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the extended double-precision floating-
- | point value `a' to the 64-bit two's complement integer format. The
- | conversion is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic---which means in particular that the conversion
- | is rounded according to the current rounding mode. If `a' is a NaN,
- | the largest positive integer is returned. Otherwise, if the conversion
- | overflows, the largest integer with the same sign as `a' is returned.
- *----------------------------------------------------------------------------*}
- function floatx80_to_int64(a: floatx80): int64;
- var
- aSign: flag;
- aExp, shiftCount: int32;
- aSig, aSigExtra: bits64;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- shiftCount := $403E - aExp;
- if ( shiftCount <= 0 ) then begin
- if ( shiftCount <> 0 ) then begin
- float_raise( float_flag_invalid );
- if ( ( aSign = 0 )
- or ( ( aExp = $7FFF )
- and ( aSig <> bits64( $8000000000000000 ) ) )
- ) then begin
- result := $7FFFFFFFFFFFFFFF;
- exit;
- end;
- result := $8000000000000000;
- exit;
- end;
- aSigExtra := 0;
- end
- else begin
- shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
- end;
- result := roundAndPackInt64( aSign, aSig, aSigExtra );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the extended double-precision floating-
- | point value `a' to the 64-bit two's complement integer format. The
- | conversion is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic, except that the conversion is always rounded
- | toward zero. If `a' is a NaN, the largest positive integer is returned.
- | Otherwise, if the conversion overflows, the largest integer with the same
- | sign as `a' is returned.
- *----------------------------------------------------------------------------*}
- function floatx80_to_int64_round_to_zero(a: floatx80): int64;
- var
- aSign: flag;
- aExp, shiftCount: int32;
- aSig: bits64;
- z: int64;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- shiftCount := aExp - $403E;
- if ( 0 <= shiftCount ) then begin
- aSig := $7FFFFFFFFFFFFFFF;
- if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
- float_raise( float_flag_invalid );
- if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
- result := $7FFFFFFFFFFFFFFF;
- exit;
- end;
- end;
- result := $8000000000000000;
- exit;
- end
- else if ( aExp < $3FFF ) then begin
- if ( aExp or aSig <> 0 ) then set_inexact_flag;
- result := 0;
- exit;
- end;
- z := aSig shr ( - shiftCount );
- if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
- set_inexact_flag;
- end;
- if ( aSign <> 0 ) then z := - z;
- result := z;
- end;
- {*----------------------------------------------------------------------------
- | The pattern for a default generated extended double-precision NaN. The
- | `high' and `low' values hold the most- and least-significant bits,
- | respectively.
- *----------------------------------------------------------------------------*}
- const
- floatx80_default_nan_high = $FFFF;
- floatx80_default_nan_low = bits64( $C000000000000000 );
- {*----------------------------------------------------------------------------
- | Returns 1 if the extended double-precision floating-point value `a' is a
- | signaling NaN; otherwise returns 0.
- *----------------------------------------------------------------------------*}
- function floatx80_is_signaling_nan(a : floatx80): flag;
- var
- aLow: bits64;
- begin
- aLow := a.low and not $4000000000000000;
- result := ord(
- ( a.high and $7FFF = $7FFF )
- and ( bits64( aLow shl 1 ) <> 0 )
- and ( a.low = aLow ) );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the extended double-precision floating-
- | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
- | invalid exception is raised.
- *----------------------------------------------------------------------------*}
- function floatx80ToCommonNaN(a : floatx80): commonNaNT;
- var
- z: commonNaNT;
- begin
- if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
- z.sign := a.high shr 15;
- z.low := 0;
- z.high := a.low shl 1;
- result := z;
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the extended double-precision floating-point value `a' is a
- | NaN; otherwise returns 0.
- *----------------------------------------------------------------------------*}
- function floatx80_is_nan(a : floatx80 ): flag;
- begin
- result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
- end;
- {*----------------------------------------------------------------------------
- | Takes two extended double-precision floating-point values `a' and `b', one
- | of which is a NaN, and returns the appropriate NaN result. If either `a' or
- | `b' is a signaling NaN, the invalid exception is raised.
- *----------------------------------------------------------------------------*}
- function propagateFloatx80NaN(a, b: floatx80): floatx80;
- var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
- label
- returnLargerSignificand;
- begin
- aIsNaN := floatx80_is_nan( a );
- aIsSignalingNaN := floatx80_is_signaling_nan( a );
- bIsNaN := floatx80_is_nan( b );
- bIsSignalingNaN := floatx80_is_signaling_nan( b );
- a.low := a.low or $C000000000000000;
- b.low := b.low or $C000000000000000;
- if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
- if aIsSignalingNaN <> 0 then begin
- if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
- if bIsNaN <> 0 then result := b else result := a;
- exit;
- end
- else if aIsNaN <>0 then begin
- if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
- result := a;
- exit;
- end;
- returnLargerSignificand:
- if ( a.low < b.low ) then begin
- result := b;
- exit;
- end;
- if ( b.low < a.low ) then begin
- result := a;
- exit;
- end;
- if a.high < b.high then result := a else result := b;
- exit;
- end
- else
- result := b;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the extended double-precision floating-
- | point value `a' to the single-precision floating-point format. The
- | conversion is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_to_float32(a: floatx80): float32;
- var
- aSign: flag;
- aExp: int32;
- aSig: bits64;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- if ( aExp = $7FFF ) then begin
- if bits64( aSig shl 1 ) <> 0 then begin
- result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
- exit;
- end;
- result := packFloat32( aSign, $FF, 0 );
- exit;
- end;
- shift64RightJamming( aSig, 33, aSig );
- if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
- result := roundAndPackFloat32( aSign, aExp, aSig );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the extended double-precision floating-
- | point value `a' to the double-precision floating-point format. The
- | conversion is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_to_float64(a: floatx80): float64;
- var
- aSign: flag;
- aExp: int32;
- aSig, zSig: bits64;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- if ( aExp = $7FFF ) then begin
- if bits64( aSig shl 1 ) <> 0 then begin
- result:=commonNaNToFloat64(floatx80ToCommonNaN(a));
- exit;
- end;
- result := packFloat64( aSign, $7FF, 0 );
- exit;
- end;
- shift64RightJamming( aSig, 1, zSig );
- if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
- result := roundAndPackFloat64( aSign, aExp, zSig );
- end;
- {$ifdef FPC_SOFTFLOAT_FLOAT128}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the extended double-precision floating-
- | point value `a' to the quadruple-precision floating-point format. The
- | conversion is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_to_float128(a: floatx80): float128;
- var
- aSign: flag;
- aExp: int16;
- aSig, zSig0, zSig1: bits64;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
- result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
- exit;
- end;
- shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
- result := packFloat128( aSign, aExp, zSig0, zSig1 );
- end;
- {$endif FPC_SOFTFLOAT_FLOAT128}
- {*----------------------------------------------------------------------------
- | Rounds the extended double-precision floating-point value `a' to an integer,
- | and Returns the result as an extended quadruple-precision floating-point
- | value. The operation is performed according to the IEC/IEEE Standard for
- | Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_round_to_int(a: floatx80): floatx80;
- var
- aSign: flag;
- aExp: int32;
- lastBitMask, roundBitsMask: bits64;
- roundingMode: TFPURoundingMode;
- z: floatx80;
- begin
- aExp := extractFloatx80Exp( a );
- if ( $403E <= aExp ) then begin
- if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, a );
- exit;
- end;
- result := a;
- exit;
- end;
- if ( aExp < $3FFF ) then begin
- if ( ( aExp = 0 )
- and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
- result := a;
- exit;
- end;
- set_inexact_flag;
- aSign := extractFloatx80Sign( a );
- case softfloat_rounding_mode of
- float_round_nearest_even:
- if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
- ) then begin
- result :=
- packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- float_round_down: begin
- if aSign <> 0 then
- result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
- else
- result := packFloatx80( 0, 0, 0 );
- exit;
- end;
- float_round_up: begin
- if aSign <> 0 then
- result := packFloatx80( 1, 0, 0 )
- else
- result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- end;
- result := packFloatx80( aSign, 0, 0 );
- exit;
- end;
- lastBitMask := 1;
- lastBitMask := lastBitMask shl ( $403E - aExp );
- roundBitsMask := lastBitMask - 1;
- z := a;
- roundingMode := softfloat_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then begin
- inc( z.low, lastBitMask shr 1 );
- if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
- end
- else if ( roundingMode <> float_round_to_zero ) then begin
- if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
- inc( z.low, roundBitsMask );
- end;
- end;
- z.low := z.low and not roundBitsMask;
- if ( z.low = 0 ) then begin
- inc(z.high);
- z.low := bits64( $8000000000000000 );
- end;
- if ( z.low <> a.low ) then set_inexact_flag;
- result := z;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of adding the absolute values of the extended double-
- | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
- | negated before being returned. `zSign' is ignored if the result is a NaN.
- | The addition is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
- var
- aExp, bExp, zExp: int32;
- aSig, bSig, zSig0, zSig1: bits64;
- expDiff: int32;
- label
- shiftRight1, roundAndPack;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- bSig := extractFloatx80Frac( b );
- bExp := extractFloatx80Exp( b );
- expDiff := aExp - bExp;
- if ( 0 < expDiff ) then begin
- if ( aExp = $7FFF ) then begin
- if ( bits64( aSig shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- result := a;
- exit;
- end;
- if ( bExp = 0 ) then dec(expDiff);
- shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
- zExp := aExp;
- end
- else if ( expDiff < 0 ) then begin
- if ( bExp = $7FFF ) then begin
- if ( bits64( bSig shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- if ( aExp = 0 ) then inc(expDiff);
- shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
- zExp := bExp;
- end
- else begin
- if ( aExp = $7FFF ) then begin
- if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- result := a;
- exit;
- end;
- zSig1 := 0;
- zSig0 := aSig + bSig;
- if ( aExp = 0 ) then begin
- normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
- goto roundAndPack;
- end;
- zExp := aExp;
- goto shiftRight1;
- end;
- zSig0 := aSig + bSig;
- if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
- shiftRight1:
- shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
- zSig0 := zSig0 or $8000000000000000;
- inc(zExp);
- roundAndPack:
- result :=
- roundAndPackFloatx80(
- floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of subtracting the absolute values of the extended
- | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
- | difference is negated before being returned. `zSign' is ignored if the
- | result is a NaN. The subtraction is performed according to the IEC/IEEE
- | Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
- var
- aExp, bExp, zExp: int32;
- aSig, bSig, zSig0, zSig1: bits64;
- expDiff: int32;
- z: floatx80;
- label
- bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- bSig := extractFloatx80Frac( b );
- bExp := extractFloatx80Exp( b );
- expDiff := aExp - bExp;
- if ( 0 < expDiff ) then goto aExpBigger;
- if ( expDiff < 0 ) then goto bExpBigger;
- if ( aExp = $7FFF ) then begin
- if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- float_raise( float_flag_invalid );
- z.low := floatx80_default_nan_low;
- z.high := floatx80_default_nan_high;
- result := z;
- exit;
- end;
- if ( aExp = 0 ) then begin
- aExp := 1;
- bExp := 1;
- end;
- zSig1 := 0;
- if ( bSig < aSig ) then goto aBigger;
- if ( aSig < bSig ) then goto bBigger;
- result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
- exit;
- bExpBigger:
- if ( bExp = $7FFF ) then begin
- if ( bits64( bSig shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- if ( aExp = 0 ) then inc(expDiff);
- shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
- bBigger:
- sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
- zExp := bExp;
- zSign := zSign xor 1;
- goto normalizeRoundAndPack;
- aExpBigger:
- if ( aExp = $7FFF ) then begin
- if ( bits64( aSig shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- result := a;
- exit;
- end;
- if ( bExp = 0 ) then dec(expDiff);
- shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
- aBigger:
- sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
- zExp := aExp;
- normalizeRoundAndPack:
- result :=
- normalizeRoundAndPackFloatx80(
- floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of adding the extended double-precision floating-point
- | values `a' and `b'. The operation is performed according to the IEC/IEEE
- | Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_add(a: floatx80; b: floatx80): floatx80;
- var
- aSign, bSign: flag;
- begin
- aSign := extractFloatx80Sign( a );
- bSign := extractFloatx80Sign( b );
- if ( aSign = bSign ) then begin
- result := addFloatx80Sigs( a, b, aSign );
- end
- else begin
- result := subFloatx80Sigs( a, b, aSign );
- end;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of subtracting the extended double-precision floating-
- | point values `a' and `b'. The operation is performed according to the
- | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
- var
- aSign, bSign: flag;
- begin
- aSign := extractFloatx80Sign( a );
- bSign := extractFloatx80Sign( b );
- if ( aSign = bSign ) then begin
- result := subFloatx80Sigs( a, b, aSign );
- end
- else begin
- result := addFloatx80Sigs( a, b, aSign );
- end;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of multiplying the extended double-precision floating-
- | point values `a' and `b'. The operation is performed according to the
- | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_mul(a: floatx80; b: floatx80): floatx80;
- var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp: int32;
- aSig, bSig, zSig0, zSig1: bits64;
- z: floatx80;
- label
- invalid;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- bSig := extractFloatx80Frac( b );
- bExp := extractFloatx80Exp( b );
- bSign := extractFloatx80Sign( b );
- zSign := aSign xor bSign;
- if ( aExp = $7FFF ) then begin
- if ( bits64( aSig shl 1 ) <> 0 )
- or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- if ( ( bExp or bSig ) = 0 ) then goto invalid;
- result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- if ( bExp = $7FFF ) then begin
- if ( bits64( bSig shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- if ( ( aExp or aSig ) = 0 ) then begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := floatx80_default_nan_low;
- z.high := floatx80_default_nan_high;
- result := z;
- exit;
- end;
- result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- if ( aExp = 0 ) then begin
- if ( aSig = 0 ) then begin
- result := packFloatx80( zSign, 0, 0 );
- exit;
- end;
- normalizeFloatx80Subnormal( aSig, aExp, aSig );
- end;
- if ( bExp = 0 ) then begin
- if ( bSig = 0 ) then begin
- result := packFloatx80( zSign, 0, 0 );
- exit;
- end;
- normalizeFloatx80Subnormal( bSig, bExp, bSig );
- end;
- zExp := aExp + bExp - $3FFE;
- mul64To128( aSig, bSig, zSig0, zSig1 );
- if 0 < sbits64( zSig0 ) then begin
- shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
- dec(zExp);
- end;
- result :=
- roundAndPackFloatx80(
- floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of dividing the extended double-precision floating-point
- | value `a' by the corresponding value `b'. The operation is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
- var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp: int32;
- aSig, bSig, zSig0, zSig1: bits64;
- rem0, rem1, rem2, term0, term1, term2: bits64;
- z: floatx80;
- label
- invalid;
- begin
- aSig := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- bSig := extractFloatx80Frac( b );
- bExp := extractFloatx80Exp( b );
- bSign := extractFloatx80Sign( b );
- zSign := aSign xor bSign;
- if ( aExp = $7FFF ) then begin
- if ( bits64( aSig shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- if ( bExp = $7FFF ) then begin
- if ( bits64( bSig shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- goto invalid;
- end;
- result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- if ( bExp = $7FFF ) then begin
- if ( bits64( bSig shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- result := packFloatx80( zSign, 0, 0 );
- exit;
- end;
- if ( bExp = 0 ) then begin
- if ( bSig = 0 ) then begin
- if ( ( aExp or aSig ) = 0 ) then begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := floatx80_default_nan_low;
- z.high := floatx80_default_nan_high;
- result := z;
- exit;
- end;
- float_raise( float_flag_divbyzero );
- result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- normalizeFloatx80Subnormal( bSig, bExp, bSig );
- end;
- if ( aExp = 0 ) then begin
- if ( aSig = 0 ) then begin
- result := packFloatx80( zSign, 0, 0 );
- exit;
- end;
- normalizeFloatx80Subnormal( aSig, aExp, aSig );
- end;
- zExp := aExp - bExp + $3FFE;
- rem1 := 0;
- if ( bSig <= aSig ) then begin
- shift128Right( aSig, 0, 1, aSig, rem1 );
- inc(zExp);
- end;
- zSig0 := estimateDiv128To64( aSig, rem1, bSig );
- mul64To128( bSig, zSig0, term0, term1 );
- sub128( aSig, rem1, term0, term1, rem0, rem1 );
- while ( sbits64( rem0 ) < 0 ) do begin
- dec(zSig0);
- add128( rem0, rem1, 0, bSig, rem0, rem1 );
- end;
- zSig1 := estimateDiv128To64( rem1, 0, bSig );
- if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
- mul64To128( bSig, zSig1, term1, term2 );
- sub128( rem1, 0, term1, term2, rem1, rem2 );
- while ( sbits64( rem1 ) < 0 ) do begin
- dec(zSig1);
- add128( rem1, rem2, 0, bSig, rem1, rem2 );
- end;
- zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
- end;
- result :=
- roundAndPackFloatx80(
- floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the remainder of the extended double-precision floating-point value
- | `a' with respect to the corresponding value `b'. The operation is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
- var
- aSign, zSign: flag;
- aExp, bExp, expDiff: int32;
- aSig0, aSig1, bSig: bits64;
- q, term0, term1, alternateASig0, alternateASig1: bits64;
- z: floatx80;
- label
- invalid;
- begin
- aSig0 := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- bSig := extractFloatx80Frac( b );
- bExp := extractFloatx80Exp( b );
- if ( aExp = $7FFF ) then begin
- if ( bits64( aSig0 shl 1 ) <> 0 )
- or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- goto invalid;
- end;
- if ( bExp = $7FFF ) then begin
- if ( bits64( bSig shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, b );
- exit;
- end;
- result := a;
- exit;
- end;
- if ( bExp = 0 ) then begin
- if ( bSig = 0 ) then begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := floatx80_default_nan_low;
- z.high := floatx80_default_nan_high;
- result := z;
- exit;
- end;
- normalizeFloatx80Subnormal( bSig, bExp, bSig );
- end;
- if ( aExp = 0 ) then begin
- if ( bits64( aSig0 shl 1 ) = 0 ) then begin
- result := a;
- exit;
- end;
- normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
- end;
- bSig := bSig or $8000000000000000;
- zSign := aSign;
- expDiff := aExp - bExp;
- aSig1 := 0;
- if ( expDiff < 0 ) then begin
- if ( expDiff < -1 ) then begin
- result := a;
- exit;
- end;
- shift128Right( aSig0, 0, 1, aSig0, aSig1 );
- expDiff := 0;
- end;
- q := ord( bSig <= aSig0 );
- if ( q <> 0 ) then dec( aSig0, bSig );
- dec( expDiff, 64 );
- while ( 0 < expDiff ) do begin
- q := estimateDiv128To64( aSig0, aSig1, bSig );
- if ( 2 < q ) then q := q - 2 else q := 0;
- mul64To128( bSig, q, term0, term1 );
- sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
- shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
- dec( expDiff, 62 );
- end;
- inc( expDiff, 64 );
- if ( 0 < expDiff ) then begin
- q := estimateDiv128To64( aSig0, aSig1, bSig );
- if ( 2 < q ) then q:= q - 2 else q := 0;
- q := q shr ( 64 - expDiff );
- mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
- sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
- shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
- while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
- inc(q);
- sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
- end;
- end
- else begin
- term1 := 0;
- term0 := bSig;
- end;
- sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
- if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
- or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
- and ( q and 1 <> 0 ) )
- then begin
- aSig0 := alternateASig0;
- aSig1 := alternateASig1;
- zSign := ord( zSign = 0 );
- end;
- result :=
- normalizeRoundAndPackFloatx80(
- 80, zSign, bExp + expDiff, aSig0, aSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the square root of the extended double-precision floating-point
- | value `a'. The operation is performed according to the IEC/IEEE Standard
- | for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_sqrt(a: floatx80): floatx80;
- var
- aSign: flag;
- aExp, zExp: int32;
- aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
- rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
- z: floatx80;
- label
- invalid;
- begin
- aSig0 := extractFloatx80Frac( a );
- aExp := extractFloatx80Exp( a );
- aSign := extractFloatx80Sign( a );
- if ( aExp = $7FFF ) then begin
- if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
- result := propagateFloatx80NaN( a, a );
- exit;
- end;
- if ( aSign = 0 ) then begin
- result := a;
- exit;
- end;
- goto invalid;
- end;
- if ( aSign <> 0 ) then begin
- if ( ( aExp or aSig0 ) = 0 ) then begin
- result := a;
- exit;
- end;
- invalid:
- float_raise( float_flag_invalid );
- z.low := floatx80_default_nan_low;
- z.high := floatx80_default_nan_high;
- result := z;
- exit;
- end;
- if ( aExp = 0 ) then begin
- if ( aSig0 = 0 ) then begin
- result := packFloatx80( 0, 0, 0 );
- exit;
- end;
- normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
- end;
- zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
- zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
- shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
- zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
- doubleZSig0 := zSig0 shl 1;
- mul64To128( zSig0, zSig0, term0, term1 );
- sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
- while ( sbits64( rem0 ) < 0 ) do begin
- dec(zSig0);
- dec( doubleZSig0, 2 );
- add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
- end;
- zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
- if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
- if ( zSig1 = 0 ) then zSig1 := 1;
- mul64To128( doubleZSig0, zSig1, term1, term2 );
- sub128( rem1, 0, term1, term2, rem1, rem2 );
- mul64To128( zSig1, zSig1, term2, term3 );
- sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
- while ( sbits64( rem1 ) < 0 ) do begin
- dec(zSig1);
- shortShift128Left( 0, zSig1, 1, term2, term3 );
- term3 := term3 or 1;
- term2 := term2 or doubleZSig0;
- add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
- end;
- zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
- end;
- shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
- zSig0 := zSig0 or doubleZSig0;
- result :=
- roundAndPackFloatx80(
- floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the extended double-precision floating-point value `a' is
- | equal to the corresponding value `b', and 0 otherwise. The comparison is
- | performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_eq(a: floatx80; b: floatx80 ): flag;
- begin
- if ( ( extractFloatx80Exp( a ) = $7FFF )
- and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
- ) or ( ( extractFloatx80Exp( b ) = $7FFF )
- and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
- ) then begin
- if ( floatx80_is_signaling_nan( a )
- or floatx80_is_signaling_nan( b ) <> 0 ) then begin
- float_raise( float_flag_invalid );
- end;
- result := 0;
- exit;
- end;
- result := ord(
- ( a.low = b.low )
- and ( ( a.high = b.high )
- or ( ( a.low = 0 )
- and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
- ) );
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the extended double-precision floating-point value `a' is
- | less than or equal to the corresponding value `b', and 0 otherwise. The
- | comparison is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_le(a: floatx80; b: floatx80 ): flag;
- var
- aSign, bSign: flag;
- begin
- if ( ( extractFloatx80Exp( a ) = $7FFF )
- and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
- or ( ( extractFloatx80Exp( b ) = $7FFF )
- and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
- then begin
- float_raise( float_flag_invalid );
- result := 0;
- exit;
- end;
- aSign := extractFloatx80Sign( a );
- bSign := extractFloatx80Sign( b );
- if ( aSign <> bSign ) then begin
- result := ord(
- ( aSign <> 0 )
- or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
- exit;
- end;
- if aSign<>0 then
- result := le128( b.high, b.low, a.high, a.low )
- else
- result := le128( a.high, a.low, b.high, b.low );
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the extended double-precision floating-point value `a' is
- | less than the corresponding value `b', and 0 otherwise. The comparison
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_lt(a: floatx80; b: floatx80 ): flag;
- var
- aSign, bSign: flag;
- begin
- if ( ( extractFloatx80Exp( a ) = $7FFF )
- and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
- or ( ( extractFloatx80Exp( b ) = $7FFF )
- and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
- then begin
- float_raise( float_flag_invalid );
- result := 0;
- exit;
- end;
- aSign := extractFloatx80Sign( a );
- bSign := extractFloatx80Sign( b );
- if ( aSign <> bSign ) then begin
- result := ord(
- ( aSign <> 0 )
- and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
- exit;
- end;
- if aSign <> 0 then
- result := lt128( b.high, b.low, a.high, a.low )
- else
- result := lt128( a.high, a.low, b.high, b.low );
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the extended double-precision floating-point value `a' is equal
- | to the corresponding value `b', and 0 otherwise. The invalid exception is
- | raised if either operand is a NaN. Otherwise, the comparison is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
- begin
- if ( ( extractFloatx80Exp( a ) = $7FFF )
- and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
- or ( ( extractFloatx80Exp( b ) = $7FFF )
- and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
- then begin
- float_raise( float_flag_invalid );
- result := 0;
- exit;
- end;
- result := ord(
- ( a.low = b.low )
- and ( ( a.high = b.high )
- or ( ( a.low = 0 )
- and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
- ) );
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the extended double-precision floating-point value `a' is less
- | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
- | do not cause an exception. Otherwise, the comparison is performed according
- | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
- var
- aSign, bSign: flag;
- begin
- if ( ( extractFloatx80Exp( a ) = $7FFF )
- and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
- or ( ( extractFloatx80Exp( b ) = $7FFF )
- and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
- then begin
- if ( floatx80_is_signaling_nan( a )
- or floatx80_is_signaling_nan( b ) <> 0 ) then begin
- float_raise( float_flag_invalid );
- end;
- result := 0;
- exit;
- end;
- aSign := extractFloatx80Sign( a );
- bSign := extractFloatx80Sign( b );
- if ( aSign <> bSign ) then begin
- result := ord(
- ( aSign <> 0 )
- or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
- exit;
- end;
- if aSign <> 0 then
- result := le128( b.high, b.low, a.high, a.low )
- else
- result := le128( a.high, a.low, b.high, b.low );
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the extended double-precision floating-point value `a' is less
- | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
- | an exception. Otherwise, the comparison is performed according to the
- | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
- var
- aSign, bSign: flag;
- begin
- if ( ( extractFloatx80Exp( a ) = $7FFF )
- and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
- or ( ( extractFloatx80Exp( b ) = $7FFF )
- and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
- then begin
- if ( floatx80_is_signaling_nan( a )
- or floatx80_is_signaling_nan( b ) <> 0 ) then begin
- float_raise( float_flag_invalid );
- end;
- result := 0;
- exit;
- end;
- aSign := extractFloatx80Sign( a );
- bSign := extractFloatx80Sign( b );
- if ( aSign <> bSign ) then begin
- result := ord(
- ( aSign <> 0 )
- and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
- exit;
- end;
- if aSign <> 0 then
- result := lt128( b.high, b.low, a.high, a.low )
- else
- result := lt128( a.high, a.low, b.high, b.low );
- end;
- {$endif FPC_SOFTFLOAT_FLOATX80}
- {$ifdef FPC_SOFTFLOAT_FLOAT128}
- {*----------------------------------------------------------------------------
- | Returns the least-significant 64 fraction bits of the quadruple-precision
- | floating-point value `a'.
- *----------------------------------------------------------------------------*}
- function extractFloat128Frac1(a : float128): bits64;
- begin
- result:=a.low;
- end;
- {*----------------------------------------------------------------------------
- | Returns the most-significant 48 fraction bits of the quadruple-precision
- | floating-point value `a'.
- *----------------------------------------------------------------------------*}
- function extractFloat128Frac0(a : float128): bits64;
- begin
- result:=a.high and int64($0000FFFFFFFFFFFF);
- end;
- {*----------------------------------------------------------------------------
- | Returns the exponent bits of the quadruple-precision floating-point value
- | `a'.
- *----------------------------------------------------------------------------*}
- function extractFloat128Exp(a : float128): int32;
- begin
- result:=( a.high shr 48 ) and $7FFF;
- end;
- {*----------------------------------------------------------------------------
- | Returns the sign bit of the quadruple-precision floating-point value `a'.
- *----------------------------------------------------------------------------*}
- function extractFloat128Sign(a : float128): flag;
- begin
- result:=a.high shr 63;
- end;
- {*----------------------------------------------------------------------------
- | Normalizes the subnormal quadruple-precision floating-point value
- | represented by the denormalized significand formed by the concatenation of
- | `aSig0' and `aSig1'. The normalized exponent is stored at the location
- | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
- | significand are stored at the location pointed to by `zSig0Ptr', and the
- | least significant 64 bits of the normalized significand are stored at the
- | location pointed to by `zSig1Ptr'.
- *----------------------------------------------------------------------------*}
- procedure normalizeFloat128Subnormal(
- aSig0: bits64;
- aSig1: bits64;
- var zExpPtr: int32;
- var zSig0Ptr: bits64;
- var zSig1Ptr: bits64);
- var
- shiftCount: int8;
- begin
- if ( aSig0 = 0 ) then
- begin
- shiftCount := countLeadingZeros64( aSig1 ) - 15;
- if ( shiftCount < 0 ) then
- begin
- zSig0Ptr := aSig1 shr ( - shiftCount );
- zSig1Ptr := aSig1 shl ( shiftCount and 63 );
- end
- else begin
- zSig0Ptr := aSig1 shl shiftCount;
- zSig1Ptr := 0;
- end;
- zExpPtr := - shiftCount - 63;
- end
- else begin
- shiftCount := countLeadingZeros64( aSig0 ) - 15;
- shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
- zExpPtr := 1 - shiftCount;
- end;
- end;
- {*----------------------------------------------------------------------------
- | Packs the sign `zSign', the exponent `zExp', and the significand formed
- | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
- | floating-point value, returning the result. After being shifted into the
- | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
- | added together to form the most significant 32 bits of the result. This
- | means that any integer portion of `zSig0' will be added into the exponent.
- | Since a properly normalized significand will have an integer portion equal
- | to 1, the `zExp' input should be 1 less than the desired result exponent
- | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
- | significand.
- *----------------------------------------------------------------------------*}
- function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
- var
- z: float128;
- begin
- z.low := zSig1;
- z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
- result:=z;
- end;
- {*----------------------------------------------------------------------------
- | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- | and extended significand formed by the concatenation of `zSig0', `zSig1',
- | and `zSig2', and returns the proper quadruple-precision floating-point value
- | corresponding to the abstract input. Ordinarily, the abstract value is
- | simply rounded and packed into the quadruple-precision format, with the
- | inexact exception raised if the abstract input cannot be represented
- | exactly. However, if the abstract value is too large, the overflow and
- | inexact exceptions are raised and an infinity or maximal finite value is
- | returned. If the abstract value is too small, the input value is rounded to
- | a subnormal number, and the underflow and inexact exceptions are raised if
- | the abstract input cannot be represented exactly as a subnormal quadruple-
- | precision floating-point number.
- | The input significand must be normalized or smaller. If the input
- | significand is not normalized, `zExp' must be 0; in that case, the result
- | returned is a subnormal number, and it must not require rounding. In the
- | usual case that the input significand is normalized, `zExp' must be 1 less
- | than the ``true'' floating-point exponent. The handling of underflow and
- | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
- var
- roundingMode: TFPURoundingMode;
- roundNearestEven, increment, isTiny: flag;
- begin
- roundingMode := softfloat_rounding_mode;
- roundNearestEven := ord( roundingMode = float_round_nearest_even );
- increment := ord( sbits64(zSig2) < 0 );
- if ( roundNearestEven=0 ) then
- begin
- if ( roundingMode = float_round_to_zero ) then
- begin
- increment := 0;
- end
- else begin
- if ( zSign<>0 ) then
- begin
- increment := ord( roundingMode = float_round_down ) and zSig2;
- end
- else begin
- increment := ord( roundingMode = float_round_up ) and zSig2;
- end;
- end;
- end;
- if ( $7FFD <= bits32(zExp) ) then
- begin
- if ( ord( $7FFD < zExp )
- or ( ord( zExp = $7FFD )
- and eq128(
- int64( $0001FFFFFFFFFFFF ),
- bits64( $FFFFFFFFFFFFFFFF ),
- zSig0,
- zSig1
- )
- and increment
- )
- )<>0 then
- begin
- float_raise( [float_flag_overflow,float_flag_inexact] );
- if ( ord( roundingMode = float_round_to_zero )
- or ( zSign and ord( roundingMode = float_round_up ) )
- or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
- )<>0 then
- begin
- result :=
- packFloat128(
- zSign,
- $7FFE,
- int64( $0000FFFFFFFFFFFF ),
- bits64( $FFFFFFFFFFFFFFFF )
- );
- exit;
- end;
- result:=packFloat128( zSign, $7FFF, 0, 0 );
- exit;
- end;
- if ( zExp < 0 ) then
- begin
- isTiny :=
- ord(( softfloat_detect_tininess = float_tininess_before_rounding )
- or ( zExp < -1 )
- or not( increment<>0 )
- or boolean(lt128(
- zSig0,
- zSig1,
- int64( $0001FFFFFFFFFFFF ),
- bits64( $FFFFFFFFFFFFFFFF )
- )));
- shift128ExtraRightJamming(
- zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
- zExp := 0;
- if ( isTiny and zSig2 )<>0 then
- float_raise( float_flag_underflow );
- if ( roundNearestEven<>0 ) then
- begin
- increment := ord( sbits64(zSig2) < 0 );
- end
- else begin
- if ( zSign<>0 ) then
- begin
- increment := ord( roundingMode = float_round_down ) and zSig2;
- end
- else begin
- increment := ord( roundingMode = float_round_up ) and zSig2;
- end;
- end;
- end;
- end;
- if ( zSig2<>0 ) then
- set_inexact_flag;
- if ( increment<>0 ) then
- begin
- add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
- zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
- end
- else begin
- if ( ( zSig0 or zSig1 ) = 0 ) then
- zExp := 0;
- end;
- result:=packFloat128( zSign, zExp, zSig0, zSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- | and significand formed by the concatenation of `zSig0' and `zSig1', and
- | returns the proper quadruple-precision floating-point value corresponding
- | to the abstract input. This routine is just like `roundAndPackFloat128'
- | except that the input significand has fewer bits and does not have to be
- | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
- | point exponent.
- *----------------------------------------------------------------------------*}
- function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
- var
- shiftCount: int8;
- zSig2: bits64;
- begin
- if ( zSig0 = 0 ) then
- begin
- zSig0 := zSig1;
- zSig1 := 0;
- dec(zExp, 64);
- end;
- shiftCount := countLeadingZeros64( zSig0 ) - 15;
- if ( 0 <= shiftCount ) then
- begin
- zSig2 := 0;
- shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
- end
- else begin
- shift128ExtraRightJamming(
- zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
- end;
- dec(zExp, shiftCount);
- result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the quadruple-precision floating-point
- | value `a' to the 32-bit two's complement integer format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic---which means in particular that the conversion is rounded
- | according to the current rounding mode. If `a' is a NaN, the largest
- | positive integer is returned. Otherwise, if the conversion overflows, the
- | largest integer with the same sign as `a' is returned.
- *----------------------------------------------------------------------------*}
- function float128_to_int32(a: float128): int32;
- var
- aSign: flag;
- aExp, shiftCount: int32;
- aSig0, aSig1: bits64;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
- aSign := 0;
- if ( aExp<>0 ) then
- aSig0 := aSig0 or int64( $0001000000000000 );
- aSig0 := aSig0 or ord( aSig1 <> 0 );
- shiftCount := $4028 - aExp;
- if ( 0 < shiftCount ) then
- shift64RightJamming( aSig0, shiftCount, aSig0 );
- result := roundAndPackInt32( aSign, aSig0 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the quadruple-precision floating-point
- | value `a' to the 32-bit two's complement integer format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic, except that the conversion is always rounded toward zero. If
- | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
- | conversion overflows, the largest integer with the same sign as `a' is
- | returned.
- *----------------------------------------------------------------------------*}
- function float128_to_int32_round_to_zero(a: float128): int32;
- var
- aSign: flag;
- aExp, shiftCount: int32;
- aSig0, aSig1, savedASig: bits64;
- z: int32;
- label
- invalid;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- aSig0 := aSig0 or ord( aSig1 <> 0 );
- if ( $401E < aExp ) then
- begin
- if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
- aSign := 0;
- goto invalid;
- end
- else if ( aExp < $3FFF ) then
- begin
- if ( aExp or aSig0 )<>0 then
- set_inexact_flag;
- result := 0;
- exit;
- end;
- aSig0 := aSig0 or int64( $0001000000000000 );
- shiftCount := $402F - aExp;
- savedASig := aSig0;
- aSig0 := aSig0 shr shiftCount;
- z := aSig0;
- if ( aSign )<>0 then
- z := - z;
- if ( ord( z < 0 ) xor aSign )<>0 then
- begin
- invalid:
- float_raise( float_flag_invalid );
- if aSign<>0 then
- result:= int32( $80000000 )
- else
- result:=$7FFFFFFF;
- exit;
- end;
- if ( ( aSig0 shl shiftCount ) <> savedASig ) then
- begin
- set_inexact_flag;
- end;
- result := z;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the quadruple-precision floating-point
- | value `a' to the 64-bit two's complement integer format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic---which means in particular that the conversion is rounded
- | according to the current rounding mode. If `a' is a NaN, the largest
- | positive integer is returned. Otherwise, if the conversion overflows, the
- | largest integer with the same sign as `a' is returned.
- *----------------------------------------------------------------------------*}
- function float128_to_int64(a: float128): int64;
- var
- aSign: flag;
- aExp, shiftCount: int32;
- aSig0, aSig1: bits64;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- if ( aExp<>0 ) then
- aSig0 := aSig0 or int64( $0001000000000000 );
- shiftCount := $402F - aExp;
- if ( shiftCount <= 0 ) then
- begin
- if ( $403E < aExp ) then
- begin
- float_raise( float_flag_invalid );
- if ( (aSign=0)
- or ( ( aExp = $7FFF )
- and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
- )
- ) then
- begin
- result := int64( $7FFFFFFFFFFFFFFF );
- exit;
- end;
- result := int64( $8000000000000000 );
- exit;
- end;
- shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
- end
- else begin
- shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
- end;
- result := roundAndPackInt64( aSign, aSig0, aSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the quadruple-precision floating-point
- | value `a' to the 64-bit two's complement integer format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic, except that the conversion is always rounded toward zero.
- | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
- | the conversion overflows, the largest integer with the same sign as `a' is
- | returned.
- *----------------------------------------------------------------------------*}
- function float128_to_int64_round_to_zero(a: float128): int64;
- var
- aSign: flag;
- aExp, shiftCount: int32;
- aSig0, aSig1: bits64;
- z: int64;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- if ( aExp<>0 ) then
- aSig0 := aSig0 or int64( $0001000000000000 );
- shiftCount := aExp - $402F;
- if ( 0 < shiftCount ) then
- begin
- if ( $403E <= aExp ) then
- begin
- aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
- if ( ( a.high = bits64( $C03E000000000000 ) )
- and ( aSig1 < int64( $0002000000000000 ) ) ) then
- begin
- if ( aSig1<>0 ) then
- set_inexact_flag;
- end
- else begin
- float_raise( float_flag_invalid );
- if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
- begin
- result := int64( $7FFFFFFFFFFFFFFF );
- exit;
- end;
- end;
- result := int64( $8000000000000000 );
- exit;
- end;
- z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
- if ( int64( aSig1 shl shiftCount )<>0 ) then
- begin
- set_inexact_flag;
- end;
- end
- else begin
- if ( aExp < $3FFF ) then
- begin
- if ( aExp or aSig0 or aSig1 )<>0 then
- begin
- set_inexact_flag;
- end;
- result := 0;
- exit;
- end;
- z := aSig0 shr ( - shiftCount );
- if ( (aSig1<>0)
- or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
- begin
- set_inexact_flag;
- end;
- end;
- if ( aSign<>0 ) then
- z := - z;
- result := z;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the quadruple-precision floating-point
- | value `a' to the single-precision floating-point format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_to_float32(a: float128): float32;
- var
- aSign: flag;
- aExp: int32;
- aSig0, aSig1: bits64;
- zSig: bits32;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- if ( aExp = $7FFF ) then
- begin
- if ( aSig0 or aSig1 )<>0 then
- begin
- result := commonNaNToFloat32( float128ToCommonNaN( a ) );
- exit;
- end;
- result := packFloat32( aSign, $FF, 0 );
- exit;
- end;
- aSig0 := aSig0 or ord( aSig1 <> 0 );
- shift64RightJamming( aSig0, 18, aSig0 );
- zSig := aSig0;
- if ( aExp<>0 ) or (aSig0 <> 0 ) then
- begin
- zSig := zSig or $40000000;
- dec(aExp,$3F81);
- end;
- result := roundAndPackFloat32( aSign, aExp, zSig );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the quadruple-precision floating-point
- | value `a' to the double-precision floating-point format. The conversion
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_to_float64(a: float128): float64;
- var
- aSign: flag;
- aExp: int32;
- aSig0, aSig1: bits64;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- if ( aExp = $7FFF ) then
- begin
- if ( aSig0 or aSig1 )<>0 then
- begin
- result:=commonNaNToFloat64(float128ToCommonNaN(a));
- exit;
- end;
- result:=packFloat64( aSign, $7FF, 0);
- exit;
- end;
- shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
- aSig0 := aSig0 or ord( aSig1 <> 0 );
- if ( aExp<>0 ) or (aSig0 <> 0 ) then
- begin
- aSig0 := aSig0 or int64( $4000000000000000 );
- dec(aExp,$3C01);
- end;
- result := roundAndPackFloat64( aSign, aExp, aSig0 );
- end;
- {$ifdef FPC_SOFTFLOAT_FLOATX80}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the quadruple-precision floating-point
- | value `a' to the extended double-precision floating-point format. The
- | conversion is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_to_floatx80(a: float128): floatx80;
- var
- aSign: flag;
- aExp: int32;
- aSig0, aSig1: bits64;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- if ( aExp = $7FFF ) then begin
- if ( aSig0 or aSig1 <> 0 ) then begin
- result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
- exit;
- end;
- result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
- exit;
- end;
- if ( aExp = 0 ) then begin
- if ( ( aSig0 or aSig1 ) = 0 ) then
- begin
- result := packFloatx80( aSign, 0, 0 );
- exit;
- end;
- normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- end
- else begin
- aSig0 := aSig0 or int64( $0001000000000000 );
- end;
- shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
- result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
- end;
- {$endif FPC_SOFTFLOAT_FLOATX80}
- {*----------------------------------------------------------------------------
- | Rounds the quadruple-precision floating-point value `a' to an integer, and
- | Returns the result as a quadruple-precision floating-point value. The
- | operation is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_round_to_int(a: float128): float128;
- var
- aSign: flag;
- aExp: int32;
- lastBitMask, roundBitsMask: bits64;
- roundingMode: TFPURoundingMode;
- z: float128;
- begin
- aExp := extractFloat128Exp( a );
- if ( $402F <= aExp ) then
- begin
- if ( $406F <= aExp ) then
- begin
- if ( ( aExp = $7FFF )
- and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
- ) then
- begin
- result := propagateFloat128NaN( a, a );
- exit;
- end;
- result := a;
- exit;
- end;
- lastBitMask := 1;
- lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
- roundBitsMask := lastBitMask - 1;
- z := a;
- roundingMode := softfloat_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- begin
- if ( lastBitMask )<>0 then
- begin
- add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
- if ( ( z.low and roundBitsMask ) = 0 ) then
- z.low := z.low and not(lastBitMask);
- end
- else begin
- if ( sbits64(z.low) < 0 ) then
- begin
- inc(z.high);
- if ( bits64( z.low shl 1 ) = 0 ) then
- z.high := z.high and not bits64( 1 );
- end;
- end;
- end
- else if ( roundingMode <> float_round_to_zero ) then
- begin
- if ( extractFloat128Sign( z )
- xor ord( roundingMode = float_round_up ) )<>0 then
- begin
- add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
- end;
- end;
- z.low := z.low and not(roundBitsMask);
- end
- else begin
- if ( aExp < $3FFF ) then
- begin
- if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
- begin
- result := a;
- exit;
- end;
- set_inexact_flag;
- aSign := extractFloat128Sign( a );
- case softfloat_rounding_mode of
- float_round_nearest_even:
- if ( ( aExp = $3FFE )
- and ( (extractFloat128Frac0( a )<>0)
- or (extractFloat128Frac1( a )<>0) )
- ) then begin
- begin
- result := packFloat128( aSign, $3FFF, 0, 0 );
- exit;
- end;
- end;
- float_round_down:
- begin
- if aSign<>0 then
- result:=packFloat128( 1, $3FFF, 0, 0 )
- else
- result:=packFloat128( 0, 0, 0, 0 );
- exit;
- end;
- float_round_up:
- begin
- if aSign<>0 then
- result := packFloat128( 1, 0, 0, 0 )
- else
- result:=packFloat128( 0, $3FFF, 0, 0 );
- exit;
- end;
- end;
- result := packFloat128( aSign, 0, 0, 0 );
- exit;
- end;
- lastBitMask := 1;
- lastBitMask := lastBitMask shl ($402F - aExp);
- roundBitsMask := lastBitMask - 1;
- z.low := 0;
- z.high := a.high;
- roundingMode := softfloat_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then begin
- inc(z.high,lastBitMask shr 1);
- if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
- z.high := z.high and not(lastBitMask);
- end;
- end
- else if ( roundingMode <> float_round_to_zero ) then begin
- if ( (extractFloat128Sign( z )<>0)
- xor ( roundingMode = float_round_up ) ) then begin
- z.high := z.high or ord( a.low <> 0 );
- z.high := z.high+roundBitsMask;
- end;
- end;
- z.high := z.high and not(roundBitsMask);
- end;
- if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
- set_inexact_flag;
- end;
- result := z;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of adding the absolute values of the quadruple-precision
- | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
- | before being returned. `zSign' is ignored if the result is a NaN.
- | The addition is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
- var
- aExp, bExp, zExp: int32;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
- expDiff: int32;
- label
- shiftRight1,roundAndPack;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- bSig1 := extractFloat128Frac1( b );
- bSig0 := extractFloat128Frac0( b );
- bExp := extractFloat128Exp( b );
- expDiff := aExp - bExp;
- if ( 0 < expDiff ) then begin
- if ( aExp = $7FFF ) then begin
- if ( aSig0 or aSig1 )<>0 then
- begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- result := a;
- exit;
- end;
- if ( bExp = 0 ) then begin
- dec(expDiff);
- end
- else begin
- bSig0 := bSig0 or int64( $0001000000000000 );
- end;
- shift128ExtraRightJamming(
- bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
- zExp := aExp;
- end
- else if ( expDiff < 0 ) then begin
- if ( bExp = $7FFF ) then begin
- if ( bSig0 or bSig1 )<>0 then
- begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- result := packFloat128( zSign, $7FFF, 0, 0 );
- exit;
- end;
- if ( aExp = 0 ) then begin
- inc(expDiff);
- end
- else begin
- aSig0 := aSig0 or int64( $0001000000000000 );
- end;
- shift128ExtraRightJamming(
- aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
- zExp := bExp;
- end
- else begin
- if ( aExp = $7FFF ) then begin
- if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- result := a;
- exit;
- end;
- add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
- if ( aExp = 0 ) then
- begin
- result := packFloat128( zSign, 0, zSig0, zSig1 );
- exit;
- end;
- zSig2 := 0;
- zSig0 := zSig0 or int64( $0002000000000000 );
- zExp := aExp;
- goto shiftRight1;
- end;
- aSig0 := aSig0 or int64( $0001000000000000 );
- add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
- dec(zExp);
- if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
- inc(zExp);
- shiftRight1:
- shift128ExtraRightJamming(
- zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
- roundAndPack:
- result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of subtracting the absolute values of the quadruple-
- | precision floating-point values `a' and `b'. If `zSign' is 1, the
- | difference is negated before being returned. `zSign' is ignored if the
- | result is a NaN. The subtraction is performed according to the IEC/IEEE
- | Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function subFloat128Sigs( a, b : float128; zSign : flag): float128;
- var
- aExp, bExp, zExp: int32;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
- expDiff: int32;
- z: float128;
- label
- aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- bSig1 := extractFloat128Frac1( b );
- bSig0 := extractFloat128Frac0( b );
- bExp := extractFloat128Exp( b );
- expDiff := aExp - bExp;
- shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
- shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
- if ( 0 < expDiff ) then goto aExpBigger;
- if ( expDiff < 0 ) then goto bExpBigger;
- if ( aExp = $7FFF ) then begin
- if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- float_raise( float_flag_invalid );
- z.low := float128_default_nan_low;
- z.high := float128_default_nan_high;
- result := z;
- exit;
- end;
- if ( aExp = 0 ) then begin
- aExp := 1;
- bExp := 1;
- end;
- if ( bSig0 < aSig0 ) then goto aBigger;
- if ( aSig0 < bSig0 ) then goto bBigger;
- if ( bSig1 < aSig1 ) then goto aBigger;
- if ( aSig1 < bSig1 ) then goto bBigger;
- result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
- exit;
- bExpBigger:
- if ( bExp = $7FFF ) then begin
- if ( bSig0 or bSig1 )<>0 then
- begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
- exit;
- end;
- if ( aExp = 0 ) then begin
- inc(expDiff);
- end
- else begin
- aSig0 := aSig0 or int64( $4000000000000000 );
- end;
- shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
- bSig0 := bSig0 or int64( $4000000000000000 );
- bBigger:
- sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
- zExp := bExp;
- zSign := zSign xor 1;
- goto normalizeRoundAndPack;
- aExpBigger:
- if ( aExp = $7FFF ) then begin
- if ( aSig0 or aSig1 )<>0 then
- begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- result := a;
- exit;
- end;
- if ( bExp = 0 ) then begin
- dec(expDiff);
- end
- else begin
- bSig0 := bSig0 or int64( $4000000000000000 );
- end;
- shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
- aSig0 := aSig0 or int64( $4000000000000000 );
- aBigger:
- sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
- zExp := aExp;
- normalizeRoundAndPack:
- dec(zExp);
- result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of adding the quadruple-precision floating-point values
- | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- | for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_add(a: float128; b: float128): float128;
- var
- aSign, bSign: flag;
- begin
- aSign := extractFloat128Sign( a );
- bSign := extractFloat128Sign( b );
- if ( aSign = bSign ) then begin
- result := addFloat128Sigs( a, b, aSign );
- end
- else begin
- result := subFloat128Sigs( a, b, aSign );
- end;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of subtracting the quadruple-precision floating-point
- | values `a' and `b'. The operation is performed according to the IEC/IEEE
- | Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_sub(a: float128; b: float128): float128;
- var
- aSign, bSign: flag;
- begin
- aSign := extractFloat128Sign( a );
- bSign := extractFloat128Sign( b );
- if ( aSign = bSign ) then begin
- result := subFloat128Sigs( a, b, aSign );
- end
- else begin
- result := addFloat128Sigs( a, b, aSign );
- end;
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of multiplying the quadruple-precision floating-point
- | values `a' and `b'. The operation is performed according to the IEC/IEEE
- | Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_mul(a: float128; b: float128): float128;
- var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp: int32;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
- z: float128;
- label
- invalid;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- bSig1 := extractFloat128Frac1( b );
- bSig0 := extractFloat128Frac0( b );
- bExp := extractFloat128Exp( b );
- bSign := extractFloat128Sign( b );
- zSign := aSign xor bSign;
- if ( aExp = $7FFF ) then begin
- if ( (( aSig0 or aSig1 )<>0)
- or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
- result := packFloat128( zSign, $7FFF, 0, 0 );
- exit;
- end;
- if ( bExp = $7FFF ) then begin
- if ( bSig0 or bSig1 )<>0 then
- begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := float128_default_nan_low;
- z.high := float128_default_nan_high;
- result := z;
- exit;
- end;
- result := packFloat128( zSign, $7FFF, 0, 0 );
- exit;
- end;
- if ( aExp = 0 ) then begin
- if ( ( aSig0 or aSig1 ) = 0 ) then
- begin
- result := packFloat128( zSign, 0, 0, 0 );
- exit;
- end;
- normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- end;
- if ( bExp = 0 ) then begin
- if ( ( bSig0 or bSig1 ) = 0 ) then
- begin
- result := packFloat128( zSign, 0, 0, 0 );
- exit;
- end;
- normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
- end;
- zExp := aExp + bExp - $4000;
- aSig0 := aSig0 or int64( $0001000000000000 );
- shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
- mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
- add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
- zSig2 := zSig2 or ord( zSig3 <> 0 );
- if ( int64( $0002000000000000 ) <= zSig0 ) then begin
- shift128ExtraRightJamming(
- zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
- inc(zExp);
- end;
- result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the result of dividing the quadruple-precision floating-point value
- | `a' by the corresponding value `b'. The operation is performed according to
- | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_div(a: float128; b: float128): float128;
- var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp: int32;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
- rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
- z: float128;
- label
- invalid;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- bSig1 := extractFloat128Frac1( b );
- bSig0 := extractFloat128Frac0( b );
- bExp := extractFloat128Exp( b );
- bSign := extractFloat128Sign( b );
- zSign := aSign xor bSign;
- if ( aExp = $7FFF ) then begin
- if ( aSig0 or aSig1 )<>0 then
- begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- if ( bExp = $7FFF ) then begin
- if ( bSig0 or bSig1 )<>0 then
- begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- goto invalid;
- end;
- result := packFloat128( zSign, $7FFF, 0, 0 );
- exit;
- end;
- if ( bExp = $7FFF ) then begin
- if ( bSig0 or bSig1 )<>0 then
- begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- result := packFloat128( zSign, 0, 0, 0 );
- exit;
- end;
- if ( bExp = 0 ) then begin
- if ( ( bSig0 or bSig1 ) = 0 ) then begin
- if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := float128_default_nan_low;
- z.high := float128_default_nan_high;
- result := z;
- exit;
- end;
- float_raise( float_flag_divbyzero );
- result := packFloat128( zSign, $7FFF, 0, 0 );
- exit;
- end;
- normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
- end;
- if ( aExp = 0 ) then begin
- if ( ( aSig0 or aSig1 ) = 0 ) then
- begin
- result := packFloat128( zSign, 0, 0, 0 );
- exit;
- end;
- normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- end;
- zExp := aExp - bExp + $3FFD;
- shortShift128Left(
- aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
- shortShift128Left(
- bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
- if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
- shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
- inc(zExp);
- end;
- zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
- mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
- sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
- while ( sbits64(rem0) < 0 ) do begin
- dec(zSig0);
- add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
- end;
- zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
- if ( ( zSig1 and $3FFF ) <= 4 ) then begin
- mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
- sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
- while ( sbits64(rem1) < 0 ) do begin
- dec(zSig1);
- add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
- end;
- zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
- end;
- shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
- result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the remainder of the quadruple-precision floating-point value `a'
- | with respect to the corresponding value `b'. The operation is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_rem(a: float128; b: float128): float128;
- var
- aSign, zSign: flag;
- aExp, bExp, expDiff: int32;
- aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
- allZero, alternateASig0, alternateASig1, sigMean1: bits64;
- sigMean0: sbits64;
- z: float128;
- label
- invalid;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- bSig1 := extractFloat128Frac1( b );
- bSig0 := extractFloat128Frac0( b );
- bExp := extractFloat128Exp( b );
- if ( aExp = $7FFF ) then begin
- if ( (( aSig0 or aSig1 )<>0)
- or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- goto invalid;
- end;
- if ( bExp = $7FFF ) then begin
- if ( bSig0 or bSig1 )<>0 then
- begin
- result := propagateFloat128NaN( a, b );
- exit;
- end;
- result := a;
- exit;
- end;
- if ( bExp = 0 ) then begin
- if ( ( bSig0 or bSig1 ) = 0 ) then begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := float128_default_nan_low;
- z.high := float128_default_nan_high;
- result := z;
- exit;
- end;
- normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
- end;
- if ( aExp = 0 ) then begin
- if ( ( aSig0 or aSig1 ) = 0 ) then
- begin
- result := a;
- exit;
- end;
- normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- end;
- expDiff := aExp - bExp;
- if ( expDiff < -1 ) then
- begin
- result := a;
- exit;
- end;
- shortShift128Left(
- aSig0 or int64( $0001000000000000 ),
- aSig1,
- 15 - ord( expDiff < 0 ),
- aSig0,
- aSig1
- );
- shortShift128Left(
- bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
- q := le128( bSig0, bSig1, aSig0, aSig1 );
- if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
- dec(expDiff,64);
- while ( 0 < expDiff ) do begin
- q := estimateDiv128To64( aSig0, aSig1, bSig0 );
- if ( 4 < q ) then
- q := q - 4
- else
- q := 0;
- mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
- shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
- shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
- sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
- dec(expDiff,61);
- end;
- if ( -64 < expDiff ) then begin
- q := estimateDiv128To64( aSig0, aSig1, bSig0 );
- if ( 4 < q ) then
- q := q - 4
- else
- q := 0;
- q := q shr (- expDiff);
- shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
- inc(expDiff,52);
- if ( expDiff < 0 ) then begin
- shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
- end
- else begin
- shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
- end;
- mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
- sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
- end
- else begin
- shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
- shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
- end;
- repeat
- alternateASig0 := aSig0;
- alternateASig1 := aSig1;
- inc(q);
- sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
- until not( 0 <= sbits64(aSig0) );
- add128(
- aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
- if ( ( sigMean0 < 0 )
- or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
- aSig0 := alternateASig0;
- aSig1 := alternateASig1;
- end;
- zSign := ord( sbits64(aSig0) < 0 );
- if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
- result :=
- normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
- end;
- {*----------------------------------------------------------------------------
- | Returns the square root of the quadruple-precision floating-point value `a'.
- | The operation is performed according to the IEC/IEEE Standard for Binary
- | Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_sqrt(a: float128): float128;
- var
- aSign: flag;
- aExp, zExp: int32;
- aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
- rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
- z: float128;
- label
- invalid;
- begin
- aSig1 := extractFloat128Frac1( a );
- aSig0 := extractFloat128Frac0( a );
- aExp := extractFloat128Exp( a );
- aSign := extractFloat128Sign( a );
- if ( aExp = $7FFF ) then begin
- if ( aSig0 or aSig1 )<>0 then
- begin
- result := propagateFloat128NaN( a, a );
- exit;
- end;
- if ( aSign=0 ) then
- begin
- result := a;
- exit;
- end;
- goto invalid;
- end;
- if ( aSign<>0 ) then begin
- if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
- begin
- result := a;
- exit;
- end;
- invalid:
- float_raise( float_flag_invalid );
- z.low := float128_default_nan_low;
- z.high := float128_default_nan_high;
- result := z;
- exit;
- end;
- if ( aExp = 0 ) then begin
- if ( ( aSig0 or aSig1 ) = 0 ) then
- begin
- result := packFloat128( 0, 0, 0, 0 );
- exit;
- end;
- normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- end;
- zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
- aSig0 := aSig0 or int64( $0001000000000000 );
- zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
- shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
- zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
- doubleZSig0 := zSig0 shl 1;
- mul64To128( zSig0, zSig0, term0, term1 );
- sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
- while ( sbits64(rem0) < 0 ) do begin
- dec(zSig0);
- dec(doubleZSig0,2);
- add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
- end;
- zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
- if ( ( zSig1 and $1FFF ) <= 5 ) then begin
- if ( zSig1 = 0 ) then zSig1 := 1;
- mul64To128( doubleZSig0, zSig1, term1, term2 );
- sub128( rem1, 0, term1, term2, rem1, rem2 );
- mul64To128( zSig1, zSig1, term2, term3 );
- sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
- while ( sbits64(rem1) < 0 ) do begin
- dec(zSig1);
- shortShift128Left( 0, zSig1, 1, term2, term3 );
- term3 := term3 or 1;
- term2 := term2 or doubleZSig0;
- add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
- end;
- zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
- end;
- shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
- result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the quadruple-precision floating-point value `a' is equal to
- | the corresponding value `b', and 0 otherwise. The comparison is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_eq(a: float128; b: float128): flag;
- begin
- if ( ( ( extractFloat128Exp( a ) = $7FFF )
- and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
- or ( ( extractFloat128Exp( b ) = $7FFF )
- and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
- ) then begin
- if ( (float128_is_signaling_nan( a )<>0)
- or (float128_is_signaling_nan( b )<>0) ) then begin
- float_raise( float_flag_invalid );
- end;
- result := 0;
- exit;
- end;
- result := ord(
- ( a.low = b.low )
- and ( ( a.high = b.high )
- or ( ( a.low = 0 )
- and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
- ));
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the quadruple-precision floating-point value `a' is less than
- | or equal to the corresponding value `b', and 0 otherwise. The comparison
- | is performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_le(a: float128; b: float128): flag;
- var
- aSign, bSign: flag;
- begin
- if ( ( ( extractFloat128Exp( a ) = $7FFF )
- and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
- or ( ( extractFloat128Exp( b ) = $7FFF )
- and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
- ) then begin
- float_raise( float_flag_invalid );
- result := 0;
- exit;
- end;
- aSign := extractFloat128Sign( a );
- bSign := extractFloat128Sign( b );
- if ( aSign <> bSign ) then begin
- result := ord(
- (aSign<>0)
- or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
- = 0 ));
- exit;
- end;
- if aSign<>0 then
- result := le128( b.high, b.low, a.high, a.low )
- else
- result := le128( a.high, a.low, b.high, b.low );
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the quadruple-precision floating-point value `a' is less than
- | the corresponding value `b', and 0 otherwise. The comparison is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_lt(a: float128; b: float128): flag;
- var
- aSign, bSign: flag;
- begin
- if ( ( ( extractFloat128Exp( a ) = $7FFF )
- and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
- or ( ( extractFloat128Exp( b ) = $7FFF )
- and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
- ) then begin
- float_raise( float_flag_invalid );
- result := 0;
- exit;
- end;
- aSign := extractFloat128Sign( a );
- bSign := extractFloat128Sign( b );
- if ( aSign <> bSign ) then begin
- result := ord(
- (aSign<>0)
- and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
- <> 0 ));
- exit;
- end;
- if aSign<>0 then
- result := lt128( b.high, b.low, a.high, a.low )
- else
- result := lt128( a.high, a.low, b.high, b.low );
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the quadruple-precision floating-point value `a' is equal to
- | the corresponding value `b', and 0 otherwise. The invalid exception is
- | raised if either operand is a NaN. Otherwise, the comparison is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_eq_signaling(a: float128; b: float128): flag;
- begin
- if ( ( ( extractFloat128Exp( a ) = $7FFF )
- and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
- or ( ( extractFloat128Exp( b ) = $7FFF )
- and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
- ) then begin
- float_raise( float_flag_invalid );
- result := 0;
- exit;
- end;
- result := ord(
- ( a.low = b.low )
- and ( ( a.high = b.high )
- or ( ( a.low = 0 )
- and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
- ));
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the quadruple-precision floating-point value `a' is less than
- | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
- | cause an exception. Otherwise, the comparison is performed according to the
- | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_le_quiet(a: float128; b: float128): flag;
- var
- aSign, bSign: flag;
- begin
- if ( ( ( extractFloat128Exp( a ) = $7FFF )
- and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
- or ( ( extractFloat128Exp( b ) = $7FFF )
- and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
- ) then begin
- if ( (float128_is_signaling_nan( a )<>0)
- or (float128_is_signaling_nan( b )<>0) ) then begin
- float_raise( float_flag_invalid );
- end;
- result := 0;
- exit;
- end;
- aSign := extractFloat128Sign( a );
- bSign := extractFloat128Sign( b );
- if ( aSign <> bSign ) then begin
- result := ord(
- (aSign<>0)
- or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
- = 0 ));
- exit;
- end;
- if aSign<>0 then
- result := le128( b.high, b.low, a.high, a.low )
- else
- result := le128( a.high, a.low, b.high, b.low );
- end;
- {*----------------------------------------------------------------------------
- | Returns 1 if the quadruple-precision floating-point value `a' is less than
- | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
- | exception. Otherwise, the comparison is performed according to the IEC/IEEE
- | Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function float128_lt_quiet(a: float128; b: float128): flag;
- var
- aSign, bSign: flag;
- begin
- if ( ( ( extractFloat128Exp( a ) = $7FFF )
- and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
- or ( ( extractFloat128Exp( b ) = $7FFF )
- and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
- ) then begin
- if ( (float128_is_signaling_nan( a )<>0)
- or (float128_is_signaling_nan( b )<>0) ) then begin
- float_raise( float_flag_invalid );
- end;
- result := 0;
- exit;
- end;
- aSign := extractFloat128Sign( a );
- bSign := extractFloat128Sign( b );
- if ( aSign <> bSign ) then begin
- result := ord(
- (aSign<>0)
- and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
- <> 0 ));
- exit;
- end;
- if aSign<>0 then
- result:=lt128( b.high, b.low, a.high, a.low )
- else
- result:=lt128( a.high, a.low, b.high, b.low );
- end;
- {----------------------------------------------------------------------------
- | Returns the result of converting the double-precision floating-point value
- | `a' to the quadruple-precision floating-point format. The conversion is
- | performed according to the IEC/IEEE Standard for Binary Floating-Point
- | Arithmetic.
- *----------------------------------------------------------------------------}
- function float64_to_float128( a : float64) : float128;
- var
- aSign : flag;
- aExp : int16;
- aSig, zSig0, zSig1 : bits64;
- begin
- aSig := extractFloat64Frac( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- if ( aExp = $7FF ) then begin
- if ( aSig<>0 ) then begin
- result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
- exit;
- end;
- result:=packFloat128( aSign, $7FFF, 0, 0 );
- exit;
- end;
- if ( aExp = 0 ) then begin
- if ( aSig = 0 ) then
- begin
- result:=packFloat128( aSign, 0, 0, 0 );
- exit;
- end;
- normalizeFloat64Subnormal( aSig, aExp, aSig );
- dec(aExp);
- end;
- shift128Right( aSig, 0, 4, zSig0, zSig1 );
- result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
- end;
- {$endif FPC_SOFTFLOAT_FLOAT128}
- {$endif not(defined(fpc_softfpu_interface))}
- {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
- end.
- {$ifdef FPC}
- { restore context modified at implmentation start
- to possibly re-enable range and overflow checking explicitly}
- {$pop}
- {$endif FPC}
- {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
|