softfpu.pp 321 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. {$macro on}
  69. {$define compilerproc:=stdcall }
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. float32 = longword;
  80. {$define FPC_SYSTEM_HAS_float32}
  81. { we use here a record in the function header because
  82. the record allows bitwise conversion to single }
  83. float32rec = record
  84. float32 : float32;
  85. end;
  86. flag = byte;
  87. bits8 = byte;
  88. sbits8 = shortint;
  89. bits16 = word;
  90. sbits16 = smallint;
  91. sbits32 = longint;
  92. bits32 = longword;
  93. {$ifndef fpc}
  94. qword = int64;
  95. {$endif}
  96. { now part of the system unit
  97. uint64 = qword;
  98. }
  99. bits64 = qword;
  100. sbits64 = int64;
  101. {$ifdef ENDIAN_LITTLE}
  102. float64 = record
  103. case byte of
  104. 1: (low,high : bits32);
  105. // force the record to be aligned like a double
  106. // else *_to_double will fail for cpus like sparc
  107. // and avoid expensive unpacking/packing operations
  108. 2: (dummy : double);
  109. end;
  110. floatx80 = record
  111. case byte of
  112. 1: (low : qword;high : word);
  113. // force the record to be aligned like a double
  114. // else *_to_double will fail for cpus like sparc
  115. // and avoid expensive unpacking/packing operations
  116. 2: (dummy : extended);
  117. end;
  118. float128 = record
  119. case byte of
  120. 1: (low,high : qword);
  121. // force the record to be aligned like a double
  122. // else *_to_double will fail for cpus like sparc
  123. // and avoid expensive unpacking/packing operations
  124. 2: (dummy : qword);
  125. end;
  126. {$else}
  127. float64 = record
  128. case byte of
  129. 1: (high,low : bits32);
  130. // force the record to be aligned like a double
  131. // else *_to_double will fail for cpus like sparc
  132. 2: (dummy : double);
  133. end;
  134. floatx80 = record
  135. case byte of
  136. 1: (high : word;low : qword);
  137. // force the record to be aligned like a double
  138. // else *_to_double will fail for cpus like sparc
  139. // and avoid expensive unpacking/packing operations
  140. 2: (dummy : qword);
  141. end;
  142. float128 = record
  143. case byte of
  144. 1: (high : qword;low : qword);
  145. // force the record to be aligned like a double
  146. // else *_to_double will fail for cpus like sparc
  147. // and avoid expensive unpacking/packing operations
  148. 2: (dummy : qword);
  149. end;
  150. {$endif}
  151. {$define FPC_SYSTEM_HAS_float64}
  152. {*
  153. -------------------------------------------------------------------------------
  154. Returns 1 if the double-precision floating-point value `a' is less than
  155. the corresponding value `b', and 0 otherwise. The comparison is performed
  156. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  157. -------------------------------------------------------------------------------
  158. *}
  159. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  160. {*
  161. -------------------------------------------------------------------------------
  162. Returns 1 if the double-precision floating-point value `a' is less than
  163. or equal to the corresponding value `b', and 0 otherwise. The comparison
  164. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  165. Arithmetic.
  166. -------------------------------------------------------------------------------
  167. *}
  168. Function float64_le(a: float64;b: float64): flag; compilerproc;
  169. {*
  170. -------------------------------------------------------------------------------
  171. Returns 1 if the double-precision floating-point value `a' is equal to
  172. the corresponding value `b', and 0 otherwise. The comparison is performed
  173. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  174. -------------------------------------------------------------------------------
  175. *}
  176. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  177. {*
  178. -------------------------------------------------------------------------------
  179. Returns the square root of the double-precision floating-point value `a'.
  180. The operation is performed according to the IEC/IEEE Standard for Binary
  181. Floating-Point Arithmetic.
  182. -------------------------------------------------------------------------------
  183. *}
  184. function float64_sqrt( a: float64 ): float64; compilerproc;
  185. {*
  186. -------------------------------------------------------------------------------
  187. Returns the remainder of the double-precision floating-point value `a'
  188. with respect to the corresponding value `b'. The operation is performed
  189. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  190. -------------------------------------------------------------------------------
  191. *}
  192. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  193. {*
  194. -------------------------------------------------------------------------------
  195. Returns the result of dividing the double-precision floating-point value `a'
  196. by the corresponding value `b'. The operation is performed according to the
  197. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  198. -------------------------------------------------------------------------------
  199. *}
  200. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  201. {*
  202. -------------------------------------------------------------------------------
  203. Returns the result of multiplying the double-precision floating-point values
  204. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  205. for Binary Floating-Point Arithmetic.
  206. -------------------------------------------------------------------------------
  207. *}
  208. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  209. {*
  210. -------------------------------------------------------------------------------
  211. Returns the result of subtracting the double-precision floating-point values
  212. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  213. for Binary Floating-Point Arithmetic.
  214. -------------------------------------------------------------------------------
  215. *}
  216. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  217. {*
  218. -------------------------------------------------------------------------------
  219. Returns the result of adding the double-precision floating-point values `a'
  220. and `b'. The operation is performed according to the IEC/IEEE Standard for
  221. Binary Floating-Point Arithmetic.
  222. -------------------------------------------------------------------------------
  223. *}
  224. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  225. {*
  226. -------------------------------------------------------------------------------
  227. Rounds the double-precision floating-point value `a' to an integer,
  228. and returns the result as a double-precision floating-point value. The
  229. operation is performed according to the IEC/IEEE Standard for Binary
  230. Floating-Point Arithmetic.
  231. -------------------------------------------------------------------------------
  232. *}
  233. Function float64_round_to_int(a: float64) : float64; compilerproc;
  234. {*
  235. -------------------------------------------------------------------------------
  236. Returns the result of converting the double-precision floating-point value
  237. `a' to the single-precision floating-point format. The conversion is
  238. performed according to the IEC/IEEE Standard for Binary Floating-Point
  239. Arithmetic.
  240. -------------------------------------------------------------------------------
  241. *}
  242. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  243. {*
  244. -------------------------------------------------------------------------------
  245. Returns the result of converting the double-precision floating-point value
  246. `a' to the 32-bit two's complement integer format. The conversion is
  247. performed according to the IEC/IEEE Standard for Binary Floating-Point
  248. Arithmetic, except that the conversion is always rounded toward zero.
  249. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  250. the conversion overflows, the largest integer with the same sign as `a' is
  251. returned.
  252. -------------------------------------------------------------------------------
  253. *}
  254. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  255. {*
  256. -------------------------------------------------------------------------------
  257. Returns the result of converting the double-precision floating-point value
  258. `a' to the 32-bit two's complement integer format. The conversion is
  259. performed according to the IEC/IEEE Standard for Binary Floating-Point
  260. Arithmetic---which means in particular that the conversion is rounded
  261. according to the current rounding mode. If `a' is a NaN, the largest
  262. positive integer is returned. Otherwise, if the conversion overflows, the
  263. largest integer with the same sign as `a' is returned.
  264. -------------------------------------------------------------------------------
  265. *}
  266. Function float64_to_int32(a: float64): int32; compilerproc;
  267. {*
  268. -------------------------------------------------------------------------------
  269. Returns 1 if the single-precision floating-point value `a' is less than
  270. the corresponding value `b', and 0 otherwise. The comparison is performed
  271. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  272. -------------------------------------------------------------------------------
  273. *}
  274. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  275. {*
  276. -------------------------------------------------------------------------------
  277. Returns 1 if the single-precision floating-point value `a' is less than
  278. or equal to the corresponding value `b', and 0 otherwise. The comparison
  279. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  280. Arithmetic.
  281. -------------------------------------------------------------------------------
  282. *}
  283. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  284. {*
  285. -------------------------------------------------------------------------------
  286. Returns 1 if the single-precision floating-point value `a' is equal to
  287. the corresponding value `b', and 0 otherwise. The comparison is performed
  288. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  289. -------------------------------------------------------------------------------
  290. *}
  291. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  292. {*
  293. -------------------------------------------------------------------------------
  294. Returns the square root of the single-precision floating-point value `a'.
  295. The operation is performed according to the IEC/IEEE Standard for Binary
  296. Floating-Point Arithmetic.
  297. -------------------------------------------------------------------------------
  298. *}
  299. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  300. {*
  301. -------------------------------------------------------------------------------
  302. Returns the remainder of the single-precision floating-point value `a'
  303. with respect to the corresponding value `b'. The operation is performed
  304. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  305. -------------------------------------------------------------------------------
  306. *}
  307. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  308. {*
  309. -------------------------------------------------------------------------------
  310. Returns the result of dividing the single-precision floating-point value `a'
  311. by the corresponding value `b'. The operation is performed according to the
  312. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  313. -------------------------------------------------------------------------------
  314. *}
  315. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  316. {*
  317. -------------------------------------------------------------------------------
  318. Returns the result of multiplying the single-precision floating-point values
  319. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  320. for Binary Floating-Point Arithmetic.
  321. -------------------------------------------------------------------------------
  322. *}
  323. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  324. {*
  325. -------------------------------------------------------------------------------
  326. Returns the result of subtracting the single-precision floating-point values
  327. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  328. for Binary Floating-Point Arithmetic.
  329. -------------------------------------------------------------------------------
  330. *}
  331. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  332. {*
  333. -------------------------------------------------------------------------------
  334. Returns the result of adding the single-precision floating-point values `a'
  335. and `b'. The operation is performed according to the IEC/IEEE Standard for
  336. Binary Floating-Point Arithmetic.
  337. -------------------------------------------------------------------------------
  338. *}
  339. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  340. {*
  341. -------------------------------------------------------------------------------
  342. Rounds the single-precision floating-point value `a' to an integer,
  343. and returns the result as a single-precision floating-point value. The
  344. operation is performed according to the IEC/IEEE Standard for Binary
  345. Floating-Point Arithmetic.
  346. -------------------------------------------------------------------------------
  347. *}
  348. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  349. {*
  350. -------------------------------------------------------------------------------
  351. Returns the result of converting the single-precision floating-point value
  352. `a' to the double-precision floating-point format. The conversion is
  353. performed according to the IEC/IEEE Standard for Binary Floating-Point
  354. Arithmetic.
  355. -------------------------------------------------------------------------------
  356. *}
  357. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  358. {*
  359. -------------------------------------------------------------------------------
  360. Returns the result of converting the single-precision floating-point value
  361. `a' to the 32-bit two's complement integer format. The conversion is
  362. performed according to the IEC/IEEE Standard for Binary Floating-Point
  363. Arithmetic, except that the conversion is always rounded toward zero.
  364. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  365. the conversion overflows, the largest integer with the same sign as `a' is
  366. returned.
  367. -------------------------------------------------------------------------------
  368. *}
  369. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  370. {*
  371. -------------------------------------------------------------------------------
  372. Returns the result of converting the single-precision floating-point value
  373. `a' to the 32-bit two's complement integer format. The conversion is
  374. performed according to the IEC/IEEE Standard for Binary Floating-Point
  375. Arithmetic---which means in particular that the conversion is rounded
  376. according to the current rounding mode. If `a' is a NaN, the largest
  377. positive integer is returned. Otherwise, if the conversion overflows, the
  378. largest integer with the same sign as `a' is returned.
  379. -------------------------------------------------------------------------------
  380. *}
  381. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  382. {*
  383. -------------------------------------------------------------------------------
  384. Returns the result of converting the 32-bit two's complement integer `a' to
  385. the double-precision floating-point format. The conversion is performed
  386. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  387. -------------------------------------------------------------------------------
  388. *}
  389. Function int32_to_float64( a: int32) : float64; compilerproc;
  390. {*
  391. -------------------------------------------------------------------------------
  392. Returns the result of converting the 32-bit two's complement integer `a' to
  393. the single-precision floating-point format. The conversion is performed
  394. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  395. -------------------------------------------------------------------------------
  396. *}
  397. Function int32_to_float32( a: int32): float32rec; compilerproc;
  398. {*----------------------------------------------------------------------------
  399. | Returns the result of converting the 64-bit two's complement integer `a'
  400. | to the double-precision floating-point format. The conversion is performed
  401. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  402. *----------------------------------------------------------------------------*}
  403. Function int64_to_float64( a: int64 ): float64; compilerproc;
  404. Function qword_to_float64( a: qword ): float64; compilerproc;
  405. {*----------------------------------------------------------------------------
  406. | Returns the result of converting the 64-bit two's complement integer `a'
  407. | to the single-precision floating-point format. The conversion is performed
  408. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  409. *----------------------------------------------------------------------------*}
  410. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  411. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  412. // +++
  413. function float32_to_int64( a: float32 ): int64;
  414. function float32_to_int64_round_to_zero( a: float32 ): int64;
  415. function float32_eq_signaling( a: float32; b: float32) : flag;
  416. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  417. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  418. function float32_is_signaling_nan( a : float32 ): flag;
  419. function float32_is_nan( a : float32 ): flag;
  420. function float64_to_int64( a: float64 ): int64;
  421. function float64_to_int64_round_to_zero( a: float64 ): int64;
  422. function float64_eq_signaling( a: float64; b: float64): flag;
  423. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  424. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  425. function float64_is_signaling_nan( a : float64 ): flag;
  426. function float64_is_nan( a : float64 ): flag;
  427. // ===
  428. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  429. {*----------------------------------------------------------------------------
  430. | Extended double-precision rounding precision
  431. *----------------------------------------------------------------------------*}
  432. var // threadvar!?
  433. floatx80_rounding_precision : int8 = 80;
  434. function int32_to_floatx80( a: int32 ): floatx80;
  435. function int64_to_floatx80( a: int64 ): floatx80;
  436. function qword_to_floatx80( a: qword ): floatx80;
  437. function float32_to_floatx80( a: float32 ): floatx80;
  438. function float64_to_floatx80( a: float64 ): floatx80;
  439. function floatx80_to_int32( a: floatx80 ): int32;
  440. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  441. function floatx80_to_int64( a: floatx80 ): int64;
  442. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  443. function floatx80_to_float32( a: floatx80 ): float32;
  444. function floatx80_to_float64( a: floatx80 ): float64;
  445. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  446. function floatx80_to_float128( a: floatx80 ): float128;
  447. {$endif FPC_SOFTFLOAT_FLOAT128}
  448. function floatx80_round_to_int( a: floatx80 ): floatx80;
  449. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  450. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  451. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  452. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  453. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  454. function floatx80_sqrt( a: floatx80 ): floatx80;
  455. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  456. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  457. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  458. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  459. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  460. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  461. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  462. function floatx80_is_nan(a : floatx80 ): flag;
  463. {$endif FPC_SOFTFLOAT_FLOATX80}
  464. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  465. function int32_to_float128( a: int32 ): float128;
  466. function int64_to_float128( a: int64 ): float128;
  467. function qword_to_float128( a: qword ): float128;
  468. function float32_to_float128( a: float32 ): float128;
  469. function float128_is_nan( a : float128): flag;
  470. function float128_is_signaling_nan( a : float128): flag;
  471. function float128_to_int32(a: float128): int32;
  472. function float128_to_int32_round_to_zero(a: float128): int32;
  473. function float128_to_int64(a: float128): int64;
  474. function float128_to_int64_round_to_zero(a: float128): int64;
  475. function float128_to_float32(a: float128): float32;
  476. function float128_to_float64(a: float128): float64;
  477. function float64_to_float128( a : float64) : float128;
  478. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  479. function float128_to_floatx80(a: float128): floatx80;
  480. {$endif FPC_SOFTFLOAT_FLOATX80}
  481. function float128_round_to_int(a: float128): float128;
  482. function float128_add(a: float128; b: float128): float128;
  483. function float128_sub(a: float128; b: float128): float128;
  484. function float128_mul(a: float128; b: float128): float128;
  485. function float128_div(a: float128; b: float128): float128;
  486. function float128_rem(a: float128; b: float128): float128;
  487. function float128_sqrt(a: float128): float128;
  488. function float128_eq(a: float128; b: float128): flag;
  489. function float128_le(a: float128; b: float128): flag;
  490. function float128_lt(a: float128; b: float128): flag;
  491. function float128_eq_signaling(a: float128; b: float128): flag;
  492. function float128_le_quiet(a: float128; b: float128): flag;
  493. function float128_lt_quiet(a: float128; b: float128): flag;
  494. {$endif FPC_SOFTFLOAT_FLOAT128}
  495. CONST
  496. {-------------------------------------------------------------------------------
  497. Software IEC/IEEE floating-point underflow tininess-detection mode.
  498. -------------------------------------------------------------------------------
  499. *}
  500. float_tininess_after_rounding = 0;
  501. float_tininess_before_rounding = 1;
  502. {*
  503. -------------------------------------------------------------------------------
  504. Underflow tininess-detection mode, statically initialized to default value.
  505. (The declaration in `softfloat.h' must match the `int8' type here.)
  506. -------------------------------------------------------------------------------
  507. *}
  508. var // threadvar!?
  509. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  510. {$endif not(defined(fpc_softfpu_implementation))}
  511. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  512. implementation
  513. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  514. {$if not(defined(fpc_softfpu_interface))}
  515. (*****************************************************************************)
  516. (*----------------------------------------------------------------------------*)
  517. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  518. (* division and square root approximations. (Can be specialized to target if *)
  519. (* desired.) *)
  520. (* ---------------------------------------------------------------------------*)
  521. (*****************************************************************************)
  522. { This procedure serves as a single access point to softfloat_exception_flags.
  523. It also helps to reduce code size a bit because softfloat_exception_flags is
  524. a threadvar. }
  525. procedure set_inexact_flag;
  526. begin
  527. include(softfloat_exception_flags,float_flag_inexact);
  528. end;
  529. {*----------------------------------------------------------------------------
  530. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  531. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  532. | input. If `zSign' is 1, the input is negated before being converted to an
  533. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  534. | is simply rounded to an integer, with the inexact exception raised if the
  535. | input cannot be represented exactly as an integer. However, if the fixed-
  536. | point input is too large, the invalid exception is raised and the largest
  537. | positive or negative integer is returned.
  538. *----------------------------------------------------------------------------*}
  539. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  540. var
  541. roundingMode: TFPURoundingMode;
  542. roundNearestEven: boolean;
  543. roundIncrement, roundBits: int8;
  544. z: int32;
  545. begin
  546. roundingMode := softfloat_rounding_mode;
  547. roundNearestEven := (roundingMode = float_round_nearest_even);
  548. roundIncrement := $40;
  549. if not roundNearestEven then
  550. begin
  551. if ( roundingMode = float_round_to_zero ) then
  552. begin
  553. roundIncrement := 0;
  554. end
  555. else begin
  556. roundIncrement := $7F;
  557. if ( zSign<>0 ) then
  558. begin
  559. if ( roundingMode = float_round_up ) then
  560. roundIncrement := 0;
  561. end
  562. else begin
  563. if ( roundingMode = float_round_down ) then
  564. roundIncrement := 0;
  565. end;
  566. end;
  567. end;
  568. roundBits := lo(absZ) and $7F;
  569. absZ := ( absZ + roundIncrement ) shr 7;
  570. absZ := absZ and not( bits64( ord( ( roundBits xor $40 ) = 0 ) and ord(roundNearestEven) ));
  571. z := absZ;
  572. if ( zSign<>0 ) then
  573. z := - z;
  574. if ( longint(hi( absZ )) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  575. begin
  576. float_raise( float_flag_invalid );
  577. if zSign<>0 then
  578. result:=sbits32($80000000)
  579. else
  580. result:=$7FFFFFFF;
  581. exit;
  582. end;
  583. if ( roundBits<>0 ) then
  584. set_inexact_flag;
  585. result:=z;
  586. end;
  587. {*----------------------------------------------------------------------------
  588. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  589. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  590. | and returns the properly rounded 64-bit integer corresponding to the input.
  591. | If `zSign' is 1, the input is negated before being converted to an integer.
  592. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  593. | the inexact exception raised if the input cannot be represented exactly as
  594. | an integer. However, if the fixed-point input is too large, the invalid
  595. | exception is raised and the largest positive or negative integer is
  596. | returned.
  597. *----------------------------------------------------------------------------*}
  598. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  599. var
  600. roundingMode: TFPURoundingMode;
  601. roundNearestEven, increment: flag;
  602. z: int64;
  603. label
  604. overflow;
  605. begin
  606. roundingMode := softfloat_rounding_mode;
  607. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  608. increment := ord( sbits64(absZ1) < 0 );
  609. if ( roundNearestEven=0 ) then
  610. begin
  611. if ( roundingMode = float_round_to_zero ) then
  612. begin
  613. increment := 0;
  614. end
  615. else begin
  616. if ( zSign<>0 ) then
  617. begin
  618. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  619. end
  620. else begin
  621. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  622. end;
  623. end;
  624. end;
  625. if ( increment<>0 ) then
  626. begin
  627. inc(absZ0);
  628. if ( absZ0 = 0 ) then
  629. goto overflow;
  630. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  631. end;
  632. z := absZ0;
  633. if ( zSign<>0 ) then
  634. z := - z;
  635. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  636. begin
  637. overflow:
  638. float_raise( float_flag_invalid );
  639. if zSign<>0 then
  640. result:=int64($8000000000000000)
  641. else
  642. result:=int64($7FFFFFFFFFFFFFFF);
  643. exit;
  644. end;
  645. if ( absZ1<>0 ) then
  646. set_inexact_flag;
  647. result:=z;
  648. end;
  649. {*
  650. -------------------------------------------------------------------------------
  651. Shifts `a' right by the number of bits given in `count'. If any nonzero
  652. bits are shifted off, they are ``jammed'' into the least significant bit of
  653. the result by setting the least significant bit to 1. The value of `count'
  654. can be arbitrarily large; in particular, if `count' is greater than 32, the
  655. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  656. The result is stored in the location pointed to by `zPtr'.
  657. -------------------------------------------------------------------------------
  658. *}
  659. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  660. var
  661. z: Bits32;
  662. Begin
  663. if ( count = 0 ) then
  664. z := a
  665. else
  666. if ( count < 32 ) then
  667. Begin
  668. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  669. End
  670. else
  671. Begin
  672. z := bits32( a <> 0 );
  673. End;
  674. zPtr := z;
  675. End;
  676. {*----------------------------------------------------------------------------
  677. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  678. | number of bits given in `count'. Any bits shifted off are lost. The value
  679. | of `count' can be arbitrarily large; in particular, if `count' is greater
  680. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  681. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  682. *----------------------------------------------------------------------------*}
  683. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  684. var
  685. z0, z1: bits64;
  686. negCount: int8;
  687. begin
  688. negCount := ( - count ) and 63;
  689. if ( count = 0 ) then
  690. begin
  691. z1 := a1;
  692. z0 := a0;
  693. end
  694. else if ( count < 64 ) then
  695. begin
  696. z1 := ( a0 shl negCount ) or ( a1 shr count );
  697. z0 := a0 shr count;
  698. end
  699. else
  700. begin
  701. if ( count < 128 ) then
  702. z1 := a0 shr ( count and 63 )
  703. else
  704. z1 := 0;
  705. z0 := 0;
  706. end;
  707. z1Ptr := z1;
  708. z0Ptr := z0;
  709. end;
  710. {*----------------------------------------------------------------------------
  711. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  712. | number of bits given in `count'. If any nonzero bits are shifted off, they
  713. | are ``jammed'' into the least significant bit of the result by setting the
  714. | least significant bit to 1. The value of `count' can be arbitrarily large;
  715. | in particular, if `count' is greater than 128, the result will be either
  716. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  717. | nonzero. The result is broken into two 64-bit pieces which are stored at
  718. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  719. *----------------------------------------------------------------------------*}
  720. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  721. var
  722. z0,z1 : bits64;
  723. negCount : int8;
  724. begin
  725. negCount := ( - count ) and 63;
  726. if ( count = 0 ) then begin
  727. z1 := a1;
  728. z0 := a0;
  729. end
  730. else if ( count < 64 ) then begin
  731. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  732. z0 := a0 shr count;
  733. end
  734. else begin
  735. if ( count = 64 ) then begin
  736. z1 := a0 or ord( a1 <> 0 );
  737. end
  738. else if ( count < 128 ) then begin
  739. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  740. end
  741. else begin
  742. z1 := ord( ( a0 or a1 ) <> 0 );
  743. end;
  744. z0 := 0;
  745. end;
  746. z1Ptr := z1;
  747. z0Ptr := z0;
  748. end;
  749. {*
  750. -------------------------------------------------------------------------------
  751. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  752. number of bits given in `count'. Any bits shifted off are lost. The value
  753. of `count' can be arbitrarily large; in particular, if `count' is greater
  754. than 64, the result will be 0. The result is broken into two 32-bit pieces
  755. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  756. -------------------------------------------------------------------------------
  757. *}
  758. Procedure
  759. shift64Right(
  760. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  761. Var
  762. z0, z1: bits32;
  763. negCount : int8;
  764. Begin
  765. negCount := ( - count ) AND 31;
  766. if ( count = 0 ) then
  767. Begin
  768. z1 := a1;
  769. z0 := a0;
  770. End
  771. else if ( count < 32 ) then
  772. Begin
  773. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  774. z0 := a0 shr count;
  775. End
  776. else
  777. Begin
  778. if (count < 64) then
  779. z1 := ( a0 shr ( count AND 31 ) )
  780. else
  781. z1 := 0;
  782. z0 := 0;
  783. End;
  784. z1Ptr := z1;
  785. z0Ptr := z0;
  786. End;
  787. {*
  788. -------------------------------------------------------------------------------
  789. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  790. number of bits given in `count'. If any nonzero bits are shifted off, they
  791. are ``jammed'' into the least significant bit of the result by setting the
  792. least significant bit to 1. The value of `count' can be arbitrarily large;
  793. in particular, if `count' is greater than 64, the result will be either 0
  794. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  795. nonzero. The result is broken into two 32-bit pieces which are stored at
  796. the locations pointed to by `z0Ptr' and `z1Ptr'.
  797. -------------------------------------------------------------------------------
  798. *}
  799. Procedure
  800. shift64RightJamming(
  801. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  802. VAR
  803. z0, z1 : bits32;
  804. negCount : int8;
  805. Begin
  806. negCount := ( - count ) AND 31;
  807. if ( count = 0 ) then
  808. Begin
  809. z1 := a1;
  810. z0 := a0;
  811. End
  812. else
  813. if ( count < 32 ) then
  814. Begin
  815. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  816. z0 := a0 shr count;
  817. End
  818. else
  819. Begin
  820. if ( count = 32 ) then
  821. Begin
  822. z1 := a0 OR bits32( a1 <> 0 );
  823. End
  824. else
  825. if ( count < 64 ) Then
  826. Begin
  827. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  828. End
  829. else
  830. Begin
  831. z1 := bits32( ( a0 OR a1 ) <> 0 );
  832. End;
  833. z0 := 0;
  834. End;
  835. z1Ptr := z1;
  836. z0Ptr := z0;
  837. End;
  838. {*----------------------------------------------------------------------------
  839. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  840. | bits are shifted off, they are ``jammed'' into the least significant bit of
  841. | the result by setting the least significant bit to 1. The value of `count'
  842. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  843. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  844. | The result is stored in the location pointed to by `zPtr'.
  845. *----------------------------------------------------------------------------*}
  846. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  847. var
  848. z: bits64;
  849. begin
  850. if ( count = 0 ) then
  851. begin
  852. z := a;
  853. end
  854. else if ( count < 64 ) then
  855. begin
  856. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  857. end
  858. else
  859. begin
  860. z := ord( a <> 0 );
  861. end;
  862. zPtr := z;
  863. end;
  864. {$if not defined(shift64ExtraRightJamming)}
  865. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  866. overload;
  867. forward;
  868. {$endif}
  869. {*
  870. -------------------------------------------------------------------------------
  871. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  872. by 32 _plus_ the number of bits given in `count'. The shifted result is
  873. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  874. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  875. off form a third 32-bit result as follows: The _last_ bit shifted off is
  876. the most-significant bit of the extra result, and the other 31 bits of the
  877. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  878. were all zero. This extra result is stored in the location pointed to by
  879. `z2Ptr'. The value of `count' can be arbitrarily large.
  880. (This routine makes more sense if `a0', `a1', and `a2' are considered
  881. to form a fixed-point value with binary point between `a1' and `a2'. This
  882. fixed-point value is shifted right by the number of bits given in `count',
  883. and the integer part of the result is returned at the locations pointed to
  884. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  885. corrupted as described above, and is returned at the location pointed to by
  886. `z2Ptr'.)
  887. -------------------------------------------------------------------------------
  888. }
  889. Procedure
  890. shift64ExtraRightJamming(
  891. a0: bits32;
  892. a1: bits32;
  893. a2: bits32;
  894. count: int16;
  895. VAR z0Ptr: bits32;
  896. VAR z1Ptr: bits32;
  897. VAR z2Ptr: bits32
  898. ); overload;
  899. Var
  900. z0, z1, z2: bits32;
  901. negCount : int8;
  902. Begin
  903. negCount := ( - count ) AND 31;
  904. if ( count = 0 ) then
  905. Begin
  906. z2 := a2;
  907. z1 := a1;
  908. z0 := a0;
  909. End
  910. else
  911. Begin
  912. if ( count < 32 ) Then
  913. Begin
  914. z2 := a1 shl negCount;
  915. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  916. z0 := a0 shr count;
  917. End
  918. else
  919. Begin
  920. if ( count = 32 ) then
  921. Begin
  922. z2 := a1;
  923. z1 := a0;
  924. End
  925. else
  926. Begin
  927. a2 := a2 or a1;
  928. if ( count < 64 ) then
  929. Begin
  930. z2 := a0 shl negCount;
  931. z1 := a0 shr ( count AND 31 );
  932. End
  933. else
  934. Begin
  935. if count = 64 then
  936. z2 := a0
  937. else
  938. z2 := bits32(a0 <> 0);
  939. z1 := 0;
  940. End;
  941. End;
  942. z0 := 0;
  943. End;
  944. z2 := z2 or bits32( a2 <> 0 );
  945. End;
  946. z2Ptr := z2;
  947. z1Ptr := z1;
  948. z0Ptr := z0;
  949. End;
  950. {*
  951. -------------------------------------------------------------------------------
  952. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  953. number of bits given in `count'. Any bits shifted off are lost. The value
  954. of `count' must be less than 32. The result is broken into two 32-bit
  955. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  956. -------------------------------------------------------------------------------
  957. *}
  958. Procedure
  959. shortShift64Left(
  960. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  961. Begin
  962. z1Ptr := a1 shl count;
  963. if count = 0 then
  964. z0Ptr := a0
  965. else
  966. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  967. End;
  968. {*
  969. -------------------------------------------------------------------------------
  970. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  971. by the number of bits given in `count'. Any bits shifted off are lost.
  972. The value of `count' must be less than 32. The result is broken into three
  973. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  974. `z1Ptr', and `z2Ptr'.
  975. -------------------------------------------------------------------------------
  976. *}
  977. Procedure
  978. shortShift96Left(
  979. a0: bits32;
  980. a1: bits32;
  981. a2: bits32;
  982. count: int16;
  983. VAR z0Ptr: bits32;
  984. VAR z1Ptr: bits32;
  985. VAR z2Ptr: bits32
  986. );
  987. Var
  988. z0, z1, z2: bits32;
  989. negCount: int8;
  990. Begin
  991. z2 := a2 shl count;
  992. z1 := a1 shl count;
  993. z0 := a0 shl count;
  994. if ( 0 < count ) then
  995. Begin
  996. negCount := ( ( - count ) AND 31 );
  997. z1 := z1 or (a2 shr negCount);
  998. z0 := z0 or (a1 shr negCount);
  999. End;
  1000. z2Ptr := z2;
  1001. z1Ptr := z1;
  1002. z0Ptr := z0;
  1003. End;
  1004. {*----------------------------------------------------------------------------
  1005. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1006. | number of bits given in `count'. Any bits shifted off are lost. The value
  1007. | of `count' must be less than 64. The result is broken into two 64-bit
  1008. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1009. *----------------------------------------------------------------------------*}
  1010. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1011. begin
  1012. z1Ptr := a1 shl count;
  1013. if count=0 then
  1014. z0Ptr:=a0
  1015. else
  1016. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1017. end;
  1018. {*
  1019. -------------------------------------------------------------------------------
  1020. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1021. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1022. any carry out is lost. The result is broken into two 32-bit pieces which
  1023. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1024. -------------------------------------------------------------------------------
  1025. *}
  1026. Procedure
  1027. add64(
  1028. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1029. Var
  1030. z1: bits32;
  1031. Begin
  1032. z1 := a1 + b1;
  1033. z1Ptr := z1;
  1034. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1035. End;
  1036. {*
  1037. -------------------------------------------------------------------------------
  1038. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1039. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1040. modulo 2^96, so any carry out is lost. The result is broken into three
  1041. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1042. `z1Ptr', and `z2Ptr'.
  1043. -------------------------------------------------------------------------------
  1044. *}
  1045. Procedure
  1046. add96(
  1047. a0: bits32;
  1048. a1: bits32;
  1049. a2: bits32;
  1050. b0: bits32;
  1051. b1: bits32;
  1052. b2: bits32;
  1053. VAR z0Ptr: bits32;
  1054. VAR z1Ptr: bits32;
  1055. VAR z2Ptr: bits32
  1056. );
  1057. var
  1058. z0, z1, z2: bits32;
  1059. carry0, carry1: int8;
  1060. Begin
  1061. z2 := a2 + b2;
  1062. carry1 := int8( z2 < a2 );
  1063. z1 := a1 + b1;
  1064. carry0 := int8( z1 < a1 );
  1065. z0 := a0 + b0;
  1066. z1 := z1 + carry1;
  1067. z0 := z0 + bits32( z1 < carry1 );
  1068. z0 := z0 + carry0;
  1069. z2Ptr := z2;
  1070. z1Ptr := z1;
  1071. z0Ptr := z0;
  1072. End;
  1073. {*----------------------------------------------------------------------------
  1074. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1075. | by the number of bits given in `count'. Any bits shifted off are lost.
  1076. | The value of `count' must be less than 64. The result is broken into three
  1077. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1078. | `z1Ptr', and `z2Ptr'.
  1079. *----------------------------------------------------------------------------*}
  1080. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1081. var
  1082. z0, z1, z2 : bits64;
  1083. negCount : int8;
  1084. begin
  1085. z2 := a2 shl count;
  1086. z1 := a1 shl count;
  1087. z0 := a0 shl count;
  1088. if ( 0 < count ) then
  1089. begin
  1090. negCount := ( ( - count ) and 63 );
  1091. z1 := z1 or (a2 shr negCount);
  1092. z0 := z0 or (a1 shr negCount);
  1093. end;
  1094. z2Ptr := z2;
  1095. z1Ptr := z1;
  1096. z0Ptr := z0;
  1097. end;
  1098. {*----------------------------------------------------------------------------
  1099. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1100. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1101. | any carry out is lost. The result is broken into two 64-bit pieces which
  1102. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1103. *----------------------------------------------------------------------------*}
  1104. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1105. var
  1106. z1 : bits64;
  1107. begin
  1108. z1 := a1 + b1;
  1109. z1Ptr := z1;
  1110. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1111. end;
  1112. {*----------------------------------------------------------------------------
  1113. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1114. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1115. | modulo 2^192, so any carry out is lost. The result is broken into three
  1116. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1117. | `z1Ptr', and `z2Ptr'.
  1118. *----------------------------------------------------------------------------*}
  1119. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1120. var
  1121. z0, z1, z2 : bits64;
  1122. carry0, carry1 : int8;
  1123. begin
  1124. z2 := a2 + b2;
  1125. carry1 := ord( z2 < a2 );
  1126. z1 := a1 + b1;
  1127. carry0 := ord( z1 < a1 );
  1128. z0 := a0 + b0;
  1129. inc(z1, carry1);
  1130. inc(z0, ord( z1 < carry1 ));
  1131. inc(z0, carry0);
  1132. z2Ptr := z2;
  1133. z1Ptr := z1;
  1134. z0Ptr := z0;
  1135. end;
  1136. {*
  1137. -------------------------------------------------------------------------------
  1138. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1139. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1140. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1141. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1142. `z1Ptr'.
  1143. -------------------------------------------------------------------------------
  1144. *}
  1145. Procedure
  1146. sub64(
  1147. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1148. Begin
  1149. z1Ptr := a1 - b1;
  1150. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1151. End;
  1152. {*
  1153. -------------------------------------------------------------------------------
  1154. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1155. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1156. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1157. into three 32-bit pieces which are stored at the locations pointed to by
  1158. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1159. -------------------------------------------------------------------------------
  1160. *}
  1161. Procedure
  1162. sub96(
  1163. a0:bits32;
  1164. a1:bits32;
  1165. a2:bits32;
  1166. b0:bits32;
  1167. b1:bits32;
  1168. b2:bits32;
  1169. VAR z0Ptr:bits32;
  1170. VAR z1Ptr:bits32;
  1171. VAR z2Ptr:bits32
  1172. );
  1173. Var
  1174. z0, z1, z2: bits32;
  1175. borrow0, borrow1: int8;
  1176. Begin
  1177. z2 := a2 - b2;
  1178. borrow1 := int8( a2 < b2 );
  1179. z1 := a1 - b1;
  1180. borrow0 := int8( a1 < b1 );
  1181. z0 := a0 - b0;
  1182. z0 := z0 - bits32( z1 < borrow1 );
  1183. z1 := z1 - borrow1;
  1184. z0 := z0 -borrow0;
  1185. z2Ptr := z2;
  1186. z1Ptr := z1;
  1187. z0Ptr := z0;
  1188. End;
  1189. {*----------------------------------------------------------------------------
  1190. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1191. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1192. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1193. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1194. | `z1Ptr'.
  1195. *----------------------------------------------------------------------------*}
  1196. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1197. begin
  1198. z1Ptr := a1 - b1;
  1199. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1200. end;
  1201. {*----------------------------------------------------------------------------
  1202. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1203. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1204. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1205. | result is broken into three 64-bit pieces which are stored at the locations
  1206. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1207. *----------------------------------------------------------------------------*}
  1208. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1209. var
  1210. z0, z1, z2 : bits64;
  1211. borrow0, borrow1 : int8;
  1212. begin
  1213. z2 := a2 - b2;
  1214. borrow1 := ord( a2 < b2 );
  1215. z1 := a1 - b1;
  1216. borrow0 := ord( a1 < b1 );
  1217. z0 := a0 - b0;
  1218. dec(z0, ord( z1 < borrow1 ));
  1219. dec(z1, borrow1);
  1220. dec(z0, borrow0);
  1221. z2Ptr := z2;
  1222. z1Ptr := z1;
  1223. z0Ptr := z0;
  1224. end;
  1225. {*
  1226. -------------------------------------------------------------------------------
  1227. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1228. into two 32-bit pieces which are stored at the locations pointed to by
  1229. `z0Ptr' and `z1Ptr'.
  1230. -------------------------------------------------------------------------------
  1231. *}
  1232. {$IFDEF SOFTFPU_COMPILER_MUL32TO64}
  1233. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr :bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1234. var
  1235. tmp: qword;
  1236. begin
  1237. tmp:=qword(a) * b;
  1238. z0ptr:=hi(tmp);
  1239. z1ptr:=lo(tmp);
  1240. end;
  1241. {$ELSE}
  1242. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1243. :bits32 );
  1244. Var
  1245. aHigh, aLow, bHigh, bLow: bits16;
  1246. z0, zMiddleA, zMiddleB, z1: bits32;
  1247. Begin
  1248. aLow := bits16(a);
  1249. aHigh := a shr 16;
  1250. bLow := bits16(b);
  1251. bHigh := b shr 16;
  1252. z1 := ( bits32( aLow) ) * bLow;
  1253. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1254. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1255. z0 := ( bits32 (aHigh) ) * bHigh;
  1256. zMiddleA := zMiddleA + zMiddleB;
  1257. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1258. zMiddleA := zmiddleA shl 16;
  1259. z1 := z1 + zMiddleA;
  1260. z0 := z0 + bits32( z1 < zMiddleA );
  1261. z1Ptr := z1;
  1262. z0Ptr := z0;
  1263. End;
  1264. {$ENDIF}
  1265. {*
  1266. -------------------------------------------------------------------------------
  1267. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1268. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1269. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1270. `z2Ptr'.
  1271. -------------------------------------------------------------------------------
  1272. *}
  1273. Procedure
  1274. mul64By32To96(
  1275. a0:bits32;
  1276. a1:bits32;
  1277. b:bits32;
  1278. VAR z0Ptr:bits32;
  1279. VAR z1Ptr:bits32;
  1280. VAR z2Ptr:bits32
  1281. );
  1282. Var
  1283. z0, z1, z2, more1: bits32;
  1284. Begin
  1285. mul32To64( a1, b, z1, z2 );
  1286. mul32To64( a0, b, z0, more1 );
  1287. add64( z0, more1, 0, z1, z0, z1 );
  1288. z2Ptr := z2;
  1289. z1Ptr := z1;
  1290. z0Ptr := z0;
  1291. End;
  1292. {*
  1293. -------------------------------------------------------------------------------
  1294. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1295. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1296. product. The product is broken into four 32-bit pieces which are stored at
  1297. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1298. -------------------------------------------------------------------------------
  1299. *}
  1300. Procedure
  1301. mul64To128(
  1302. a0:bits32;
  1303. a1:bits32;
  1304. b0:bits32;
  1305. b1:bits32;
  1306. VAR z0Ptr:bits32;
  1307. VAR z1Ptr:bits32;
  1308. VAR z2Ptr:bits32;
  1309. VAR z3Ptr:bits32
  1310. );
  1311. Var
  1312. z0, z1, z2, z3: bits32;
  1313. more1, more2: bits32;
  1314. Begin
  1315. mul32To64( a1, b1, z2, z3 );
  1316. mul32To64( a1, b0, z1, more2 );
  1317. add64( z1, more2, 0, z2, z1, z2 );
  1318. mul32To64( a0, b0, z0, more1 );
  1319. add64( z0, more1, 0, z1, z0, z1 );
  1320. mul32To64( a0, b1, more1, more2 );
  1321. add64( more1, more2, 0, z2, more1, z2 );
  1322. add64( z0, z1, 0, more1, z0, z1 );
  1323. z3Ptr := z3;
  1324. z2Ptr := z2;
  1325. z1Ptr := z1;
  1326. z0Ptr := z0;
  1327. End;
  1328. {*----------------------------------------------------------------------------
  1329. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1330. | into two 64-bit pieces which are stored at the locations pointed to by
  1331. | `z0Ptr' and `z1Ptr'.
  1332. *----------------------------------------------------------------------------*}
  1333. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1334. var
  1335. aHigh, aLow, bHigh, bLow : bits32;
  1336. z0, zMiddleA, zMiddleB, z1 : bits64;
  1337. begin
  1338. aLow := a;
  1339. aHigh := a shr 32;
  1340. bLow := b;
  1341. bHigh := b shr 32;
  1342. z1 := ( bits64(aLow) ) * bLow;
  1343. zMiddleA := ( bits64( aLow )) * bHigh;
  1344. zMiddleB := ( bits64( aHigh )) * bLow;
  1345. z0 := ( bits64(aHigh) ) * bHigh;
  1346. inc(zMiddleA, zMiddleB);
  1347. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1348. zMiddleA := zMiddleA shl 32;
  1349. inc(z1, zMiddleA);
  1350. inc(z0, ord( z1 < zMiddleA ));
  1351. z1Ptr := z1;
  1352. z0Ptr := z0;
  1353. end;
  1354. {*----------------------------------------------------------------------------
  1355. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1356. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1357. | product. The product is broken into four 64-bit pieces which are stored at
  1358. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1359. *----------------------------------------------------------------------------*}
  1360. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1361. var
  1362. z0,z1,z2,z3,more1,more2 : bits64;
  1363. begin
  1364. mul64To128( a1, b1, z2, z3 );
  1365. mul64To128( a1, b0, z1, more2 );
  1366. add128( z1, more2, 0, z2, z1, z2 );
  1367. mul64To128( a0, b0, z0, more1 );
  1368. add128( z0, more1, 0, z1, z0, z1 );
  1369. mul64To128( a0, b1, more1, more2 );
  1370. add128( more1, more2, 0, z2, more1, z2 );
  1371. add128( z0, z1, 0, more1, z0, z1 );
  1372. z3Ptr := z3;
  1373. z2Ptr := z2;
  1374. z1Ptr := z1;
  1375. z0Ptr := z0;
  1376. end;
  1377. {*----------------------------------------------------------------------------
  1378. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1379. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1380. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1381. | `z2Ptr'.
  1382. *----------------------------------------------------------------------------*}
  1383. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1384. var
  1385. z0, z1, z2, more1 : bits64;
  1386. begin
  1387. mul64To128( a1, b, z1, z2 );
  1388. mul64To128( a0, b, z0, more1 );
  1389. add128( z0, more1, 0, z1, z0, z1 );
  1390. z2Ptr := z2;
  1391. z1Ptr := z1;
  1392. z0Ptr := z0;
  1393. end;
  1394. {*----------------------------------------------------------------------------
  1395. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1396. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1397. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1398. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1399. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1400. | unsigned integer is returned.
  1401. *----------------------------------------------------------------------------*}
  1402. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1403. var
  1404. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1405. begin
  1406. if ( b <= a0 ) then
  1407. begin
  1408. result:=qword( $FFFFFFFFFFFFFFFF );
  1409. exit;
  1410. end;
  1411. b0 := b shr 32;
  1412. if ( b0 shl 32 <= a0 ) then
  1413. z:=qword( $FFFFFFFF00000000 )
  1414. else
  1415. z:=( a0 div b0 ) shl 32;
  1416. mul64To128( b, z, term0, term1 );
  1417. sub128( a0, a1, term0, term1, rem0, rem1 );
  1418. while ( ( sbits64(rem0) ) < 0 ) do begin
  1419. dec(z,qword( $100000000 ));
  1420. b1 := b shl 32;
  1421. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1422. end;
  1423. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1424. if ( b0 shl 32 <= rem0 ) then
  1425. z:=z or $FFFFFFFF
  1426. else
  1427. z:=z or rem0 div b0;
  1428. result:=z;
  1429. end;
  1430. {*
  1431. -------------------------------------------------------------------------------
  1432. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1433. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1434. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1435. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1436. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1437. unsigned integer is returned.
  1438. -------------------------------------------------------------------------------
  1439. *}
  1440. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1441. Var
  1442. b0, b1: bits32;
  1443. rem0, rem1, term0, term1: bits32;
  1444. z: bits32;
  1445. Begin
  1446. if ( b <= a0 ) then
  1447. Begin
  1448. estimateDiv64To32 := $FFFFFFFF;
  1449. exit;
  1450. End;
  1451. b0 := b shr 16;
  1452. if ( b0 shl 16 <= a0 ) then
  1453. z:= $FFFF0000
  1454. else
  1455. z:= ( a0 div b0 ) shl 16;
  1456. mul32To64( b, z, term0, term1 );
  1457. sub64( a0, a1, term0, term1, rem0, rem1 );
  1458. while ( ( sbits32 (rem0) ) < 0 ) do
  1459. Begin
  1460. z := z - $10000;
  1461. b1 := b shl 16;
  1462. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1463. End;
  1464. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1465. if ( b0 shl 16 <= rem0 ) then
  1466. z := z or $FFFF
  1467. else
  1468. z := z or (rem0 div b0);
  1469. estimateDiv64To32 := z;
  1470. End;
  1471. {*
  1472. -------------------------------------------------------------------------------
  1473. Returns an approximation to the square root of the 32-bit significand given
  1474. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1475. `aExp' (the least significant bit) is 1, the integer returned approximates
  1476. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1477. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1478. case, the approximation returned lies strictly within +/-2 of the exact
  1479. value.
  1480. -------------------------------------------------------------------------------
  1481. *}
  1482. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1483. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1484. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1485. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1486. );
  1487. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1488. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1489. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1490. );
  1491. Var
  1492. index: int8;
  1493. z: bits32;
  1494. Begin
  1495. index := ( a shr 27 ) AND 15;
  1496. if ( aExp AND 1 ) <> 0 then
  1497. Begin
  1498. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1499. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1500. a := a shr 1;
  1501. End
  1502. else
  1503. Begin
  1504. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1505. z := a div z + z;
  1506. if ( $20000 <= z ) then
  1507. z := $FFFF8000
  1508. else
  1509. z := ( z shl 15 );
  1510. if ( z <= a ) then
  1511. Begin
  1512. estimateSqrt32 := bits32 ( SarLongint( sbits32 (a)) );
  1513. exit;
  1514. End;
  1515. End;
  1516. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1517. End;
  1518. {*
  1519. -------------------------------------------------------------------------------
  1520. Returns the number of leading 0 bits before the most-significant 1 bit of
  1521. `a'. If `a' is zero, 32 is returned.
  1522. -------------------------------------------------------------------------------
  1523. *}
  1524. Function countLeadingZeros32( a:bits32 ): int8;
  1525. const countLeadingZerosHigh:array[0..255] of int8 = (
  1526. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1527. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1528. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1529. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1530. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1531. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1532. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1533. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1534. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1535. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1536. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1537. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1538. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1539. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1540. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1541. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1542. );
  1543. Var
  1544. shiftCount: int8;
  1545. Begin
  1546. shiftCount := 0;
  1547. if ( a < $10000 ) then
  1548. Begin
  1549. shiftCount := shiftcount + 16;
  1550. a := a shl 16;
  1551. End;
  1552. if ( a < $1000000 ) then
  1553. Begin
  1554. shiftCount := shiftcount + 8;
  1555. a := a shl 8;
  1556. end;
  1557. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1558. countLeadingZeros32:= shiftCount;
  1559. End;
  1560. {*----------------------------------------------------------------------------
  1561. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1562. | `a'. If `a' is zero, 64 is returned.
  1563. *----------------------------------------------------------------------------*}
  1564. function countLeadingZeros64( a : bits64): int8;
  1565. var
  1566. shiftcount : int8;
  1567. Begin
  1568. shiftCount := 0;
  1569. if ( a < bits64(bits64(1) shl 32 )) then
  1570. shiftCount := shiftcount + 32
  1571. else
  1572. a := a shr 32;
  1573. shiftCount := shiftCount + countLeadingZeros32( a );
  1574. countLeadingZeros64:= shiftCount;
  1575. End;
  1576. {*
  1577. -------------------------------------------------------------------------------
  1578. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1579. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1580. Otherwise, returns 0.
  1581. -------------------------------------------------------------------------------
  1582. *}
  1583. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1584. Begin
  1585. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1586. End;
  1587. {*
  1588. -------------------------------------------------------------------------------
  1589. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1590. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1591. returns 0.
  1592. -------------------------------------------------------------------------------
  1593. *}
  1594. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1595. Begin
  1596. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1597. End;
  1598. const
  1599. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1600. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1601. (*****************************************************************************)
  1602. (* End Low-Level arithmetic *)
  1603. (*****************************************************************************)
  1604. {*
  1605. -------------------------------------------------------------------------------
  1606. Functions and definitions to determine: (1) whether tininess for underflow
  1607. is detected before or after rounding by default, (2) what (if anything)
  1608. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1609. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1610. are propagated from function inputs to output. These details are ENDIAN
  1611. specific
  1612. -------------------------------------------------------------------------------
  1613. *}
  1614. {$IFDEF ENDIAN_LITTLE}
  1615. {*
  1616. -------------------------------------------------------------------------------
  1617. Internal canonical NaN format.
  1618. -------------------------------------------------------------------------------
  1619. *}
  1620. TYPE
  1621. commonNaNT = record
  1622. high, low : bits32;
  1623. sign: flag;
  1624. end;
  1625. {*
  1626. -------------------------------------------------------------------------------
  1627. The pattern for a default generated single-precision NaN.
  1628. -------------------------------------------------------------------------------
  1629. *}
  1630. const float32_default_nan = $FFC00000;
  1631. {*
  1632. -------------------------------------------------------------------------------
  1633. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1634. otherwise returns 0.
  1635. -------------------------------------------------------------------------------
  1636. *}
  1637. Function float32_is_nan( a : float32 ): flag;
  1638. Begin
  1639. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1640. End;
  1641. {*
  1642. -------------------------------------------------------------------------------
  1643. Returns 1 if the single-precision floating-point value `a' is a signaling
  1644. NaN; otherwise returns 0.
  1645. -------------------------------------------------------------------------------
  1646. *}
  1647. Function float32_is_signaling_nan( a : float32 ): flag;
  1648. Begin
  1649. float32_is_signaling_nan := flag
  1650. (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));
  1651. End;
  1652. {*
  1653. -------------------------------------------------------------------------------
  1654. Returns the result of converting the single-precision floating-point NaN
  1655. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1656. exception is raised.
  1657. -------------------------------------------------------------------------------
  1658. *}
  1659. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1660. var
  1661. z : commonNaNT ;
  1662. Begin
  1663. if ( float32_is_signaling_nan( a ) <> 0) then
  1664. float_raise( float_flag_invalid );
  1665. z.sign := a shr 31;
  1666. z.low := 0;
  1667. z.high := a shl 9;
  1668. c := z;
  1669. End;
  1670. {*
  1671. -------------------------------------------------------------------------------
  1672. Returns the result of converting the canonical NaN `a' to the single-
  1673. precision floating-point format.
  1674. -------------------------------------------------------------------------------
  1675. *}
  1676. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1677. Begin
  1678. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1679. End;
  1680. {*
  1681. -------------------------------------------------------------------------------
  1682. Takes two single-precision floating-point values `a' and `b', one of which
  1683. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1684. signaling NaN, the invalid exception is raised.
  1685. -------------------------------------------------------------------------------
  1686. *}
  1687. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1688. Var
  1689. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1690. label returnLargerSignificand;
  1691. Begin
  1692. aIsNaN := float32_is_nan( a );
  1693. aIsSignalingNaN := float32_is_signaling_nan( a );
  1694. bIsNaN := float32_is_nan( b );
  1695. bIsSignalingNaN := float32_is_signaling_nan( b );
  1696. a := a or $00400000;
  1697. b := b or $00400000;
  1698. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1699. float_raise( float_flag_invalid );
  1700. if ( aIsSignalingNaN )<> 0 then
  1701. Begin
  1702. if ( bIsSignalingNaN ) <> 0 then
  1703. goto returnLargerSignificand;
  1704. if bIsNan <> 0 then
  1705. propagateFloat32NaN := b
  1706. else
  1707. propagateFloat32NaN := a;
  1708. exit;
  1709. End
  1710. else if ( aIsNaN <> 0) then
  1711. Begin
  1712. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1713. Begin
  1714. propagateFloat32NaN := a;
  1715. exit;
  1716. End;
  1717. returnLargerSignificand:
  1718. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1719. Begin
  1720. propagateFloat32NaN := b;
  1721. exit;
  1722. End;
  1723. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1724. Begin
  1725. propagateFloat32NaN := a;
  1726. End;
  1727. if a < b then
  1728. propagateFloat32NaN := a
  1729. else
  1730. propagateFloat32NaN := b;
  1731. exit;
  1732. End
  1733. else
  1734. Begin
  1735. propagateFloat32NaN := b;
  1736. exit;
  1737. End;
  1738. End;
  1739. {*
  1740. -------------------------------------------------------------------------------
  1741. The pattern for a default generated double-precision NaN. The `high' and
  1742. `low' values hold the most- and least-significant bits, respectively.
  1743. -------------------------------------------------------------------------------
  1744. *}
  1745. const
  1746. float64_default_nan_high = $FFF80000;
  1747. float64_default_nan_low = $00000000;
  1748. {*
  1749. -------------------------------------------------------------------------------
  1750. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1751. otherwise returns 0.
  1752. -------------------------------------------------------------------------------
  1753. *}
  1754. Function float64_is_nan( a : float64 ) : flag;
  1755. Begin
  1756. float64_is_nan :=
  1757. flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1758. and (( a.low or ( a.high and $000FFFFF ) )<>0));
  1759. End;
  1760. {*
  1761. -------------------------------------------------------------------------------
  1762. Returns 1 if the double-precision floating-point value `a' is a signaling
  1763. NaN; otherwise returns 0.
  1764. -------------------------------------------------------------------------------
  1765. *}
  1766. Function float64_is_signaling_nan( a : float64 ): flag;
  1767. Begin
  1768. float64_is_signaling_nan :=
  1769. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1770. and ( a.low or ( a.high and $0007FFFF ) );
  1771. End;
  1772. {*
  1773. -------------------------------------------------------------------------------
  1774. Returns the result of converting the double-precision floating-point NaN
  1775. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1776. exception is raised.
  1777. -------------------------------------------------------------------------------
  1778. *}
  1779. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1780. Var
  1781. z : commonNaNT;
  1782. Begin
  1783. if ( float64_is_signaling_nan( a )<>0 ) then
  1784. float_raise( float_flag_invalid );
  1785. z.sign := a.high shr 31;
  1786. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1787. c := z;
  1788. End;
  1789. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1790. Var
  1791. z : commonNaNT;
  1792. Begin
  1793. if ( float64_is_signaling_nan( a )<>0 ) then
  1794. float_raise( float_flag_invalid );
  1795. z.sign := a.high shr 31;
  1796. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1797. result := z;
  1798. End;
  1799. {*
  1800. -------------------------------------------------------------------------------
  1801. Returns the result of converting the canonical NaN `a' to the double-
  1802. precision floating-point format.
  1803. -------------------------------------------------------------------------------
  1804. *}
  1805. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1806. Var
  1807. z: float64;
  1808. Begin
  1809. shift64Right( a.high, a.low, 12, z.high, z.low );
  1810. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1811. c := z;
  1812. End;
  1813. {*
  1814. -------------------------------------------------------------------------------
  1815. Takes two double-precision floating-point values `a' and `b', one of which
  1816. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1817. signaling NaN, the invalid exception is raised.
  1818. -------------------------------------------------------------------------------
  1819. *}
  1820. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1821. Var
  1822. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1823. label returnLargerSignificand;
  1824. Begin
  1825. aIsNaN := float64_is_nan( a );
  1826. aIsSignalingNaN := float64_is_signaling_nan( a );
  1827. bIsNaN := float64_is_nan( b );
  1828. bIsSignalingNaN := float64_is_signaling_nan( b );
  1829. a.high := a.high or $00080000;
  1830. b.high := b.high or $00080000;
  1831. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1832. float_raise( float_flag_invalid );
  1833. if ( aIsSignalingNaN )<>0 then
  1834. Begin
  1835. if ( bIsSignalingNaN )<>0 then
  1836. goto returnLargerSignificand;
  1837. if bIsNan <> 0 then
  1838. c := b
  1839. else
  1840. c := a;
  1841. exit;
  1842. End
  1843. else if ( aIsNaN )<> 0 then
  1844. Begin
  1845. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1846. Begin
  1847. c := a;
  1848. exit;
  1849. End;
  1850. returnLargerSignificand:
  1851. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1852. Begin
  1853. c := b;
  1854. exit;
  1855. End;
  1856. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1857. Begin
  1858. c := a;
  1859. exit;
  1860. End;
  1861. if a.high < b.high then
  1862. c := a
  1863. else
  1864. c := b;
  1865. exit;
  1866. End
  1867. else
  1868. Begin
  1869. c := b;
  1870. exit;
  1871. End;
  1872. End;
  1873. {*----------------------------------------------------------------------------
  1874. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1875. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1876. | returns 0.
  1877. *----------------------------------------------------------------------------*}
  1878. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1879. begin
  1880. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1881. end;
  1882. {*----------------------------------------------------------------------------
  1883. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1884. | otherwise returns 0.
  1885. *----------------------------------------------------------------------------*}
  1886. function float128_is_nan( a : float128): flag;
  1887. begin
  1888. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1889. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1890. end;
  1891. {*----------------------------------------------------------------------------
  1892. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1893. | signaling NaN; otherwise returns 0.
  1894. *----------------------------------------------------------------------------*}
  1895. function float128_is_signaling_nan( a : float128): flag;
  1896. begin
  1897. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1898. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1899. end;
  1900. {*----------------------------------------------------------------------------
  1901. | Returns the result of converting the quadruple-precision floating-point NaN
  1902. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1903. | exception is raised.
  1904. *----------------------------------------------------------------------------*}
  1905. function float128ToCommonNaN( a : float128): commonNaNT;
  1906. var
  1907. z: commonNaNT;
  1908. qhigh,qlow : qword;
  1909. begin
  1910. if ( float128_is_signaling_nan( a )<>0) then
  1911. float_raise( float_flag_invalid );
  1912. z.sign := a.high shr 63;
  1913. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1914. z.high:=qhigh shr 32;
  1915. z.low:=qhigh and $ffffffff;
  1916. result:=z;
  1917. end;
  1918. {*----------------------------------------------------------------------------
  1919. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1920. | precision floating-point format.
  1921. *----------------------------------------------------------------------------*}
  1922. function commonNaNToFloat128( a : commonNaNT): float128;
  1923. var
  1924. z: float128;
  1925. begin
  1926. shift128Right( a.high, a.low, 16, z.high, z.low );
  1927. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1928. result:=z;
  1929. end;
  1930. {*----------------------------------------------------------------------------
  1931. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1932. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1933. | `b' is a signaling NaN, the invalid exception is raised.
  1934. *----------------------------------------------------------------------------*}
  1935. function propagateFloat128NaN( a: float128; b : float128): float128;
  1936. var
  1937. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1938. label
  1939. returnLargerSignificand;
  1940. begin
  1941. aIsNaN := float128_is_nan( a );
  1942. aIsSignalingNaN := float128_is_signaling_nan( a );
  1943. bIsNaN := float128_is_nan( b );
  1944. bIsSignalingNaN := float128_is_signaling_nan( b );
  1945. a.high := a.high or int64( $0000800000000000 );
  1946. b.high := b.high or int64( $0000800000000000 );
  1947. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1948. float_raise( float_flag_invalid );
  1949. if ( aIsSignalingNaN )<>0 then
  1950. begin
  1951. if ( bIsSignalingNaN )<>0 then
  1952. goto returnLargerSignificand;
  1953. if bIsNaN<>0 then
  1954. result := b
  1955. else
  1956. result := a;
  1957. exit;
  1958. end
  1959. else if ( aIsNaN )<>0 then
  1960. begin
  1961. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1962. begin
  1963. result := a;
  1964. exit;
  1965. end;
  1966. returnLargerSignificand:
  1967. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1968. begin
  1969. result := b;
  1970. exit;
  1971. end;
  1972. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1973. begin
  1974. result := a;
  1975. exit
  1976. end;
  1977. if ( a.high < b.high ) then
  1978. result := a
  1979. else
  1980. result := b;
  1981. exit;
  1982. end
  1983. else
  1984. result:=b;
  1985. end;
  1986. {$ELSE}
  1987. { Big endian code }
  1988. (*----------------------------------------------------------------------------
  1989. | Internal canonical NaN format.
  1990. *----------------------------------------------------------------------------*)
  1991. type
  1992. commonNANT = record
  1993. high, low : bits32;
  1994. sign : flag;
  1995. end;
  1996. (*----------------------------------------------------------------------------
  1997. | The pattern for a default generated single-precision NaN.
  1998. *----------------------------------------------------------------------------*)
  1999. const float32_default_nan = $7FFFFFFF;
  2000. (*----------------------------------------------------------------------------
  2001. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  2002. | otherwise returns 0.
  2003. *----------------------------------------------------------------------------*)
  2004. function float32_is_nan(a: float32): flag;
  2005. begin
  2006. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  2007. end;
  2008. (*----------------------------------------------------------------------------
  2009. | Returns 1 if the single-precision floating-point value `a' is a signaling
  2010. | NaN; otherwise returns 0.
  2011. *----------------------------------------------------------------------------*)
  2012. function float32_is_signaling_nan(a: float32):flag;
  2013. begin
  2014. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2015. end;
  2016. (*----------------------------------------------------------------------------
  2017. | Returns the result of converting the single-precision floating-point NaN
  2018. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2019. | exception is raised.
  2020. *----------------------------------------------------------------------------*)
  2021. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  2022. var
  2023. z: commonNANT;
  2024. begin
  2025. if float32_is_signaling_nan(a)<>0 then
  2026. float_raise(float_flag_invalid);
  2027. z.sign := a shr 31;
  2028. z.low := 0;
  2029. z.high := a shl 9;
  2030. c:=z;
  2031. end;
  2032. (*----------------------------------------------------------------------------
  2033. | Returns the result of converting the canonical NaN `a' to the single-
  2034. | precision floating-point format.
  2035. *----------------------------------------------------------------------------*)
  2036. function CommonNanToFloat32(a : CommonNaNT): float32;
  2037. begin
  2038. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2039. end;
  2040. (*----------------------------------------------------------------------------
  2041. | Takes two single-precision floating-point values `a' and `b', one of which
  2042. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2043. | signaling NaN, the invalid exception is raised.
  2044. *----------------------------------------------------------------------------*)
  2045. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2046. var
  2047. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2048. begin
  2049. aIsNaN := float32_is_nan( a );
  2050. aIsSignalingNaN := float32_is_signaling_nan( a );
  2051. bIsNaN := float32_is_nan( b );
  2052. bIsSignalingNaN := float32_is_signaling_nan( b );
  2053. a := a or $00400000;
  2054. b := b or $00400000;
  2055. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2056. float_raise( float_flag_invalid );
  2057. if bIsSignalingNaN<>0 then
  2058. propagateFloat32Nan := b
  2059. else if aIsSignalingNan<>0 then
  2060. propagateFloat32Nan := a
  2061. else if bIsNan<>0 then
  2062. propagateFloat32Nan := b
  2063. else
  2064. propagateFloat32Nan := a;
  2065. end;
  2066. (*----------------------------------------------------------------------------
  2067. | The pattern for a default generated double-precision NaN. The `high' and
  2068. | `low' values hold the most- and least-significant bits, respectively.
  2069. *----------------------------------------------------------------------------*)
  2070. const
  2071. float64_default_nan_high = $7FFFFFFF;
  2072. float64_default_nan_low = $FFFFFFFF;
  2073. (*----------------------------------------------------------------------------
  2074. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2075. | otherwise returns 0.
  2076. *----------------------------------------------------------------------------*)
  2077. function float64_is_nan(a: float64): flag;
  2078. begin
  2079. float64_is_nan := flag (
  2080. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2081. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2082. end;
  2083. (*----------------------------------------------------------------------------
  2084. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2085. | NaN; otherwise returns 0.
  2086. *----------------------------------------------------------------------------*)
  2087. function float64_is_signaling_nan( a:float64): flag;
  2088. begin
  2089. float64_is_signaling_nan := flag(
  2090. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2091. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2092. end;
  2093. (*----------------------------------------------------------------------------
  2094. | Returns the result of converting the double-precision floating-point NaN
  2095. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2096. | exception is raised.
  2097. *----------------------------------------------------------------------------*)
  2098. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2099. var
  2100. z : commonNaNT;
  2101. begin
  2102. if ( float64_is_signaling_nan( a )<>0 ) then
  2103. float_raise( float_flag_invalid );
  2104. z.sign := a.high shr 31;
  2105. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2106. c:=z;
  2107. end;
  2108. (*----------------------------------------------------------------------------
  2109. | Returns the result of converting the canonical NaN `a' to the double-
  2110. | precision floating-point format.
  2111. *----------------------------------------------------------------------------*)
  2112. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2113. var
  2114. z: float64;
  2115. begin
  2116. shift64Right( a.high, a.low, 12, z.high, z.low );
  2117. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2118. c:=z;
  2119. end;
  2120. (*----------------------------------------------------------------------------
  2121. | Takes two double-precision floating-point values `a' and `b', one of which
  2122. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2123. | signaling NaN, the invalid exception is raised.
  2124. *----------------------------------------------------------------------------*)
  2125. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2126. var
  2127. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2128. begin
  2129. aIsNaN := float64_is_nan( a );
  2130. aIsSignalingNaN := float64_is_signaling_nan( a );
  2131. bIsNaN := float64_is_nan( b );
  2132. bIsSignalingNaN := float64_is_signaling_nan( b );
  2133. a.high := a.high or $00080000;
  2134. b.high := b.high or $00080000;
  2135. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2136. float_raise( float_flag_invalid );
  2137. if bIsSignalingNaN<>0 then
  2138. c := b
  2139. else if aIsSignalingNan<>0 then
  2140. c := a
  2141. else if bIsNan<>0 then
  2142. c := b
  2143. else
  2144. c := a;
  2145. end;
  2146. {$ENDIF}
  2147. (****************************************************************************)
  2148. (* END ENDIAN SPECIFIC CODE *)
  2149. (****************************************************************************)
  2150. {*
  2151. -------------------------------------------------------------------------------
  2152. Returns the fraction bits of the single-precision floating-point value `a'.
  2153. -------------------------------------------------------------------------------
  2154. *}
  2155. Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
  2156. Begin
  2157. ExtractFloat32Frac := A AND $007FFFFF;
  2158. End;
  2159. {*
  2160. -------------------------------------------------------------------------------
  2161. Returns the exponent bits of the single-precision floating-point value `a'.
  2162. -------------------------------------------------------------------------------
  2163. *}
  2164. Function extractFloat32Exp( a: float32 ): Int16; inline;
  2165. Begin
  2166. extractFloat32Exp := (a shr 23) AND $FF;
  2167. End;
  2168. {*
  2169. -------------------------------------------------------------------------------
  2170. Returns the sign bit of the single-precision floating-point value `a'.
  2171. -------------------------------------------------------------------------------
  2172. *}
  2173. Function extractFloat32Sign( a: float32 ): Flag; inline;
  2174. Begin
  2175. extractFloat32Sign := a shr 31;
  2176. End;
  2177. {*
  2178. -------------------------------------------------------------------------------
  2179. Normalizes the subnormal single-precision floating-point value represented
  2180. by the denormalized significand `aSig'. The normalized exponent and
  2181. significand are stored at the locations pointed to by `zExpPtr' and
  2182. `zSigPtr', respectively.
  2183. -------------------------------------------------------------------------------
  2184. *}
  2185. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2186. Var
  2187. ShiftCount : BYTE;
  2188. Begin
  2189. shiftCount := countLeadingZeros32( aSig ) - 8;
  2190. zSigPtr := aSig shl shiftCount;
  2191. zExpPtr := 1 - shiftCount;
  2192. End;
  2193. {*
  2194. -------------------------------------------------------------------------------
  2195. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2196. single-precision floating-point value, returning the result. After being
  2197. shifted into the proper positions, the three fields are simply added
  2198. together to form the result. This means that any integer portion of `zSig'
  2199. will be added into the exponent. Since a properly normalized significand
  2200. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2201. than the desired result exponent whenever `zSig' is a complete, normalized
  2202. significand.
  2203. -------------------------------------------------------------------------------
  2204. *}
  2205. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
  2206. Begin
  2207. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2208. + zSig;
  2209. End;
  2210. {*
  2211. -------------------------------------------------------------------------------
  2212. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2213. and significand `zSig', and returns the proper single-precision floating-
  2214. point value corresponding to the abstract input. Ordinarily, the abstract
  2215. value is simply rounded and packed into the single-precision format, with
  2216. the inexact exception raised if the abstract input cannot be represented
  2217. exactly. However, if the abstract value is too large, the overflow and
  2218. inexact exceptions are raised and an infinity or maximal finite value is
  2219. returned. If the abstract value is too small, the input value is rounded to
  2220. a subnormal number, and the underflow and inexact exceptions are raised if
  2221. the abstract input cannot be represented exactly as a subnormal single-
  2222. precision floating-point number.
  2223. The input significand `zSig' has its binary point between bits 30
  2224. and 29, which is 7 bits to the left of the usual location. This shifted
  2225. significand must be normalized or smaller. If `zSig' is not normalized,
  2226. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2227. and it must not require rounding. In the usual case that `zSig' is
  2228. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2229. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2230. Binary Floating-Point Arithmetic.
  2231. -------------------------------------------------------------------------------
  2232. *}
  2233. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2234. Var
  2235. roundingMode : TFPURoundingMode;
  2236. roundNearestEven : boolean;
  2237. roundIncrement, roundBits : BYTE;
  2238. IsTiny : boolean;
  2239. Begin
  2240. roundingMode := softfloat_rounding_mode;
  2241. roundNearestEven := (roundingMode = float_round_nearest_even);
  2242. roundIncrement := $40;
  2243. if not roundNearestEven then
  2244. Begin
  2245. if ( roundingMode = float_round_to_zero ) Then
  2246. Begin
  2247. roundIncrement := 0;
  2248. End
  2249. else
  2250. Begin
  2251. roundIncrement := $7F;
  2252. if ( zSign <> 0 ) then
  2253. Begin
  2254. if roundingMode = float_round_up then roundIncrement := 0;
  2255. End
  2256. else
  2257. Begin
  2258. if roundingMode = float_round_down then roundIncrement := 0;
  2259. End;
  2260. End
  2261. End;
  2262. roundBits := zSig AND $7F;
  2263. if ($FD <= bits16 (zExp) ) then
  2264. Begin
  2265. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2266. Begin
  2267. float_raise( [float_flag_overflow,float_flag_inexact] );
  2268. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2269. exit;
  2270. End;
  2271. if ( zExp < 0 ) then
  2272. Begin
  2273. isTiny :=
  2274. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2275. OR ( zExp < -1 )
  2276. OR ( (zSig + roundIncrement) < $80000000 );
  2277. shift32RightJamming( zSig, - zExp, zSig );
  2278. zExp := 0;
  2279. roundBits := zSig AND $7F;
  2280. if ( isTiny and (roundBits<>0) ) then
  2281. float_raise( float_flag_underflow );
  2282. End;
  2283. End;
  2284. if ( roundBits )<> 0 then
  2285. set_inexact_flag;
  2286. zSig := ( zSig + roundIncrement ) shr 7;
  2287. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
  2288. if ( zSig = 0 ) then zExp := 0;
  2289. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2290. End;
  2291. {*
  2292. -------------------------------------------------------------------------------
  2293. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2294. and significand `zSig', and returns the proper single-precision floating-
  2295. point value corresponding to the abstract input. This routine is just like
  2296. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2297. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2298. floating-point exponent.
  2299. -------------------------------------------------------------------------------
  2300. *}
  2301. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2302. Var
  2303. ShiftCount : int8;
  2304. Begin
  2305. shiftCount := countLeadingZeros32( zSig ) - 1;
  2306. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2307. End;
  2308. {*
  2309. -------------------------------------------------------------------------------
  2310. Returns the most-significant 20 fraction bits of the double-precision
  2311. floating-point value `a'.
  2312. -------------------------------------------------------------------------------
  2313. *}
  2314. Function extractFloat64Frac0(a: float64): bits32; inline;
  2315. Begin
  2316. extractFloat64Frac0 := a.high and $000FFFFF;
  2317. End;
  2318. {*
  2319. -------------------------------------------------------------------------------
  2320. Returns the least-significant 32 fraction bits of the double-precision
  2321. floating-point value `a'.
  2322. -------------------------------------------------------------------------------
  2323. *}
  2324. Function extractFloat64Frac1(a: float64): bits32; inline;
  2325. Begin
  2326. extractFloat64Frac1 := a.low;
  2327. End;
  2328. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2329. Function extractFloat64Frac(a: float64): bits64; inline;
  2330. Begin
  2331. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2332. End;
  2333. {*
  2334. -------------------------------------------------------------------------------
  2335. Returns the exponent bits of the double-precision floating-point value `a'.
  2336. -------------------------------------------------------------------------------
  2337. *}
  2338. Function extractFloat64Exp(a: float64): int16; inline;
  2339. Begin
  2340. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2341. End;
  2342. {*
  2343. -------------------------------------------------------------------------------
  2344. Returns the sign bit of the double-precision floating-point value `a'.
  2345. -------------------------------------------------------------------------------
  2346. *}
  2347. Function extractFloat64Sign(a: float64) : flag; inline;
  2348. Begin
  2349. extractFloat64Sign := a.high shr 31;
  2350. End;
  2351. {*
  2352. -------------------------------------------------------------------------------
  2353. Normalizes the subnormal double-precision floating-point value represented
  2354. by the denormalized significand formed by the concatenation of `aSig0' and
  2355. `aSig1'. The normalized exponent is stored at the location pointed to by
  2356. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2357. stored at the location pointed to by `zSig0Ptr', and the least significant
  2358. 32 bits of the normalized significand are stored at the location pointed to
  2359. by `zSig1Ptr'.
  2360. -------------------------------------------------------------------------------
  2361. *}
  2362. Procedure normalizeFloat64Subnormal(
  2363. aSig0: bits32;
  2364. aSig1: bits32;
  2365. VAR zExpPtr : Int16;
  2366. VAR zSig0Ptr : Bits32;
  2367. VAR zSig1Ptr : Bits32
  2368. );
  2369. Var
  2370. ShiftCount : Int8;
  2371. Begin
  2372. if ( aSig0 = 0 ) then
  2373. Begin
  2374. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2375. if ( shiftCount < 0 ) then
  2376. Begin
  2377. zSig0Ptr := aSig1 shr ( - shiftCount );
  2378. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2379. End
  2380. else
  2381. Begin
  2382. zSig0Ptr := aSig1 shl shiftCount;
  2383. zSig1Ptr := 0;
  2384. End;
  2385. zExpPtr := - shiftCount - 31;
  2386. End
  2387. else
  2388. Begin
  2389. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2390. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2391. zExpPtr := 1 - shiftCount;
  2392. End;
  2393. End;
  2394. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2395. var
  2396. shiftCount : int8;
  2397. begin
  2398. shiftCount := countLeadingZeros64( aSig ) - 11;
  2399. zSigPtr := aSig shl shiftCount;
  2400. zExpPtr := 1 - shiftCount;
  2401. end;
  2402. {*
  2403. -------------------------------------------------------------------------------
  2404. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2405. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2406. point value, returning the result. After being shifted into the proper
  2407. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2408. together to form the most significant 32 bits of the result. This means
  2409. that any integer portion of `zSig0' will be added into the exponent. Since
  2410. a properly normalized significand will have an integer portion equal to 1,
  2411. the `zExp' input should be 1 less than the desired result exponent whenever
  2412. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2413. -------------------------------------------------------------------------------
  2414. *}
  2415. Procedure
  2416. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2417. var
  2418. z: Float64;
  2419. Begin
  2420. z.low := zSig1;
  2421. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2422. c := z;
  2423. End;
  2424. {*----------------------------------------------------------------------------
  2425. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2426. | double-precision floating-point value, returning the result. After being
  2427. | shifted into the proper positions, the three fields are simply added
  2428. | together to form the result. This means that any integer portion of `zSig'
  2429. | will be added into the exponent. Since a properly normalized significand
  2430. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2431. | than the desired result exponent whenever `zSig' is a complete, normalized
  2432. | significand.
  2433. *----------------------------------------------------------------------------*}
  2434. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2435. begin
  2436. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2437. end;
  2438. {*
  2439. -------------------------------------------------------------------------------
  2440. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2441. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2442. and `zSig2', and returns the proper double-precision floating-point value
  2443. corresponding to the abstract input. Ordinarily, the abstract value is
  2444. simply rounded and packed into the double-precision format, with the inexact
  2445. exception raised if the abstract input cannot be represented exactly.
  2446. However, if the abstract value is too large, the overflow and inexact
  2447. exceptions are raised and an infinity or maximal finite value is returned.
  2448. If the abstract value is too small, the input value is rounded to a
  2449. subnormal number, and the underflow and inexact exceptions are raised if the
  2450. abstract input cannot be represented exactly as a subnormal double-precision
  2451. floating-point number.
  2452. The input significand must be normalized or smaller. If the input
  2453. significand is not normalized, `zExp' must be 0; in that case, the result
  2454. returned is a subnormal number, and it must not require rounding. In the
  2455. usual case that the input significand is normalized, `zExp' must be 1 less
  2456. than the ``true'' floating-point exponent. The handling of underflow and
  2457. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2458. -------------------------------------------------------------------------------
  2459. *}
  2460. Procedure
  2461. roundAndPackFloat64(
  2462. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2463. Var
  2464. roundingMode : TFPURoundingMode;
  2465. roundNearestEven, increment, isTiny : Flag;
  2466. Begin
  2467. roundingMode := softfloat_rounding_mode;
  2468. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2469. increment := flag( sbits32 (zSig2) < 0 );
  2470. if ( roundNearestEven = flag(FALSE) ) then
  2471. Begin
  2472. if ( roundingMode = float_round_to_zero ) then
  2473. increment := 0
  2474. else
  2475. Begin
  2476. if ( zSign )<> 0 then
  2477. Begin
  2478. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2479. End
  2480. else
  2481. Begin
  2482. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2483. End
  2484. End
  2485. End;
  2486. if ( $7FD <= bits16 (zExp) ) then
  2487. Begin
  2488. if (( $7FD < zExp )
  2489. or (( zExp = $7FD )
  2490. and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
  2491. and (increment<>0)
  2492. )
  2493. ) then
  2494. Begin
  2495. float_raise( [float_flag_overflow,float_flag_inexact] );
  2496. if (( roundingMode = float_round_to_zero )
  2497. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2498. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2499. ) then
  2500. Begin
  2501. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2502. exit;
  2503. End;
  2504. packFloat64( zSign, $7FF, 0, 0, c );
  2505. exit;
  2506. End;
  2507. if ( zExp < 0 ) then
  2508. Begin
  2509. isTiny :=
  2510. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2511. or flag( zExp < -1 )
  2512. or flag(increment = 0)
  2513. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2514. shift64ExtraRightJamming(
  2515. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2516. zExp := 0;
  2517. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2518. if ( roundNearestEven )<>0 then
  2519. Begin
  2520. increment := flag( sbits32 (zSig2) < 0 );
  2521. End
  2522. else
  2523. Begin
  2524. if ( zSign )<>0 then
  2525. Begin
  2526. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2527. End
  2528. else
  2529. Begin
  2530. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2531. End
  2532. End;
  2533. End;
  2534. End;
  2535. if ( zSig2 )<>0 then
  2536. set_inexact_flag;
  2537. if ( increment )<>0 then
  2538. Begin
  2539. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2540. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2541. End
  2542. else
  2543. Begin
  2544. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2545. End;
  2546. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2547. End;
  2548. {*----------------------------------------------------------------------------
  2549. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2550. | and significand `zSig', and returns the proper double-precision floating-
  2551. | point value corresponding to the abstract input. Ordinarily, the abstract
  2552. | value is simply rounded and packed into the double-precision format, with
  2553. | the inexact exception raised if the abstract input cannot be represented
  2554. | exactly. However, if the abstract value is too large, the overflow and
  2555. | inexact exceptions are raised and an infinity or maximal finite value is
  2556. | returned. If the abstract value is too small, the input value is rounded
  2557. | to a subnormal number, and the underflow and inexact exceptions are raised
  2558. | if the abstract input cannot be represented exactly as a subnormal double-
  2559. | precision floating-point number.
  2560. | The input significand `zSig' has its binary point between bits 62
  2561. | and 61, which is 10 bits to the left of the usual location. This shifted
  2562. | significand must be normalized or smaller. If `zSig' is not normalized,
  2563. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2564. | and it must not require rounding. In the usual case that `zSig' is
  2565. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2566. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2567. | Binary Floating-Point Arithmetic.
  2568. *----------------------------------------------------------------------------*}
  2569. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2570. var
  2571. roundingMode: TFPURoundingMode;
  2572. roundNearestEven: flag;
  2573. roundIncrement, roundBits: int16;
  2574. isTiny: flag;
  2575. begin
  2576. roundingMode := softfloat_rounding_mode;
  2577. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2578. roundIncrement := $200;
  2579. if ( roundNearestEven=0 ) then
  2580. begin
  2581. if ( roundingMode = float_round_to_zero ) then
  2582. begin
  2583. roundIncrement := 0;
  2584. end
  2585. else begin
  2586. roundIncrement := $3FF;
  2587. if ( zSign<>0 ) then
  2588. begin
  2589. if ( roundingMode = float_round_up ) then
  2590. roundIncrement := 0;
  2591. end
  2592. else begin
  2593. if ( roundingMode = float_round_down ) then
  2594. roundIncrement := 0;
  2595. end
  2596. end
  2597. end;
  2598. roundBits := zSig and $3FF;
  2599. if ( $7FD <= bits16(zExp) ) then
  2600. begin
  2601. if ( ( $7FD < zExp )
  2602. or ( ( zExp = $7FD )
  2603. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2604. ) then
  2605. begin
  2606. float_raise( [float_flag_overflow,float_flag_inexact] );
  2607. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2608. exit;
  2609. end;
  2610. if ( zExp < 0 ) then
  2611. begin
  2612. isTiny := ord(
  2613. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2614. or ( zExp < -1 )
  2615. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2616. shift64RightJamming( zSig, - zExp, zSig );
  2617. zExp := 0;
  2618. roundBits := zSig and $3FF;
  2619. if ( isTiny and roundBits )<>0 then
  2620. float_raise( float_flag_underflow );
  2621. end
  2622. end;
  2623. if ( roundBits<>0 ) then
  2624. set_inexact_flag;
  2625. zSig := ( zSig + roundIncrement ) shr 10;
  2626. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2627. if ( zSig = 0 ) then
  2628. zExp := 0;
  2629. result:=packFloat64( zSign, zExp, zSig );
  2630. end;
  2631. {*
  2632. -------------------------------------------------------------------------------
  2633. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2634. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2635. returns the proper double-precision floating-point value corresponding
  2636. to the abstract input. This routine is just like `roundAndPackFloat64'
  2637. except that the input significand has fewer bits and does not have to be
  2638. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2639. point exponent.
  2640. -------------------------------------------------------------------------------
  2641. *}
  2642. Procedure
  2643. normalizeRoundAndPackFloat64(
  2644. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2645. Var
  2646. shiftCount : int8;
  2647. zSig2 : bits32;
  2648. Begin
  2649. if ( zSig0 = 0 ) then
  2650. Begin
  2651. zSig0 := zSig1;
  2652. zSig1 := 0;
  2653. zExp := zExp -32;
  2654. End;
  2655. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2656. if ( 0 <= shiftCount ) then
  2657. Begin
  2658. zSig2 := 0;
  2659. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2660. End
  2661. else
  2662. Begin
  2663. shift64ExtraRightJamming
  2664. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2665. End;
  2666. zExp := zExp - shiftCount;
  2667. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2668. End;
  2669. {*
  2670. ----------------------------------------------------------------------------
  2671. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2672. and significand `zSig', and returns the proper double-precision floating-
  2673. point value corresponding to the abstract input. This routine is just like
  2674. `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  2675. Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2676. floating-point exponent.
  2677. ----------------------------------------------------------------------------
  2678. *}
  2679. function normalizeRoundAndPackFloat64(zSign: flag; zExp: int16; zSig: bits64): float64;
  2680. var
  2681. shiftCount: int8;
  2682. begin
  2683. shiftCount := countLeadingZeros64( zSig ) - 1;
  2684. result := roundAndPackFloat64( zSign, zExp - shiftCount, zSig shl shiftCount);
  2685. end;
  2686. {*
  2687. -------------------------------------------------------------------------------
  2688. Returns the result of converting the 32-bit two's complement integer `a' to
  2689. the single-precision floating-point format. The conversion is performed
  2690. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2691. -------------------------------------------------------------------------------
  2692. *}
  2693. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2694. Var
  2695. zSign : Flag;
  2696. Begin
  2697. if ( a = 0 ) then
  2698. Begin
  2699. int32_to_float32.float32 := 0;
  2700. exit;
  2701. End;
  2702. if ( a = sbits32 ($80000000) ) then
  2703. Begin
  2704. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2705. exit;
  2706. end;
  2707. zSign := flag( a < 0 );
  2708. If zSign<>0 then
  2709. a := -a;
  2710. int32_to_float32.float32:=
  2711. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2712. End;
  2713. {*
  2714. -------------------------------------------------------------------------------
  2715. Returns the result of converting the 32-bit two's complement integer `a' to
  2716. the double-precision floating-point format. The conversion is performed
  2717. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2718. -------------------------------------------------------------------------------
  2719. *}
  2720. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2721. var
  2722. zSign : flag;
  2723. absA : bits32;
  2724. shiftCount : int8;
  2725. zSig0, zSig1 : bits32;
  2726. Begin
  2727. if ( a = 0 ) then
  2728. Begin
  2729. packFloat64( 0, 0, 0, 0, result );
  2730. exit;
  2731. end;
  2732. zSign := flag( a < 0 );
  2733. if ZSign<>0 then
  2734. AbsA := -a
  2735. else
  2736. AbsA := a;
  2737. shiftCount := countLeadingZeros32( absA ) - 11;
  2738. if ( 0 <= shiftCount ) then
  2739. Begin
  2740. zSig0 := absA shl shiftCount;
  2741. zSig1 := 0;
  2742. End
  2743. else
  2744. Begin
  2745. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2746. End;
  2747. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2748. End;
  2749. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2750. {$if not defined(packFloatx80)}
  2751. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2752. forward;
  2753. {$endif}
  2754. {*----------------------------------------------------------------------------
  2755. | Returns the result of converting the 32-bit two's complement integer `a'
  2756. | to the extended double-precision floating-point format. The conversion
  2757. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2758. | Arithmetic.
  2759. *----------------------------------------------------------------------------*}
  2760. function int32_to_floatx80( a: int32 ): floatx80;
  2761. var
  2762. zSign: flag;
  2763. absA: uint32;
  2764. shiftCount: int8;
  2765. zSig: bits64;
  2766. begin
  2767. if ( a = 0 ) then begin
  2768. result := packFloatx80( 0, 0, 0 );
  2769. exit;
  2770. end;
  2771. zSign := ord( a < 0 );
  2772. if zSign <> 0 then absA := - a else absA := a;
  2773. shiftCount := countLeadingZeros32( absA ) + 32;
  2774. zSig := absA;
  2775. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2776. end;
  2777. {$endif FPC_SOFTFLOAT_FLOATX80}
  2778. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2779. {$if not defined(packFloat128)}
  2780. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2781. forward;
  2782. {$endif}
  2783. {*----------------------------------------------------------------------------
  2784. | Returns the result of converting the 32-bit two's complement integer `a' to
  2785. | the quadruple-precision floating-point format. The conversion is performed
  2786. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2787. *----------------------------------------------------------------------------*}
  2788. function int32_to_float128( a: int32 ): float128;
  2789. var
  2790. zSign: flag;
  2791. absA: uint32;
  2792. shiftCount: int8;
  2793. zSig0: bits64;
  2794. begin
  2795. if ( a = 0 ) then begin
  2796. result := packFloat128( 0, 0, 0, 0 );
  2797. exit;
  2798. end;
  2799. zSign := ord( a < 0 );
  2800. if zSign <> 0 then absA := - a else absA := a;
  2801. shiftCount := countLeadingZeros32( absA ) + 17;
  2802. zSig0 := absA;
  2803. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2804. end;
  2805. {$endif FPC_SOFTFLOAT_FLOAT128}
  2806. {*
  2807. -------------------------------------------------------------------------------
  2808. Returns the result of converting the single-precision floating-point value
  2809. `a' to the 32-bit two's complement integer format. The conversion is
  2810. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2811. Arithmetic---which means in particular that the conversion is rounded
  2812. according to the current rounding mode. If `a' is a NaN, the largest
  2813. positive integer is returned. Otherwise, if the conversion overflows, the
  2814. largest integer with the same sign as `a' is returned.
  2815. -------------------------------------------------------------------------------
  2816. *}
  2817. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2818. Var
  2819. aSign: flag;
  2820. aExp, shiftCount: int16;
  2821. aSig, aSigExtra: bits32;
  2822. z: int32;
  2823. roundingMode: TFPURoundingMode;
  2824. Begin
  2825. aSig := extractFloat32Frac( a.float32 );
  2826. aExp := extractFloat32Exp( a.float32 );
  2827. aSign := extractFloat32Sign( a.float32 );
  2828. shiftCount := aExp - $96;
  2829. if ( 0 <= shiftCount ) then
  2830. Begin
  2831. if ( $9E <= aExp ) then
  2832. Begin
  2833. if ( a.float32 <> $CF000000 ) then
  2834. Begin
  2835. float_raise( float_flag_invalid );
  2836. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2837. Begin
  2838. float32_to_int32 := $7FFFFFFF;
  2839. exit;
  2840. End;
  2841. End;
  2842. float32_to_int32 := sbits32 ($80000000);
  2843. exit;
  2844. End;
  2845. z := ( aSig or $00800000 ) shl shiftCount;
  2846. if ( aSign<>0 ) then z := - z;
  2847. End
  2848. else
  2849. Begin
  2850. if ( aExp < $7E ) then
  2851. Begin
  2852. aSigExtra := aExp OR aSig;
  2853. z := 0;
  2854. End
  2855. else
  2856. Begin
  2857. aSig := aSig OR $00800000;
  2858. aSigExtra := aSig shl ( shiftCount and 31 );
  2859. z := aSig shr ( - shiftCount );
  2860. End;
  2861. if ( aSigExtra<>0 ) then
  2862. set_inexact_flag;
  2863. roundingMode := softfloat_rounding_mode;
  2864. if ( roundingMode = float_round_nearest_even ) then
  2865. Begin
  2866. if ( sbits32 (aSigExtra) < 0 ) then
  2867. Begin
  2868. Inc(z);
  2869. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2870. z := z and not 1;
  2871. End;
  2872. if ( aSign<>0 ) then
  2873. z := - z;
  2874. End
  2875. else
  2876. Begin
  2877. aSigExtra := flag( aSigExtra <> 0 );
  2878. if ( aSign<>0 ) then
  2879. Begin
  2880. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2881. z := - z;
  2882. End
  2883. else
  2884. Begin
  2885. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2886. End
  2887. End;
  2888. End;
  2889. float32_to_int32 := z;
  2890. End;
  2891. {*
  2892. -------------------------------------------------------------------------------
  2893. Returns the result of converting the single-precision floating-point value
  2894. `a' to the 32-bit two's complement integer format. The conversion is
  2895. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2896. Arithmetic, except that the conversion is always rounded toward zero.
  2897. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2898. the conversion overflows, the largest integer with the same sign as `a' is
  2899. returned.
  2900. -------------------------------------------------------------------------------
  2901. *}
  2902. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2903. Var
  2904. aSign : flag;
  2905. aExp, shiftCount : int16;
  2906. aSig : bits32;
  2907. z : int32;
  2908. Begin
  2909. aSig := extractFloat32Frac( a.float32 );
  2910. aExp := extractFloat32Exp( a.float32 );
  2911. aSign := extractFloat32Sign( a.float32 );
  2912. shiftCount := aExp - $9E;
  2913. if ( 0 <= shiftCount ) then
  2914. Begin
  2915. if ( a.float32 <> $CF000000 ) then
  2916. Begin
  2917. float_raise( float_flag_invalid );
  2918. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2919. Begin
  2920. float32_to_int32_round_to_zero := $7FFFFFFF;
  2921. exit;
  2922. end;
  2923. End;
  2924. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2925. exit;
  2926. End
  2927. else
  2928. if ( aExp <= $7E ) then
  2929. Begin
  2930. if ( aExp or aSig )<>0 then
  2931. set_inexact_flag;
  2932. float32_to_int32_round_to_zero := 0;
  2933. exit;
  2934. End;
  2935. aSig := ( aSig or $00800000 ) shl 8;
  2936. z := aSig shr ( - shiftCount );
  2937. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2938. Begin
  2939. set_inexact_flag;
  2940. End;
  2941. if ( aSign<>0 ) then z := - z;
  2942. float32_to_int32_round_to_zero := z;
  2943. End;
  2944. {*----------------------------------------------------------------------------
  2945. | Returns the result of converting the single-precision floating-point value
  2946. | `a' to the 64-bit two's complement integer format. The conversion is
  2947. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2948. | Arithmetic---which means in particular that the conversion is rounded
  2949. | according to the current rounding mode. If `a' is a NaN, the largest
  2950. | positive integer is returned. Otherwise, if the conversion overflows, the
  2951. | largest integer with the same sign as `a' is returned.
  2952. *----------------------------------------------------------------------------*}
  2953. function float32_to_int64( a: float32 ): int64;
  2954. var
  2955. aSign: flag;
  2956. aExp, shiftCount: int16;
  2957. aSig: bits32;
  2958. aSig64, aSigExtra: bits64;
  2959. begin
  2960. aSig := extractFloat32Frac( a );
  2961. aExp := extractFloat32Exp( a );
  2962. aSign := extractFloat32Sign( a );
  2963. shiftCount := $BE - aExp;
  2964. if ( shiftCount < 0 ) then begin
  2965. float_raise( float_flag_invalid );
  2966. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  2967. result := $7FFFFFFFFFFFFFFF;
  2968. exit;
  2969. end;
  2970. result := $8000000000000000;
  2971. exit;
  2972. end;
  2973. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  2974. aSig64 := aSig;
  2975. aSig64 := aSig64 shl 40;
  2976. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  2977. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  2978. end;
  2979. {*----------------------------------------------------------------------------
  2980. | Returns the result of converting the single-precision floating-point value
  2981. | `a' to the 64-bit two's complement integer format. The conversion is
  2982. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2983. | Arithmetic, except that the conversion is always rounded toward zero. If
  2984. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  2985. | conversion overflows, the largest integer with the same sign as `a' is
  2986. | returned.
  2987. *----------------------------------------------------------------------------*}
  2988. function float32_to_int64_round_to_zero( a: float32 ): int64;
  2989. var
  2990. aSign: flag;
  2991. aExp, shiftCount: int16;
  2992. aSig: bits32;
  2993. aSig64: bits64;
  2994. z: int64;
  2995. begin
  2996. aSig := extractFloat32Frac( a );
  2997. aExp := extractFloat32Exp( a );
  2998. aSign := extractFloat32Sign( a );
  2999. shiftCount := aExp - $BE;
  3000. if ( 0 <= shiftCount ) then begin
  3001. if ( a <> $DF000000 ) then begin
  3002. float_raise( float_flag_invalid );
  3003. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3004. result := $7FFFFFFFFFFFFFFF;
  3005. exit;
  3006. end;
  3007. end;
  3008. result := $8000000000000000;
  3009. exit;
  3010. end
  3011. else if ( aExp <= $7E ) then begin
  3012. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  3013. result := 0;
  3014. exit;
  3015. end;
  3016. aSig64 := aSig or $00800000;
  3017. aSig64 := aSig64 shl 40;
  3018. z := aSig64 shr ( - shiftCount );
  3019. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3020. set_inexact_flag;
  3021. if ( aSign <> 0 ) then z := - z;
  3022. result := z;
  3023. end;
  3024. {*
  3025. -------------------------------------------------------------------------------
  3026. Returns the result of converting the single-precision floating-point value
  3027. `a' to the double-precision floating-point format. The conversion is
  3028. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3029. Arithmetic.
  3030. -------------------------------------------------------------------------------
  3031. *}
  3032. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  3033. Var
  3034. aSign : flag;
  3035. aExp : int16;
  3036. aSig, zSig0, zSig1: bits32;
  3037. tmp : CommonNanT;
  3038. Begin
  3039. aSig := extractFloat32Frac( a.float32 );
  3040. aExp := extractFloat32Exp( a.float32 );
  3041. aSign := extractFloat32Sign( a.float32 );
  3042. if ( aExp = $FF ) then
  3043. Begin
  3044. if ( aSig<>0 ) then
  3045. Begin
  3046. float32ToCommonNaN(a.float32, tmp);
  3047. commonNaNToFloat64(tmp , result);
  3048. exit;
  3049. End;
  3050. packFloat64( aSign, $7FF, 0, 0, result);
  3051. exit;
  3052. End;
  3053. if ( aExp = 0 ) then
  3054. Begin
  3055. if ( aSig = 0 ) then
  3056. Begin
  3057. packFloat64( aSign, 0, 0, 0, result );
  3058. exit;
  3059. end;
  3060. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3061. Dec(aExp);
  3062. End;
  3063. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3064. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3065. End;
  3066. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3067. {*----------------------------------------------------------------------------
  3068. | Returns the result of converting the canonical NaN `a' to the extended
  3069. | double-precision floating-point format.
  3070. *----------------------------------------------------------------------------*}
  3071. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3072. var
  3073. z : floatx80;
  3074. begin
  3075. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3076. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3077. result := z;
  3078. end;
  3079. {*----------------------------------------------------------------------------
  3080. | Returns the result of converting the single-precision floating-point value
  3081. | `a' to the extended double-precision floating-point format. The conversion
  3082. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3083. | Arithmetic.
  3084. *----------------------------------------------------------------------------*}
  3085. function float32_to_floatx80( a: float32 ): floatx80;
  3086. var
  3087. aSign: flag;
  3088. aExp: int16;
  3089. aSig: bits32;
  3090. tmp: commonNaNT;
  3091. begin
  3092. aSig := extractFloat32Frac( a );
  3093. aExp := extractFloat32Exp( a );
  3094. aSign := extractFloat32Sign( a );
  3095. if ( aExp = $FF ) then begin
  3096. if ( aSig <> 0 ) then begin
  3097. float32ToCommonNaN( a, tmp );
  3098. result := commonNaNToFloatx80( tmp );
  3099. exit;
  3100. end;
  3101. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3102. exit;
  3103. end;
  3104. if ( aExp = 0 ) then begin
  3105. if ( aSig = 0 ) then begin
  3106. result := packFloatx80( aSign, 0, 0 );
  3107. exit;
  3108. end;
  3109. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3110. end;
  3111. aSig := aSig or $00800000;
  3112. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3113. end;
  3114. {$endif FPC_SOFTFLOAT_FLOATX80}
  3115. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3116. {*----------------------------------------------------------------------------
  3117. | Returns the result of converting the single-precision floating-point value
  3118. | `a' to the double-precision floating-point format. The conversion is
  3119. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3120. | Arithmetic.
  3121. *----------------------------------------------------------------------------*}
  3122. function float32_to_float128( a: float32 ): float128;
  3123. var
  3124. aSign: flag;
  3125. aExp: int16;
  3126. aSig: bits32;
  3127. tmp: commonNaNT;
  3128. begin
  3129. aSig := extractFloat32Frac( a );
  3130. aExp := extractFloat32Exp( a );
  3131. aSign := extractFloat32Sign( a );
  3132. if ( aExp = $FF ) then begin
  3133. if ( aSig <> 0 ) then begin
  3134. float32ToCommonNaN( a, tmp );
  3135. result := commonNaNToFloat128( tmp );
  3136. exit;
  3137. end;
  3138. result := packFloat128( aSign, $7FFF, 0, 0 );
  3139. exit;
  3140. end;
  3141. if ( aExp = 0 ) then begin
  3142. if ( aSig = 0 ) then begin
  3143. result := packFloat128( aSign, 0, 0, 0 );
  3144. exit;
  3145. end;
  3146. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3147. dec( aExp );
  3148. end;
  3149. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3150. end;
  3151. {$endif FPC_SOFTFLOAT_FLOAT128}
  3152. {*
  3153. -------------------------------------------------------------------------------
  3154. Rounds the single-precision floating-point value `a' to an integer,
  3155. and returns the result as a single-precision floating-point value. The
  3156. operation is performed according to the IEC/IEEE Standard for Binary
  3157. Floating-Point Arithmetic.
  3158. -------------------------------------------------------------------------------
  3159. *}
  3160. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  3161. Var
  3162. aSign: flag;
  3163. aExp: int16;
  3164. lastBitMask, roundBitsMask: bits32;
  3165. roundingMode: TFPURoundingMode;
  3166. z: float32;
  3167. Begin
  3168. aExp := extractFloat32Exp( a.float32 );
  3169. if ( $96 <= aExp ) then
  3170. Begin
  3171. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3172. Begin
  3173. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3174. exit;
  3175. End;
  3176. float32_round_to_int:=a;
  3177. exit;
  3178. End;
  3179. if ( aExp <= $7E ) then
  3180. Begin
  3181. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3182. Begin
  3183. float32_round_to_int:=a;
  3184. exit;
  3185. end;
  3186. set_inexact_flag;
  3187. aSign := extractFloat32Sign( a.float32 );
  3188. case ( softfloat_rounding_mode ) of
  3189. float_round_nearest_even:
  3190. Begin
  3191. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3192. Begin
  3193. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3194. exit;
  3195. End;
  3196. End;
  3197. float_round_down:
  3198. Begin
  3199. if aSign <> 0 then
  3200. float32_round_to_int.float32 := $BF800000
  3201. else
  3202. float32_round_to_int.float32 := 0;
  3203. exit;
  3204. End;
  3205. float_round_up:
  3206. Begin
  3207. if aSign <> 0 then
  3208. float32_round_to_int.float32 := $80000000
  3209. else
  3210. float32_round_to_int.float32 := $3F800000;
  3211. exit;
  3212. End;
  3213. end;
  3214. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3215. exit;
  3216. End;
  3217. lastBitMask := 1;
  3218. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3219. lastBitMask := lastBitMask shl ($96 - aExp);
  3220. roundBitsMask := lastBitMask - 1;
  3221. z := a.float32;
  3222. roundingMode := softfloat_rounding_mode;
  3223. if ( roundingMode = float_round_nearest_even ) then
  3224. Begin
  3225. z := z + (lastBitMask shr 1);
  3226. if ( ( z and roundBitsMask ) = 0 ) then
  3227. z := z and not lastBitMask;
  3228. End
  3229. else if ( roundingMode <> float_round_to_zero ) then
  3230. Begin
  3231. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3232. Begin
  3233. z := z + roundBitsMask;
  3234. End;
  3235. End;
  3236. z := z and not roundBitsMask;
  3237. if ( z <> a.float32 ) then
  3238. set_inexact_flag;
  3239. float32_round_to_int.float32 := z;
  3240. End;
  3241. {*
  3242. -------------------------------------------------------------------------------
  3243. Returns the result of adding the absolute values of the single-precision
  3244. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3245. before being returned. `zSign' is ignored if the result is a NaN.
  3246. The addition is performed according to the IEC/IEEE Standard for Binary
  3247. Floating-Point Arithmetic.
  3248. -------------------------------------------------------------------------------
  3249. *}
  3250. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3251. Var
  3252. aExp, bExp, zExp: int16;
  3253. aSig, bSig, zSig: bits32;
  3254. expDiff: int16;
  3255. label roundAndPack;
  3256. Begin
  3257. aSig:=extractFloat32Frac( a );
  3258. aExp:=extractFloat32Exp( a );
  3259. bSig:=extractFloat32Frac( b );
  3260. bExp := extractFloat32Exp( b );
  3261. expDiff := aExp - bExp;
  3262. aSig := aSig shl 6;
  3263. bSig := bSig shl 6;
  3264. if ( 0 < expDiff ) then
  3265. Begin
  3266. if ( aExp = $FF ) then
  3267. Begin
  3268. if ( aSig <> 0) then
  3269. Begin
  3270. addFloat32Sigs := propagateFloat32NaN( a, b );
  3271. exit;
  3272. End;
  3273. addFloat32Sigs := a;
  3274. exit;
  3275. End;
  3276. if ( bExp = 0 ) then
  3277. Begin
  3278. Dec(expDiff);
  3279. End
  3280. else
  3281. Begin
  3282. bSig := bSig or $20000000;
  3283. End;
  3284. shift32RightJamming( bSig, expDiff, bSig );
  3285. zExp := aExp;
  3286. End
  3287. else
  3288. If ( expDiff < 0 ) then
  3289. Begin
  3290. if ( bExp = $FF ) then
  3291. Begin
  3292. if ( bSig<>0 ) then
  3293. Begin
  3294. addFloat32Sigs := propagateFloat32NaN( a, b );
  3295. exit;
  3296. end;
  3297. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3298. exit;
  3299. End;
  3300. if ( aExp = 0 ) then
  3301. Begin
  3302. Inc(expDiff);
  3303. End
  3304. else
  3305. Begin
  3306. aSig := aSig OR $20000000;
  3307. End;
  3308. shift32RightJamming( aSig, - expDiff, aSig );
  3309. zExp := bExp;
  3310. End
  3311. else
  3312. Begin
  3313. if ( aExp = $FF ) then
  3314. Begin
  3315. if ( aSig OR bSig )<> 0 then
  3316. Begin
  3317. addFloat32Sigs := propagateFloat32NaN( a, b );
  3318. exit;
  3319. end;
  3320. addFloat32Sigs := a;
  3321. exit;
  3322. End;
  3323. if ( aExp = 0 ) then
  3324. Begin
  3325. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3326. exit;
  3327. end;
  3328. zSig := $40000000 + aSig + bSig;
  3329. zExp := aExp;
  3330. goto roundAndPack;
  3331. End;
  3332. aSig := aSig OR $20000000;
  3333. zSig := ( aSig + bSig ) shl 1;
  3334. Dec(zExp);
  3335. if ( sbits32 (zSig) < 0 ) then
  3336. Begin
  3337. zSig := aSig + bSig;
  3338. Inc(zExp);
  3339. End;
  3340. roundAndPack:
  3341. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3342. End;
  3343. {*
  3344. -------------------------------------------------------------------------------
  3345. Returns the result of subtracting the absolute values of the single-
  3346. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3347. difference is negated before being returned. `zSign' is ignored if the
  3348. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3349. Standard for Binary Floating-Point Arithmetic.
  3350. -------------------------------------------------------------------------------
  3351. *}
  3352. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3353. Var
  3354. aExp, bExp, zExp: int16;
  3355. aSig, bSig, zSig: bits32;
  3356. expDiff : int16;
  3357. label aExpBigger;
  3358. label bExpBigger;
  3359. label aBigger;
  3360. label bBigger;
  3361. label normalizeRoundAndPack;
  3362. Begin
  3363. aSig := extractFloat32Frac( a );
  3364. aExp := extractFloat32Exp( a );
  3365. bSig := extractFloat32Frac( b );
  3366. bExp := extractFloat32Exp( b );
  3367. expDiff := aExp - bExp;
  3368. aSig := aSig shl 7;
  3369. bSig := bSig shl 7;
  3370. if ( 0 < expDiff ) then goto aExpBigger;
  3371. if ( expDiff < 0 ) then goto bExpBigger;
  3372. if ( aExp = $FF ) then
  3373. Begin
  3374. if ( aSig OR bSig )<> 0 then
  3375. Begin
  3376. subFloat32Sigs := propagateFloat32NaN( a, b );
  3377. exit;
  3378. End;
  3379. float_raise( float_flag_invalid );
  3380. subFloat32Sigs := float32_default_nan;
  3381. exit;
  3382. End;
  3383. if ( aExp = 0 ) then
  3384. Begin
  3385. aExp := 1;
  3386. bExp := 1;
  3387. End;
  3388. if ( bSig < aSig ) Then goto aBigger;
  3389. if ( aSig < bSig ) Then goto bBigger;
  3390. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3391. exit;
  3392. bExpBigger:
  3393. if ( bExp = $FF ) then
  3394. Begin
  3395. if ( bSig<>0 ) then
  3396. Begin
  3397. subFloat32Sigs := propagateFloat32NaN( a, b );
  3398. exit;
  3399. End;
  3400. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3401. exit;
  3402. End;
  3403. if ( aExp = 0 ) then
  3404. Begin
  3405. Inc(expDiff);
  3406. End
  3407. else
  3408. Begin
  3409. aSig := aSig OR $40000000;
  3410. End;
  3411. shift32RightJamming( aSig, - expDiff, aSig );
  3412. bSig := bSig OR $40000000;
  3413. bBigger:
  3414. zSig := bSig - aSig;
  3415. zExp := bExp;
  3416. zSign := zSign xor 1;
  3417. goto normalizeRoundAndPack;
  3418. aExpBigger:
  3419. if ( aExp = $FF ) then
  3420. Begin
  3421. if ( aSig <> 0) then
  3422. Begin
  3423. subFloat32Sigs := propagateFloat32NaN( a, b );
  3424. exit;
  3425. End;
  3426. subFloat32Sigs := a;
  3427. exit;
  3428. End;
  3429. if ( bExp = 0 ) then
  3430. Begin
  3431. Dec(expDiff);
  3432. End
  3433. else
  3434. Begin
  3435. bSig := bSig OR $40000000;
  3436. End;
  3437. shift32RightJamming( bSig, expDiff, bSig );
  3438. aSig := aSig OR $40000000;
  3439. aBigger:
  3440. zSig := aSig - bSig;
  3441. zExp := aExp;
  3442. normalizeRoundAndPack:
  3443. Dec(zExp);
  3444. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3445. End;
  3446. {*
  3447. -------------------------------------------------------------------------------
  3448. Returns the result of adding the single-precision floating-point values `a'
  3449. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3450. Binary Floating-Point Arithmetic.
  3451. -------------------------------------------------------------------------------
  3452. *}
  3453. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3454. Var
  3455. aSign, bSign: Flag;
  3456. Begin
  3457. aSign := extractFloat32Sign( a.float32 );
  3458. bSign := extractFloat32Sign( b.float32 );
  3459. if ( aSign = bSign ) then
  3460. Begin
  3461. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3462. End
  3463. else
  3464. Begin
  3465. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3466. End;
  3467. End;
  3468. {*
  3469. -------------------------------------------------------------------------------
  3470. Returns the result of subtracting the single-precision floating-point values
  3471. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3472. for Binary Floating-Point Arithmetic.
  3473. -------------------------------------------------------------------------------
  3474. *}
  3475. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3476. Var
  3477. aSign, bSign: flag;
  3478. Begin
  3479. aSign := extractFloat32Sign( a.float32 );
  3480. bSign := extractFloat32Sign( b.float32 );
  3481. if ( aSign = bSign ) then
  3482. Begin
  3483. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3484. End
  3485. else
  3486. Begin
  3487. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3488. End;
  3489. End;
  3490. {*
  3491. -------------------------------------------------------------------------------
  3492. Returns the result of multiplying the single-precision floating-point values
  3493. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3494. for Binary Floating-Point Arithmetic.
  3495. -------------------------------------------------------------------------------
  3496. *}
  3497. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3498. Var
  3499. aSign, bSign, zSign: flag;
  3500. aExp, bExp, zExp : int16;
  3501. aSig, bSig, zSig0, zSig1: bits32;
  3502. Begin
  3503. aSig := extractFloat32Frac( a.float32 );
  3504. aExp := extractFloat32Exp( a.float32 );
  3505. aSign := extractFloat32Sign( a.float32 );
  3506. bSig := extractFloat32Frac( b.float32 );
  3507. bExp := extractFloat32Exp( b.float32 );
  3508. bSign := extractFloat32Sign( b.float32 );
  3509. zSign := aSign xor bSign;
  3510. if ( aExp = $FF ) then
  3511. Begin
  3512. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3513. Begin
  3514. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3515. exit;
  3516. End;
  3517. if ( ( bits32(bExp) OR bSig ) = 0 ) then
  3518. Begin
  3519. float_raise( float_flag_invalid );
  3520. float32_mul.float32 := float32_default_nan;
  3521. exit;
  3522. End;
  3523. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3524. exit;
  3525. End;
  3526. if ( bExp = $FF ) then
  3527. Begin
  3528. if ( bSig <> 0 ) then
  3529. Begin
  3530. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3531. exit;
  3532. End;
  3533. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3534. Begin
  3535. float_raise( float_flag_invalid );
  3536. float32_mul.float32 := float32_default_nan;
  3537. exit;
  3538. End;
  3539. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3540. exit;
  3541. End;
  3542. if ( aExp = 0 ) then
  3543. Begin
  3544. if ( aSig = 0 ) then
  3545. Begin
  3546. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3547. exit;
  3548. End;
  3549. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3550. End;
  3551. if ( bExp = 0 ) then
  3552. Begin
  3553. if ( bSig = 0 ) then
  3554. Begin
  3555. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3556. exit;
  3557. End;
  3558. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3559. End;
  3560. zExp := aExp + bExp - $7F;
  3561. aSig := ( aSig OR $00800000 ) shl 7;
  3562. bSig := ( bSig OR $00800000 ) shl 8;
  3563. mul32To64( aSig, bSig, zSig0, zSig1 );
  3564. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3565. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3566. Begin
  3567. zSig0 := zSig0 shl 1;
  3568. Dec(zExp);
  3569. End;
  3570. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3571. End;
  3572. {*
  3573. -------------------------------------------------------------------------------
  3574. Returns the result of dividing the single-precision floating-point value `a'
  3575. by the corresponding value `b'. The operation is performed according to the
  3576. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3577. -------------------------------------------------------------------------------
  3578. *}
  3579. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3580. Var
  3581. aSign, bSign, zSign: flag;
  3582. aExp, bExp, zExp: int16;
  3583. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3584. Begin
  3585. aSig := extractFloat32Frac( a.float32 );
  3586. aExp := extractFloat32Exp( a.float32 );
  3587. aSign := extractFloat32Sign( a.float32 );
  3588. bSig := extractFloat32Frac( b.float32 );
  3589. bExp := extractFloat32Exp( b.float32 );
  3590. bSign := extractFloat32Sign( b.float32 );
  3591. zSign := aSign xor bSign;
  3592. if ( aExp = $FF ) then
  3593. Begin
  3594. if ( aSig <> 0 ) then
  3595. Begin
  3596. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3597. exit;
  3598. End;
  3599. if ( bExp = $FF ) then
  3600. Begin
  3601. if ( bSig <> 0) then
  3602. Begin
  3603. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3604. exit;
  3605. End;
  3606. float_raise( float_flag_invalid );
  3607. float32_div.float32 := float32_default_nan;
  3608. exit;
  3609. End;
  3610. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3611. exit;
  3612. End;
  3613. if ( bExp = $FF ) then
  3614. Begin
  3615. if ( bSig <> 0) then
  3616. Begin
  3617. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3618. exit;
  3619. End;
  3620. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3621. exit;
  3622. End;
  3623. if ( bExp = 0 ) Then
  3624. Begin
  3625. if ( bSig = 0 ) Then
  3626. Begin
  3627. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3628. Begin
  3629. float_raise( float_flag_invalid );
  3630. float32_div.float32 := float32_default_nan;
  3631. exit;
  3632. End;
  3633. float_raise( float_flag_divbyzero );
  3634. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3635. exit;
  3636. End;
  3637. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3638. End;
  3639. if ( aExp = 0 ) Then
  3640. Begin
  3641. if ( aSig = 0 ) Then
  3642. Begin
  3643. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3644. exit;
  3645. End;
  3646. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3647. End;
  3648. zExp := aExp - bExp + $7D;
  3649. aSig := ( aSig OR $00800000 ) shl 7;
  3650. bSig := ( bSig OR $00800000 ) shl 8;
  3651. if ( bSig <= ( aSig + aSig ) ) then
  3652. Begin
  3653. aSig := aSig shr 1;
  3654. Inc(zExp);
  3655. End;
  3656. zSig := estimateDiv64To32( aSig, 0, bSig );
  3657. if ( ( zSig and $3F ) <= 2 ) then
  3658. Begin
  3659. mul32To64( bSig, zSig, term0, term1 );
  3660. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3661. while ( sbits32 (rem0) < 0 ) do
  3662. Begin
  3663. Dec(zSig);
  3664. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3665. End;
  3666. zSig := zSig or bits32( rem1 <> 0 );
  3667. End;
  3668. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3669. End;
  3670. {*
  3671. -------------------------------------------------------------------------------
  3672. Returns the remainder of the single-precision floating-point value `a'
  3673. with respect to the corresponding value `b'. The operation is performed
  3674. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3675. -------------------------------------------------------------------------------
  3676. *}
  3677. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3678. Var
  3679. aSign, zSign: flag;
  3680. aExp, bExp, expDiff: int16;
  3681. aSig, bSig, q, alternateASig: bits32;
  3682. sigMean: sbits32;
  3683. Begin
  3684. aSig := extractFloat32Frac( a.float32 );
  3685. aExp := extractFloat32Exp( a.float32 );
  3686. aSign := extractFloat32Sign( a.float32 );
  3687. bSig := extractFloat32Frac( b.float32 );
  3688. bExp := extractFloat32Exp( b.float32 );
  3689. if ( aExp = $FF ) then
  3690. Begin
  3691. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3692. Begin
  3693. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3694. exit;
  3695. End;
  3696. float_raise( float_flag_invalid );
  3697. float32_rem.float32 := float32_default_nan;
  3698. exit;
  3699. End;
  3700. if ( bExp = $FF ) then
  3701. Begin
  3702. if ( bSig <> 0 ) then
  3703. Begin
  3704. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3705. exit;
  3706. End;
  3707. float32_rem := a;
  3708. exit;
  3709. End;
  3710. if ( bExp = 0 ) then
  3711. Begin
  3712. if ( bSig = 0 ) then
  3713. Begin
  3714. float_raise( float_flag_invalid );
  3715. float32_rem.float32 := float32_default_nan;
  3716. exit;
  3717. End;
  3718. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3719. End;
  3720. if ( aExp = 0 ) then
  3721. Begin
  3722. if ( aSig = 0 ) then
  3723. Begin
  3724. float32_rem := a;
  3725. exit;
  3726. End;
  3727. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3728. End;
  3729. expDiff := aExp - bExp;
  3730. aSig := ( aSig OR $00800000 ) shl 8;
  3731. bSig := ( bSig OR $00800000 ) shl 8;
  3732. if ( expDiff < 0 ) then
  3733. Begin
  3734. if ( expDiff < -1 ) then
  3735. Begin
  3736. float32_rem := a;
  3737. exit;
  3738. End;
  3739. aSig := aSig shr 1;
  3740. End;
  3741. q := bits32( bSig <= aSig );
  3742. if ( q <> 0) then
  3743. aSig := aSig - bSig;
  3744. expDiff := expDiff - 32;
  3745. while ( 0 < expDiff ) do
  3746. Begin
  3747. q := estimateDiv64To32( aSig, 0, bSig );
  3748. if (2 < q) then
  3749. q := q - 2
  3750. else
  3751. q := 0;
  3752. aSig := - ( ( bSig shr 2 ) * q );
  3753. expDiff := expDiff - 30;
  3754. End;
  3755. expDiff := expDiff + 32;
  3756. if ( 0 < expDiff ) then
  3757. Begin
  3758. q := estimateDiv64To32( aSig, 0, bSig );
  3759. if (2 < q) then
  3760. q := q - 2
  3761. else
  3762. q := 0;
  3763. q := q shr (32 - expDiff);
  3764. bSig := bSig shr 2;
  3765. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3766. End
  3767. else
  3768. Begin
  3769. aSig := aSig shr 2;
  3770. bSig := bSig shr 2;
  3771. End;
  3772. Repeat
  3773. alternateASig := aSig;
  3774. Inc(q);
  3775. aSig := aSig - bSig;
  3776. Until not ( 0 <= sbits32 (aSig) );
  3777. sigMean := aSig + alternateASig;
  3778. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3779. Begin
  3780. aSig := alternateASig;
  3781. End;
  3782. zSign := flag( sbits32 (aSig) < 0 );
  3783. if ( zSign<>0 ) then
  3784. aSig := - aSig;
  3785. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3786. End;
  3787. {*
  3788. -------------------------------------------------------------------------------
  3789. Returns the square root of the single-precision floating-point value `a'.
  3790. The operation is performed according to the IEC/IEEE Standard for Binary
  3791. Floating-Point Arithmetic.
  3792. -------------------------------------------------------------------------------
  3793. *}
  3794. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3795. Var
  3796. aSign : flag;
  3797. aExp, zExp : int16;
  3798. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3799. label roundAndPack;
  3800. Begin
  3801. aSig := extractFloat32Frac( a.float32 );
  3802. aExp := extractFloat32Exp( a.float32 );
  3803. aSign := extractFloat32Sign( a.float32 );
  3804. if ( aExp = $FF ) then
  3805. Begin
  3806. if ( aSig <> 0) then
  3807. Begin
  3808. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3809. exit;
  3810. End;
  3811. if ( aSign = 0) then
  3812. Begin
  3813. float32_sqrt := a;
  3814. exit;
  3815. End;
  3816. float_raise( float_flag_invalid );
  3817. float32_sqrt.float32 := float32_default_nan;
  3818. exit;
  3819. End;
  3820. if ( aSign <> 0) then
  3821. Begin
  3822. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3823. Begin
  3824. float32_sqrt := a;
  3825. exit;
  3826. End;
  3827. float_raise( float_flag_invalid );
  3828. float32_sqrt.float32 := float32_default_nan;
  3829. exit;
  3830. End;
  3831. if ( aExp = 0 ) then
  3832. Begin
  3833. if ( aSig = 0 ) then
  3834. Begin
  3835. float32_sqrt.float32 := 0;
  3836. exit;
  3837. End;
  3838. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3839. End;
  3840. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3841. aSig := ( aSig OR $00800000 ) shl 8;
  3842. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3843. if ( ( zSig and $7F ) <= 5 ) then
  3844. Begin
  3845. if ( zSig < 2 ) then
  3846. Begin
  3847. zSig := $7FFFFFFF;
  3848. goto roundAndPack;
  3849. End
  3850. else
  3851. Begin
  3852. aSig := aSig shr (aExp and 1);
  3853. mul32To64( zSig, zSig, term0, term1 );
  3854. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3855. while ( sbits32 (rem0) < 0 ) do
  3856. Begin
  3857. Dec(zSig);
  3858. shortShift64Left( 0, zSig, 1, term0, term1 );
  3859. term1 := term1 or 1;
  3860. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3861. End;
  3862. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3863. End;
  3864. End;
  3865. shift32RightJamming( zSig, 1, zSig );
  3866. roundAndPack:
  3867. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3868. End;
  3869. {*
  3870. -------------------------------------------------------------------------------
  3871. Returns 1 if the single-precision floating-point value `a' is equal to
  3872. the corresponding value `b', and 0 otherwise. The comparison is performed
  3873. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3874. -------------------------------------------------------------------------------
  3875. *}
  3876. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3877. Begin
  3878. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3879. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3880. ) then
  3881. Begin
  3882. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3883. Begin
  3884. float_raise( float_flag_invalid );
  3885. End;
  3886. float32_eq := 0;
  3887. exit;
  3888. End;
  3889. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3890. End;
  3891. {*
  3892. -------------------------------------------------------------------------------
  3893. Returns 1 if the single-precision floating-point value `a' is less than
  3894. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3895. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3896. Arithmetic.
  3897. -------------------------------------------------------------------------------
  3898. *}
  3899. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3900. var
  3901. aSign, bSign: flag;
  3902. Begin
  3903. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3904. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3905. ) then
  3906. Begin
  3907. float_raise( float_flag_invalid );
  3908. float32_le := 0;
  3909. exit;
  3910. End;
  3911. aSign := extractFloat32Sign( a.float32 );
  3912. bSign := extractFloat32Sign( b.float32 );
  3913. if ( aSign <> bSign ) then
  3914. Begin
  3915. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3916. exit;
  3917. End;
  3918. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3919. End;
  3920. {*
  3921. -------------------------------------------------------------------------------
  3922. Returns 1 if the single-precision floating-point value `a' is less than
  3923. the corresponding value `b', and 0 otherwise. The comparison is performed
  3924. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3925. -------------------------------------------------------------------------------
  3926. *}
  3927. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3928. var
  3929. aSign, bSign: flag;
  3930. Begin
  3931. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3932. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3933. ) then
  3934. Begin
  3935. float_raise( float_flag_invalid );
  3936. float32_lt :=0;
  3937. exit;
  3938. End;
  3939. aSign := extractFloat32Sign( a.float32 );
  3940. bSign := extractFloat32Sign( b.float32 );
  3941. if ( aSign <> bSign ) then
  3942. Begin
  3943. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3944. exit;
  3945. End;
  3946. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3947. End;
  3948. {*
  3949. -------------------------------------------------------------------------------
  3950. Returns 1 if the single-precision floating-point value `a' is equal to
  3951. the corresponding value `b', and 0 otherwise. The invalid exception is
  3952. raised if either operand is a NaN. Otherwise, the comparison is performed
  3953. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3954. -------------------------------------------------------------------------------
  3955. *}
  3956. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3957. Begin
  3958. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3959. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3960. ) then
  3961. Begin
  3962. float_raise( float_flag_invalid );
  3963. float32_eq_signaling := 0;
  3964. exit;
  3965. End;
  3966. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3967. End;
  3968. {*
  3969. -------------------------------------------------------------------------------
  3970. Returns 1 if the single-precision floating-point value `a' is less than or
  3971. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3972. cause an exception. Otherwise, the comparison is performed according to the
  3973. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3974. -------------------------------------------------------------------------------
  3975. *}
  3976. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3977. Var
  3978. aSign, bSign: flag;
  3979. Begin
  3980. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3981. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3982. ) then
  3983. Begin
  3984. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3985. Begin
  3986. float_raise( float_flag_invalid );
  3987. End;
  3988. float32_le_quiet := 0;
  3989. exit;
  3990. End;
  3991. aSign := extractFloat32Sign( a );
  3992. bSign := extractFloat32Sign( b );
  3993. if ( aSign <> bSign ) then
  3994. Begin
  3995. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3996. exit;
  3997. End;
  3998. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3999. End;
  4000. {*
  4001. -------------------------------------------------------------------------------
  4002. Returns 1 if the single-precision floating-point value `a' is less than
  4003. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4004. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4005. Standard for Binary Floating-Point Arithmetic.
  4006. -------------------------------------------------------------------------------
  4007. *}
  4008. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  4009. Var
  4010. aSign, bSign: flag;
  4011. Begin
  4012. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4013. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4014. ) then
  4015. Begin
  4016. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4017. Begin
  4018. float_raise( float_flag_invalid );
  4019. End;
  4020. float32_lt_quiet := 0;
  4021. exit;
  4022. End;
  4023. aSign := extractFloat32Sign( a );
  4024. bSign := extractFloat32Sign( b );
  4025. if ( aSign <> bSign ) then
  4026. Begin
  4027. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4028. exit;
  4029. End;
  4030. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4031. End;
  4032. {*
  4033. -------------------------------------------------------------------------------
  4034. Returns the result of converting the double-precision floating-point value
  4035. `a' to the 32-bit two's complement integer format. The conversion is
  4036. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4037. Arithmetic---which means in particular that the conversion is rounded
  4038. according to the current rounding mode. If `a' is a NaN, the largest
  4039. positive integer is returned. Otherwise, if the conversion overflows, the
  4040. largest integer with the same sign as `a' is returned.
  4041. -------------------------------------------------------------------------------
  4042. *}
  4043. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4044. var
  4045. aSign: flag;
  4046. aExp, shiftCount: int16;
  4047. aSig0, aSig1, absZ, aSigExtra: bits32;
  4048. z: int32;
  4049. roundingMode: TFPURoundingMode;
  4050. label invalid;
  4051. Begin
  4052. aSig1 := extractFloat64Frac1( a );
  4053. aSig0 := extractFloat64Frac0( a );
  4054. aExp := extractFloat64Exp( a );
  4055. aSign := extractFloat64Sign( a );
  4056. shiftCount := aExp - $413;
  4057. if ( 0 <= shiftCount ) then
  4058. Begin
  4059. if ( $41E < aExp ) then
  4060. Begin
  4061. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4062. aSign := 0;
  4063. goto invalid;
  4064. End;
  4065. shortShift64Left(
  4066. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4067. if ( $80000000 < absZ ) then
  4068. goto invalid;
  4069. End
  4070. else
  4071. Begin
  4072. aSig1 := flag( aSig1 <> 0 );
  4073. if ( aExp < $3FE ) then
  4074. Begin
  4075. aSigExtra := aExp OR aSig0 OR aSig1;
  4076. absZ := 0;
  4077. End
  4078. else
  4079. Begin
  4080. aSig0 := aSig0 OR $00100000;
  4081. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4082. absZ := aSig0 shr ( - shiftCount );
  4083. End;
  4084. End;
  4085. roundingMode := softfloat_rounding_mode;
  4086. if ( roundingMode = float_round_nearest_even ) then
  4087. Begin
  4088. if ( sbits32(aSigExtra) < 0 ) then
  4089. Begin
  4090. Inc(absZ);
  4091. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4092. absZ := absZ and not 1;
  4093. End;
  4094. if aSign <> 0 then
  4095. z := - absZ
  4096. else
  4097. z := absZ;
  4098. End
  4099. else
  4100. Begin
  4101. aSigExtra := bits32( aSigExtra <> 0 );
  4102. if ( aSign <> 0) then
  4103. Begin
  4104. z := - ( absZ
  4105. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4106. End
  4107. else
  4108. Begin
  4109. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4110. End
  4111. End;
  4112. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4113. Begin
  4114. invalid:
  4115. float_raise( float_flag_invalid );
  4116. if (aSign <> 0 ) then
  4117. float64_to_int32 := sbits32 ($80000000)
  4118. else
  4119. float64_to_int32 := $7FFFFFFF;
  4120. exit;
  4121. End;
  4122. if ( aSigExtra <> 0) then
  4123. set_inexact_flag;
  4124. float64_to_int32 := z;
  4125. End;
  4126. {*
  4127. -------------------------------------------------------------------------------
  4128. Returns the result of converting the double-precision floating-point value
  4129. `a' to the 32-bit two's complement integer format. The conversion is
  4130. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4131. Arithmetic, except that the conversion is always rounded toward zero.
  4132. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4133. the conversion overflows, the largest integer with the same sign as `a' is
  4134. returned.
  4135. -------------------------------------------------------------------------------
  4136. *}
  4137. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4138. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4139. Var
  4140. aSign: flag;
  4141. aExp, shiftCount: int16;
  4142. aSig0, aSig1, absZ, aSigExtra: bits32;
  4143. z: int32;
  4144. label invalid;
  4145. Begin
  4146. aSig1 := extractFloat64Frac1( a );
  4147. aSig0 := extractFloat64Frac0( a );
  4148. aExp := extractFloat64Exp( a );
  4149. aSign := extractFloat64Sign( a );
  4150. shiftCount := aExp - $413;
  4151. if ( 0 <= shiftCount ) then
  4152. Begin
  4153. if ( $41E < aExp ) then
  4154. Begin
  4155. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4156. aSign := 0;
  4157. goto invalid;
  4158. End;
  4159. shortShift64Left(
  4160. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4161. End
  4162. else
  4163. Begin
  4164. if ( aExp < $3FF ) then
  4165. Begin
  4166. if ( bits32(aExp) OR aSig0 OR aSig1 )<>0 then
  4167. Begin
  4168. set_inexact_flag;
  4169. End;
  4170. float64_to_int32_round_to_zero := 0;
  4171. exit;
  4172. End;
  4173. aSig0 := aSig0 or $00100000;
  4174. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4175. absZ := aSig0 shr ( - shiftCount );
  4176. End;
  4177. if aSign <> 0 then
  4178. z := - absZ
  4179. else
  4180. z := absZ;
  4181. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4182. Begin
  4183. invalid:
  4184. float_raise( float_flag_invalid );
  4185. if (aSign <> 0) then
  4186. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4187. else
  4188. float64_to_int32_round_to_zero := $7FFFFFFF;
  4189. exit;
  4190. End;
  4191. if ( aSigExtra <> 0) then
  4192. set_inexact_flag;
  4193. float64_to_int32_round_to_zero := z;
  4194. End;
  4195. {*----------------------------------------------------------------------------
  4196. | Returns the result of converting the double-precision floating-point value
  4197. | `a' to the 64-bit two's complement integer format. The conversion is
  4198. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4199. | Arithmetic---which means in particular that the conversion is rounded
  4200. | according to the current rounding mode. If `a' is a NaN, the largest
  4201. | positive integer is returned. Otherwise, if the conversion overflows, the
  4202. | largest integer with the same sign as `a' is returned.
  4203. *----------------------------------------------------------------------------*}
  4204. function float64_to_int64( a: float64 ): int64;
  4205. var
  4206. aSign: flag;
  4207. aExp, shiftCount: int16;
  4208. aSig, aSigExtra: bits64;
  4209. begin
  4210. aSig := extractFloat64Frac( a );
  4211. aExp := extractFloat64Exp( a );
  4212. aSign := extractFloat64Sign( a );
  4213. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4214. shiftCount := $433 - aExp;
  4215. if ( shiftCount <= 0 ) then begin
  4216. if ( $43E < aExp ) then begin
  4217. float_raise( float_flag_invalid );
  4218. if ( ( aSign = 0 )
  4219. or ( ( aExp = $7FF )
  4220. and ( aSig <> $0010000000000000 ) )
  4221. ) then begin
  4222. result := $7FFFFFFFFFFFFFFF;
  4223. exit;
  4224. end;
  4225. result := $8000000000000000;
  4226. exit;
  4227. end;
  4228. aSigExtra := 0;
  4229. aSig := aSig shl ( - shiftCount );
  4230. end
  4231. else
  4232. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4233. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4234. end;
  4235. {*----------------------------------------------------------------------------
  4236. | Returns the result of converting the double-precision floating-point value
  4237. | `a' to the 64-bit two's complement integer format. The conversion is
  4238. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4239. | Arithmetic, except that the conversion is always rounded toward zero.
  4240. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4241. | the conversion overflows, the largest integer with the same sign as `a' is
  4242. | returned.
  4243. *----------------------------------------------------------------------------*}
  4244. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4245. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4246. var
  4247. aSign: flag;
  4248. aExp, shiftCount: int16;
  4249. aSig: bits64;
  4250. z: int64;
  4251. begin
  4252. aSig := extractFloat64Frac( a );
  4253. aExp := extractFloat64Exp( a );
  4254. aSign := extractFloat64Sign( a );
  4255. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4256. shiftCount := aExp - $433;
  4257. if ( 0 <= shiftCount ) then begin
  4258. if ( $43E <= aExp ) then begin
  4259. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4260. float_raise( float_flag_invalid );
  4261. if ( ( aSign = 0 )
  4262. or ( ( aExp = $7FF )
  4263. and ( aSig <> $0010000000000000 ) )
  4264. ) then begin
  4265. result := $7FFFFFFFFFFFFFFF;
  4266. exit;
  4267. end;
  4268. end;
  4269. result := $8000000000000000;
  4270. exit;
  4271. end;
  4272. z := aSig shl shiftCount;
  4273. end
  4274. else begin
  4275. if ( aExp < $3FE ) then begin
  4276. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  4277. result := 0;
  4278. exit;
  4279. end;
  4280. z := aSig shr ( - shiftCount );
  4281. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4282. set_inexact_flag;
  4283. end;
  4284. if ( aSign <> 0 ) then z := - z;
  4285. result := z;
  4286. end;
  4287. {*
  4288. -------------------------------------------------------------------------------
  4289. Returns the result of converting the double-precision floating-point value
  4290. `a' to the single-precision floating-point format. The conversion is
  4291. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4292. Arithmetic.
  4293. -------------------------------------------------------------------------------
  4294. *}
  4295. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  4296. Var
  4297. aSign: flag;
  4298. aExp: int16;
  4299. aSig0, aSig1, zSig: bits32;
  4300. allZero: bits32;
  4301. tmp : CommonNanT;
  4302. Begin
  4303. aSig1 := extractFloat64Frac1( a );
  4304. aSig0 := extractFloat64Frac0( a );
  4305. aExp := extractFloat64Exp( a );
  4306. aSign := extractFloat64Sign( a );
  4307. if ( aExp = $7FF ) then
  4308. Begin
  4309. if ( aSig0 OR aSig1 ) <> 0 then
  4310. Begin
  4311. float64ToCommonNaN( a, tmp );
  4312. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4313. exit;
  4314. End;
  4315. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4316. exit;
  4317. End;
  4318. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4319. if ( aExp <> 0) then
  4320. zSig := zSig OR $40000000;
  4321. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4322. End;
  4323. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4324. {*----------------------------------------------------------------------------
  4325. | Returns the result of converting the double-precision floating-point value
  4326. | `a' to the extended double-precision floating-point format. The conversion
  4327. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4328. | Arithmetic.
  4329. *----------------------------------------------------------------------------*}
  4330. function float64_to_floatx80( a: float64 ): floatx80;
  4331. var
  4332. aSign: flag;
  4333. aExp: int16;
  4334. aSig: bits64;
  4335. begin
  4336. aSig := extractFloat64Frac( a );
  4337. aExp := extractFloat64Exp( a );
  4338. aSign := extractFloat64Sign( a );
  4339. if ( aExp = $7FF ) then begin
  4340. if ( aSig <> 0 ) then begin
  4341. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4342. exit;
  4343. end;
  4344. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4345. exit;
  4346. end;
  4347. if ( aExp = 0 ) then begin
  4348. if ( aSig = 0 ) then begin
  4349. result := packFloatx80( aSign, 0, 0 );
  4350. exit;
  4351. end;
  4352. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4353. end;
  4354. result :=
  4355. packFloatx80(
  4356. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4357. end;
  4358. {$endif FPC_SOFTFLOAT_FLOATX80}
  4359. {*
  4360. -------------------------------------------------------------------------------
  4361. Rounds the double-precision floating-point value `a' to an integer,
  4362. and returns the result as a double-precision floating-point value. The
  4363. operation is performed according to the IEC/IEEE Standard for Binary
  4364. Floating-Point Arithmetic.
  4365. -------------------------------------------------------------------------------
  4366. *}
  4367. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4368. Var
  4369. aSign: flag;
  4370. aExp: int16;
  4371. lastBitMask, roundBitsMask: bits32;
  4372. roundingMode: TFPURoundingMode;
  4373. z: float64;
  4374. Begin
  4375. aExp := extractFloat64Exp( a );
  4376. if ( $413 <= aExp ) then
  4377. Begin
  4378. if ( $433 <= aExp ) then
  4379. Begin
  4380. if ( ( aExp = $7FF )
  4381. AND
  4382. (
  4383. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4384. ) <>0)
  4385. ) then
  4386. Begin
  4387. propagateFloat64NaN( a, a, result );
  4388. exit;
  4389. End;
  4390. result := a;
  4391. exit;
  4392. End;
  4393. lastBitMask := 1;
  4394. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4395. roundBitsMask := lastBitMask - 1;
  4396. z := a;
  4397. roundingMode := softfloat_rounding_mode;
  4398. if ( roundingMode = float_round_nearest_even ) then
  4399. Begin
  4400. if ( lastBitMask <> 0) then
  4401. Begin
  4402. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4403. if ( ( z.low and roundBitsMask ) = 0 ) then
  4404. z.low := z.low and not lastBitMask;
  4405. End
  4406. else
  4407. Begin
  4408. if ( sbits32 (z.low) < 0 ) then
  4409. Begin
  4410. Inc(z.high);
  4411. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4412. z.high := z.high and not 1;
  4413. End;
  4414. End;
  4415. End
  4416. else if ( roundingMode <> float_round_to_zero ) then
  4417. Begin
  4418. if ( extractFloat64Sign( z )
  4419. xor flag( roundingMode = float_round_up ) )<> 0 then
  4420. Begin
  4421. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4422. End;
  4423. End;
  4424. z.low := z.low and not roundBitsMask;
  4425. End
  4426. else
  4427. Begin
  4428. if ( aExp <= $3FE ) then
  4429. Begin
  4430. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4431. Begin
  4432. result := a;
  4433. exit;
  4434. End;
  4435. set_inexact_flag;
  4436. aSign := extractFloat64Sign( a );
  4437. case ( softfloat_rounding_mode ) of
  4438. float_round_nearest_even:
  4439. Begin
  4440. if ( ( aExp = $3FE )
  4441. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4442. ) then
  4443. Begin
  4444. packFloat64( aSign, $3FF, 0, 0, result );
  4445. exit;
  4446. End;
  4447. End;
  4448. float_round_down:
  4449. Begin
  4450. if aSign<>0 then
  4451. packFloat64( 1, $3FF, 0, 0, result )
  4452. else
  4453. packFloat64( 0, 0, 0, 0, result );
  4454. exit;
  4455. End;
  4456. float_round_up:
  4457. Begin
  4458. if aSign <> 0 then
  4459. packFloat64( 1, 0, 0, 0, result )
  4460. else
  4461. packFloat64( 0, $3FF, 0, 0, result );
  4462. exit;
  4463. End;
  4464. end;
  4465. packFloat64( aSign, 0, 0, 0, result );
  4466. exit;
  4467. End;
  4468. lastBitMask := 1;
  4469. lastBitMask := lastBitMask shl ($413 - aExp);
  4470. roundBitsMask := lastBitMask - 1;
  4471. z.low := 0;
  4472. z.high := a.high;
  4473. roundingMode := softfloat_rounding_mode;
  4474. if ( roundingMode = float_round_nearest_even ) then
  4475. Begin
  4476. z.high := z.high + lastBitMask shr 1;
  4477. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4478. Begin
  4479. z.high := z.high and not lastBitMask;
  4480. End;
  4481. End
  4482. else if ( roundingMode <> float_round_to_zero ) then
  4483. Begin
  4484. if ( extractFloat64Sign( z )
  4485. xor flag( roundingMode = float_round_up ) )<> 0 then
  4486. Begin
  4487. z.high := z.high or bits32( a.low <> 0 );
  4488. z.high := z.high + roundBitsMask;
  4489. End;
  4490. End;
  4491. z.high := z.high and not roundBitsMask;
  4492. End;
  4493. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4494. Begin
  4495. set_inexact_flag;
  4496. End;
  4497. result := z;
  4498. End;
  4499. {*
  4500. -------------------------------------------------------------------------------
  4501. Returns the result of adding the absolute values of the double-precision
  4502. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4503. before being returned. `zSign' is ignored if the result is a NaN.
  4504. The addition is performed according to the IEC/IEEE Standard for Binary
  4505. Floating-Point Arithmetic.
  4506. -------------------------------------------------------------------------------
  4507. *}
  4508. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4509. Var
  4510. aExp, bExp, zExp: int16;
  4511. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4512. expDiff: int16;
  4513. label shiftRight1;
  4514. label roundAndPack;
  4515. Begin
  4516. aSig1 := extractFloat64Frac1( a );
  4517. aSig0 := extractFloat64Frac0( a );
  4518. aExp := extractFloat64Exp( a );
  4519. bSig1 := extractFloat64Frac1( b );
  4520. bSig0 := extractFloat64Frac0( b );
  4521. bExp := extractFloat64Exp( b );
  4522. expDiff := aExp - bExp;
  4523. if ( 0 < expDiff ) then
  4524. Begin
  4525. if ( aExp = $7FF ) then
  4526. Begin
  4527. if ( aSig0 OR aSig1 ) <> 0 then
  4528. Begin
  4529. propagateFloat64NaN( a, b, out );
  4530. exit;
  4531. end;
  4532. out := a;
  4533. exit;
  4534. End;
  4535. if ( bExp = 0 ) then
  4536. Begin
  4537. Dec(expDiff);
  4538. End
  4539. else
  4540. Begin
  4541. bSig0 := bSig0 or $00100000;
  4542. End;
  4543. shift64ExtraRightJamming(
  4544. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4545. zExp := aExp;
  4546. End
  4547. else if ( expDiff < 0 ) then
  4548. Begin
  4549. if ( bExp = $7FF ) then
  4550. Begin
  4551. if ( bSig0 OR bSig1 ) <> 0 then
  4552. Begin
  4553. propagateFloat64NaN( a, b, out );
  4554. exit;
  4555. End;
  4556. packFloat64( zSign, $7FF, 0, 0, out );
  4557. exit;
  4558. End;
  4559. if ( aExp = 0 ) then
  4560. Begin
  4561. Inc(expDiff);
  4562. End
  4563. else
  4564. Begin
  4565. aSig0 := aSig0 or $00100000;
  4566. End;
  4567. shift64ExtraRightJamming(
  4568. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4569. zExp := bExp;
  4570. End
  4571. else
  4572. Begin
  4573. if ( aExp = $7FF ) then
  4574. Begin
  4575. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4576. Begin
  4577. propagateFloat64NaN( a, b, out );
  4578. exit;
  4579. End;
  4580. out := a;
  4581. exit;
  4582. End;
  4583. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4584. if ( aExp = 0 ) then
  4585. Begin
  4586. packFloat64( zSign, 0, zSig0, zSig1, out );
  4587. exit;
  4588. End;
  4589. zSig2 := 0;
  4590. zSig0 := zSig0 or $00200000;
  4591. zExp := aExp;
  4592. goto shiftRight1;
  4593. End;
  4594. aSig0 := aSig0 or $00100000;
  4595. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4596. Dec(zExp);
  4597. if ( zSig0 < $00200000 ) then
  4598. goto roundAndPack;
  4599. Inc(zExp);
  4600. shiftRight1:
  4601. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4602. roundAndPack:
  4603. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4604. End;
  4605. {*
  4606. -------------------------------------------------------------------------------
  4607. Returns the result of subtracting the absolute values of the double-
  4608. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4609. difference is negated before being returned. `zSign' is ignored if the
  4610. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4611. Standard for Binary Floating-Point Arithmetic.
  4612. -------------------------------------------------------------------------------
  4613. *}
  4614. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4615. Var
  4616. aExp, bExp, zExp: int16;
  4617. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4618. expDiff: int16;
  4619. z: float64;
  4620. label aExpBigger;
  4621. label bExpBigger;
  4622. label aBigger;
  4623. label bBigger;
  4624. label normalizeRoundAndPack;
  4625. Begin
  4626. aSig1 := extractFloat64Frac1( a );
  4627. aSig0 := extractFloat64Frac0( a );
  4628. aExp := extractFloat64Exp( a );
  4629. bSig1 := extractFloat64Frac1( b );
  4630. bSig0 := extractFloat64Frac0( b );
  4631. bExp := extractFloat64Exp( b );
  4632. expDiff := aExp - bExp;
  4633. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4634. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4635. if ( 0 < expDiff ) then goto aExpBigger;
  4636. if ( expDiff < 0 ) then goto bExpBigger;
  4637. if ( aExp = $7FF ) then
  4638. Begin
  4639. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4640. Begin
  4641. propagateFloat64NaN( a, b, out );
  4642. exit;
  4643. End;
  4644. float_raise( float_flag_invalid );
  4645. z.low := float64_default_nan_low;
  4646. z.high := float64_default_nan_high;
  4647. out := z;
  4648. exit;
  4649. End;
  4650. if ( aExp = 0 ) then
  4651. Begin
  4652. aExp := 1;
  4653. bExp := 1;
  4654. End;
  4655. if ( bSig0 < aSig0 ) then goto aBigger;
  4656. if ( aSig0 < bSig0 ) then goto bBigger;
  4657. if ( bSig1 < aSig1 ) then goto aBigger;
  4658. if ( aSig1 < bSig1 ) then goto bBigger;
  4659. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4660. exit;
  4661. bExpBigger:
  4662. if ( bExp = $7FF ) then
  4663. Begin
  4664. if ( bSig0 OR bSig1 ) <> 0 then
  4665. Begin
  4666. propagateFloat64NaN( a, b, out );
  4667. exit;
  4668. End;
  4669. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4670. exit;
  4671. End;
  4672. if ( aExp = 0 ) then
  4673. Begin
  4674. Inc(expDiff);
  4675. End
  4676. else
  4677. Begin
  4678. aSig0 := aSig0 or $40000000;
  4679. End;
  4680. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4681. bSig0 := bSig0 or $40000000;
  4682. bBigger:
  4683. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4684. zExp := bExp;
  4685. zSign := zSign xor 1;
  4686. goto normalizeRoundAndPack;
  4687. aExpBigger:
  4688. if ( aExp = $7FF ) then
  4689. Begin
  4690. if ( aSig0 OR aSig1 ) <> 0 then
  4691. Begin
  4692. propagateFloat64NaN( a, b, out );
  4693. exit;
  4694. End;
  4695. out := a;
  4696. exit;
  4697. End;
  4698. if ( bExp = 0 ) then
  4699. Begin
  4700. Dec(expDiff);
  4701. End
  4702. else
  4703. Begin
  4704. bSig0 := bSig0 or $40000000;
  4705. End;
  4706. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4707. aSig0 := aSig0 or $40000000;
  4708. aBigger:
  4709. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4710. zExp := aExp;
  4711. normalizeRoundAndPack:
  4712. Dec(zExp);
  4713. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4714. End;
  4715. {*
  4716. -------------------------------------------------------------------------------
  4717. Returns the result of adding the double-precision floating-point values `a'
  4718. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4719. Binary Floating-Point Arithmetic.
  4720. -------------------------------------------------------------------------------
  4721. *}
  4722. Function float64_add( a: float64; b : float64) : Float64;
  4723. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4724. Var
  4725. aSign, bSign: flag;
  4726. Begin
  4727. aSign := extractFloat64Sign( a );
  4728. bSign := extractFloat64Sign( b );
  4729. if ( aSign = bSign ) then
  4730. Begin
  4731. addFloat64Sigs( a, b, aSign, result );
  4732. End
  4733. else
  4734. Begin
  4735. subFloat64Sigs( a, b, aSign, result );
  4736. End;
  4737. End;
  4738. {*
  4739. -------------------------------------------------------------------------------
  4740. Returns the result of subtracting the double-precision floating-point values
  4741. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4742. for Binary Floating-Point Arithmetic.
  4743. -------------------------------------------------------------------------------
  4744. *}
  4745. Function float64_sub(a: float64; b : float64) : Float64;
  4746. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4747. Var
  4748. aSign, bSign: flag;
  4749. Begin
  4750. aSign := extractFloat64Sign( a );
  4751. bSign := extractFloat64Sign( b );
  4752. if ( aSign = bSign ) then
  4753. Begin
  4754. subFloat64Sigs( a, b, aSign, result );
  4755. End
  4756. else
  4757. Begin
  4758. addFloat64Sigs( a, b, aSign, result );
  4759. End;
  4760. End;
  4761. {*
  4762. -------------------------------------------------------------------------------
  4763. Returns the result of multiplying the double-precision floating-point values
  4764. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4765. for Binary Floating-Point Arithmetic.
  4766. -------------------------------------------------------------------------------
  4767. *}
  4768. Function float64_mul( a: float64; b:float64) : Float64;
  4769. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4770. Var
  4771. aSign, bSign, zSign: flag;
  4772. aExp, bExp, zExp: int16;
  4773. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4774. z: float64;
  4775. label invalid;
  4776. Begin
  4777. aSig1 := extractFloat64Frac1( a );
  4778. aSig0 := extractFloat64Frac0( a );
  4779. aExp := extractFloat64Exp( a );
  4780. aSign := extractFloat64Sign( a );
  4781. bSig1 := extractFloat64Frac1( b );
  4782. bSig0 := extractFloat64Frac0( b );
  4783. bExp := extractFloat64Exp( b );
  4784. bSign := extractFloat64Sign( b );
  4785. zSign := aSign xor bSign;
  4786. if ( aExp = $7FF ) then
  4787. Begin
  4788. if ( (( aSig0 OR aSig1 ) <>0)
  4789. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4790. Begin
  4791. propagateFloat64NaN( a, b, result );
  4792. exit;
  4793. End;
  4794. if ( ( bits32(bExp) OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4795. packFloat64( zSign, $7FF, 0, 0, result );
  4796. exit;
  4797. End;
  4798. if ( bExp = $7FF ) then
  4799. Begin
  4800. if ( bSig0 OR bSig1 )<> 0 then
  4801. Begin
  4802. propagateFloat64NaN( a, b, result );
  4803. exit;
  4804. End;
  4805. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4806. Begin
  4807. invalid:
  4808. float_raise( float_flag_invalid );
  4809. z.low := float64_default_nan_low;
  4810. z.high := float64_default_nan_high;
  4811. result := z;
  4812. exit;
  4813. End;
  4814. packFloat64( zSign, $7FF, 0, 0, result );
  4815. exit;
  4816. End;
  4817. if ( aExp = 0 ) then
  4818. Begin
  4819. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4820. Begin
  4821. packFloat64( zSign, 0, 0, 0, result );
  4822. exit;
  4823. End;
  4824. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4825. End;
  4826. if ( bExp = 0 ) then
  4827. Begin
  4828. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4829. Begin
  4830. packFloat64( zSign, 0, 0, 0, result );
  4831. exit;
  4832. End;
  4833. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4834. End;
  4835. zExp := aExp + bExp - $400;
  4836. aSig0 := aSig0 or $00100000;
  4837. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4838. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4839. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4840. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4841. if ( $00200000 <= zSig0 ) then
  4842. Begin
  4843. shift64ExtraRightJamming(
  4844. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4845. Inc(zExp);
  4846. End;
  4847. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4848. End;
  4849. {*
  4850. -------------------------------------------------------------------------------
  4851. Returns the result of dividing the double-precision floating-point value `a'
  4852. by the corresponding value `b'. The operation is performed according to the
  4853. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4854. -------------------------------------------------------------------------------
  4855. *}
  4856. Function float64_div(a: float64; b : float64) : Float64;
  4857. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4858. Var
  4859. aSign, bSign, zSign: flag;
  4860. aExp, bExp, zExp: int16;
  4861. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4862. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4863. z: float64;
  4864. label invalid;
  4865. Begin
  4866. aSig1 := extractFloat64Frac1( a );
  4867. aSig0 := extractFloat64Frac0( a );
  4868. aExp := extractFloat64Exp( a );
  4869. aSign := extractFloat64Sign( a );
  4870. bSig1 := extractFloat64Frac1( b );
  4871. bSig0 := extractFloat64Frac0( b );
  4872. bExp := extractFloat64Exp( b );
  4873. bSign := extractFloat64Sign( b );
  4874. zSign := aSign xor bSign;
  4875. if ( aExp = $7FF ) then
  4876. Begin
  4877. if ( aSig0 OR aSig1 )<> 0 then
  4878. Begin
  4879. propagateFloat64NaN( a, b, result );
  4880. exit;
  4881. end;
  4882. if ( bExp = $7FF ) then
  4883. Begin
  4884. if ( bSig0 OR bSig1 )<>0 then
  4885. Begin
  4886. propagateFloat64NaN( a, b, result );
  4887. exit;
  4888. End;
  4889. goto invalid;
  4890. End;
  4891. packFloat64( zSign, $7FF, 0, 0, result );
  4892. exit;
  4893. End;
  4894. if ( bExp = $7FF ) then
  4895. Begin
  4896. if ( bSig0 OR bSig1 )<> 0 then
  4897. Begin
  4898. propagateFloat64NaN( a, b, result );
  4899. exit;
  4900. End;
  4901. packFloat64( zSign, 0, 0, 0, result );
  4902. exit;
  4903. End;
  4904. if ( bExp = 0 ) then
  4905. Begin
  4906. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4907. Begin
  4908. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  4909. Begin
  4910. invalid:
  4911. float_raise( float_flag_invalid );
  4912. z.low := float64_default_nan_low;
  4913. z.high := float64_default_nan_high;
  4914. result := z;
  4915. exit;
  4916. End;
  4917. float_raise( float_flag_divbyzero );
  4918. packFloat64( zSign, $7FF, 0, 0, result );
  4919. exit;
  4920. End;
  4921. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4922. End;
  4923. if ( aExp = 0 ) then
  4924. Begin
  4925. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4926. Begin
  4927. packFloat64( zSign, 0, 0, 0, result );
  4928. exit;
  4929. End;
  4930. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4931. End;
  4932. zExp := aExp - bExp + $3FD;
  4933. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4934. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4935. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4936. Begin
  4937. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4938. Inc(zExp);
  4939. End;
  4940. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4941. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4942. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4943. while ( sbits32 (rem0) < 0 ) do
  4944. Begin
  4945. Dec(zSig0);
  4946. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4947. End;
  4948. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4949. if ( ( zSig1 and $3FF ) <= 4 ) then
  4950. Begin
  4951. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4952. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4953. while ( sbits32 (rem1) < 0 ) do
  4954. Begin
  4955. Dec(zSig1);
  4956. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4957. End;
  4958. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4959. End;
  4960. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4961. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4962. End;
  4963. {*
  4964. -------------------------------------------------------------------------------
  4965. Returns the remainder of the double-precision floating-point value `a'
  4966. with respect to the corresponding value `b'. The operation is performed
  4967. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4968. -------------------------------------------------------------------------------
  4969. *}
  4970. Function float64_rem(a: float64; b : float64) : float64;
  4971. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4972. Var
  4973. aSign, zSign: flag;
  4974. aExp, bExp, expDiff: int16;
  4975. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4976. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4977. sigMean0: sbits32;
  4978. z: float64;
  4979. label invalid;
  4980. Begin
  4981. aSig1 := extractFloat64Frac1( a );
  4982. aSig0 := extractFloat64Frac0( a );
  4983. aExp := extractFloat64Exp( a );
  4984. aSign := extractFloat64Sign( a );
  4985. bSig1 := extractFloat64Frac1( b );
  4986. bSig0 := extractFloat64Frac0( b );
  4987. bExp := extractFloat64Exp( b );
  4988. if ( aExp = $7FF ) then
  4989. Begin
  4990. if ((( aSig0 OR aSig1 )<>0)
  4991. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4992. Begin
  4993. propagateFloat64NaN( a, b, result );
  4994. exit;
  4995. End;
  4996. goto invalid;
  4997. End;
  4998. if ( bExp = $7FF ) then
  4999. Begin
  5000. if ( bSig0 OR bSig1 ) <> 0 then
  5001. Begin
  5002. propagateFloat64NaN( a, b, result );
  5003. exit;
  5004. End;
  5005. result := a;
  5006. exit;
  5007. End;
  5008. if ( bExp = 0 ) then
  5009. Begin
  5010. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5011. Begin
  5012. invalid:
  5013. float_raise( float_flag_invalid );
  5014. z.low := float64_default_nan_low;
  5015. z.high := float64_default_nan_high;
  5016. result := z;
  5017. exit;
  5018. End;
  5019. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5020. End;
  5021. if ( aExp = 0 ) then
  5022. Begin
  5023. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5024. Begin
  5025. result := a;
  5026. exit;
  5027. End;
  5028. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5029. End;
  5030. expDiff := aExp - bExp;
  5031. if ( expDiff < -1 ) then
  5032. Begin
  5033. result := a;
  5034. exit;
  5035. End;
  5036. shortShift64Left(
  5037. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5038. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5039. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5040. if ( q )<>0 then
  5041. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5042. expDiff := expDiff - 32;
  5043. while ( 0 < expDiff ) do
  5044. Begin
  5045. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5046. if 4 < q then
  5047. q:= q - 4
  5048. else
  5049. q := 0;
  5050. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5051. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5052. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5053. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5054. expDiff := expDiff - 29;
  5055. End;
  5056. if ( -32 < expDiff ) then
  5057. Begin
  5058. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5059. if 4 < q then
  5060. q := q - 4
  5061. else
  5062. q := 0;
  5063. q := q shr (- expDiff);
  5064. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5065. expDiff := expDiff + 24;
  5066. if ( expDiff < 0 ) then
  5067. Begin
  5068. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5069. End
  5070. else
  5071. Begin
  5072. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5073. End;
  5074. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5075. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5076. End
  5077. else
  5078. Begin
  5079. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5080. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5081. End;
  5082. Repeat
  5083. alternateASig0 := aSig0;
  5084. alternateASig1 := aSig1;
  5085. Inc(q);
  5086. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5087. Until not ( 0 <= sbits32 (aSig0) );
  5088. add64(
  5089. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5090. if ( ( sigMean0 < 0 )
  5091. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5092. Begin
  5093. aSig0 := alternateASig0;
  5094. aSig1 := alternateASig1;
  5095. End;
  5096. zSign := flag( sbits32 (aSig0) < 0 );
  5097. if ( zSign <> 0 ) then
  5098. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5099. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5100. End;
  5101. {*
  5102. -------------------------------------------------------------------------------
  5103. Returns the square root of the double-precision floating-point value `a'.
  5104. The operation is performed according to the IEC/IEEE Standard for Binary
  5105. Floating-Point Arithmetic.
  5106. -------------------------------------------------------------------------------
  5107. *}
  5108. function float64_sqrt( a: float64 ): float64;
  5109. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5110. Var
  5111. aSign: flag;
  5112. aExp, zExp: int16;
  5113. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5114. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5115. label invalid;
  5116. Begin
  5117. aSig1 := extractFloat64Frac1( a );
  5118. aSig0 := extractFloat64Frac0( a );
  5119. aExp := extractFloat64Exp( a );
  5120. aSign := extractFloat64Sign( a );
  5121. if ( aExp = $7FF ) then
  5122. Begin
  5123. if ( aSig0 OR aSig1 ) <> 0 then
  5124. Begin
  5125. propagateFloat64NaN( a, a, result );
  5126. exit;
  5127. End;
  5128. if ( aSign = 0) then
  5129. Begin
  5130. result := a;
  5131. exit;
  5132. End;
  5133. goto invalid;
  5134. End;
  5135. if ( aSign <> 0 ) then
  5136. Begin
  5137. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5138. Begin
  5139. result := a;
  5140. exit;
  5141. End;
  5142. invalid:
  5143. float_raise( float_flag_invalid );
  5144. result.low := float64_default_nan_low;
  5145. result.high := float64_default_nan_high;
  5146. exit;
  5147. End;
  5148. if ( aExp = 0 ) then
  5149. Begin
  5150. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5151. Begin
  5152. packFloat64( 0, 0, 0, 0, result );
  5153. exit;
  5154. End;
  5155. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5156. End;
  5157. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5158. aSig0 := aSig0 or $00100000;
  5159. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5160. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5161. if ( zSig0 = 0 ) then
  5162. zSig0 := $7FFFFFFF;
  5163. doubleZSig0 := zSig0 + zSig0;
  5164. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5165. mul32To64( zSig0, zSig0, term0, term1 );
  5166. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5167. while ( sbits32 (rem0) < 0 ) do
  5168. Begin
  5169. Dec(zSig0);
  5170. doubleZSig0 := doubleZSig0 - 2;
  5171. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5172. End;
  5173. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5174. if ( ( zSig1 and $1FF ) <= 5 ) then
  5175. Begin
  5176. if ( zSig1 = 0 ) then
  5177. zSig1 := 1;
  5178. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5179. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5180. mul32To64( zSig1, zSig1, term2, term3 );
  5181. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5182. while ( sbits32 (rem1) < 0 ) do
  5183. Begin
  5184. Dec(zSig1);
  5185. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5186. term3 := term3 or 1;
  5187. term2 := term2 or doubleZSig0;
  5188. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5189. End;
  5190. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5191. End;
  5192. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5193. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
  5194. End;
  5195. {*
  5196. -------------------------------------------------------------------------------
  5197. Returns 1 if the double-precision floating-point value `a' is equal to
  5198. the corresponding value `b', and 0 otherwise. The comparison is performed
  5199. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5200. -------------------------------------------------------------------------------
  5201. *}
  5202. Function float64_eq(a: float64; b: float64): flag;
  5203. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5204. Begin
  5205. if
  5206. (
  5207. ( extractFloat64Exp( a ) = $7FF )
  5208. AND
  5209. (
  5210. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5211. )
  5212. )
  5213. OR (
  5214. ( extractFloat64Exp( b ) = $7FF )
  5215. AND (
  5216. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5217. )
  5218. )
  5219. ) then
  5220. Begin
  5221. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5222. float_raise( float_flag_invalid );
  5223. float64_eq := 0;
  5224. exit;
  5225. End;
  5226. float64_eq := flag(
  5227. ( a.low = b.low )
  5228. AND ( ( a.high = b.high )
  5229. OR ( ( a.low = 0 )
  5230. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5231. ));
  5232. End;
  5233. {*
  5234. -------------------------------------------------------------------------------
  5235. Returns 1 if the double-precision floating-point value `a' is less than
  5236. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5237. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5238. Arithmetic.
  5239. -------------------------------------------------------------------------------
  5240. *}
  5241. Function float64_le(a: float64;b: float64): flag;
  5242. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5243. Var
  5244. aSign, bSign: flag;
  5245. Begin
  5246. if
  5247. (
  5248. ( extractFloat64Exp( a ) = $7FF )
  5249. AND
  5250. (
  5251. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5252. )
  5253. )
  5254. OR (
  5255. ( extractFloat64Exp( b ) = $7FF )
  5256. AND (
  5257. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5258. )
  5259. )
  5260. ) then
  5261. Begin
  5262. float_raise( float_flag_invalid );
  5263. float64_le := 0;
  5264. exit;
  5265. End;
  5266. aSign := extractFloat64Sign( a );
  5267. bSign := extractFloat64Sign( b );
  5268. if ( aSign <> bSign ) then
  5269. Begin
  5270. float64_le := flag(
  5271. (aSign <> 0)
  5272. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5273. = 0 ));
  5274. exit;
  5275. End;
  5276. if aSign <> 0 then
  5277. float64_le := le64( b.high, b.low, a.high, a.low )
  5278. else
  5279. float64_le := le64( a.high, a.low, b.high, b.low );
  5280. End;
  5281. {*
  5282. -------------------------------------------------------------------------------
  5283. Returns 1 if the double-precision floating-point value `a' is less than
  5284. the corresponding value `b', and 0 otherwise. The comparison is performed
  5285. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5286. -------------------------------------------------------------------------------
  5287. *}
  5288. Function float64_lt(a: float64;b: float64): flag;
  5289. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5290. Var
  5291. aSign, bSign: flag;
  5292. Begin
  5293. if
  5294. (
  5295. ( extractFloat64Exp( a ) = $7FF )
  5296. AND
  5297. (
  5298. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5299. )
  5300. )
  5301. OR (
  5302. ( extractFloat64Exp( b ) = $7FF )
  5303. AND (
  5304. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5305. )
  5306. )
  5307. ) then
  5308. Begin
  5309. float_raise( float_flag_invalid );
  5310. float64_lt := 0;
  5311. exit;
  5312. End;
  5313. aSign := extractFloat64Sign( a );
  5314. bSign := extractFloat64Sign( b );
  5315. if ( aSign <> bSign ) then
  5316. Begin
  5317. float64_lt := flag(
  5318. (aSign <> 0)
  5319. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5320. <> 0 ));
  5321. exit;
  5322. End;
  5323. if aSign <> 0 then
  5324. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5325. else
  5326. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5327. End;
  5328. {*
  5329. -------------------------------------------------------------------------------
  5330. Returns 1 if the double-precision floating-point value `a' is equal to
  5331. the corresponding value `b', and 0 otherwise. The invalid exception is
  5332. raised if either operand is a NaN. Otherwise, the comparison is performed
  5333. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5334. -------------------------------------------------------------------------------
  5335. *}
  5336. Function float64_eq_signaling( a: float64; b: float64): flag;
  5337. Begin
  5338. if
  5339. (
  5340. ( extractFloat64Exp( a ) = $7FF )
  5341. AND
  5342. (
  5343. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5344. )
  5345. )
  5346. OR (
  5347. ( extractFloat64Exp( b ) = $7FF )
  5348. AND (
  5349. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5350. )
  5351. )
  5352. ) then
  5353. Begin
  5354. float_raise( float_flag_invalid );
  5355. float64_eq_signaling := 0;
  5356. exit;
  5357. End;
  5358. float64_eq_signaling := flag(
  5359. ( a.low = b.low )
  5360. AND ( ( a.high = b.high )
  5361. OR ( ( a.low = 0 )
  5362. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5363. ));
  5364. End;
  5365. {*
  5366. -------------------------------------------------------------------------------
  5367. Returns 1 if the double-precision floating-point value `a' is less than or
  5368. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5369. cause an exception. Otherwise, the comparison is performed according to the
  5370. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5371. -------------------------------------------------------------------------------
  5372. *}
  5373. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5374. Var
  5375. aSign, bSign : flag;
  5376. Begin
  5377. if
  5378. (
  5379. ( extractFloat64Exp( a ) = $7FF )
  5380. AND
  5381. (
  5382. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5383. )
  5384. )
  5385. OR (
  5386. ( extractFloat64Exp( b ) = $7FF )
  5387. AND (
  5388. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5389. )
  5390. )
  5391. ) then
  5392. Begin
  5393. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5394. float_raise( float_flag_invalid );
  5395. float64_le_quiet := 0;
  5396. exit;
  5397. End;
  5398. aSign := extractFloat64Sign( a );
  5399. bSign := extractFloat64Sign( b );
  5400. if ( aSign <> bSign ) then
  5401. Begin
  5402. float64_le_quiet := flag
  5403. ((aSign <> 0)
  5404. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5405. = 0 ));
  5406. exit;
  5407. End;
  5408. if aSign <> 0 then
  5409. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5410. else
  5411. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5412. End;
  5413. {*
  5414. -------------------------------------------------------------------------------
  5415. Returns 1 if the double-precision floating-point value `a' is less than
  5416. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5417. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5418. Standard for Binary Floating-Point Arithmetic.
  5419. -------------------------------------------------------------------------------
  5420. *}
  5421. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5422. Var
  5423. aSign, bSign: flag;
  5424. Begin
  5425. if
  5426. (
  5427. ( extractFloat64Exp( a ) = $7FF )
  5428. AND
  5429. (
  5430. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5431. )
  5432. )
  5433. OR (
  5434. ( extractFloat64Exp( b ) = $7FF )
  5435. AND (
  5436. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5437. )
  5438. )
  5439. ) then
  5440. Begin
  5441. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5442. float_raise( float_flag_invalid );
  5443. float64_lt_quiet := 0;
  5444. exit;
  5445. End;
  5446. aSign := extractFloat64Sign( a );
  5447. bSign := extractFloat64Sign( b );
  5448. if ( aSign <> bSign ) then
  5449. Begin
  5450. float64_lt_quiet := flag(
  5451. (aSign<>0)
  5452. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5453. <> 0 ));
  5454. exit;
  5455. End;
  5456. If aSign <> 0 then
  5457. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5458. else
  5459. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5460. End;
  5461. {*----------------------------------------------------------------------------
  5462. | Returns the result of converting the 64-bit two's complement integer `a'
  5463. | to the single-precision floating-point format. The conversion is performed
  5464. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5465. *----------------------------------------------------------------------------*}
  5466. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5467. var
  5468. zSign : flag;
  5469. absA : uint64;
  5470. shiftCount: int8;
  5471. Begin
  5472. if ( a = 0 ) then
  5473. begin
  5474. int64_to_float32.float32 := 0;
  5475. exit;
  5476. end;
  5477. if a < 0 then
  5478. zSign := flag(TRUE)
  5479. else
  5480. zSign := flag(FALSE);
  5481. if zSign<>0 then
  5482. absA := -a
  5483. else
  5484. absA := a;
  5485. shiftCount := countLeadingZeros64( absA ) - 40;
  5486. if ( 0 <= shiftCount ) then
  5487. begin
  5488. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5489. end
  5490. else
  5491. begin
  5492. shiftCount := shiftCount + 7;
  5493. if ( shiftCount < 0 ) then
  5494. shift64RightJamming( absA, - shiftCount, absA )
  5495. else
  5496. absA := absA shl shiftCount;
  5497. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5498. end;
  5499. End;
  5500. {*----------------------------------------------------------------------------
  5501. | Returns the result of converting the 64-bit two's complement integer `a'
  5502. | to the single-precision floating-point format. The conversion is performed
  5503. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5504. | Unisgned version.
  5505. *----------------------------------------------------------------------------*}
  5506. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5507. var
  5508. absA : uint64;
  5509. shiftCount: int8;
  5510. Begin
  5511. if ( a = 0 ) then
  5512. begin
  5513. qword_to_float32.float32 := 0;
  5514. exit;
  5515. end;
  5516. absA := a;
  5517. shiftCount := countLeadingZeros64( absA ) - 40;
  5518. if ( 0 <= shiftCount ) then
  5519. begin
  5520. qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
  5521. end
  5522. else
  5523. begin
  5524. shiftCount := shiftCount + 7;
  5525. if ( shiftCount < 0 ) then
  5526. shift64RightJamming( absA, - shiftCount, absA )
  5527. else
  5528. absA := absA shl shiftCount;
  5529. qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
  5530. end;
  5531. End;
  5532. {*----------------------------------------------------------------------------
  5533. | Returns the result of converting the 64-bit two's complement integer `a'
  5534. | to the double-precision floating-point format. The conversion is performed
  5535. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5536. *----------------------------------------------------------------------------*}
  5537. function qword_to_float64( a: qword ): float64;
  5538. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5539. var
  5540. shiftCount: int8;
  5541. Begin
  5542. if ( a = 0 ) then
  5543. result := packFloat64( 0, 0, 0 )
  5544. else
  5545. begin
  5546. shiftCount := countLeadingZeros64(a) - 1;
  5547. { numbers with <= 53 significant bits are converted exactly }
  5548. if (shiftCount > 9) then
  5549. result := packFloat64(0, $43c - shiftCount, a shl (shiftCount-10))
  5550. else if (shiftCount>=0) then
  5551. result := roundAndPackFloat64( 0, $43c - shiftCount, a shl shiftCount)
  5552. else
  5553. begin
  5554. { the only possible negative value is -1, in case bit 63 of 'a' is set }
  5555. shift64RightJamming(a, 1, a);
  5556. result := roundAndPackFloat64(0, $43d, a);
  5557. end;
  5558. end;
  5559. End;
  5560. {*----------------------------------------------------------------------------
  5561. | Returns the result of converting the 64-bit two's complement integer `a'
  5562. | to the double-precision floating-point format. The conversion is performed
  5563. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5564. *----------------------------------------------------------------------------*}
  5565. function int64_to_float64( a: int64 ): float64;
  5566. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5567. Begin
  5568. if ( a = 0 ) then
  5569. result := packFloat64( 0, 0, 0 )
  5570. else if (a = int64($8000000000000000)) then
  5571. result := packFloat64( 1, $43e, 0 )
  5572. else if (a < 0) then
  5573. result := normalizeRoundAndPackFloat64( 1, $43c, -a )
  5574. else
  5575. result := normalizeRoundAndPackFloat64( 0, $43c, a );
  5576. End;
  5577. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5578. {*----------------------------------------------------------------------------
  5579. | Returns the result of converting the 64-bit two's complement integer `a'
  5580. | to the extended double-precision floating-point format. The conversion
  5581. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5582. | Arithmetic.
  5583. *----------------------------------------------------------------------------*}
  5584. function int64_to_floatx80( a: int64 ): floatx80;
  5585. var
  5586. zSign: flag;
  5587. absA: uint64;
  5588. shiftCount: int8;
  5589. begin
  5590. if ( a = 0 ) then begin
  5591. result := packFloatx80( 0, 0, 0 );
  5592. exit;
  5593. end;
  5594. zSign := ord( a < 0 );
  5595. if zSign <> 0 then absA := - a else absA := a;
  5596. shiftCount := countLeadingZeros64( absA );
  5597. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5598. end;
  5599. {*----------------------------------------------------------------------------
  5600. | Returns the result of converting the 64-bit two's complement integer `a'
  5601. | to the extended double-precision floating-point format. The conversion
  5602. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5603. | Arithmetic.
  5604. | Unsigned version.
  5605. *----------------------------------------------------------------------------*}
  5606. function qword_to_floatx80( a: qword ): floatx80;
  5607. var
  5608. absA: bits64;
  5609. shiftCount: int8;
  5610. begin
  5611. if ( a = 0 ) then begin
  5612. result := packFloatx80( 0, 0, 0 );
  5613. exit;
  5614. end;
  5615. absA := a;
  5616. shiftCount := countLeadingZeros64( absA );
  5617. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5618. end;
  5619. {$endif FPC_SOFTFLOAT_FLOATX80}
  5620. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5621. {*----------------------------------------------------------------------------
  5622. | Returns the result of converting the 64-bit two's complement integer `a' to
  5623. | the quadruple-precision floating-point format. The conversion is performed
  5624. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5625. *----------------------------------------------------------------------------*}
  5626. function int64_to_float128( a: int64 ): float128;
  5627. var
  5628. zSign: flag;
  5629. absA: uint64;
  5630. shiftCount: int8;
  5631. zExp: int32;
  5632. zSig0, zSig1: bits64;
  5633. begin
  5634. if ( a = 0 ) then begin
  5635. result := packFloat128( 0, 0, 0, 0 );
  5636. exit;
  5637. end;
  5638. zSign := ord( a < 0 );
  5639. if zSign <> 0 then absA := - a else absA := a;
  5640. shiftCount := countLeadingZeros64( absA ) + 49;
  5641. zExp := $406E - shiftCount;
  5642. if ( 64 <= shiftCount ) then begin
  5643. zSig1 := 0;
  5644. zSig0 := absA;
  5645. dec( shiftCount, 64 );
  5646. end
  5647. else begin
  5648. zSig1 := absA;
  5649. zSig0 := 0;
  5650. end;
  5651. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5652. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5653. end;
  5654. {*----------------------------------------------------------------------------
  5655. | Returns the result of converting the 64-bit two's complement integer `a' to
  5656. | the quadruple-precision floating-point format. The conversion is performed
  5657. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5658. | Unsigned version.
  5659. *----------------------------------------------------------------------------*}
  5660. function qword_to_float128( a: qword ): float128;
  5661. var
  5662. absA: bits64;
  5663. shiftCount: int8;
  5664. zExp: int32;
  5665. zSig0, zSig1: bits64;
  5666. begin
  5667. if ( a = 0 ) then begin
  5668. result := packFloat128( 0, 0, 0, 0 );
  5669. exit;
  5670. end;
  5671. absA := a;
  5672. shiftCount := countLeadingZeros64( absA ) + 49;
  5673. zExp := $406E - shiftCount;
  5674. if ( 64 <= shiftCount ) then begin
  5675. zSig1 := 0;
  5676. zSig0 := absA;
  5677. dec( shiftCount, 64 );
  5678. end
  5679. else begin
  5680. zSig1 := absA;
  5681. zSig0 := 0;
  5682. end;
  5683. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5684. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5685. end;
  5686. {$endif FPC_SOFTFLOAT_FLOAT128}
  5687. {*----------------------------------------------------------------------------
  5688. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5689. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5690. | Otherwise, returns 0.
  5691. *----------------------------------------------------------------------------*}
  5692. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5693. begin
  5694. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5695. end;
  5696. {*----------------------------------------------------------------------------
  5697. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5698. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5699. | Otherwise, returns 0.
  5700. *----------------------------------------------------------------------------*}
  5701. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5702. begin
  5703. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5704. end;
  5705. {*----------------------------------------------------------------------------
  5706. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5707. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5708. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5709. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5710. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5711. | the most-significant bit of the extra result, and the other 63 bits of the
  5712. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5713. | were all zero. This extra result is stored in the location pointed to by
  5714. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5715. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5716. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5717. | fixed-point value is shifted right by the number of bits given in `count',
  5718. | and the integer part of the result is returned at the locations pointed to
  5719. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5720. | corrupted as described above, and is returned at the location pointed to by
  5721. | `z2Ptr'.)
  5722. *----------------------------------------------------------------------------*}
  5723. procedure shift128ExtraRightJamming(
  5724. a0: bits64;
  5725. a1: bits64;
  5726. a2: bits64;
  5727. count: int16;
  5728. var z0Ptr: bits64;
  5729. var z1Ptr: bits64;
  5730. var z2Ptr: bits64);
  5731. var
  5732. z0, z1, z2: bits64;
  5733. negCount: int8;
  5734. begin
  5735. negCount := ( - count ) and 63;
  5736. if ( count = 0 ) then
  5737. begin
  5738. z2 := a2;
  5739. z1 := a1;
  5740. z0 := a0;
  5741. end
  5742. else begin
  5743. if ( count < 64 ) then
  5744. begin
  5745. z2 := a1 shl negCount;
  5746. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5747. z0 := a0 shr count;
  5748. end
  5749. else begin
  5750. if ( count = 64 ) then
  5751. begin
  5752. z2 := a1;
  5753. z1 := a0;
  5754. end
  5755. else begin
  5756. a2 := a2 or a1;
  5757. if ( count < 128 ) then
  5758. begin
  5759. z2 := a0 shl negCount;
  5760. z1 := a0 shr ( count and 63 );
  5761. end
  5762. else begin
  5763. if ( count = 128 ) then
  5764. z2 := a0
  5765. else
  5766. z2 := ord( a0 <> 0 );
  5767. z1 := 0;
  5768. end;
  5769. end;
  5770. z0 := 0;
  5771. end;
  5772. z2 := z2 or ord( a2 <> 0 );
  5773. end;
  5774. z2Ptr := z2;
  5775. z1Ptr := z1;
  5776. z0Ptr := z0;
  5777. end;
  5778. {*----------------------------------------------------------------------------
  5779. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5780. | _plus_ the number of bits given in `count'. The shifted result is at most
  5781. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5782. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5783. | shifted off is the most-significant bit of the extra result, and the other
  5784. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5785. | bits shifted off were all zero. This extra result is stored in the location
  5786. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5787. | (This routine makes more sense if `a0' and `a1' are considered to form
  5788. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5789. | point value is shifted right by the number of bits given in `count', and
  5790. | the integer part of the result is returned at the location pointed to by
  5791. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5792. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5793. *----------------------------------------------------------------------------*}
  5794. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5795. var
  5796. z0, z1: bits64;
  5797. negCount: int8;
  5798. begin
  5799. negCount := ( - count ) and 63;
  5800. if ( count = 0 ) then
  5801. begin
  5802. z1 := a1;
  5803. z0 := a0;
  5804. end
  5805. else if ( count < 64 ) then
  5806. begin
  5807. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5808. z0 := a0 shr count;
  5809. end
  5810. else begin
  5811. if ( count = 64 ) then
  5812. begin
  5813. z1 := a0 or ord( a1 <> 0 );
  5814. end
  5815. else begin
  5816. z1 := ord( ( a0 or a1 ) <> 0 );
  5817. end;
  5818. z0 := 0;
  5819. end;
  5820. z1Ptr := z1;
  5821. z0Ptr := z0;
  5822. end;
  5823. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5824. {*----------------------------------------------------------------------------
  5825. | Returns the fraction bits of the extended double-precision floating-point
  5826. | value `a'.
  5827. *----------------------------------------------------------------------------*}
  5828. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5829. begin
  5830. result:=a.low;
  5831. end;
  5832. {*----------------------------------------------------------------------------
  5833. | Returns the exponent bits of the extended double-precision floating-point
  5834. | value `a'.
  5835. *----------------------------------------------------------------------------*}
  5836. function extractFloatx80Exp(a : floatx80): int32;inline;
  5837. begin
  5838. result:=a.high and $7FFF;
  5839. end;
  5840. {*----------------------------------------------------------------------------
  5841. | Returns the sign bit of the extended double-precision floating-point value
  5842. | `a'.
  5843. *----------------------------------------------------------------------------*}
  5844. function extractFloatx80Sign(a : floatx80): flag;inline;
  5845. begin
  5846. result:=a.high shr 15;
  5847. end;
  5848. {*----------------------------------------------------------------------------
  5849. | Normalizes the subnormal extended double-precision floating-point value
  5850. | represented by the denormalized significand `aSig'. The normalized exponent
  5851. | and significand are stored at the locations pointed to by `zExpPtr' and
  5852. | `zSigPtr', respectively.
  5853. *----------------------------------------------------------------------------*}
  5854. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5855. var
  5856. shiftCount: int8;
  5857. begin
  5858. shiftCount := countLeadingZeros64( aSig );
  5859. zSigPtr := aSig shl shiftCount;
  5860. zExpPtr := 1 - shiftCount;
  5861. end;
  5862. {*----------------------------------------------------------------------------
  5863. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5864. | extended double-precision floating-point value, returning the result.
  5865. *----------------------------------------------------------------------------*}
  5866. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5867. var
  5868. z: floatx80;
  5869. begin
  5870. z.low := zSig;
  5871. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5872. result:=z;
  5873. end;
  5874. {*----------------------------------------------------------------------------
  5875. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5876. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5877. | and returns the proper extended double-precision floating-point value
  5878. | corresponding to the abstract input. Ordinarily, the abstract value is
  5879. | rounded and packed into the extended double-precision format, with the
  5880. | inexact exception raised if the abstract input cannot be represented
  5881. | exactly. However, if the abstract value is too large, the overflow and
  5882. | inexact exceptions are raised and an infinity or maximal finite value is
  5883. | returned. If the abstract value is too small, the input value is rounded to
  5884. | a subnormal number, and the underflow and inexact exceptions are raised if
  5885. | the abstract input cannot be represented exactly as a subnormal extended
  5886. | double-precision floating-point number.
  5887. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5888. | number of bits as single or double precision, respectively. Otherwise, the
  5889. | result is rounded to the full precision of the extended double-precision
  5890. | format.
  5891. | The input significand must be normalized or smaller. If the input
  5892. | significand is not normalized, `zExp' must be 0; in that case, the result
  5893. | returned is a subnormal number, and it must not require rounding. The
  5894. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5895. | Floating-Point Arithmetic.
  5896. *----------------------------------------------------------------------------*}
  5897. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5898. var
  5899. roundingMode: TFPURoundingMode;
  5900. roundNearestEven, increment, isTiny: flag;
  5901. roundIncrement, roundMask, roundBits: int64;
  5902. label
  5903. precision80, overflow;
  5904. begin
  5905. roundingMode := softfloat_rounding_mode;
  5906. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5907. if ( roundingPrecision = 80 ) then
  5908. goto precision80;
  5909. if ( roundingPrecision = 64 ) then
  5910. begin
  5911. roundIncrement := int64( $0000000000000400 );
  5912. roundMask := int64( $00000000000007FF );
  5913. end
  5914. else if ( roundingPrecision = 32 ) then
  5915. begin
  5916. roundIncrement := int64( $0000008000000000 );
  5917. roundMask := int64( $000000FFFFFFFFFF );
  5918. end
  5919. else begin
  5920. goto precision80;
  5921. end;
  5922. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5923. if ( not (roundNearestEven<>0) ) then
  5924. begin
  5925. if ( roundingMode = float_round_to_zero ) then
  5926. begin
  5927. roundIncrement := 0;
  5928. end
  5929. else begin
  5930. roundIncrement := roundMask;
  5931. if ( zSign<>0 ) then
  5932. begin
  5933. if ( roundingMode = float_round_up ) then
  5934. roundIncrement := 0;
  5935. end
  5936. else begin
  5937. if ( roundingMode = float_round_down ) then
  5938. roundIncrement := 0;
  5939. end;
  5940. end;
  5941. end;
  5942. roundBits := zSig0 and roundMask;
  5943. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  5944. if ( ( $7FFE < zExp )
  5945. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5946. ) then begin
  5947. goto overflow;
  5948. end;
  5949. if ( zExp <= 0 ) then begin
  5950. isTiny := ord (
  5951. ( softfloat_detect_tininess = float_tininess_before_rounding )
  5952. or ( zExp < 0 )
  5953. or ( zSig0 <= zSig0 + roundIncrement ) );
  5954. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5955. zExp := 0;
  5956. roundBits := zSig0 and roundMask;
  5957. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  5958. if ( roundBits <> 0 ) then set_inexact_flag;
  5959. inc( zSig0, roundIncrement );
  5960. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  5961. roundIncrement := roundMask + 1;
  5962. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  5963. roundMask := roundMask or roundIncrement;
  5964. end;
  5965. zSig0 := zSig0 and not roundMask;
  5966. result:=packFloatx80( zSign, zExp, zSig0 );
  5967. exit;
  5968. end;
  5969. end;
  5970. if ( roundBits <> 0 ) then set_inexact_flag;
  5971. inc( zSig0, roundIncrement );
  5972. if ( zSig0 < roundIncrement ) then begin
  5973. inc(zExp);
  5974. zSig0 := bits64( $8000000000000000 );
  5975. end;
  5976. roundIncrement := roundMask + 1;
  5977. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  5978. roundMask := roundMask or roundIncrement;
  5979. end;
  5980. zSig0 := zSig0 and not roundMask;
  5981. if ( zSig0 = 0 ) then zExp := 0;
  5982. result:=packFloatx80( zSign, zExp, zSig0 );
  5983. exit;
  5984. precision80:
  5985. increment := ord ( sbits64( zSig1 ) < 0 );
  5986. if ( roundNearestEven = 0 ) then begin
  5987. if ( roundingMode = float_round_to_zero ) then begin
  5988. increment := 0;
  5989. end
  5990. else begin
  5991. if ( zSign <> 0 ) then begin
  5992. increment := ord ( roundingMode = float_round_down ) and zSig1;
  5993. end
  5994. else begin
  5995. increment := ord ( roundingMode = float_round_up ) and zSig1;
  5996. end;
  5997. end;
  5998. end;
  5999. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6000. if ( ( $7FFE < zExp )
  6001. or ( ( zExp = $7FFE )
  6002. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  6003. and ( increment <> 0 )
  6004. )
  6005. ) then begin
  6006. roundMask := 0;
  6007. overflow:
  6008. float_raise( [float_flag_overflow,float_flag_inexact] );
  6009. if ( ( roundingMode = float_round_to_zero )
  6010. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6011. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6012. ) then begin
  6013. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6014. exit;
  6015. end;
  6016. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6017. exit;
  6018. end;
  6019. if ( zExp <= 0 ) then begin
  6020. isTiny := ord(
  6021. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6022. or ( zExp < 0 )
  6023. or ( increment = 0 )
  6024. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6025. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6026. zExp := 0;
  6027. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6028. if ( zSig1 <> 0 ) then set_inexact_flag;
  6029. if ( roundNearestEven <> 0 ) then begin
  6030. increment := ord( sbits64( zSig1 ) < 0 );
  6031. end
  6032. else begin
  6033. if ( zSign <> 0 ) then begin
  6034. increment := ord( roundingMode = float_round_down ) and zSig1;
  6035. end
  6036. else begin
  6037. increment := ord( roundingMode = float_round_up ) and zSig1;
  6038. end;
  6039. end;
  6040. if ( increment <> 0 ) then begin
  6041. inc(zSig0);
  6042. zSig0 :=
  6043. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6044. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6045. end;
  6046. result:=packFloatx80( zSign, zExp, zSig0 );
  6047. exit;
  6048. end;
  6049. end;
  6050. if ( zSig1 <> 0 ) then set_inexact_flag;
  6051. if ( increment <> 0 ) then begin
  6052. inc(zSig0);
  6053. if ( zSig0 = 0 ) then begin
  6054. inc(zExp);
  6055. zSig0 := bits64( $8000000000000000 );
  6056. end
  6057. else begin
  6058. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6059. end;
  6060. end
  6061. else begin
  6062. if ( zSig0 = 0 ) then zExp := 0;
  6063. end;
  6064. result:=packFloatx80( zSign, zExp, zSig0 );
  6065. end;
  6066. {*----------------------------------------------------------------------------
  6067. | Takes an abstract floating-point value having sign `zSign', exponent
  6068. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6069. | and returns the proper extended double-precision floating-point value
  6070. | corresponding to the abstract input. This routine is just like
  6071. | `roundAndPackFloatx80' except that the input significand does not have to be
  6072. | normalized.
  6073. *----------------------------------------------------------------------------*}
  6074. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6075. var
  6076. shiftCount: int8;
  6077. begin
  6078. if ( zSig0 = 0 ) then begin
  6079. zSig0 := zSig1;
  6080. zSig1 := 0;
  6081. dec( zExp, 64 );
  6082. end;
  6083. shiftCount := countLeadingZeros64( zSig0 );
  6084. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6085. zExp := zExp - shiftCount;
  6086. result :=
  6087. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6088. end;
  6089. {*----------------------------------------------------------------------------
  6090. | Returns the result of converting the extended double-precision floating-
  6091. | point value `a' to the 32-bit two's complement integer format. The
  6092. | conversion is performed according to the IEC/IEEE Standard for Binary
  6093. | Floating-Point Arithmetic---which means in particular that the conversion
  6094. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6095. | largest positive integer is returned. Otherwise, if the conversion
  6096. | overflows, the largest integer with the same sign as `a' is returned.
  6097. *----------------------------------------------------------------------------*}
  6098. function floatx80_to_int32(a: floatx80): int32;
  6099. var
  6100. aSign: flag;
  6101. aExp, shiftCount: int32;
  6102. aSig: bits64;
  6103. begin
  6104. aSig := extractFloatx80Frac( a );
  6105. aExp := extractFloatx80Exp( a );
  6106. aSign := extractFloatx80Sign( a );
  6107. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6108. shiftCount := $4037 - aExp;
  6109. if ( shiftCount <= 0 ) then shiftCount := 1;
  6110. shift64RightJamming( aSig, shiftCount, aSig );
  6111. result := roundAndPackInt32( aSign, aSig );
  6112. end;
  6113. {*----------------------------------------------------------------------------
  6114. | Returns the result of converting the extended double-precision floating-
  6115. | point value `a' to the 32-bit two's complement integer format. The
  6116. | conversion is performed according to the IEC/IEEE Standard for Binary
  6117. | Floating-Point Arithmetic, except that the conversion is always rounded
  6118. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6119. | Otherwise, if the conversion overflows, the largest integer with the same
  6120. | sign as `a' is returned.
  6121. *----------------------------------------------------------------------------*}
  6122. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6123. var
  6124. aSign: flag;
  6125. aExp, shiftCount: int32;
  6126. aSig, savedASig: bits64;
  6127. z: int32;
  6128. label
  6129. invalid;
  6130. begin
  6131. aSig := extractFloatx80Frac( a );
  6132. aExp := extractFloatx80Exp( a );
  6133. aSign := extractFloatx80Sign( a );
  6134. if ( $401E < aExp ) then begin
  6135. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6136. goto invalid;
  6137. end
  6138. else if ( aExp < $3FFF ) then begin
  6139. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6140. result := 0;
  6141. exit;
  6142. end;
  6143. shiftCount := $403E - aExp;
  6144. savedASig := aSig;
  6145. aSig := aSig shr shiftCount;
  6146. z := aSig;
  6147. if ( aSign <> 0 ) then z := - z;
  6148. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6149. invalid:
  6150. float_raise( float_flag_invalid );
  6151. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6152. exit;
  6153. end;
  6154. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6155. set_inexact_flag;
  6156. end;
  6157. result := z;
  6158. end;
  6159. {*----------------------------------------------------------------------------
  6160. | Returns the result of converting the extended double-precision floating-
  6161. | point value `a' to the 64-bit two's complement integer format. The
  6162. | conversion is performed according to the IEC/IEEE Standard for Binary
  6163. | Floating-Point Arithmetic---which means in particular that the conversion
  6164. | is rounded according to the current rounding mode. If `a' is a NaN,
  6165. | the largest positive integer is returned. Otherwise, if the conversion
  6166. | overflows, the largest integer with the same sign as `a' is returned.
  6167. *----------------------------------------------------------------------------*}
  6168. function floatx80_to_int64(a: floatx80): int64;
  6169. var
  6170. aSign: flag;
  6171. aExp, shiftCount: int32;
  6172. aSig, aSigExtra: bits64;
  6173. begin
  6174. aSig := extractFloatx80Frac( a );
  6175. aExp := extractFloatx80Exp( a );
  6176. aSign := extractFloatx80Sign( a );
  6177. shiftCount := $403E - aExp;
  6178. if ( shiftCount <= 0 ) then begin
  6179. if ( shiftCount <> 0 ) then begin
  6180. float_raise( float_flag_invalid );
  6181. if ( ( aSign = 0 )
  6182. or ( ( aExp = $7FFF )
  6183. and ( aSig <> bits64( $8000000000000000 ) ) )
  6184. ) then begin
  6185. result := $7FFFFFFFFFFFFFFF;
  6186. exit;
  6187. end;
  6188. result := $8000000000000000;
  6189. exit;
  6190. end;
  6191. aSigExtra := 0;
  6192. end
  6193. else begin
  6194. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6195. end;
  6196. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6197. end;
  6198. {*----------------------------------------------------------------------------
  6199. | Returns the result of converting the extended double-precision floating-
  6200. | point value `a' to the 64-bit two's complement integer format. The
  6201. | conversion is performed according to the IEC/IEEE Standard for Binary
  6202. | Floating-Point Arithmetic, except that the conversion is always rounded
  6203. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6204. | Otherwise, if the conversion overflows, the largest integer with the same
  6205. | sign as `a' is returned.
  6206. *----------------------------------------------------------------------------*}
  6207. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6208. var
  6209. aSign: flag;
  6210. aExp, shiftCount: int32;
  6211. aSig: bits64;
  6212. z: int64;
  6213. begin
  6214. aSig := extractFloatx80Frac( a );
  6215. aExp := extractFloatx80Exp( a );
  6216. aSign := extractFloatx80Sign( a );
  6217. shiftCount := aExp - $403E;
  6218. if ( 0 <= shiftCount ) then begin
  6219. aSig := $7FFFFFFFFFFFFFFF;
  6220. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6221. float_raise( float_flag_invalid );
  6222. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6223. result := $7FFFFFFFFFFFFFFF;
  6224. exit;
  6225. end;
  6226. end;
  6227. result := $8000000000000000;
  6228. exit;
  6229. end
  6230. else if ( aExp < $3FFF ) then begin
  6231. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6232. result := 0;
  6233. exit;
  6234. end;
  6235. z := aSig shr ( - shiftCount );
  6236. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6237. set_inexact_flag;
  6238. end;
  6239. if ( aSign <> 0 ) then z := - z;
  6240. result := z;
  6241. end;
  6242. {*----------------------------------------------------------------------------
  6243. | The pattern for a default generated extended double-precision NaN. The
  6244. | `high' and `low' values hold the most- and least-significant bits,
  6245. | respectively.
  6246. *----------------------------------------------------------------------------*}
  6247. const
  6248. floatx80_default_nan_high = $FFFF;
  6249. floatx80_default_nan_low = bits64( $C000000000000000 );
  6250. {*----------------------------------------------------------------------------
  6251. | Returns 1 if the extended double-precision floating-point value `a' is a
  6252. | signaling NaN; otherwise returns 0.
  6253. *----------------------------------------------------------------------------*}
  6254. function floatx80_is_signaling_nan(a : floatx80): flag;
  6255. var
  6256. aLow: bits64;
  6257. begin
  6258. aLow := a.low and not $4000000000000000;
  6259. result := ord(
  6260. ( a.high and $7FFF = $7FFF )
  6261. and ( bits64( aLow shl 1 ) <> 0 )
  6262. and ( a.low = aLow ) );
  6263. end;
  6264. {*----------------------------------------------------------------------------
  6265. | Returns the result of converting the extended double-precision floating-
  6266. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6267. | invalid exception is raised.
  6268. *----------------------------------------------------------------------------*}
  6269. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6270. var
  6271. z: commonNaNT;
  6272. begin
  6273. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6274. z.sign := a.high shr 15;
  6275. z.low := 0;
  6276. z.high := a.low shl 1;
  6277. result := z;
  6278. end;
  6279. {*----------------------------------------------------------------------------
  6280. | Returns 1 if the extended double-precision floating-point value `a' is a
  6281. | NaN; otherwise returns 0.
  6282. *----------------------------------------------------------------------------*}
  6283. function floatx80_is_nan(a : floatx80 ): flag;
  6284. begin
  6285. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
  6286. end;
  6287. {*----------------------------------------------------------------------------
  6288. | Takes two extended double-precision floating-point values `a' and `b', one
  6289. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6290. | `b' is a signaling NaN, the invalid exception is raised.
  6291. *----------------------------------------------------------------------------*}
  6292. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6293. var
  6294. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6295. label
  6296. returnLargerSignificand;
  6297. begin
  6298. aIsNaN := floatx80_is_nan( a );
  6299. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6300. bIsNaN := floatx80_is_nan( b );
  6301. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6302. a.low := a.low or $C000000000000000;
  6303. b.low := b.low or $C000000000000000;
  6304. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6305. if aIsSignalingNaN <> 0 then begin
  6306. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6307. if bIsNaN <> 0 then result := b else result := a;
  6308. exit;
  6309. end
  6310. else if aIsNaN <>0 then begin
  6311. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6312. result := a;
  6313. exit;
  6314. end;
  6315. returnLargerSignificand:
  6316. if ( a.low < b.low ) then begin
  6317. result := b;
  6318. exit;
  6319. end;
  6320. if ( b.low < a.low ) then begin
  6321. result := a;
  6322. exit;
  6323. end;
  6324. if a.high < b.high then result := a else result := b;
  6325. exit;
  6326. end
  6327. else
  6328. result := b;
  6329. end;
  6330. {*----------------------------------------------------------------------------
  6331. | Returns the result of converting the extended double-precision floating-
  6332. | point value `a' to the single-precision floating-point format. The
  6333. | conversion is performed according to the IEC/IEEE Standard for Binary
  6334. | Floating-Point Arithmetic.
  6335. *----------------------------------------------------------------------------*}
  6336. function floatx80_to_float32(a: floatx80): float32;
  6337. var
  6338. aSign: flag;
  6339. aExp: int32;
  6340. aSig: bits64;
  6341. begin
  6342. aSig := extractFloatx80Frac( a );
  6343. aExp := extractFloatx80Exp( a );
  6344. aSign := extractFloatx80Sign( a );
  6345. if ( aExp = $7FFF ) then begin
  6346. if bits64( aSig shl 1 ) <> 0 then begin
  6347. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6348. exit;
  6349. end;
  6350. result := packFloat32( aSign, $FF, 0 );
  6351. exit;
  6352. end;
  6353. shift64RightJamming( aSig, 33, aSig );
  6354. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6355. result := roundAndPackFloat32( aSign, aExp, aSig );
  6356. end;
  6357. {*----------------------------------------------------------------------------
  6358. | Returns the result of converting the extended double-precision floating-
  6359. | point value `a' to the double-precision floating-point format. The
  6360. | conversion is performed according to the IEC/IEEE Standard for Binary
  6361. | Floating-Point Arithmetic.
  6362. *----------------------------------------------------------------------------*}
  6363. function floatx80_to_float64(a: floatx80): float64;
  6364. var
  6365. aSign: flag;
  6366. aExp: int32;
  6367. aSig, zSig: bits64;
  6368. begin
  6369. aSig := extractFloatx80Frac( a );
  6370. aExp := extractFloatx80Exp( a );
  6371. aSign := extractFloatx80Sign( a );
  6372. if ( aExp = $7FFF ) then begin
  6373. if bits64( aSig shl 1 ) <> 0 then begin
  6374. commonNaNToFloat64( floatx80ToCommonNaN( a ), result );
  6375. exit;
  6376. end;
  6377. result := packFloat64( aSign, $7FF, 0 );
  6378. exit;
  6379. end;
  6380. shift64RightJamming( aSig, 1, zSig );
  6381. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6382. result := roundAndPackFloat64( aSign, aExp, zSig );
  6383. end;
  6384. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6385. {*----------------------------------------------------------------------------
  6386. | Returns the result of converting the extended double-precision floating-
  6387. | point value `a' to the quadruple-precision floating-point format. The
  6388. | conversion is performed according to the IEC/IEEE Standard for Binary
  6389. | Floating-Point Arithmetic.
  6390. *----------------------------------------------------------------------------*}
  6391. function floatx80_to_float128(a: floatx80): float128;
  6392. var
  6393. aSign: flag;
  6394. aExp: int16;
  6395. aSig, zSig0, zSig1: bits64;
  6396. begin
  6397. aSig := extractFloatx80Frac( a );
  6398. aExp := extractFloatx80Exp( a );
  6399. aSign := extractFloatx80Sign( a );
  6400. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6401. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6402. exit;
  6403. end;
  6404. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6405. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6406. end;
  6407. {$endif FPC_SOFTFLOAT_FLOAT128}
  6408. {*----------------------------------------------------------------------------
  6409. | Rounds the extended double-precision floating-point value `a' to an integer,
  6410. | and Returns the result as an extended quadruple-precision floating-point
  6411. | value. The operation is performed according to the IEC/IEEE Standard for
  6412. | Binary Floating-Point Arithmetic.
  6413. *----------------------------------------------------------------------------*}
  6414. function floatx80_round_to_int(a: floatx80): floatx80;
  6415. var
  6416. aSign: flag;
  6417. aExp: int32;
  6418. lastBitMask, roundBitsMask: bits64;
  6419. roundingMode: TFPURoundingMode;
  6420. z: floatx80;
  6421. begin
  6422. aExp := extractFloatx80Exp( a );
  6423. if ( $403E <= aExp ) then begin
  6424. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6425. result := propagateFloatx80NaN( a, a );
  6426. exit;
  6427. end;
  6428. result := a;
  6429. exit;
  6430. end;
  6431. if ( aExp < $3FFF ) then begin
  6432. if ( ( aExp = 0 )
  6433. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6434. result := a;
  6435. exit;
  6436. end;
  6437. set_inexact_flag;
  6438. aSign := extractFloatx80Sign( a );
  6439. case softfloat_rounding_mode of
  6440. float_round_nearest_even:
  6441. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6442. ) then begin
  6443. result :=
  6444. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6445. exit;
  6446. end;
  6447. float_round_down: begin
  6448. if aSign <> 0 then
  6449. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6450. else
  6451. result := packFloatx80( 0, 0, 0 );
  6452. exit;
  6453. end;
  6454. float_round_up: begin
  6455. if aSign <> 0 then
  6456. result := packFloatx80( 1, 0, 0 )
  6457. else
  6458. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6459. exit;
  6460. end;
  6461. end;
  6462. result := packFloatx80( aSign, 0, 0 );
  6463. exit;
  6464. end;
  6465. lastBitMask := 1;
  6466. lastBitMask := lastBitMask shl ( $403E - aExp );
  6467. roundBitsMask := lastBitMask - 1;
  6468. z := a;
  6469. roundingMode := softfloat_rounding_mode;
  6470. if ( roundingMode = float_round_nearest_even ) then begin
  6471. inc( z.low, lastBitMask shr 1 );
  6472. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6473. end
  6474. else if ( roundingMode <> float_round_to_zero ) then begin
  6475. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6476. inc( z.low, roundBitsMask );
  6477. end;
  6478. end;
  6479. z.low := z.low and not roundBitsMask;
  6480. if ( z.low = 0 ) then begin
  6481. inc(z.high);
  6482. z.low := bits64( $8000000000000000 );
  6483. end;
  6484. if ( z.low <> a.low ) then set_inexact_flag;
  6485. result := z;
  6486. end;
  6487. {*----------------------------------------------------------------------------
  6488. | Returns the result of adding the absolute values of the extended double-
  6489. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6490. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6491. | The addition is performed according to the IEC/IEEE Standard for Binary
  6492. | Floating-Point Arithmetic.
  6493. *----------------------------------------------------------------------------*}
  6494. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6495. var
  6496. aExp, bExp, zExp: int32;
  6497. aSig, bSig, zSig0, zSig1: bits64;
  6498. expDiff: int32;
  6499. label
  6500. shiftRight1, roundAndPack;
  6501. begin
  6502. aSig := extractFloatx80Frac( a );
  6503. aExp := extractFloatx80Exp( a );
  6504. bSig := extractFloatx80Frac( b );
  6505. bExp := extractFloatx80Exp( b );
  6506. expDiff := aExp - bExp;
  6507. if ( 0 < expDiff ) then begin
  6508. if ( aExp = $7FFF ) then begin
  6509. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6510. result := propagateFloatx80NaN( a, b );
  6511. exit;
  6512. end;
  6513. result := a;
  6514. exit;
  6515. end;
  6516. if ( bExp = 0 ) then dec(expDiff);
  6517. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6518. zExp := aExp;
  6519. end
  6520. else if ( expDiff < 0 ) then begin
  6521. if ( bExp = $7FFF ) then begin
  6522. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6523. result := propagateFloatx80NaN( a, b );
  6524. exit;
  6525. end;
  6526. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6527. exit;
  6528. end;
  6529. if ( aExp = 0 ) then inc(expDiff);
  6530. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6531. zExp := bExp;
  6532. end
  6533. else begin
  6534. if ( aExp = $7FFF ) then begin
  6535. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6536. result := propagateFloatx80NaN( a, b );
  6537. exit;
  6538. end;
  6539. result := a;
  6540. exit;
  6541. end;
  6542. zSig1 := 0;
  6543. zSig0 := aSig + bSig;
  6544. if ( aExp = 0 ) then begin
  6545. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6546. goto roundAndPack;
  6547. end;
  6548. zExp := aExp;
  6549. goto shiftRight1;
  6550. end;
  6551. zSig0 := aSig + bSig;
  6552. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6553. shiftRight1:
  6554. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6555. zSig0 := zSig0 or $8000000000000000;
  6556. inc(zExp);
  6557. roundAndPack:
  6558. result :=
  6559. roundAndPackFloatx80(
  6560. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6561. end;
  6562. {*----------------------------------------------------------------------------
  6563. | Returns the result of subtracting the absolute values of the extended
  6564. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6565. | difference is negated before being returned. `zSign' is ignored if the
  6566. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6567. | Standard for Binary Floating-Point Arithmetic.
  6568. *----------------------------------------------------------------------------*}
  6569. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6570. var
  6571. aExp, bExp, zExp: int32;
  6572. aSig, bSig, zSig0, zSig1: bits64;
  6573. expDiff: int32;
  6574. z: floatx80;
  6575. label
  6576. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6577. begin
  6578. aSig := extractFloatx80Frac( a );
  6579. aExp := extractFloatx80Exp( a );
  6580. bSig := extractFloatx80Frac( b );
  6581. bExp := extractFloatx80Exp( b );
  6582. expDiff := aExp - bExp;
  6583. if ( 0 < expDiff ) then goto aExpBigger;
  6584. if ( expDiff < 0 ) then goto bExpBigger;
  6585. if ( aExp = $7FFF ) then begin
  6586. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6587. result := propagateFloatx80NaN( a, b );
  6588. exit;
  6589. end;
  6590. float_raise( float_flag_invalid );
  6591. z.low := floatx80_default_nan_low;
  6592. z.high := floatx80_default_nan_high;
  6593. result := z;
  6594. exit;
  6595. end;
  6596. if ( aExp = 0 ) then begin
  6597. aExp := 1;
  6598. bExp := 1;
  6599. end;
  6600. zSig1 := 0;
  6601. if ( bSig < aSig ) then goto aBigger;
  6602. if ( aSig < bSig ) then goto bBigger;
  6603. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6604. exit;
  6605. bExpBigger:
  6606. if ( bExp = $7FFF ) then begin
  6607. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6608. result := propagateFloatx80NaN( a, b );
  6609. exit;
  6610. end;
  6611. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6612. exit;
  6613. end;
  6614. if ( aExp = 0 ) then inc(expDiff);
  6615. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6616. bBigger:
  6617. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6618. zExp := bExp;
  6619. zSign := zSign xor 1;
  6620. goto normalizeRoundAndPack;
  6621. aExpBigger:
  6622. if ( aExp = $7FFF ) then begin
  6623. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6624. result := propagateFloatx80NaN( a, b );
  6625. exit;
  6626. end;
  6627. result := a;
  6628. exit;
  6629. end;
  6630. if ( bExp = 0 ) then dec(expDiff);
  6631. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6632. aBigger:
  6633. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6634. zExp := aExp;
  6635. normalizeRoundAndPack:
  6636. result :=
  6637. normalizeRoundAndPackFloatx80(
  6638. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6639. end;
  6640. {*----------------------------------------------------------------------------
  6641. | Returns the result of adding the extended double-precision floating-point
  6642. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6643. | Standard for Binary Floating-Point Arithmetic.
  6644. *----------------------------------------------------------------------------*}
  6645. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6646. var
  6647. aSign, bSign: flag;
  6648. begin
  6649. aSign := extractFloatx80Sign( a );
  6650. bSign := extractFloatx80Sign( b );
  6651. if ( aSign = bSign ) then begin
  6652. result := addFloatx80Sigs( a, b, aSign );
  6653. end
  6654. else begin
  6655. result := subFloatx80Sigs( a, b, aSign );
  6656. end;
  6657. end;
  6658. {*----------------------------------------------------------------------------
  6659. | Returns the result of subtracting the extended double-precision floating-
  6660. | point values `a' and `b'. The operation is performed according to the
  6661. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6662. *----------------------------------------------------------------------------*}
  6663. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6664. var
  6665. aSign, bSign: flag;
  6666. begin
  6667. aSign := extractFloatx80Sign( a );
  6668. bSign := extractFloatx80Sign( b );
  6669. if ( aSign = bSign ) then begin
  6670. result := subFloatx80Sigs( a, b, aSign );
  6671. end
  6672. else begin
  6673. result := addFloatx80Sigs( a, b, aSign );
  6674. end;
  6675. end;
  6676. {*----------------------------------------------------------------------------
  6677. | Returns the result of multiplying the extended double-precision floating-
  6678. | point values `a' and `b'. The operation is performed according to the
  6679. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6680. *----------------------------------------------------------------------------*}
  6681. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6682. var
  6683. aSign, bSign, zSign: flag;
  6684. aExp, bExp, zExp: int32;
  6685. aSig, bSig, zSig0, zSig1: bits64;
  6686. z: floatx80;
  6687. label
  6688. invalid;
  6689. begin
  6690. aSig := extractFloatx80Frac( a );
  6691. aExp := extractFloatx80Exp( a );
  6692. aSign := extractFloatx80Sign( a );
  6693. bSig := extractFloatx80Frac( b );
  6694. bExp := extractFloatx80Exp( b );
  6695. bSign := extractFloatx80Sign( b );
  6696. zSign := aSign xor bSign;
  6697. if ( aExp = $7FFF ) then begin
  6698. if ( bits64( aSig shl 1 ) <> 0 )
  6699. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6700. result := propagateFloatx80NaN( a, b );
  6701. exit;
  6702. end;
  6703. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6704. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6705. exit;
  6706. end;
  6707. if ( bExp = $7FFF ) then begin
  6708. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6709. result := propagateFloatx80NaN( a, b );
  6710. exit;
  6711. end;
  6712. if ( ( aExp or aSig ) = 0 ) then begin
  6713. invalid:
  6714. float_raise( float_flag_invalid );
  6715. z.low := floatx80_default_nan_low;
  6716. z.high := floatx80_default_nan_high;
  6717. result := z;
  6718. exit;
  6719. end;
  6720. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6721. exit;
  6722. end;
  6723. if ( aExp = 0 ) then begin
  6724. if ( aSig = 0 ) then begin
  6725. result := packFloatx80( zSign, 0, 0 );
  6726. exit;
  6727. end;
  6728. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6729. end;
  6730. if ( bExp = 0 ) then begin
  6731. if ( bSig = 0 ) then begin
  6732. result := packFloatx80( zSign, 0, 0 );
  6733. exit;
  6734. end;
  6735. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6736. end;
  6737. zExp := aExp + bExp - $3FFE;
  6738. mul64To128( aSig, bSig, zSig0, zSig1 );
  6739. if 0 < sbits64( zSig0 ) then begin
  6740. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6741. dec(zExp);
  6742. end;
  6743. result :=
  6744. roundAndPackFloatx80(
  6745. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6746. end;
  6747. {*----------------------------------------------------------------------------
  6748. | Returns the result of dividing the extended double-precision floating-point
  6749. | value `a' by the corresponding value `b'. The operation is performed
  6750. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6751. *----------------------------------------------------------------------------*}
  6752. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6753. var
  6754. aSign, bSign, zSign: flag;
  6755. aExp, bExp, zExp: int32;
  6756. aSig, bSig, zSig0, zSig1: bits64;
  6757. rem0, rem1, rem2, term0, term1, term2: bits64;
  6758. z: floatx80;
  6759. label
  6760. invalid;
  6761. begin
  6762. aSig := extractFloatx80Frac( a );
  6763. aExp := extractFloatx80Exp( a );
  6764. aSign := extractFloatx80Sign( a );
  6765. bSig := extractFloatx80Frac( b );
  6766. bExp := extractFloatx80Exp( b );
  6767. bSign := extractFloatx80Sign( b );
  6768. zSign := aSign xor bSign;
  6769. if ( aExp = $7FFF ) then begin
  6770. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6771. result := propagateFloatx80NaN( a, b );
  6772. exit;
  6773. end;
  6774. if ( bExp = $7FFF ) then begin
  6775. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6776. result := propagateFloatx80NaN( a, b );
  6777. exit;
  6778. end;
  6779. goto invalid;
  6780. end;
  6781. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6782. exit;
  6783. end;
  6784. if ( bExp = $7FFF ) then begin
  6785. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6786. result := propagateFloatx80NaN( a, b );
  6787. exit;
  6788. end;
  6789. result := packFloatx80( zSign, 0, 0 );
  6790. exit;
  6791. end;
  6792. if ( bExp = 0 ) then begin
  6793. if ( bSig = 0 ) then begin
  6794. if ( ( aExp or aSig ) = 0 ) then begin
  6795. invalid:
  6796. float_raise( float_flag_invalid );
  6797. z.low := floatx80_default_nan_low;
  6798. z.high := floatx80_default_nan_high;
  6799. result := z;
  6800. exit;
  6801. end;
  6802. float_raise( float_flag_divbyzero );
  6803. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6804. exit;
  6805. end;
  6806. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6807. end;
  6808. if ( aExp = 0 ) then begin
  6809. if ( aSig = 0 ) then begin
  6810. result := packFloatx80( zSign, 0, 0 );
  6811. exit;
  6812. end;
  6813. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6814. end;
  6815. zExp := aExp - bExp + $3FFE;
  6816. rem1 := 0;
  6817. if ( bSig <= aSig ) then begin
  6818. shift128Right( aSig, 0, 1, aSig, rem1 );
  6819. inc(zExp);
  6820. end;
  6821. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6822. mul64To128( bSig, zSig0, term0, term1 );
  6823. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6824. while ( sbits64( rem0 ) < 0 ) do begin
  6825. dec(zSig0);
  6826. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6827. end;
  6828. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6829. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6830. mul64To128( bSig, zSig1, term1, term2 );
  6831. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6832. while ( sbits64( rem1 ) < 0 ) do begin
  6833. dec(zSig1);
  6834. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6835. end;
  6836. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6837. end;
  6838. result :=
  6839. roundAndPackFloatx80(
  6840. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6841. end;
  6842. {*----------------------------------------------------------------------------
  6843. | Returns the remainder of the extended double-precision floating-point value
  6844. | `a' with respect to the corresponding value `b'. The operation is performed
  6845. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6846. *----------------------------------------------------------------------------*}
  6847. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6848. var
  6849. aSign, zSign: flag;
  6850. aExp, bExp, expDiff: int32;
  6851. aSig0, aSig1, bSig: bits64;
  6852. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6853. z: floatx80;
  6854. label
  6855. invalid;
  6856. begin
  6857. aSig0 := extractFloatx80Frac( a );
  6858. aExp := extractFloatx80Exp( a );
  6859. aSign := extractFloatx80Sign( a );
  6860. bSig := extractFloatx80Frac( b );
  6861. bExp := extractFloatx80Exp( b );
  6862. if ( aExp = $7FFF ) then begin
  6863. if ( bits64( aSig0 shl 1 ) <> 0 )
  6864. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6865. result := propagateFloatx80NaN( a, b );
  6866. exit;
  6867. end;
  6868. goto invalid;
  6869. end;
  6870. if ( bExp = $7FFF ) then begin
  6871. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6872. result := propagateFloatx80NaN( a, b );
  6873. exit;
  6874. end;
  6875. result := a;
  6876. exit;
  6877. end;
  6878. if ( bExp = 0 ) then begin
  6879. if ( bSig = 0 ) then begin
  6880. invalid:
  6881. float_raise( float_flag_invalid );
  6882. z.low := floatx80_default_nan_low;
  6883. z.high := floatx80_default_nan_high;
  6884. result := z;
  6885. exit;
  6886. end;
  6887. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6888. end;
  6889. if ( aExp = 0 ) then begin
  6890. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  6891. result := a;
  6892. exit;
  6893. end;
  6894. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6895. end;
  6896. bSig := bSig or $8000000000000000;
  6897. zSign := aSign;
  6898. expDiff := aExp - bExp;
  6899. aSig1 := 0;
  6900. if ( expDiff < 0 ) then begin
  6901. if ( expDiff < -1 ) then begin
  6902. result := a;
  6903. exit;
  6904. end;
  6905. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6906. expDiff := 0;
  6907. end;
  6908. q := ord( bSig <= aSig0 );
  6909. if ( q <> 0 ) then dec( aSig0, bSig );
  6910. dec( expDiff, 64 );
  6911. while ( 0 < expDiff ) do begin
  6912. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6913. if ( 2 < q ) then q := q - 2 else q := 0;
  6914. mul64To128( bSig, q, term0, term1 );
  6915. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6916. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6917. dec( expDiff, 62 );
  6918. end;
  6919. inc( expDiff, 64 );
  6920. if ( 0 < expDiff ) then begin
  6921. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6922. if ( 2 < q ) then q:= q - 2 else q := 0;
  6923. q := q shr ( 64 - expDiff );
  6924. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6925. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6926. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6927. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  6928. inc(q);
  6929. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6930. end;
  6931. end
  6932. else begin
  6933. term1 := 0;
  6934. term0 := bSig;
  6935. end;
  6936. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6937. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  6938. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  6939. and ( q and 1 <> 0 ) )
  6940. then begin
  6941. aSig0 := alternateASig0;
  6942. aSig1 := alternateASig1;
  6943. zSign := ord( zSign = 0 );
  6944. end;
  6945. result :=
  6946. normalizeRoundAndPackFloatx80(
  6947. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6948. end;
  6949. {*----------------------------------------------------------------------------
  6950. | Returns the square root of the extended double-precision floating-point
  6951. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6952. | for Binary Floating-Point Arithmetic.
  6953. *----------------------------------------------------------------------------*}
  6954. function floatx80_sqrt(a: floatx80): floatx80;
  6955. var
  6956. aSign: flag;
  6957. aExp, zExp: int32;
  6958. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6959. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6960. z: floatx80;
  6961. label
  6962. invalid;
  6963. begin
  6964. aSig0 := extractFloatx80Frac( a );
  6965. aExp := extractFloatx80Exp( a );
  6966. aSign := extractFloatx80Sign( a );
  6967. if ( aExp = $7FFF ) then begin
  6968. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  6969. result := propagateFloatx80NaN( a, a );
  6970. exit;
  6971. end;
  6972. if ( aSign = 0 ) then begin
  6973. result := a;
  6974. exit;
  6975. end;
  6976. goto invalid;
  6977. end;
  6978. if ( aSign <> 0 ) then begin
  6979. if ( ( aExp or aSig0 ) = 0 ) then begin
  6980. result := a;
  6981. exit;
  6982. end;
  6983. invalid:
  6984. float_raise( float_flag_invalid );
  6985. z.low := floatx80_default_nan_low;
  6986. z.high := floatx80_default_nan_high;
  6987. result := z;
  6988. exit;
  6989. end;
  6990. if ( aExp = 0 ) then begin
  6991. if ( aSig0 = 0 ) then begin
  6992. result := packFloatx80( 0, 0, 0 );
  6993. exit;
  6994. end;
  6995. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6996. end;
  6997. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
  6998. zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
  6999. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  7000. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7001. doubleZSig0 := zSig0 shl 1;
  7002. mul64To128( zSig0, zSig0, term0, term1 );
  7003. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7004. while ( sbits64( rem0 ) < 0 ) do begin
  7005. dec(zSig0);
  7006. dec( doubleZSig0, 2 );
  7007. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7008. end;
  7009. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7010. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7011. if ( zSig1 = 0 ) then zSig1 := 1;
  7012. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7013. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7014. mul64To128( zSig1, zSig1, term2, term3 );
  7015. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7016. while ( sbits64( rem1 ) < 0 ) do begin
  7017. dec(zSig1);
  7018. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7019. term3 := term3 or 1;
  7020. term2 := term2 or doubleZSig0;
  7021. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7022. end;
  7023. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7024. end;
  7025. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7026. zSig0 := zSig0 or doubleZSig0;
  7027. result :=
  7028. roundAndPackFloatx80(
  7029. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7030. end;
  7031. {*----------------------------------------------------------------------------
  7032. | Returns 1 if the extended double-precision floating-point value `a' is
  7033. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7034. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7035. | Arithmetic.
  7036. *----------------------------------------------------------------------------*}
  7037. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7038. begin
  7039. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7040. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7041. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7042. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7043. ) then begin
  7044. if ( floatx80_is_signaling_nan( a )
  7045. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7046. float_raise( float_flag_invalid );
  7047. end;
  7048. result := 0;
  7049. exit;
  7050. end;
  7051. result := ord(
  7052. ( a.low = b.low )
  7053. and ( ( a.high = b.high )
  7054. or ( ( a.low = 0 )
  7055. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7056. ) );
  7057. end;
  7058. {*----------------------------------------------------------------------------
  7059. | Returns 1 if the extended double-precision floating-point value `a' is
  7060. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7061. | comparison is performed according to the IEC/IEEE Standard for Binary
  7062. | Floating-Point Arithmetic.
  7063. *----------------------------------------------------------------------------*}
  7064. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7065. var
  7066. aSign, bSign: flag;
  7067. begin
  7068. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7069. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7070. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7071. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7072. then begin
  7073. float_raise( float_flag_invalid );
  7074. result := 0;
  7075. exit;
  7076. end;
  7077. aSign := extractFloatx80Sign( a );
  7078. bSign := extractFloatx80Sign( b );
  7079. if ( aSign <> bSign ) then begin
  7080. result := ord(
  7081. ( aSign <> 0 )
  7082. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7083. exit;
  7084. end;
  7085. if aSign<>0 then
  7086. result := le128( b.high, b.low, a.high, a.low )
  7087. else
  7088. result := le128( a.high, a.low, b.high, b.low );
  7089. end;
  7090. {*----------------------------------------------------------------------------
  7091. | Returns 1 if the extended double-precision floating-point value `a' is
  7092. | less than the corresponding value `b', and 0 otherwise. The comparison
  7093. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7094. | Arithmetic.
  7095. *----------------------------------------------------------------------------*}
  7096. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7097. var
  7098. aSign, bSign: flag;
  7099. begin
  7100. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7101. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7102. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7103. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7104. then begin
  7105. float_raise( float_flag_invalid );
  7106. result := 0;
  7107. exit;
  7108. end;
  7109. aSign := extractFloatx80Sign( a );
  7110. bSign := extractFloatx80Sign( b );
  7111. if ( aSign <> bSign ) then begin
  7112. result := ord(
  7113. ( aSign <> 0 )
  7114. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7115. exit;
  7116. end;
  7117. if aSign <> 0 then
  7118. result := lt128( b.high, b.low, a.high, a.low )
  7119. else
  7120. result := lt128( a.high, a.low, b.high, b.low );
  7121. end;
  7122. {*----------------------------------------------------------------------------
  7123. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7124. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7125. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7126. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7127. *----------------------------------------------------------------------------*}
  7128. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7129. begin
  7130. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7131. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7132. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7133. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7134. then begin
  7135. float_raise( float_flag_invalid );
  7136. result := 0;
  7137. exit;
  7138. end;
  7139. result := ord(
  7140. ( a.low = b.low )
  7141. and ( ( a.high = b.high )
  7142. or ( ( a.low = 0 )
  7143. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7144. ) );
  7145. end;
  7146. {*----------------------------------------------------------------------------
  7147. | Returns 1 if the extended double-precision floating-point value `a' is less
  7148. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7149. | do not cause an exception. Otherwise, the comparison is performed according
  7150. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7151. *----------------------------------------------------------------------------*}
  7152. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7153. var
  7154. aSign, bSign: flag;
  7155. begin
  7156. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7157. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7158. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7159. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7160. then begin
  7161. if ( floatx80_is_signaling_nan( a )
  7162. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7163. float_raise( float_flag_invalid );
  7164. end;
  7165. result := 0;
  7166. exit;
  7167. end;
  7168. aSign := extractFloatx80Sign( a );
  7169. bSign := extractFloatx80Sign( b );
  7170. if ( aSign <> bSign ) then begin
  7171. result := ord(
  7172. ( aSign <> 0 )
  7173. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7174. exit;
  7175. end;
  7176. if aSign <> 0 then
  7177. result := le128( b.high, b.low, a.high, a.low )
  7178. else
  7179. result := le128( a.high, a.low, b.high, b.low );
  7180. end;
  7181. {*----------------------------------------------------------------------------
  7182. | Returns 1 if the extended double-precision floating-point value `a' is less
  7183. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7184. | an exception. Otherwise, the comparison is performed according to the
  7185. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7186. *----------------------------------------------------------------------------*}
  7187. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7188. var
  7189. aSign, bSign: flag;
  7190. begin
  7191. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7192. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7193. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7194. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7195. then begin
  7196. if ( floatx80_is_signaling_nan( a )
  7197. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7198. float_raise( float_flag_invalid );
  7199. end;
  7200. result := 0;
  7201. exit;
  7202. end;
  7203. aSign := extractFloatx80Sign( a );
  7204. bSign := extractFloatx80Sign( b );
  7205. if ( aSign <> bSign ) then begin
  7206. result := ord(
  7207. ( aSign <> 0 )
  7208. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7209. exit;
  7210. end;
  7211. if aSign <> 0 then
  7212. result := lt128( b.high, b.low, a.high, a.low )
  7213. else
  7214. result := lt128( a.high, a.low, b.high, b.low );
  7215. end;
  7216. {$endif FPC_SOFTFLOAT_FLOATX80}
  7217. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7218. {*----------------------------------------------------------------------------
  7219. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7220. | floating-point value `a'.
  7221. *----------------------------------------------------------------------------*}
  7222. function extractFloat128Frac1(a : float128): bits64;
  7223. begin
  7224. result:=a.low;
  7225. end;
  7226. {*----------------------------------------------------------------------------
  7227. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7228. | floating-point value `a'.
  7229. *----------------------------------------------------------------------------*}
  7230. function extractFloat128Frac0(a : float128): bits64;
  7231. begin
  7232. result:=a.high and int64($0000FFFFFFFFFFFF);
  7233. end;
  7234. {*----------------------------------------------------------------------------
  7235. | Returns the exponent bits of the quadruple-precision floating-point value
  7236. | `a'.
  7237. *----------------------------------------------------------------------------*}
  7238. function extractFloat128Exp(a : float128): int32;
  7239. begin
  7240. result:=( a.high shr 48 ) and $7FFF;
  7241. end;
  7242. {*----------------------------------------------------------------------------
  7243. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7244. *----------------------------------------------------------------------------*}
  7245. function extractFloat128Sign(a : float128): flag;
  7246. begin
  7247. result:=a.high shr 63;
  7248. end;
  7249. {*----------------------------------------------------------------------------
  7250. | Normalizes the subnormal quadruple-precision floating-point value
  7251. | represented by the denormalized significand formed by the concatenation of
  7252. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7253. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7254. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7255. | least significant 64 bits of the normalized significand are stored at the
  7256. | location pointed to by `zSig1Ptr'.
  7257. *----------------------------------------------------------------------------*}
  7258. procedure normalizeFloat128Subnormal(
  7259. aSig0: bits64;
  7260. aSig1: bits64;
  7261. var zExpPtr: int32;
  7262. var zSig0Ptr: bits64;
  7263. var zSig1Ptr: bits64);
  7264. var
  7265. shiftCount: int8;
  7266. begin
  7267. if ( aSig0 = 0 ) then
  7268. begin
  7269. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7270. if ( shiftCount < 0 ) then
  7271. begin
  7272. zSig0Ptr := aSig1 shr ( - shiftCount );
  7273. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7274. end
  7275. else begin
  7276. zSig0Ptr := aSig1 shl shiftCount;
  7277. zSig1Ptr := 0;
  7278. end;
  7279. zExpPtr := - shiftCount - 63;
  7280. end
  7281. else begin
  7282. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7283. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7284. zExpPtr := 1 - shiftCount;
  7285. end;
  7286. end;
  7287. {*----------------------------------------------------------------------------
  7288. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7289. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7290. | floating-point value, returning the result. After being shifted into the
  7291. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7292. | added together to form the most significant 32 bits of the result. This
  7293. | means that any integer portion of `zSig0' will be added into the exponent.
  7294. | Since a properly normalized significand will have an integer portion equal
  7295. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7296. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7297. | significand.
  7298. *----------------------------------------------------------------------------*}
  7299. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7300. var
  7301. z: float128;
  7302. begin
  7303. z.low := zSig1;
  7304. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7305. result:=z;
  7306. end;
  7307. {*----------------------------------------------------------------------------
  7308. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7309. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7310. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7311. | corresponding to the abstract input. Ordinarily, the abstract value is
  7312. | simply rounded and packed into the quadruple-precision format, with the
  7313. | inexact exception raised if the abstract input cannot be represented
  7314. | exactly. However, if the abstract value is too large, the overflow and
  7315. | inexact exceptions are raised and an infinity or maximal finite value is
  7316. | returned. If the abstract value is too small, the input value is rounded to
  7317. | a subnormal number, and the underflow and inexact exceptions are raised if
  7318. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7319. | precision floating-point number.
  7320. | The input significand must be normalized or smaller. If the input
  7321. | significand is not normalized, `zExp' must be 0; in that case, the result
  7322. | returned is a subnormal number, and it must not require rounding. In the
  7323. | usual case that the input significand is normalized, `zExp' must be 1 less
  7324. | than the ``true'' floating-point exponent. The handling of underflow and
  7325. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7326. *----------------------------------------------------------------------------*}
  7327. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7328. var
  7329. roundingMode: TFPURoundingMode;
  7330. roundNearestEven, increment, isTiny: flag;
  7331. begin
  7332. roundingMode := softfloat_rounding_mode;
  7333. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7334. increment := ord( sbits64(zSig2) < 0 );
  7335. if ( roundNearestEven=0 ) then
  7336. begin
  7337. if ( roundingMode = float_round_to_zero ) then
  7338. begin
  7339. increment := 0;
  7340. end
  7341. else begin
  7342. if ( zSign<>0 ) then
  7343. begin
  7344. increment := ord( roundingMode = float_round_down ) and zSig2;
  7345. end
  7346. else begin
  7347. increment := ord( roundingMode = float_round_up ) and zSig2;
  7348. end;
  7349. end;
  7350. end;
  7351. if ( $7FFD <= bits32(zExp) ) then
  7352. begin
  7353. if ( ord( $7FFD < zExp )
  7354. or ( ord( zExp = $7FFD )
  7355. and eq128(
  7356. int64( $0001FFFFFFFFFFFF ),
  7357. bits64( $FFFFFFFFFFFFFFFF ),
  7358. zSig0,
  7359. zSig1
  7360. )
  7361. and increment
  7362. )
  7363. )<>0 then
  7364. begin
  7365. float_raise( [float_flag_overflow,float_flag_inexact] );
  7366. if ( ord( roundingMode = float_round_to_zero )
  7367. or ( zSign and ord( roundingMode = float_round_up ) )
  7368. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7369. )<>0 then
  7370. begin
  7371. result :=
  7372. packFloat128(
  7373. zSign,
  7374. $7FFE,
  7375. int64( $0000FFFFFFFFFFFF ),
  7376. bits64( $FFFFFFFFFFFFFFFF )
  7377. );
  7378. exit;
  7379. end;
  7380. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7381. exit;
  7382. end;
  7383. if ( zExp < 0 ) then
  7384. begin
  7385. isTiny :=
  7386. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7387. or ( zExp < -1 )
  7388. or not( increment<>0 )
  7389. or boolean(lt128(
  7390. zSig0,
  7391. zSig1,
  7392. int64( $0001FFFFFFFFFFFF ),
  7393. bits64( $FFFFFFFFFFFFFFFF )
  7394. )));
  7395. shift128ExtraRightJamming(
  7396. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7397. zExp := 0;
  7398. if ( isTiny and zSig2 )<>0 then
  7399. float_raise( float_flag_underflow );
  7400. if ( roundNearestEven<>0 ) then
  7401. begin
  7402. increment := ord( sbits64(zSig2) < 0 );
  7403. end
  7404. else begin
  7405. if ( zSign<>0 ) then
  7406. begin
  7407. increment := ord( roundingMode = float_round_down ) and zSig2;
  7408. end
  7409. else begin
  7410. increment := ord( roundingMode = float_round_up ) and zSig2;
  7411. end;
  7412. end;
  7413. end;
  7414. end;
  7415. if ( zSig2<>0 ) then
  7416. set_inexact_flag;
  7417. if ( increment<>0 ) then
  7418. begin
  7419. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7420. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7421. end
  7422. else begin
  7423. if ( ( zSig0 or zSig1 ) = 0 ) then
  7424. zExp := 0;
  7425. end;
  7426. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7427. end;
  7428. {*----------------------------------------------------------------------------
  7429. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7430. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7431. | returns the proper quadruple-precision floating-point value corresponding
  7432. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7433. | except that the input significand has fewer bits and does not have to be
  7434. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7435. | point exponent.
  7436. *----------------------------------------------------------------------------*}
  7437. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7438. var
  7439. shiftCount: int8;
  7440. zSig2: bits64;
  7441. begin
  7442. if ( zSig0 = 0 ) then
  7443. begin
  7444. zSig0 := zSig1;
  7445. zSig1 := 0;
  7446. dec(zExp, 64);
  7447. end;
  7448. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7449. if ( 0 <= shiftCount ) then
  7450. begin
  7451. zSig2 := 0;
  7452. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7453. end
  7454. else begin
  7455. shift128ExtraRightJamming(
  7456. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7457. end;
  7458. dec(zExp, shiftCount);
  7459. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7460. end;
  7461. {*----------------------------------------------------------------------------
  7462. | Returns the result of converting the quadruple-precision floating-point
  7463. | value `a' to the 32-bit two's complement integer format. The conversion
  7464. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7465. | Arithmetic---which means in particular that the conversion is rounded
  7466. | according to the current rounding mode. If `a' is a NaN, the largest
  7467. | positive integer is returned. Otherwise, if the conversion overflows, the
  7468. | largest integer with the same sign as `a' is returned.
  7469. *----------------------------------------------------------------------------*}
  7470. function float128_to_int32(a: float128): int32;
  7471. var
  7472. aSign: flag;
  7473. aExp, shiftCount: int32;
  7474. aSig0, aSig1: bits64;
  7475. begin
  7476. aSig1 := extractFloat128Frac1( a );
  7477. aSig0 := extractFloat128Frac0( a );
  7478. aExp := extractFloat128Exp( a );
  7479. aSign := extractFloat128Sign( a );
  7480. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7481. aSign := 0;
  7482. if ( aExp<>0 ) then
  7483. aSig0 := aSig0 or int64( $0001000000000000 );
  7484. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7485. shiftCount := $4028 - aExp;
  7486. if ( 0 < shiftCount ) then
  7487. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7488. result := roundAndPackInt32( aSign, aSig0 );
  7489. end;
  7490. {*----------------------------------------------------------------------------
  7491. | Returns the result of converting the quadruple-precision floating-point
  7492. | value `a' to the 32-bit two's complement integer format. The conversion
  7493. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7494. | Arithmetic, except that the conversion is always rounded toward zero. If
  7495. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7496. | conversion overflows, the largest integer with the same sign as `a' is
  7497. | returned.
  7498. *----------------------------------------------------------------------------*}
  7499. function float128_to_int32_round_to_zero(a: float128): int32;
  7500. var
  7501. aSign: flag;
  7502. aExp, shiftCount: int32;
  7503. aSig0, aSig1, savedASig: bits64;
  7504. z: int32;
  7505. label
  7506. invalid;
  7507. begin
  7508. aSig1 := extractFloat128Frac1( a );
  7509. aSig0 := extractFloat128Frac0( a );
  7510. aExp := extractFloat128Exp( a );
  7511. aSign := extractFloat128Sign( a );
  7512. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7513. if ( $401E < aExp ) then
  7514. begin
  7515. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7516. aSign := 0;
  7517. goto invalid;
  7518. end
  7519. else if ( aExp < $3FFF ) then
  7520. begin
  7521. if ( aExp or aSig0 )<>0 then
  7522. set_inexact_flag;
  7523. result := 0;
  7524. exit;
  7525. end;
  7526. aSig0 := aSig0 or int64( $0001000000000000 );
  7527. shiftCount := $402F - aExp;
  7528. savedASig := aSig0;
  7529. aSig0 := aSig0 shr shiftCount;
  7530. z := aSig0;
  7531. if ( aSign )<>0 then
  7532. z := - z;
  7533. if ( ord( z < 0 ) xor aSign )<>0 then
  7534. begin
  7535. invalid:
  7536. float_raise( float_flag_invalid );
  7537. if aSign<>0 then
  7538. result:= int32( $80000000 )
  7539. else
  7540. result:=$7FFFFFFF;
  7541. exit;
  7542. end;
  7543. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7544. begin
  7545. set_inexact_flag;
  7546. end;
  7547. result := z;
  7548. end;
  7549. {*----------------------------------------------------------------------------
  7550. | Returns the result of converting the quadruple-precision floating-point
  7551. | value `a' to the 64-bit two's complement integer format. The conversion
  7552. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7553. | Arithmetic---which means in particular that the conversion is rounded
  7554. | according to the current rounding mode. If `a' is a NaN, the largest
  7555. | positive integer is returned. Otherwise, if the conversion overflows, the
  7556. | largest integer with the same sign as `a' is returned.
  7557. *----------------------------------------------------------------------------*}
  7558. function float128_to_int64(a: float128): int64;
  7559. var
  7560. aSign: flag;
  7561. aExp, shiftCount: int32;
  7562. aSig0, aSig1: bits64;
  7563. begin
  7564. aSig1 := extractFloat128Frac1( a );
  7565. aSig0 := extractFloat128Frac0( a );
  7566. aExp := extractFloat128Exp( a );
  7567. aSign := extractFloat128Sign( a );
  7568. if ( aExp<>0 ) then
  7569. aSig0 := aSig0 or int64( $0001000000000000 );
  7570. shiftCount := $402F - aExp;
  7571. if ( shiftCount <= 0 ) then
  7572. begin
  7573. if ( $403E < aExp ) then
  7574. begin
  7575. float_raise( float_flag_invalid );
  7576. if ( (aSign=0)
  7577. or ( ( aExp = $7FFF )
  7578. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7579. )
  7580. ) then
  7581. begin
  7582. result := int64( $7FFFFFFFFFFFFFFF );
  7583. exit;
  7584. end;
  7585. result := int64( $8000000000000000 );
  7586. exit;
  7587. end;
  7588. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7589. end
  7590. else begin
  7591. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7592. end;
  7593. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7594. end;
  7595. {*----------------------------------------------------------------------------
  7596. | Returns the result of converting the quadruple-precision floating-point
  7597. | value `a' to the 64-bit two's complement integer format. The conversion
  7598. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7599. | Arithmetic, except that the conversion is always rounded toward zero.
  7600. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7601. | the conversion overflows, the largest integer with the same sign as `a' is
  7602. | returned.
  7603. *----------------------------------------------------------------------------*}
  7604. function float128_to_int64_round_to_zero(a: float128): int64;
  7605. var
  7606. aSign: flag;
  7607. aExp, shiftCount: int32;
  7608. aSig0, aSig1: bits64;
  7609. z: int64;
  7610. begin
  7611. aSig1 := extractFloat128Frac1( a );
  7612. aSig0 := extractFloat128Frac0( a );
  7613. aExp := extractFloat128Exp( a );
  7614. aSign := extractFloat128Sign( a );
  7615. if ( aExp<>0 ) then
  7616. aSig0 := aSig0 or int64( $0001000000000000 );
  7617. shiftCount := aExp - $402F;
  7618. if ( 0 < shiftCount ) then
  7619. begin
  7620. if ( $403E <= aExp ) then
  7621. begin
  7622. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7623. if ( ( a.high = bits64( $C03E000000000000 ) )
  7624. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7625. begin
  7626. if ( aSig1<>0 ) then
  7627. set_inexact_flag;
  7628. end
  7629. else begin
  7630. float_raise( float_flag_invalid );
  7631. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7632. begin
  7633. result := int64( $7FFFFFFFFFFFFFFF );
  7634. exit;
  7635. end;
  7636. end;
  7637. result := int64( $8000000000000000 );
  7638. exit;
  7639. end;
  7640. z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
  7641. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7642. begin
  7643. set_inexact_flag;
  7644. end;
  7645. end
  7646. else begin
  7647. if ( aExp < $3FFF ) then
  7648. begin
  7649. if ( aExp or aSig0 or aSig1 )<>0 then
  7650. begin
  7651. set_inexact_flag;
  7652. end;
  7653. result := 0;
  7654. exit;
  7655. end;
  7656. z := aSig0 shr ( - shiftCount );
  7657. if ( (aSig1<>0)
  7658. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7659. begin
  7660. set_inexact_flag;
  7661. end;
  7662. end;
  7663. if ( aSign<>0 ) then
  7664. z := - z;
  7665. result := z;
  7666. end;
  7667. {*----------------------------------------------------------------------------
  7668. | Returns the result of converting the quadruple-precision floating-point
  7669. | value `a' to the single-precision floating-point format. The conversion
  7670. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7671. | Arithmetic.
  7672. *----------------------------------------------------------------------------*}
  7673. function float128_to_float32(a: float128): float32;
  7674. var
  7675. aSign: flag;
  7676. aExp: int32;
  7677. aSig0, aSig1: bits64;
  7678. zSig: bits32;
  7679. begin
  7680. aSig1 := extractFloat128Frac1( a );
  7681. aSig0 := extractFloat128Frac0( a );
  7682. aExp := extractFloat128Exp( a );
  7683. aSign := extractFloat128Sign( a );
  7684. if ( aExp = $7FFF ) then
  7685. begin
  7686. if ( aSig0 or aSig1 )<>0 then
  7687. begin
  7688. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7689. exit;
  7690. end;
  7691. result := packFloat32( aSign, $FF, 0 );
  7692. exit;
  7693. end;
  7694. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7695. shift64RightJamming( aSig0, 18, aSig0 );
  7696. zSig := aSig0;
  7697. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7698. begin
  7699. zSig := zSig or $40000000;
  7700. dec(aExp,$3F81);
  7701. end;
  7702. result := roundAndPackFloat32( aSign, aExp, zSig );
  7703. end;
  7704. {*----------------------------------------------------------------------------
  7705. | Returns the result of converting the quadruple-precision floating-point
  7706. | value `a' to the double-precision floating-point format. The conversion
  7707. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7708. | Arithmetic.
  7709. *----------------------------------------------------------------------------*}
  7710. function float128_to_float64(a: float128): float64;
  7711. var
  7712. aSign: flag;
  7713. aExp: int32;
  7714. aSig0, aSig1: bits64;
  7715. begin
  7716. aSig1 := extractFloat128Frac1( a );
  7717. aSig0 := extractFloat128Frac0( a );
  7718. aExp := extractFloat128Exp( a );
  7719. aSign := extractFloat128Sign( a );
  7720. if ( aExp = $7FFF ) then
  7721. begin
  7722. if ( aSig0 or aSig1 )<>0 then
  7723. begin
  7724. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  7725. exit;
  7726. end;
  7727. result:=packFloat64( aSign, $7FF, 0);
  7728. exit;
  7729. end;
  7730. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7731. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7732. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7733. begin
  7734. aSig0 := aSig0 or int64( $4000000000000000 );
  7735. dec(aExp,$3C01);
  7736. end;
  7737. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7738. end;
  7739. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7740. {*----------------------------------------------------------------------------
  7741. | Returns the result of converting the quadruple-precision floating-point
  7742. | value `a' to the extended double-precision floating-point format. The
  7743. | conversion is performed according to the IEC/IEEE Standard for Binary
  7744. | Floating-Point Arithmetic.
  7745. *----------------------------------------------------------------------------*}
  7746. function float128_to_floatx80(a: float128): floatx80;
  7747. var
  7748. aSign: flag;
  7749. aExp: int32;
  7750. aSig0, aSig1: bits64;
  7751. begin
  7752. aSig1 := extractFloat128Frac1( a );
  7753. aSig0 := extractFloat128Frac0( a );
  7754. aExp := extractFloat128Exp( a );
  7755. aSign := extractFloat128Sign( a );
  7756. if ( aExp = $7FFF ) then begin
  7757. if ( aSig0 or aSig1 <> 0 ) then begin
  7758. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7759. exit;
  7760. end;
  7761. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7762. exit;
  7763. end;
  7764. if ( aExp = 0 ) then begin
  7765. if ( ( aSig0 or aSig1 ) = 0 ) then
  7766. begin
  7767. result := packFloatx80( aSign, 0, 0 );
  7768. exit;
  7769. end;
  7770. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7771. end
  7772. else begin
  7773. aSig0 := aSig0 or int64( $0001000000000000 );
  7774. end;
  7775. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7776. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7777. end;
  7778. {$endif FPC_SOFTFLOAT_FLOATX80}
  7779. {*----------------------------------------------------------------------------
  7780. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7781. | Returns the result as a quadruple-precision floating-point value. The
  7782. | operation is performed according to the IEC/IEEE Standard for Binary
  7783. | Floating-Point Arithmetic.
  7784. *----------------------------------------------------------------------------*}
  7785. function float128_round_to_int(a: float128): float128;
  7786. var
  7787. aSign: flag;
  7788. aExp: int32;
  7789. lastBitMask, roundBitsMask: bits64;
  7790. roundingMode: TFPURoundingMode;
  7791. z: float128;
  7792. begin
  7793. aExp := extractFloat128Exp( a );
  7794. if ( $402F <= aExp ) then
  7795. begin
  7796. if ( $406F <= aExp ) then
  7797. begin
  7798. if ( ( aExp = $7FFF )
  7799. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7800. ) then
  7801. begin
  7802. result := propagateFloat128NaN( a, a );
  7803. exit;
  7804. end;
  7805. result := a;
  7806. exit;
  7807. end;
  7808. lastBitMask := 1;
  7809. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7810. roundBitsMask := lastBitMask - 1;
  7811. z := a;
  7812. roundingMode := softfloat_rounding_mode;
  7813. if ( roundingMode = float_round_nearest_even ) then
  7814. begin
  7815. if ( lastBitMask )<>0 then
  7816. begin
  7817. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7818. if ( ( z.low and roundBitsMask ) = 0 ) then
  7819. z.low := z.low and not(lastBitMask);
  7820. end
  7821. else begin
  7822. if ( sbits64(z.low) < 0 ) then
  7823. begin
  7824. inc(z.high);
  7825. if ( bits64( z.low shl 1 ) = 0 ) then
  7826. z.high := z.high and not bits64( 1 );
  7827. end;
  7828. end;
  7829. end
  7830. else if ( roundingMode <> float_round_to_zero ) then
  7831. begin
  7832. if ( extractFloat128Sign( z )
  7833. xor ord( roundingMode = float_round_up ) )<>0 then
  7834. begin
  7835. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7836. end;
  7837. end;
  7838. z.low := z.low and not(roundBitsMask);
  7839. end
  7840. else begin
  7841. if ( aExp < $3FFF ) then
  7842. begin
  7843. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7844. begin
  7845. result := a;
  7846. exit;
  7847. end;
  7848. set_inexact_flag;
  7849. aSign := extractFloat128Sign( a );
  7850. case softfloat_rounding_mode of
  7851. float_round_nearest_even:
  7852. if ( ( aExp = $3FFE )
  7853. and ( (extractFloat128Frac0( a )<>0)
  7854. or (extractFloat128Frac1( a )<>0) )
  7855. ) then begin
  7856. begin
  7857. result := packFloat128( aSign, $3FFF, 0, 0 );
  7858. exit;
  7859. end;
  7860. end;
  7861. float_round_down:
  7862. begin
  7863. if aSign<>0 then
  7864. result:=packFloat128( 1, $3FFF, 0, 0 )
  7865. else
  7866. result:=packFloat128( 0, 0, 0, 0 );
  7867. exit;
  7868. end;
  7869. float_round_up:
  7870. begin
  7871. if aSign<>0 then
  7872. result := packFloat128( 1, 0, 0, 0 )
  7873. else
  7874. result:=packFloat128( 0, $3FFF, 0, 0 );
  7875. exit;
  7876. end;
  7877. end;
  7878. result := packFloat128( aSign, 0, 0, 0 );
  7879. exit;
  7880. end;
  7881. lastBitMask := 1;
  7882. lastBitMask := lastBitMask shl ($402F - aExp);
  7883. roundBitsMask := lastBitMask - 1;
  7884. z.low := 0;
  7885. z.high := a.high;
  7886. roundingMode := softfloat_rounding_mode;
  7887. if ( roundingMode = float_round_nearest_even ) then begin
  7888. inc(z.high,lastBitMask shr 1);
  7889. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7890. z.high := z.high and not(lastBitMask);
  7891. end;
  7892. end
  7893. else if ( roundingMode <> float_round_to_zero ) then begin
  7894. if ( (extractFloat128Sign( z )<>0)
  7895. xor ( roundingMode = float_round_up ) ) then begin
  7896. z.high := z.high or ord( a.low <> 0 );
  7897. z.high := z.high+roundBitsMask;
  7898. end;
  7899. end;
  7900. z.high := z.high and not(roundBitsMask);
  7901. end;
  7902. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7903. set_inexact_flag;
  7904. end;
  7905. result := z;
  7906. end;
  7907. {*----------------------------------------------------------------------------
  7908. | Returns the result of adding the absolute values of the quadruple-precision
  7909. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7910. | before being returned. `zSign' is ignored if the result is a NaN.
  7911. | The addition is performed according to the IEC/IEEE Standard for Binary
  7912. | Floating-Point Arithmetic.
  7913. *----------------------------------------------------------------------------*}
  7914. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7915. var
  7916. aExp, bExp, zExp: int32;
  7917. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7918. expDiff: int32;
  7919. label
  7920. shiftRight1,roundAndPack;
  7921. begin
  7922. aSig1 := extractFloat128Frac1( a );
  7923. aSig0 := extractFloat128Frac0( a );
  7924. aExp := extractFloat128Exp( a );
  7925. bSig1 := extractFloat128Frac1( b );
  7926. bSig0 := extractFloat128Frac0( b );
  7927. bExp := extractFloat128Exp( b );
  7928. expDiff := aExp - bExp;
  7929. if ( 0 < expDiff ) then begin
  7930. if ( aExp = $7FFF ) then begin
  7931. if ( aSig0 or aSig1 )<>0 then
  7932. begin
  7933. result := propagateFloat128NaN( a, b );
  7934. exit;
  7935. end;
  7936. result := a;
  7937. exit;
  7938. end;
  7939. if ( bExp = 0 ) then begin
  7940. dec(expDiff);
  7941. end
  7942. else begin
  7943. bSig0 := bSig0 or int64( $0001000000000000 );
  7944. end;
  7945. shift128ExtraRightJamming(
  7946. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7947. zExp := aExp;
  7948. end
  7949. else if ( expDiff < 0 ) then begin
  7950. if ( bExp = $7FFF ) then begin
  7951. if ( bSig0 or bSig1 )<>0 then
  7952. begin
  7953. result := propagateFloat128NaN( a, b );
  7954. exit;
  7955. end;
  7956. result := packFloat128( zSign, $7FFF, 0, 0 );
  7957. exit;
  7958. end;
  7959. if ( aExp = 0 ) then begin
  7960. inc(expDiff);
  7961. end
  7962. else begin
  7963. aSig0 := aSig0 or int64( $0001000000000000 );
  7964. end;
  7965. shift128ExtraRightJamming(
  7966. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7967. zExp := bExp;
  7968. end
  7969. else begin
  7970. if ( aExp = $7FFF ) then begin
  7971. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7972. result := propagateFloat128NaN( a, b );
  7973. exit;
  7974. end;
  7975. result := a;
  7976. exit;
  7977. end;
  7978. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7979. if ( aExp = 0 ) then
  7980. begin
  7981. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7982. exit;
  7983. end;
  7984. zSig2 := 0;
  7985. zSig0 := zSig0 or int64( $0002000000000000 );
  7986. zExp := aExp;
  7987. goto shiftRight1;
  7988. end;
  7989. aSig0 := aSig0 or int64( $0001000000000000 );
  7990. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7991. dec(zExp);
  7992. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7993. inc(zExp);
  7994. shiftRight1:
  7995. shift128ExtraRightJamming(
  7996. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7997. roundAndPack:
  7998. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7999. end;
  8000. {*----------------------------------------------------------------------------
  8001. | Returns the result of subtracting the absolute values of the quadruple-
  8002. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  8003. | difference is negated before being returned. `zSign' is ignored if the
  8004. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  8005. | Standard for Binary Floating-Point Arithmetic.
  8006. *----------------------------------------------------------------------------*}
  8007. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  8008. var
  8009. aExp, bExp, zExp: int32;
  8010. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8011. expDiff: int32;
  8012. z: float128;
  8013. label
  8014. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8015. begin
  8016. aSig1 := extractFloat128Frac1( a );
  8017. aSig0 := extractFloat128Frac0( a );
  8018. aExp := extractFloat128Exp( a );
  8019. bSig1 := extractFloat128Frac1( b );
  8020. bSig0 := extractFloat128Frac0( b );
  8021. bExp := extractFloat128Exp( b );
  8022. expDiff := aExp - bExp;
  8023. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8024. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8025. if ( 0 < expDiff ) then goto aExpBigger;
  8026. if ( expDiff < 0 ) then goto bExpBigger;
  8027. if ( aExp = $7FFF ) then begin
  8028. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8029. result := propagateFloat128NaN( a, b );
  8030. exit;
  8031. end;
  8032. float_raise( float_flag_invalid );
  8033. z.low := float128_default_nan_low;
  8034. z.high := float128_default_nan_high;
  8035. result := z;
  8036. exit;
  8037. end;
  8038. if ( aExp = 0 ) then begin
  8039. aExp := 1;
  8040. bExp := 1;
  8041. end;
  8042. if ( bSig0 < aSig0 ) then goto aBigger;
  8043. if ( aSig0 < bSig0 ) then goto bBigger;
  8044. if ( bSig1 < aSig1 ) then goto aBigger;
  8045. if ( aSig1 < bSig1 ) then goto bBigger;
  8046. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8047. exit;
  8048. bExpBigger:
  8049. if ( bExp = $7FFF ) then begin
  8050. if ( bSig0 or bSig1 )<>0 then
  8051. begin
  8052. result := propagateFloat128NaN( a, b );
  8053. exit;
  8054. end;
  8055. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8056. exit;
  8057. end;
  8058. if ( aExp = 0 ) then begin
  8059. inc(expDiff);
  8060. end
  8061. else begin
  8062. aSig0 := aSig0 or int64( $4000000000000000 );
  8063. end;
  8064. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8065. bSig0 := bSig0 or int64( $4000000000000000 );
  8066. bBigger:
  8067. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8068. zExp := bExp;
  8069. zSign := zSign xor 1;
  8070. goto normalizeRoundAndPack;
  8071. aExpBigger:
  8072. if ( aExp = $7FFF ) then begin
  8073. if ( aSig0 or aSig1 )<>0 then
  8074. begin
  8075. result := propagateFloat128NaN( a, b );
  8076. exit;
  8077. end;
  8078. result := a;
  8079. exit;
  8080. end;
  8081. if ( bExp = 0 ) then begin
  8082. dec(expDiff);
  8083. end
  8084. else begin
  8085. bSig0 := bSig0 or int64( $4000000000000000 );
  8086. end;
  8087. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8088. aSig0 := aSig0 or int64( $4000000000000000 );
  8089. aBigger:
  8090. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8091. zExp := aExp;
  8092. normalizeRoundAndPack:
  8093. dec(zExp);
  8094. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8095. end;
  8096. {*----------------------------------------------------------------------------
  8097. | Returns the result of adding the quadruple-precision floating-point values
  8098. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8099. | for Binary Floating-Point Arithmetic.
  8100. *----------------------------------------------------------------------------*}
  8101. function float128_add(a: float128; b: float128): float128;
  8102. var
  8103. aSign, bSign: flag;
  8104. begin
  8105. aSign := extractFloat128Sign( a );
  8106. bSign := extractFloat128Sign( b );
  8107. if ( aSign = bSign ) then begin
  8108. result := addFloat128Sigs( a, b, aSign );
  8109. end
  8110. else begin
  8111. result := subFloat128Sigs( a, b, aSign );
  8112. end;
  8113. end;
  8114. {*----------------------------------------------------------------------------
  8115. | Returns the result of subtracting the quadruple-precision floating-point
  8116. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8117. | Standard for Binary Floating-Point Arithmetic.
  8118. *----------------------------------------------------------------------------*}
  8119. function float128_sub(a: float128; b: float128): float128;
  8120. var
  8121. aSign, bSign: flag;
  8122. begin
  8123. aSign := extractFloat128Sign( a );
  8124. bSign := extractFloat128Sign( b );
  8125. if ( aSign = bSign ) then begin
  8126. result := subFloat128Sigs( a, b, aSign );
  8127. end
  8128. else begin
  8129. result := addFloat128Sigs( a, b, aSign );
  8130. end;
  8131. end;
  8132. {*----------------------------------------------------------------------------
  8133. | Returns the result of multiplying the quadruple-precision floating-point
  8134. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8135. | Standard for Binary Floating-Point Arithmetic.
  8136. *----------------------------------------------------------------------------*}
  8137. function float128_mul(a: float128; b: float128): float128;
  8138. var
  8139. aSign, bSign, zSign: flag;
  8140. aExp, bExp, zExp: int32;
  8141. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8142. z: float128;
  8143. label
  8144. invalid;
  8145. begin
  8146. aSig1 := extractFloat128Frac1( a );
  8147. aSig0 := extractFloat128Frac0( a );
  8148. aExp := extractFloat128Exp( a );
  8149. aSign := extractFloat128Sign( a );
  8150. bSig1 := extractFloat128Frac1( b );
  8151. bSig0 := extractFloat128Frac0( b );
  8152. bExp := extractFloat128Exp( b );
  8153. bSign := extractFloat128Sign( b );
  8154. zSign := aSign xor bSign;
  8155. if ( aExp = $7FFF ) then begin
  8156. if ( (( aSig0 or aSig1 )<>0)
  8157. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8158. result := propagateFloat128NaN( a, b );
  8159. exit;
  8160. end;
  8161. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8162. result := packFloat128( zSign, $7FFF, 0, 0 );
  8163. exit;
  8164. end;
  8165. if ( bExp = $7FFF ) then begin
  8166. if ( bSig0 or bSig1 )<>0 then
  8167. begin
  8168. result := propagateFloat128NaN( a, b );
  8169. exit;
  8170. end;
  8171. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8172. invalid:
  8173. float_raise( float_flag_invalid );
  8174. z.low := float128_default_nan_low;
  8175. z.high := float128_default_nan_high;
  8176. result := z;
  8177. exit;
  8178. end;
  8179. result := packFloat128( zSign, $7FFF, 0, 0 );
  8180. exit;
  8181. end;
  8182. if ( aExp = 0 ) then begin
  8183. if ( ( aSig0 or aSig1 ) = 0 ) then
  8184. begin
  8185. result := packFloat128( zSign, 0, 0, 0 );
  8186. exit;
  8187. end;
  8188. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8189. end;
  8190. if ( bExp = 0 ) then begin
  8191. if ( ( bSig0 or bSig1 ) = 0 ) then
  8192. begin
  8193. result := packFloat128( zSign, 0, 0, 0 );
  8194. exit;
  8195. end;
  8196. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8197. end;
  8198. zExp := aExp + bExp - $4000;
  8199. aSig0 := aSig0 or int64( $0001000000000000 );
  8200. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8201. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8202. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8203. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8204. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8205. shift128ExtraRightJamming(
  8206. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8207. inc(zExp);
  8208. end;
  8209. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8210. end;
  8211. {*----------------------------------------------------------------------------
  8212. | Returns the result of dividing the quadruple-precision floating-point value
  8213. | `a' by the corresponding value `b'. The operation is performed according to
  8214. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8215. *----------------------------------------------------------------------------*}
  8216. function float128_div(a: float128; b: float128): float128;
  8217. var
  8218. aSign, bSign, zSign: flag;
  8219. aExp, bExp, zExp: int32;
  8220. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8221. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8222. z: float128;
  8223. label
  8224. invalid;
  8225. begin
  8226. aSig1 := extractFloat128Frac1( a );
  8227. aSig0 := extractFloat128Frac0( a );
  8228. aExp := extractFloat128Exp( a );
  8229. aSign := extractFloat128Sign( a );
  8230. bSig1 := extractFloat128Frac1( b );
  8231. bSig0 := extractFloat128Frac0( b );
  8232. bExp := extractFloat128Exp( b );
  8233. bSign := extractFloat128Sign( b );
  8234. zSign := aSign xor bSign;
  8235. if ( aExp = $7FFF ) then begin
  8236. if ( aSig0 or aSig1 )<>0 then
  8237. begin
  8238. result := propagateFloat128NaN( a, b );
  8239. exit;
  8240. end;
  8241. if ( bExp = $7FFF ) then begin
  8242. if ( bSig0 or bSig1 )<>0 then
  8243. begin
  8244. result := propagateFloat128NaN( a, b );
  8245. exit;
  8246. end;
  8247. goto invalid;
  8248. end;
  8249. result := packFloat128( zSign, $7FFF, 0, 0 );
  8250. exit;
  8251. end;
  8252. if ( bExp = $7FFF ) then begin
  8253. if ( bSig0 or bSig1 )<>0 then
  8254. begin
  8255. result := propagateFloat128NaN( a, b );
  8256. exit;
  8257. end;
  8258. result := packFloat128( zSign, 0, 0, 0 );
  8259. exit;
  8260. end;
  8261. if ( bExp = 0 ) then begin
  8262. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8263. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8264. invalid:
  8265. float_raise( float_flag_invalid );
  8266. z.low := float128_default_nan_low;
  8267. z.high := float128_default_nan_high;
  8268. result := z;
  8269. exit;
  8270. end;
  8271. float_raise( float_flag_divbyzero );
  8272. result := packFloat128( zSign, $7FFF, 0, 0 );
  8273. exit;
  8274. end;
  8275. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8276. end;
  8277. if ( aExp = 0 ) then begin
  8278. if ( ( aSig0 or aSig1 ) = 0 ) then
  8279. begin
  8280. result := packFloat128( zSign, 0, 0, 0 );
  8281. exit;
  8282. end;
  8283. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8284. end;
  8285. zExp := aExp - bExp + $3FFD;
  8286. shortShift128Left(
  8287. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8288. shortShift128Left(
  8289. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8290. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8291. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8292. inc(zExp);
  8293. end;
  8294. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8295. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8296. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8297. while ( sbits64(rem0) < 0 ) do begin
  8298. dec(zSig0);
  8299. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8300. end;
  8301. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8302. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8303. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8304. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8305. while ( sbits64(rem1) < 0 ) do begin
  8306. dec(zSig1);
  8307. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8308. end;
  8309. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8310. end;
  8311. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8312. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8313. end;
  8314. {*----------------------------------------------------------------------------
  8315. | Returns the remainder of the quadruple-precision floating-point value `a'
  8316. | with respect to the corresponding value `b'. The operation is performed
  8317. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8318. *----------------------------------------------------------------------------*}
  8319. function float128_rem(a: float128; b: float128): float128;
  8320. var
  8321. aSign, zSign: flag;
  8322. aExp, bExp, expDiff: int32;
  8323. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8324. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8325. sigMean0: sbits64;
  8326. z: float128;
  8327. label
  8328. invalid;
  8329. begin
  8330. aSig1 := extractFloat128Frac1( a );
  8331. aSig0 := extractFloat128Frac0( a );
  8332. aExp := extractFloat128Exp( a );
  8333. aSign := extractFloat128Sign( a );
  8334. bSig1 := extractFloat128Frac1( b );
  8335. bSig0 := extractFloat128Frac0( b );
  8336. bExp := extractFloat128Exp( b );
  8337. if ( aExp = $7FFF ) then begin
  8338. if ( (( aSig0 or aSig1 )<>0)
  8339. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8340. result := propagateFloat128NaN( a, b );
  8341. exit;
  8342. end;
  8343. goto invalid;
  8344. end;
  8345. if ( bExp = $7FFF ) then begin
  8346. if ( bSig0 or bSig1 )<>0 then
  8347. begin
  8348. result := propagateFloat128NaN( a, b );
  8349. exit;
  8350. end;
  8351. result := a;
  8352. exit;
  8353. end;
  8354. if ( bExp = 0 ) then begin
  8355. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8356. invalid:
  8357. float_raise( float_flag_invalid );
  8358. z.low := float128_default_nan_low;
  8359. z.high := float128_default_nan_high;
  8360. result := z;
  8361. exit;
  8362. end;
  8363. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8364. end;
  8365. if ( aExp = 0 ) then begin
  8366. if ( ( aSig0 or aSig1 ) = 0 ) then
  8367. begin
  8368. result := a;
  8369. exit;
  8370. end;
  8371. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8372. end;
  8373. expDiff := aExp - bExp;
  8374. if ( expDiff < -1 ) then
  8375. begin
  8376. result := a;
  8377. exit;
  8378. end;
  8379. shortShift128Left(
  8380. aSig0 or int64( $0001000000000000 ),
  8381. aSig1,
  8382. 15 - ord( expDiff < 0 ),
  8383. aSig0,
  8384. aSig1
  8385. );
  8386. shortShift128Left(
  8387. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8388. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8389. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8390. dec(expDiff,64);
  8391. while ( 0 < expDiff ) do begin
  8392. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8393. if ( 4 < q ) then
  8394. q := q - 4
  8395. else
  8396. q := 0;
  8397. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8398. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8399. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8400. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8401. dec(expDiff,61);
  8402. end;
  8403. if ( -64 < expDiff ) then begin
  8404. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8405. if ( 4 < q ) then
  8406. q := q - 4
  8407. else
  8408. q := 0;
  8409. q := q shr (- expDiff);
  8410. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8411. inc(expDiff,52);
  8412. if ( expDiff < 0 ) then begin
  8413. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8414. end
  8415. else begin
  8416. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8417. end;
  8418. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8419. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8420. end
  8421. else begin
  8422. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8423. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8424. end;
  8425. repeat
  8426. alternateASig0 := aSig0;
  8427. alternateASig1 := aSig1;
  8428. inc(q);
  8429. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8430. until not( 0 <= sbits64(aSig0) );
  8431. add128(
  8432. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8433. if ( ( sigMean0 < 0 )
  8434. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8435. aSig0 := alternateASig0;
  8436. aSig1 := alternateASig1;
  8437. end;
  8438. zSign := ord( sbits64(aSig0) < 0 );
  8439. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8440. result :=
  8441. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8442. end;
  8443. {*----------------------------------------------------------------------------
  8444. | Returns the square root of the quadruple-precision floating-point value `a'.
  8445. | The operation is performed according to the IEC/IEEE Standard for Binary
  8446. | Floating-Point Arithmetic.
  8447. *----------------------------------------------------------------------------*}
  8448. function float128_sqrt(a: float128): float128;
  8449. var
  8450. aSign: flag;
  8451. aExp, zExp: int32;
  8452. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8453. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8454. z: float128;
  8455. label
  8456. invalid;
  8457. begin
  8458. aSig1 := extractFloat128Frac1( a );
  8459. aSig0 := extractFloat128Frac0( a );
  8460. aExp := extractFloat128Exp( a );
  8461. aSign := extractFloat128Sign( a );
  8462. if ( aExp = $7FFF ) then begin
  8463. if ( aSig0 or aSig1 )<>0 then
  8464. begin
  8465. result := propagateFloat128NaN( a, a );
  8466. exit;
  8467. end;
  8468. if ( aSign=0 ) then
  8469. begin
  8470. result := a;
  8471. exit;
  8472. end;
  8473. goto invalid;
  8474. end;
  8475. if ( aSign<>0 ) then begin
  8476. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8477. begin
  8478. result := a;
  8479. exit;
  8480. end;
  8481. invalid:
  8482. float_raise( float_flag_invalid );
  8483. z.low := float128_default_nan_low;
  8484. z.high := float128_default_nan_high;
  8485. result := z;
  8486. exit;
  8487. end;
  8488. if ( aExp = 0 ) then begin
  8489. if ( ( aSig0 or aSig1 ) = 0 ) then
  8490. begin
  8491. result := packFloat128( 0, 0, 0, 0 );
  8492. exit;
  8493. end;
  8494. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8495. end;
  8496. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
  8497. aSig0 := aSig0 or int64( $0001000000000000 );
  8498. zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
  8499. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8500. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8501. doubleZSig0 := zSig0 shl 1;
  8502. mul64To128( zSig0, zSig0, term0, term1 );
  8503. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8504. while ( sbits64(rem0) < 0 ) do begin
  8505. dec(zSig0);
  8506. dec(doubleZSig0,2);
  8507. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8508. end;
  8509. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8510. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8511. if ( zSig1 = 0 ) then zSig1 := 1;
  8512. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8513. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8514. mul64To128( zSig1, zSig1, term2, term3 );
  8515. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8516. while ( sbits64(rem1) < 0 ) do begin
  8517. dec(zSig1);
  8518. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8519. term3 := term3 or 1;
  8520. term2 := term2 or doubleZSig0;
  8521. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8522. end;
  8523. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8524. end;
  8525. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8526. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8527. end;
  8528. {*----------------------------------------------------------------------------
  8529. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8530. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8531. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8532. *----------------------------------------------------------------------------*}
  8533. function float128_eq(a: float128; b: float128): flag;
  8534. begin
  8535. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8536. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8537. or ( ( extractFloat128Exp( b ) = $7FFF )
  8538. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8539. ) then begin
  8540. if ( (float128_is_signaling_nan( a )<>0)
  8541. or (float128_is_signaling_nan( b )<>0) ) then begin
  8542. float_raise( float_flag_invalid );
  8543. end;
  8544. result := 0;
  8545. exit;
  8546. end;
  8547. result := ord(
  8548. ( a.low = b.low )
  8549. and ( ( a.high = b.high )
  8550. or ( ( a.low = 0 )
  8551. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8552. ));
  8553. end;
  8554. {*----------------------------------------------------------------------------
  8555. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8556. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8557. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8558. | Arithmetic.
  8559. *----------------------------------------------------------------------------*}
  8560. function float128_le(a: float128; b: float128): flag;
  8561. var
  8562. aSign, bSign: flag;
  8563. begin
  8564. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8565. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8566. or ( ( extractFloat128Exp( b ) = $7FFF )
  8567. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8568. ) then begin
  8569. float_raise( float_flag_invalid );
  8570. result := 0;
  8571. exit;
  8572. end;
  8573. aSign := extractFloat128Sign( a );
  8574. bSign := extractFloat128Sign( b );
  8575. if ( aSign <> bSign ) then begin
  8576. result := ord(
  8577. (aSign<>0)
  8578. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8579. = 0 ));
  8580. exit;
  8581. end;
  8582. if aSign<>0 then
  8583. result := le128( b.high, b.low, a.high, a.low )
  8584. else
  8585. result := le128( a.high, a.low, b.high, b.low );
  8586. end;
  8587. {*----------------------------------------------------------------------------
  8588. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8589. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8590. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8591. *----------------------------------------------------------------------------*}
  8592. function float128_lt(a: float128; b: float128): flag;
  8593. var
  8594. aSign, bSign: flag;
  8595. begin
  8596. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8597. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8598. or ( ( extractFloat128Exp( b ) = $7FFF )
  8599. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8600. ) then begin
  8601. float_raise( float_flag_invalid );
  8602. result := 0;
  8603. exit;
  8604. end;
  8605. aSign := extractFloat128Sign( a );
  8606. bSign := extractFloat128Sign( b );
  8607. if ( aSign <> bSign ) then begin
  8608. result := ord(
  8609. (aSign<>0)
  8610. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8611. <> 0 ));
  8612. exit;
  8613. end;
  8614. if aSign<>0 then
  8615. result := lt128( b.high, b.low, a.high, a.low )
  8616. else
  8617. result := lt128( a.high, a.low, b.high, b.low );
  8618. end;
  8619. {*----------------------------------------------------------------------------
  8620. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8621. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8622. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8623. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8624. *----------------------------------------------------------------------------*}
  8625. function float128_eq_signaling(a: float128; b: float128): flag;
  8626. begin
  8627. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8628. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8629. or ( ( extractFloat128Exp( b ) = $7FFF )
  8630. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8631. ) then begin
  8632. float_raise( float_flag_invalid );
  8633. result := 0;
  8634. exit;
  8635. end;
  8636. result := ord(
  8637. ( a.low = b.low )
  8638. and ( ( a.high = b.high )
  8639. or ( ( a.low = 0 )
  8640. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8641. ));
  8642. end;
  8643. {*----------------------------------------------------------------------------
  8644. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8645. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8646. | cause an exception. Otherwise, the comparison is performed according to the
  8647. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8648. *----------------------------------------------------------------------------*}
  8649. function float128_le_quiet(a: float128; b: float128): flag;
  8650. var
  8651. aSign, bSign: flag;
  8652. begin
  8653. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8654. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8655. or ( ( extractFloat128Exp( b ) = $7FFF )
  8656. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8657. ) then begin
  8658. if ( (float128_is_signaling_nan( a )<>0)
  8659. or (float128_is_signaling_nan( b )<>0) ) then begin
  8660. float_raise( float_flag_invalid );
  8661. end;
  8662. result := 0;
  8663. exit;
  8664. end;
  8665. aSign := extractFloat128Sign( a );
  8666. bSign := extractFloat128Sign( b );
  8667. if ( aSign <> bSign ) then begin
  8668. result := ord(
  8669. (aSign<>0)
  8670. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8671. = 0 ));
  8672. exit;
  8673. end;
  8674. if aSign<>0 then
  8675. result := le128( b.high, b.low, a.high, a.low )
  8676. else
  8677. result := le128( a.high, a.low, b.high, b.low );
  8678. end;
  8679. {*----------------------------------------------------------------------------
  8680. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8681. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8682. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8683. | Standard for Binary Floating-Point Arithmetic.
  8684. *----------------------------------------------------------------------------*}
  8685. function float128_lt_quiet(a: float128; b: float128): flag;
  8686. var
  8687. aSign, bSign: flag;
  8688. begin
  8689. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8690. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8691. or ( ( extractFloat128Exp( b ) = $7FFF )
  8692. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8693. ) then begin
  8694. if ( (float128_is_signaling_nan( a )<>0)
  8695. or (float128_is_signaling_nan( b )<>0) ) then begin
  8696. float_raise( float_flag_invalid );
  8697. end;
  8698. result := 0;
  8699. exit;
  8700. end;
  8701. aSign := extractFloat128Sign( a );
  8702. bSign := extractFloat128Sign( b );
  8703. if ( aSign <> bSign ) then begin
  8704. result := ord(
  8705. (aSign<>0)
  8706. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8707. <> 0 ));
  8708. exit;
  8709. end;
  8710. if aSign<>0 then
  8711. result:=lt128( b.high, b.low, a.high, a.low )
  8712. else
  8713. result:=lt128( a.high, a.low, b.high, b.low );
  8714. end;
  8715. {----------------------------------------------------------------------------
  8716. | Returns the result of converting the double-precision floating-point value
  8717. | `a' to the quadruple-precision floating-point format. The conversion is
  8718. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8719. | Arithmetic.
  8720. *----------------------------------------------------------------------------}
  8721. function float64_to_float128( a : float64) : float128;
  8722. var
  8723. aSign : flag;
  8724. aExp : int16;
  8725. aSig, zSig0, zSig1 : bits64;
  8726. begin
  8727. aSig := extractFloat64Frac( a );
  8728. aExp := extractFloat64Exp( a );
  8729. aSign := extractFloat64Sign( a );
  8730. if ( aExp = $7FF ) then begin
  8731. if ( aSig<>0 ) then begin
  8732. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8733. exit;
  8734. end;
  8735. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8736. exit;
  8737. end;
  8738. if ( aExp = 0 ) then begin
  8739. if ( aSig = 0 ) then
  8740. begin
  8741. result:=packFloat128( aSign, 0, 0, 0 );
  8742. exit;
  8743. end;
  8744. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8745. dec(aExp);
  8746. end;
  8747. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8748. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8749. end;
  8750. {$endif FPC_SOFTFLOAT_FLOAT128}
  8751. {$endif not(defined(fpc_softfpu_interface))}
  8752. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8753. end.
  8754. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}